2. model_selection_precomputed can now save all results as human readable text. 3. modify pygraph.utils.utils.floydTransformation and pygraph.utils.graphdataset.get_dataset_attributes.v0.1
@@ -1,7 +1,9 @@ | |||||
# Jupyter Notebook | # Jupyter Notebook | ||||
.ipynb_checkpoints | .ipynb_checkpoints | ||||
datasets | |||||
notebooks/results | |||||
datasets/* | |||||
!datasets/ds.py | |||||
notebooks/results/* | |||||
requirements/* | |||||
__pycache__ | __pycache__ | ||||
##*# | ##*# |
@@ -0,0 +1,112 @@ | |||||
dslist = [ | |||||
{ | |||||
'name': 'Acyclic', | |||||
'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
'task': 'regression' | |||||
}, # node_labeled | |||||
{ | |||||
'name': 'COIL-DEL', | |||||
'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt' | |||||
}, # edge_labeled | |||||
{ | |||||
'name': 'PAH', | |||||
'dataset': '../datasets/PAH/dataset.ds', | |||||
}, # unlabeled | |||||
{ | |||||
'name': 'Mutagenicity', | |||||
'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt' | |||||
}, # fully_labeled | |||||
{ | |||||
'name': 'MAO', | |||||
'dataset': '../datasets/MAO/dataset.ds', | |||||
}, | |||||
{ | |||||
'name': 'MUTAG', | |||||
'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
'extra_params': { | |||||
'am_sp_al_nl_el': [0, 0, 3, 1, 2] | |||||
} | |||||
}, | |||||
{ | |||||
'name': 'Alkane', | |||||
'dataset': '../datasets/Alkane/dataset.ds', | |||||
'task': 'regression', | |||||
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', | |||||
}, | |||||
{ | |||||
'name': 'BZR', | |||||
'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'COX2', | |||||
'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'ENZYMES', | |||||
'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'DHFR', | |||||
'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'SYNTHETIC', | |||||
'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'MSRC9', | |||||
'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt' | |||||
}, | |||||
{ | |||||
'name': 'MSRC21', | |||||
'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt' | |||||
}, | |||||
{ | |||||
'name': 'FIRSTMM_DB', | |||||
'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt' | |||||
}, | |||||
{ | |||||
'name': 'PROTEINS', | |||||
'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'PROTEINS_full', | |||||
'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt' | |||||
}, | |||||
{ | |||||
'name': 'D&D', | |||||
'dataset': '../datasets/D&D/DD.mat', | |||||
'extra_params': { | |||||
'am_sp_al_nl_el': [0, 1, 2, 1, -1] | |||||
} | |||||
}, | |||||
{ | |||||
'name': 'AIDS', | |||||
'dataset': '../datasets/AIDS/AIDS_A.txt' | |||||
}, | |||||
{ | |||||
'name': 'NCI1', | |||||
'dataset': '../datasets/NCI1/NCI1.mat', | |||||
'extra_params': { | |||||
'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||||
} | |||||
}, | |||||
{ | |||||
'name': 'NCI109', | |||||
'dataset': '../datasets/NCI109/NCI109.mat', | |||||
'extra_params': { | |||||
'am_sp_al_nl_el': [1, 1, 2, 0, -1] | |||||
} | |||||
}, | |||||
{ | |||||
'name': 'NCI-HIV', | |||||
'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt', | |||||
}, | |||||
# # not working below | |||||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
] |
@@ -0,0 +1,8 @@ | |||||
import sys | |||||
import pathlib | |||||
sys.path.insert(0, "../") | |||||
import numpy as np | |||||
from pygraph.utils.model_selection_precomputed import model_selection_for_precomputed_kernel | |||||
from datasets.ds import dslist |
@@ -1,3 +0,0 @@ | |||||
import sys | |||||
import pathlib | |||||
sys.path.insert(0, "../") |
@@ -0,0 +1,56 @@ | |||||
from libs import * | |||||
from pygraph.kernels.spKernel import spkernel | |||||
dslist = [ | |||||
# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node_labeled | |||||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge_labeled | |||||
# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled | |||||
{'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # fully_labeled | |||||
# {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, | |||||
# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, | |||||
# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, | |||||
# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, | |||||
# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, | |||||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||||
# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, | |||||
# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, | |||||
# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, | |||||
# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, | |||||
# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, | |||||
# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, | |||||
# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, | |||||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, | |||||
# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, | |||||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, | |||||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, | |||||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, | |||||
# # not working below | |||||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
] | |||||
estimator = spkernel | |||||
param_grid_precomputed = {} | |||||
param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, | |||||
{'alpha': np.logspace(-10, 10, num = 41, base = 10)}] | |||||
for ds in dslist: | |||||
print() | |||||
print(ds['name']) | |||||
model_selection_for_precomputed_kernel( | |||||
ds['dataset'], estimator, param_grid_precomputed, | |||||
(param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), | |||||
(ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, | |||||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) | |||||
print() |
@@ -1,18 +0,0 @@ | |||||
def deltakernel(condition): | |||||
"""Return 1 if condition holds, 0 otherwise. | |||||
Parameters | |||||
---------- | |||||
condition : Boolean | |||||
A condition, according to which the kernel is set to 1 or 0. | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
Delta kernel. | |||||
References | |||||
---------- | |||||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003. | |||||
""" | |||||
return condition #(1 if condition else 0) |
@@ -1,95 +0,0 @@ | |||||
""" | |||||
@author: linlin | |||||
@references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. | |||||
""" | |||||
import sys | |||||
import pathlib | |||||
sys.path.insert(0, "../") | |||||
import time | |||||
# from collections import Counter | |||||
import networkx as nx | |||||
import numpy as np | |||||
def randomwalkkernel(*args, node_label='atom', edge_label='bond_type', labeled=True, n=10, method=''): | |||||
"""Calculate random walk graph kernels. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are calculated. | |||||
/ | |||||
G1, G2 : NetworkX graphs | |||||
2 graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
n : integer | |||||
Longest length of walks. | |||||
method : string | |||||
Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'. | |||||
Return | |||||
------ | |||||
Kmatrix : Numpy matrix | |||||
Kernel matrix, each element of which is the path kernel up to d between 2 praphs. | |||||
""" | |||||
method = method.lower() | |||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
n = int(n) | |||||
start_time = time.time() | |||||
# get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||||
all_walks = [ find_all_walks_until_length(Gn[i], n, node_label = node_label, edge_label = edge_label, labeled = labeled) for i in range(0, len(Gn)) ] | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
Kmatrix[i][j] = _randomwalkkernel_do(all_walks[i], all_walks[j], node_label = node_label, edge_label = edge_label, labeled = labeled) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
run_time = time.time() - start_time | |||||
print("\n --- kernel matrix of walk kernel up to %d of size %d built in %s seconds ---" % (n, len(Gn), run_time)) | |||||
return Kmatrix, run_time | |||||
def _randomwalkkernel_do(walks1, walks2, node_label = 'atom', edge_label = 'bond_type', labeled = True, method=''): | |||||
"""Calculate walk graph kernels up to n between 2 graphs. | |||||
Parameters | |||||
---------- | |||||
walks1, walks2 : list | |||||
List of walks in 2 graphs, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Treelet Kernel between 2 graphs. | |||||
""" | |||||
if method == 'sylvester': | |||||
import warnings | |||||
warnings.warn('The Sylvester equation (rather than generalized Sylvester equation) is used; only walks of length 1 is considered.') | |||||
from control import dlyap | |||||
dpg = nx.tensor_product(G1, G2) # direct product graph | |||||
X = dlyap(A, Q, C) | |||||
pass | |||||
else: | |||||
raise Exception('No computation method specified.') | |||||
return kernel | |||||
@@ -16,7 +16,7 @@ from pygraph.utils.utils import getSPGraph | |||||
from pygraph.utils.graphdataset import get_dataset_attributes | from pygraph.utils.graphdataset import get_dataset_attributes | ||||
def spkernel(*args, node_label='atom', edge_weight=None): | |||||
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None): | |||||
"""Calculate shortest-path kernels between graphs. | """Calculate shortest-path kernels between graphs. | ||||
Parameters | Parameters | ||||
@@ -27,24 +27,50 @@ def spkernel(*args, node_label='atom', edge_weight=None): | |||||
G1, G2 : NetworkX graphs | G1, G2 : NetworkX graphs | ||||
2 graphs between which the kernel is calculated. | 2 graphs between which the kernel is calculated. | ||||
edge_weight : string | edge_weight : string | ||||
Edge attribute corresponding to the edge weight. | |||||
Edge attribute name corresponding to the edge weight. | |||||
node_kernels: dict | |||||
A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled. | |||||
Return | Return | ||||
------ | ------ | ||||
Kmatrix : Numpy matrix | Kmatrix : Numpy matrix | ||||
Kernel matrix, each element of which is the sp kernel between 2 praphs. | Kernel matrix, each element of which is the sp kernel between 2 praphs. | ||||
""" | """ | ||||
# pre-process | |||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | Gn = args[0] if len(args) == 1 else [args[0], args[1]] | ||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
try: | |||||
some_weight = list( | |||||
nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||||
weight = edge_label if isinstance(some_weight, float) or isinstance( | |||||
some_weight, int) else None | |||||
except: | |||||
weight = None | |||||
Gn = [nx.to_directed(G) for G in Gn] | |||||
weight = None | |||||
if edge_weight == None: | |||||
print('\n None edge weight specified. Set all weight to 1.\n') | |||||
else: | |||||
try: | |||||
some_weight = list( | |||||
nx.get_edge_attributes(Gn[0], edge_weight).values())[0] | |||||
if isinstance(some_weight, float) or isinstance(some_weight, int): | |||||
weight = edge_weight | |||||
else: | |||||
print( | |||||
'\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' | |||||
% edge_weight) | |||||
except: | |||||
print( | |||||
'\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' | |||||
% edge_weight) | |||||
ds_attrs = get_dataset_attributes( | ds_attrs = get_dataset_attributes( | ||||
Gn, attr_names=['node_labeled'], node_label=node_label) | |||||
Gn, | |||||
attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], | |||||
node_label=node_label) | |||||
# remove graphs with no edges, as no sp can be found in their structures, so the kernel between such a graph and itself will be zero. | |||||
len_gn = len(Gn) | |||||
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] | |||||
idx = [G[0] for G in Gn] | |||||
Gn = [G[1] for G in Gn] | |||||
if len(Gn) != len_gn: | |||||
print('\n %d graphs are removed as they don\'t contain edges.\n' % | |||||
(len_gn - len(Gn))) | |||||
start_time = time.time() | start_time = time.time() | ||||
@@ -54,44 +80,187 @@ def spkernel(*args, node_label='atom', edge_weight=None): | |||||
for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout) | for G in tqdm(Gn, desc='getting sp graphs', file=sys.stdout) | ||||
] | ] | ||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
pbar = tqdm( | pbar = tqdm( | ||||
total=((len(Gn) + 1) * len(Gn) / 2), | total=((len(Gn) + 1) * len(Gn) / 2), | ||||
desc='calculating kernels', | desc='calculating kernels', | ||||
file=sys.stdout) | file=sys.stdout) | ||||
if ds_attrs['node_labeled']: | if ds_attrs['node_labeled']: | ||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
for e1 in Gn[i].edges(data=True): | |||||
for e2 in Gn[j].edges(data=True): | |||||
# cost of a node to itself equals to 0, cost between two disconnected nodes is Inf. | |||||
if e1[2]['cost'] != 0 and e1[2] != np.Inf and e1[2]['cost'] == e2[2]['cost'] and { | |||||
Gn[i].nodes[e1[0]][node_label], | |||||
Gn[i].nodes[e1[1]][node_label] | |||||
} == { | |||||
Gn[j].nodes[e2[0]][node_label], | |||||
Gn[j].nodes[e2[1]][node_label] | |||||
}: | |||||
Kmatrix[i][j] += 1 | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
# node symb and non-synb labeled | |||||
if ds_attrs['node_attr_dim'] > 0: | |||||
if ds_attrs['is_directed']: | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
for e1 in Gn[i].edges(data=True): | |||||
for e2 in Gn[j].edges(data=True): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['mix'] | |||||
try: | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[ | |||||
0]], Gn[i].nodes[e1[1]], Gn[ | |||||
j].nodes[e2[0]], Gn[j].nodes[ | |||||
e2[1]] | |||||
kn1 = kn(n11[node_label], n21[ | |||||
node_label], [n11['attributes']], | |||||
[n21['attributes']]) * kn( | |||||
n12[node_label], | |||||
n22[node_label], | |||||
[n12['attributes']], | |||||
[n22['attributes']]) | |||||
Kmatrix[i][j] += kn1 | |||||
except KeyError: # missing labels or attributes | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
else: | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
for e1 in Gn[i].edges(data=True): | |||||
for e2 in Gn[j].edges(data=True): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['mix'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[ | |||||
0]], Gn[i].nodes[e1[1]], Gn[ | |||||
j].nodes[e2[0]], Gn[j].nodes[ | |||||
e2[1]] | |||||
kn1 = kn(n11[node_label], n21[ | |||||
node_label], [n11['attributes']], | |||||
[n21['attributes']]) * kn( | |||||
n12[node_label], | |||||
n22[node_label], | |||||
[n12['attributes']], | |||||
[n22['attributes']]) | |||||
kn2 = kn(n11[node_label], n22[ | |||||
node_label], [n11['attributes']], | |||||
[n22['attributes']]) * kn( | |||||
n12[node_label], | |||||
n21[node_label], | |||||
[n12['attributes']], | |||||
[n21['attributes']]) | |||||
Kmatrix[i][j] += kn1 + kn2 | |||||
except KeyError: # missing labels or attributes | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
# node symb labeled | |||||
else: | |||||
if ds_attrs['is_directed']: | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
for e1 in Gn[i].edges(data=True): | |||||
for e2 in Gn[j].edges(data=True): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['symb'] | |||||
try: | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[ | |||||
0]], Gn[i].nodes[e1[1]], Gn[ | |||||
j].nodes[e2[0]], Gn[j].nodes[ | |||||
e2[1]] | |||||
kn1 = kn(n11[node_label], | |||||
n21[node_label]) * kn( | |||||
n12[node_label], | |||||
n22[node_label]) | |||||
Kmatrix[i][j] += kn1 | |||||
except KeyError: # missing labels | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
else: | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
for e1 in Gn[i].edges(data=True): | |||||
for e2 in Gn[j].edges(data=True): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['symb'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[ | |||||
0]], Gn[i].nodes[e1[1]], Gn[ | |||||
j].nodes[e2[0]], Gn[j].nodes[ | |||||
e2[1]] | |||||
kn1 = kn(n11[node_label], | |||||
n21[node_label]) * kn( | |||||
n12[node_label], | |||||
n22[node_label]) | |||||
kn2 = kn(n11[node_label], | |||||
n22[node_label]) * kn( | |||||
n12[node_label], | |||||
n21[node_label]) | |||||
Kmatrix[i][j] += kn1 + kn2 | |||||
except KeyError: # missing labels | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
else: | else: | ||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
# kernel_t = [ e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])) \ | |||||
# for e1 in Sn[i].edges(data = True) for e2 in Sn[j].edges(data = True) ] | |||||
# Kmatrix[i][j] = np.sum(kernel_t) | |||||
# Kmatrix[j][i] = Kmatrix[i][j] | |||||
for e1 in Gn[i].edges(data=True): | |||||
for e2 in Gn[j].edges(data=True): | |||||
if e1[2]['cost'] != 0 and e1[2] != np.Inf and e1[2]['cost'] == e2[2]['cost']: | |||||
Kmatrix[i][j] += 1 | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
# node non-synb labeled | |||||
if ds_attrs['node_attr_dim'] > 0: | |||||
if ds_attrs['is_directed']: | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
for e1 in Gn[i].edges(data=True): | |||||
for e2 in Gn[j].edges(data=True): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['nsymb'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[ | |||||
0]], Gn[i].nodes[e1[1]], Gn[ | |||||
j].nodes[e2[0]], Gn[j].nodes[ | |||||
e2[1]] | |||||
kn1 = kn([n11['attributes']], | |||||
[n21['attributes']]) * kn( | |||||
[n12['attributes']], | |||||
[n22['attributes']]) | |||||
Kmatrix[i][j] += kn1 | |||||
except KeyError: # missing attributes | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
else: | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
for e1 in Gn[i].edges(data=True): | |||||
for e2 in Gn[j].edges(data=True): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
kn = node_kernels['nsymb'] | |||||
try: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
n11, n12, n21, n22 = Gn[i].nodes[e1[ | |||||
0]], Gn[i].nodes[e1[1]], Gn[ | |||||
j].nodes[e2[0]], Gn[j].nodes[ | |||||
e2[1]] | |||||
kn1 = kn([n11['attributes']], | |||||
[n21['attributes']]) * kn( | |||||
[n12['attributes']], | |||||
[n22['attributes']]) | |||||
kn2 = kn([n11['attributes']], | |||||
[n22['attributes']]) * kn( | |||||
[n12['attributes']], | |||||
[n21['attributes']]) | |||||
Kmatrix[i][j] += kn1 + kn2 | |||||
except KeyError: # missing attributes | |||||
pass | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
# node unlabeled | |||||
else: | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
for e1 in Gn[i].edges(data=True): | |||||
for e2 in Gn[j].edges(data=True): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
Kmatrix[i][j] += 1 | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
pbar.update(1) | |||||
run_time = time.time() - start_time | run_time = time.time() - start_time | ||||
print( | print( | ||||
"--- shortest path kernel matrix of size %d built in %s seconds ---" % | |||||
(len(Gn), run_time)) | |||||
"\n --- shortest path kernel matrix of size %d built in %s seconds ---" | |||||
% (len(Gn), run_time)) | |||||
return Kmatrix, run_time | |||||
return Kmatrix, run_time, idx |
@@ -1,219 +0,0 @@ | |||||
""" | |||||
@author: linlin | |||||
@references: Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre Baldi. Graph kernels for chemical informatics. Neural networks, 18(8):1093–1110, 2005. | |||||
""" | |||||
import sys | |||||
import pathlib | |||||
sys.path.insert(0, "../") | |||||
import time | |||||
from collections import Counter | |||||
import networkx as nx | |||||
import numpy as np | |||||
def untildpathkernel(*args, node_label='atom', edge_label='bond_type', labeled=True, depth=10, k_func='tanimoto'): | |||||
"""Calculate path graph kernels up to depth d between graphs. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are calculated. | |||||
/ | |||||
G1, G2 : NetworkX graphs | |||||
2 graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
depth : integer | |||||
Depth of search. Longest length of paths. | |||||
k_func : function | |||||
A kernel function used using different notions of fingerprint similarity. | |||||
Return | |||||
------ | |||||
Kmatrix : Numpy matrix | |||||
Kernel matrix, each element of which is the path kernel up to d between 2 praphs. | |||||
""" | |||||
depth = int(depth) | |||||
if len(args) == 1: # for a list of graphs | |||||
Gn = args[0] | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
start_time = time.time() | |||||
# get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||||
all_paths = [find_all_paths_until_length( | |||||
Gn[i], depth, node_label=node_label, edge_label=edge_label, labeled=labeled) for i in range(0, len(Gn))] | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
Kmatrix[i][j] = _untildpathkernel_do( | |||||
all_paths[i], all_paths[j], k_func, node_label=node_label, edge_label=edge_label, labeled=labeled) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
run_time = time.time() - start_time | |||||
print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" % | |||||
(depth, len(Gn), run_time)) | |||||
return Kmatrix, run_time | |||||
else: # for only 2 graphs | |||||
start_time = time.time() | |||||
all_paths1 = find_all_paths_until_length( | |||||
args[0], depth, node_label=node_label, edge_label=edge_label, labeled=labeled) | |||||
all_paths2 = find_all_paths_until_length( | |||||
args[1], depth, node_label=node_label, edge_label=edge_label, labeled=labeled) | |||||
kernel = _untildpathkernel_do( | |||||
all_paths1, all_paths2, k_func, node_label=node_label, edge_label=edge_label, labeled=labeled) | |||||
run_time = time.time() - start_time | |||||
print("\n --- path kernel up to %d built in %s seconds ---" % | |||||
(depth, run_time)) | |||||
return kernel, run_time | |||||
def _untildpathkernel_do(paths1, paths2, k_func, node_label='atom', edge_label='bond_type', labeled=True): | |||||
"""Calculate path graph kernels up to depth d between 2 graphs. | |||||
Parameters | |||||
---------- | |||||
paths1, paths2 : list | |||||
List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. | |||||
k_func : function | |||||
A kernel function used using different notions of fingerprint similarity. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Treelet Kernel between 2 graphs. | |||||
""" | |||||
all_paths = list(set(paths1 + paths2)) | |||||
if k_func == 'tanimoto': | |||||
vector1 = [(1 if path in paths1 else 0) for path in all_paths] | |||||
vector2 = [(1 if path in paths2 else 0) for path in all_paths] | |||||
kernel_uv = np.dot(vector1, vector2) | |||||
kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) | |||||
else: # MinMax kernel | |||||
path_count1 = Counter(paths1) | |||||
path_count2 = Counter(paths2) | |||||
vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) | |||||
for key in all_paths] | |||||
vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) | |||||
for key in all_paths] | |||||
kernel = np.sum(np.minimum(vector1, vector2)) / \ | |||||
np.sum(np.maximum(vector1, vector2)) | |||||
return kernel | |||||
# this method find paths repetively, it could be faster. | |||||
def find_all_paths_until_length(G, length, node_label='atom', edge_label='bond_type', labeled=True): | |||||
"""Find all paths with a certain maximum length in a graph. A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which paths are searched. | |||||
length : integer | |||||
The maximum length of paths. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
Return | |||||
------ | |||||
path : list | |||||
List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. | |||||
""" | |||||
all_paths = [] | |||||
for i in range(0, length + 1): | |||||
new_paths = find_all_paths(G, i) | |||||
if new_paths == []: | |||||
break | |||||
all_paths.extend(new_paths) | |||||
if labeled == True: # convert paths to strings | |||||
path_strs = [] | |||||
for path in all_paths: | |||||
strlist = [G.node[node][node_label] + G[node] | |||||
[path[path.index(node) + 1]][edge_label] for node in path[:-1]] | |||||
path_strs.append(''.join(strlist) + G.node[path[-1]][node_label]) | |||||
return path_strs | |||||
return all_paths | |||||
def find_paths(G, source_node, length): | |||||
"""Find all paths with a certain length those start from a source node. A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which paths are searched. | |||||
source_node : integer | |||||
The number of the node from where all paths start. | |||||
length : integer | |||||
The length of paths. | |||||
Return | |||||
------ | |||||
path : list of list | |||||
List of paths retrieved, where each path is represented by a list of nodes. | |||||
""" | |||||
return [[source_node]] if length == 0 else \ | |||||
[[source_node] + path for neighbor in G[source_node] | |||||
for path in find_paths(G, neighbor, length - 1) if source_node not in path] | |||||
def find_all_paths(G, length): | |||||
"""Find all paths with a certain length in a graph. A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which paths are searched. | |||||
length : integer | |||||
The length of paths. | |||||
Return | |||||
------ | |||||
path : list of list | |||||
List of paths retrieved, where each path is represented by a list of nodes. | |||||
""" | |||||
all_paths = [] | |||||
for node in G: | |||||
all_paths.extend(find_paths(G, node, length)) | |||||
# The following process is not carried out according to the original article | |||||
# all_paths_r = [ path[::-1] for path in all_paths ] | |||||
# # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||||
# for idx, path in enumerate(all_paths[:-1]): | |||||
# for path2 in all_paths_r[idx+1::]: | |||||
# if path == path2: | |||||
# all_paths[idx] = [] | |||||
# break | |||||
# return list(filter(lambda a: a != [], all_paths)) | |||||
return all_paths |
@@ -1,262 +0,0 @@ | |||||
""" | |||||
@author: linlin | |||||
@references: | |||||
[1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003. | |||||
""" | |||||
import sys | |||||
import pathlib | |||||
sys.path.insert(0, "../") | |||||
import time | |||||
from collections import Counter | |||||
import networkx as nx | |||||
import numpy as np | |||||
def untilnwalkkernel(*args, | |||||
node_label='atom', | |||||
edge_label='bond_type', | |||||
labeled=True, | |||||
n=None, | |||||
compute_method='direct'): | |||||
"""Calculate common walk graph kernels up to depth d between graphs. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are calculated. | |||||
/ | |||||
G1, G2 : NetworkX graphs | |||||
2 graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
n : integer | |||||
Longest length of walks. | |||||
compute_method : string | |||||
Method used to compute walk kernel. The Following choices are available: | |||||
'direct' : direct product graph method, as shown in reference [1]. The time complexity is O(n^6) for unlabeled graphs with n vertices. | |||||
'brute' : brute force, simply search for all walks and compare them. | |||||
Return | |||||
------ | |||||
Kmatrix : Numpy matrix | |||||
Kernel matrix, each element of which is the path kernel up to d between 2 graphs. | |||||
""" | |||||
# arrange all graphs in a list | |||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
n = int(n) | |||||
start_time = time.time() | |||||
# direct product graph method | |||||
if compute_method == 'direct': | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
Kmatrix[i][j] = _untilnwalkkernel_direct( | |||||
Gn[i], Gn[j], node_label, edge_label, labeled) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
# search all paths use brute force. | |||||
elif compute_method == 'brute': | |||||
# get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. | |||||
all_walks = [ | |||||
find_all_walks_until_length(Gn[i], n, node_label, edge_label, | |||||
labeled) for i in range(0, len(Gn)) | |||||
] | |||||
for i in range(0, len(Gn)): | |||||
for j in range(i, len(Gn)): | |||||
Kmatrix[i][j] = _untilnwalkkernel_brute( | |||||
all_walks[i], | |||||
all_walks[j], | |||||
node_label=node_label, | |||||
edge_label=edge_label, | |||||
labeled=labeled) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
run_time = time.time() - start_time | |||||
print( | |||||
"\n --- kernel matrix of walk kernel up to %d of size %d built in %s seconds ---" | |||||
% (n, len(Gn), run_time)) | |||||
return Kmatrix, run_time | |||||
def _untilnwalkkernel_direct(G1, G2, node_label, edge_label, labeled): | |||||
"""Calculate walk graph kernels up to n between 2 graphs using direct product graphs. | |||||
Parameters | |||||
---------- | |||||
G1, G2 : NetworkX graph | |||||
Graphs between which the kernel is calculated. | |||||
node_label : string | |||||
node attribute used as label. | |||||
edge_label : string | |||||
edge attribute used as label. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Treelet Kernel between 2 graphs. | |||||
""" | |||||
# get tensor product / direct product | |||||
gp = nx.tensor_product(G1, G2) | |||||
from matplotlib import pyplot as plt | |||||
nx.draw_networkx(G1) | |||||
plt.show() | |||||
nx.draw_networkx(G2) | |||||
plt.show() | |||||
kernel = 0 | |||||
nx.draw_networkx(gp) | |||||
plt.show() | |||||
return kernel | |||||
def _untilnwalkkernel_brute(walks1, | |||||
walks2, | |||||
node_label='atom', | |||||
edge_label='bond_type', | |||||
labeled=True): | |||||
"""Calculate walk graph kernels up to n between 2 graphs. | |||||
Parameters | |||||
---------- | |||||
walks1, walks2 : list | |||||
List of walks in 2 graphs, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Treelet Kernel between 2 graphs. | |||||
""" | |||||
counts_walks1 = dict(Counter(walks1)) | |||||
counts_walks2 = dict(Counter(walks2)) | |||||
all_walks = list(set(walks1 + walks2)) | |||||
vector1 = [(counts_walks1[walk] if walk in walks1 else 0) | |||||
for walk in all_walks] | |||||
vector2 = [(counts_walks2[walk] if walk in walks2 else 0) | |||||
for walk in all_walks] | |||||
kernel = np.dot(vector1, vector2) | |||||
return kernel | |||||
# this method find walks repetively, it could be faster. | |||||
def find_all_walks_until_length(G, | |||||
length, | |||||
node_label='atom', | |||||
edge_label='bond_type', | |||||
labeled=True): | |||||
"""Find all walks with a certain maximum length in a graph. A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which walks are searched. | |||||
length : integer | |||||
The maximum length of walks. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
Return | |||||
------ | |||||
walk : list | |||||
List of walks retrieved, where for unlabeled graphs, each walk is represented by a list of nodes; while for labeled graphs, each walk is represented by a string consists of labels of nodes and edges on that walk. | |||||
""" | |||||
all_walks = [] | |||||
# @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n) | |||||
for i in range(0, length + 1): | |||||
new_walks = find_all_walks(G, i) | |||||
if new_walks == []: | |||||
break | |||||
all_walks.extend(new_walks) | |||||
if labeled == True: # convert paths to strings | |||||
walk_strs = [] | |||||
for walk in all_walks: | |||||
strlist = [ | |||||
G.node[node][node_label] + | |||||
G[node][walk[walk.index(node) + 1]][edge_label] | |||||
for node in walk[:-1] | |||||
] | |||||
walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) | |||||
return walk_strs | |||||
return all_walks | |||||
def find_walks(G, source_node, length): | |||||
"""Find all walks with a certain length those start from a source node. A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which walks are searched. | |||||
source_node : integer | |||||
The number of the node from where all walks start. | |||||
length : integer | |||||
The length of walks. | |||||
Return | |||||
------ | |||||
walk : list of list | |||||
List of walks retrieved, where each walk is represented by a list of nodes. | |||||
""" | |||||
return [[source_node]] if length == 0 else \ | |||||
[ [source_node] + walk for neighbor in G[source_node] \ | |||||
for walk in find_walks(G, neighbor, length - 1) ] | |||||
def find_all_walks(G, length): | |||||
"""Find all walks with a certain length in a graph. A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which walks are searched. | |||||
length : integer | |||||
The length of walks. | |||||
Return | |||||
------ | |||||
walk : list of list | |||||
List of walks retrieved, where each walk is represented by a list of nodes. | |||||
""" | |||||
all_walks = [] | |||||
for node in G: | |||||
all_walks.extend(find_walks(G, node, length)) | |||||
### The following process is not carried out according to the original article | |||||
# all_paths_r = [ path[::-1] for path in all_paths ] | |||||
# # For each path, two presentation are retrieved from its two extremities. Remove one of them. | |||||
# for idx, path in enumerate(all_paths[:-1]): | |||||
# for path2 in all_paths_r[idx+1::]: | |||||
# if path == path2: | |||||
# all_paths[idx] = [] | |||||
# break | |||||
# return list(filter(lambda a: a != [], all_paths)) | |||||
return all_walks |
@@ -111,7 +111,7 @@ def get_dataset_attributes(Gn, | |||||
if 'attributes' in attrs: | if 'attributes' in attrs: | ||||
return len(attrs['attributes']) | return len(attrs['attributes']) | ||||
else: | else: | ||||
return False | |||||
return 0 | |||||
def get_edge_attr_dim(Gn): | def get_edge_attr_dim(Gn): | ||||
for G in Gn: | for G in Gn: | ||||
@@ -120,8 +120,8 @@ def get_dataset_attributes(Gn, | |||||
if 'attributes' in e[2]: | if 'attributes' in e[2]: | ||||
return len(e[2]['attributes']) | return len(e[2]['attributes']) | ||||
else: | else: | ||||
return False | |||||
return False | |||||
return 0 | |||||
return 0 | |||||
if attr_names == []: | if attr_names == []: | ||||
attr_names = [ | attr_names = [ | ||||
@@ -0,0 +1,83 @@ | |||||
"""Those who are not graph kernels. We can be kernels for nodes or edges! | |||||
""" | |||||
def deltakernel(x, y): | |||||
"""Delta kernel. Return 1 if x == y, 0 otherwise. | |||||
Parameters | |||||
---------- | |||||
x, y : any | |||||
Two parts to compare. | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
Delta kernel. | |||||
References | |||||
---------- | |||||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003. | |||||
""" | |||||
return x == y #(1 if condition else 0) | |||||
def gaussiankernel(x, y): | |||||
"""Gaussian kernel. Use sklearn.metrics.pairwise.rbf_kernel instead. | |||||
""" | |||||
pass | |||||
def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | |||||
"""Sum of a pair of kernels. | |||||
k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||||
Parameters | |||||
---------- | |||||
k1, k2 : function | |||||
A pair of kernel functions. | |||||
d11, d12: | |||||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||||
d21, d22: | |||||
Inputs of k2. | |||||
lamda1, lamda2: float | |||||
Coefficients of the product. | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
""" | |||||
if d21 == None or d22 == None: | |||||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12) | |||||
else: | |||||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||||
return kernel | |||||
def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1): | |||||
"""Product of a pair of kernels. | |||||
k = lamda * k1(d11, d12) * k2(d21, d22) | |||||
Parameters | |||||
---------- | |||||
k1, k2 : function | |||||
A pair of kernel functions. | |||||
d11, d12: | |||||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||||
d21, d22: | |||||
Inputs of k2. | |||||
lamda: float | |||||
Coefficient of the product. | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
""" | |||||
if d21 == None or d22 == None: | |||||
kernel = lamda * k1(d11, d12) * k2(d11, d12) | |||||
else: | |||||
kernel = lamda * k1(d11, d12) * k2(d21, d22) | |||||
return kernel |
@@ -4,7 +4,8 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
param_grid_precomputed, param_grid, | param_grid_precomputed, param_grid, | ||||
model_type, NUM_TRIALS=30, | model_type, NUM_TRIALS=30, | ||||
datafile_y=None, | datafile_y=None, | ||||
extra_params=None): | |||||
extra_params=None, | |||||
ds_name='ds-unknown'): | |||||
"""Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. | """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results. | ||||
Parameters | Parameters | ||||
@@ -14,9 +15,9 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
estimator : function | estimator : function | ||||
kernel function used to estimate. This function needs to return a gram matrix. | kernel function used to estimate. This function needs to return a gram matrix. | ||||
param_grid_precomputed : dictionary | param_grid_precomputed : dictionary | ||||
Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. | |||||
Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted. | |||||
param_grid : dictionary | param_grid : dictionary | ||||
Dictionary with names (string) of parameters used as penelties as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. | |||||
Dictionary with names (string) of parameters used as penelties as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted. | |||||
model_type : string | model_type : string | ||||
Typr of the problem, can be regression or classification. | Typr of the problem, can be regression or classification. | ||||
NUM_TRIALS : integer | NUM_TRIALS : integer | ||||
@@ -49,7 +50,7 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
import sys | import sys | ||||
sys.path.insert(0, "../") | sys.path.insert(0, "../") | ||||
import os | import os | ||||
from os.path import basename | |||||
from os.path import basename, splitext | |||||
from pygraph.utils.graphfiles import loadDataset | from pygraph.utils.graphfiles import loadDataset | ||||
from tqdm import tqdm | from tqdm import tqdm | ||||
tqdm.monitor_interval = 0 | tqdm.monitor_interval = 0 | ||||
@@ -57,291 +58,305 @@ def model_selection_for_precomputed_kernel(datafile, estimator, | |||||
results_dir = '../notebooks/results/' + estimator.__name__ | results_dir = '../notebooks/results/' + estimator.__name__ | ||||
if not os.path.exists(results_dir): | if not os.path.exists(results_dir): | ||||
os.makedirs(results_dir) | os.makedirs(results_dir) | ||||
results_name_pre = results_dir + '/' + basename(datafile) + '_' | |||||
# setup the model type | |||||
model_type = model_type.lower() | |||||
if model_type != 'regression' and model_type != 'classification': | |||||
raise Exception( | |||||
'The model type is incorrect! Please choose from regression or classification.') | |||||
print() | |||||
print('--- This is a %s problem ---' % model_type) | |||||
# open file to save all results for this dataset. | |||||
with open(results_dir + '/' + ds_name + '.txt', 'w') as fresults: | |||||
fresults.write('# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n') | |||||
# Load the dataset | |||||
print() | |||||
print('1. Loading dataset from file...') | |||||
dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params) | |||||
# import matplotlib.pyplot as plt | |||||
# import networkx as nx | |||||
# nx.draw_networkx(dataset[30]) | |||||
# plt.show() | |||||
# Grid of parameters with a discrete number of values for each. | |||||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||||
param_list = list(ParameterGrid(param_grid)) | |||||
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||||
# [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||||
# np.savetxt(results_name_pre + 'param_grid.dt', | |||||
# [[key, value] for key, value in sorted(param_grid)]) | |||||
gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed | |||||
gram_matrix_time = [] # a list to store time to calculate gram matrices | |||||
param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones | |||||
# calculate all gram matrices | |||||
print() | |||||
print('2. Calculating gram matrices. This could take a while...') | |||||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||||
for params_out in param_list_precomputed: | |||||
# setup the model type | |||||
model_type = model_type.lower() | |||||
if model_type != 'regression' and model_type != 'classification': | |||||
raise Exception( | |||||
'The model type is incorrect! Please choose from regression or classification.') | |||||
print() | print() | ||||
if params_out != {}: | |||||
print('gram matrix with parameters', params_out, 'is: ') | |||||
print('--- This is a %s problem ---' % model_type) | |||||
fresults.write('This is a %s problem.\n\n' % model_type) | |||||
Kmatrix, current_run_time = estimator(dataset, **params_out) | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
# Load the dataset | |||||
print() | |||||
print('\nI. Loading dataset from file...') | |||||
dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params) | |||||
# import matplotlib.pyplot as plt | |||||
# import networkx as nx | |||||
# nx.draw_networkx(dataset[30]) | |||||
# plt.show() | |||||
# Grid of parameters with a discrete number of values for each. | |||||
param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) | |||||
param_list = list(ParameterGrid(param_grid)) | |||||
# np.savetxt(results_name_pre + 'param_grid_precomputed.dt', | |||||
# [[key, value] for key, value in sorted(param_grid_precomputed)]) | |||||
# np.savetxt(results_name_pre + 'param_grid.dt', | |||||
# [[key, value] for key, value in sorted(param_grid)]) | |||||
gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed | |||||
gram_matrix_time = [] # a list to store time to calculate gram matrices | |||||
param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones | |||||
# calculate all gram matrices | |||||
print() | |||||
print('2. Calculating gram matrices. This could take a while...') | |||||
fresults.write('\nI. Gram matrices.\n\n') | |||||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||||
for idx, params_out in enumerate(param_list_precomputed): | |||||
rtn_data = estimator(dataset, **params_out) | |||||
Kmatrix = rtn_data[0] | |||||
current_run_time = rtn_data[1] | |||||
if len(rtn_data) == 3: | |||||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||||
y = [y[idx] for idx in idx_trim] | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
for i in range(len(Kmatrix)): | |||||
for j in range(i, len(Kmatrix)): | |||||
# if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0: | |||||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
print() | |||||
if params_out == {}: | |||||
print('the gram matrix is: ') | |||||
fresults.write('the gram matrix is:\n\n') | |||||
else: | |||||
print('the gram matrix with parameters', params_out, 'is: ') | |||||
fresults.write('the gram matrix with parameters %s is:\n\n' % params_out) | |||||
if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers | |||||
nb_gm_ignore += 1 | |||||
print('ignored, as it contains elements that are not numbers.') | |||||
fresults.write('ignored, as it contains elements that are not numbers.\n\n') | |||||
else: | |||||
print(Kmatrix) | |||||
fresults.write(np.array2string(Kmatrix, separator=',', threshold=np.inf, floatmode='unique') + '\n\n') | |||||
plt.matshow(Kmatrix) | |||||
plt.colorbar() | |||||
fig_file_name = results_dir + '/GM[ds]' + ds_name | |||||
if params_out != {}: | |||||
fig_file_name += '[params]' + str(idx) | |||||
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) | |||||
plt.show() | |||||
gram_matrices.append(Kmatrix) | |||||
gram_matrix_time.append(current_run_time) | |||||
param_list_pre_revised.append(params_out) | |||||
print() | |||||
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore)) | |||||
fresults.write('{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)) | |||||
fresults.write('serial numbers of gram matrix figure and their corresponding parameters settings:\n\n') | |||||
fresults.write(''.join(['{}: {}\n'.format(idx, params_out) | |||||
for idx, params_out in enumerate(param_list_precomputed)])) | |||||
for i in range(len(Kmatrix)): | |||||
for j in range(i, len(Kmatrix)): | |||||
# print(Kmatrix[i][j]) | |||||
# if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0: | |||||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
# print(i, j, Kmatrix[i][j], Kmatrix_diag[i], Kmatrix_diag[j]) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
print() | |||||
print('3. Fitting and predicting using nested cross validation. This could really take a while...') | |||||
# Arrays to store scores | |||||
train_pref = np.zeros( | |||||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||||
val_pref = np.zeros( | |||||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||||
test_pref = np.zeros( | |||||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||||
# Loop for each trial | |||||
pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list), | |||||
desc='calculate performance', file=sys.stdout) | |||||
for trial in range(NUM_TRIALS): # Test set level | |||||
# loop for each outer param tuple | |||||
for index_out, params_out in enumerate(param_list_pre_revised): | |||||
# split gram matrix and y to app and test sets. | |||||
X_app, X_test, y_app, y_test = train_test_split( | |||||
gram_matrices[index_out], y, test_size=0.1) | |||||
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||||
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||||
X_app = X_app[:, split_index_app] | |||||
X_test = X_test[:, split_index_app] | |||||
y_app = np.array(y_app) | |||||
y_test = np.array(y_test) | |||||
# loop for each inner param tuple | |||||
for index_in, params_in in enumerate(param_list): | |||||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||||
current_train_perf = [] | |||||
current_valid_perf = [] | |||||
current_test_perf = [] | |||||
# For regression use the Kernel Ridge method | |||||
try: | |||||
if model_type == 'regression': | |||||
KR = KernelRidge(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
KR.fit(X_app[train_index, :] | |||||
[:, train_index], y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = KR.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = KR.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = KR.predict(X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
np.sqrt(mean_squared_error(y_app[train_index], y_pred_train))) | |||||
current_valid_perf.append( | |||||
np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid))) | |||||
current_test_perf.append( | |||||
np.sqrt(mean_squared_error(y_test, y_pred_test))) | |||||
# For clcassification use SVM | |||||
else: | |||||
KR = SVC(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
KR.fit(X_app[train_index, :] | |||||
[:, train_index], y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = KR.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = KR.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = KR.predict( | |||||
X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append(accuracy_score( | |||||
y_app[train_index], y_pred_train)) | |||||
current_valid_perf.append(accuracy_score( | |||||
y_app[valid_index], y_pred_valid)) | |||||
current_test_perf.append( | |||||
accuracy_score(y_test, y_pred_test)) | |||||
except ValueError: | |||||
print(sys.exc_info()[0]) | |||||
print(params_out, params_in) | |||||
# average performance on inner splits | |||||
train_pref[trial][index_out][index_in] = np.mean( | |||||
current_train_perf) | |||||
val_pref[trial][index_out][index_in] = np.mean( | |||||
current_valid_perf) | |||||
test_pref[trial][index_out][index_in] = np.mean( | |||||
current_test_perf) | |||||
pbar.update(1) | |||||
pbar.clear() | |||||
# np.save(results_name_pre + 'train_pref.dt', train_pref) | |||||
# np.save(results_name_pre + 'val_pref.dt', val_pref) | |||||
# np.save(results_name_pre + 'test_pref.dt', test_pref) | |||||
if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers | |||||
nb_gm_ignore += 1 | |||||
print('ignored, as it contains elements that are not numbers.') | |||||
print() | |||||
print('4. Getting final performance...') | |||||
fresults.write('\nII. Performance.\n\n') | |||||
# averages and confidences of performances on outer trials for each combination of parameters | |||||
average_train_scores = np.mean(train_pref, axis=0) | |||||
average_val_scores = np.mean(val_pref, axis=0) | |||||
average_perf_scores = np.mean(test_pref, axis=0) | |||||
# sample std is used here | |||||
std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||||
std_val_scores = np.std(val_pref, axis=0, ddof=1) | |||||
std_perf_scores = np.std(test_pref, axis=0, ddof=1) | |||||
if model_type == 'regression': | |||||
best_val_perf = np.amin(average_val_scores) | |||||
else: | else: | ||||
print(Kmatrix) | |||||
plt.matshow(Kmatrix) | |||||
plt.colorbar() | |||||
fig_name_suffix = '_'.join(['{}-{}'.format(key, val) | |||||
for key, val in sorted(params_out.items())]) | |||||
plt.savefig( | |||||
results_name_pre + 'gram_matrix_{}.png'.format(fig_name_suffix)) | |||||
plt.show() | |||||
gram_matrices.append(Kmatrix) | |||||
gram_matrix_time.append(current_run_time) | |||||
param_list_pre_revised.append(params_out) | |||||
np.save(results_name_pre + 'gram_matrices.dt', gram_matrices) | |||||
np.save(results_name_pre + 'param_list_precomputed.dt', param_list_pre_revised) | |||||
np.save(results_name_pre + 'param_list.dt', param_list) | |||||
print() | |||||
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore)) | |||||
print() | |||||
print('3. Fitting and predicting using nested cross validation. This could really take a while...') | |||||
# Arrays to store scores | |||||
train_pref = np.zeros( | |||||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||||
val_pref = np.zeros( | |||||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||||
test_pref = np.zeros( | |||||
(NUM_TRIALS, len(param_list_pre_revised), len(param_list))) | |||||
# Loop for each trial | |||||
pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list), | |||||
desc='calculate performance', file=sys.stdout) | |||||
for trial in range(NUM_TRIALS): # Test set level | |||||
# loop for each outer param tuple | |||||
for index_out, params_out in enumerate(param_list_pre_revised): | |||||
# split gram matrix and y to app and test sets. | |||||
X_app, X_test, y_app, y_test = train_test_split( | |||||
gram_matrices[index_out], y, test_size=0.1) | |||||
split_index_app = [y.index(y_i) for y_i in y_app if y_i in y] | |||||
# split_index_test = [y.index(y_i) for y_i in y_test if y_i in y] | |||||
X_app = X_app[:, split_index_app] | |||||
X_test = X_test[:, split_index_app] | |||||
y_app = np.array(y_app) | |||||
y_test = np.array(y_test) | |||||
# loop for each inner param tuple | |||||
for index_in, params_in in enumerate(param_list): | |||||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial) | |||||
current_train_perf = [] | |||||
current_valid_perf = [] | |||||
current_test_perf = [] | |||||
# For regression use the Kernel Ridge method | |||||
try: | |||||
if model_type == 'regression': | |||||
KR = KernelRidge(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
KR.fit(X_app[train_index, :] | |||||
[:, train_index], y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = KR.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = KR.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = KR.predict(X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
np.sqrt(mean_squared_error(y_app[train_index], y_pred_train))) | |||||
current_valid_perf.append( | |||||
np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid))) | |||||
current_test_perf.append( | |||||
np.sqrt(mean_squared_error(y_test, y_pred_test))) | |||||
# For clcassification use SVM | |||||
else: | |||||
KR = SVC(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
KR.fit(X_app[train_index, :] | |||||
[:, train_index], y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = KR.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = KR.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = KR.predict( | |||||
X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append(accuracy_score( | |||||
y_app[train_index], y_pred_train)) | |||||
current_valid_perf.append(accuracy_score( | |||||
y_app[valid_index], y_pred_valid)) | |||||
current_test_perf.append( | |||||
accuracy_score(y_test, y_pred_test)) | |||||
except ValueError: | |||||
print(sys.exc_info()[0]) | |||||
print(params_out, params_in) | |||||
# average performance on inner splits | |||||
train_pref[trial][index_out][index_in] = np.mean( | |||||
current_train_perf) | |||||
val_pref[trial][index_out][index_in] = np.mean( | |||||
current_valid_perf) | |||||
test_pref[trial][index_out][index_in] = np.mean( | |||||
current_test_perf) | |||||
pbar.update(1) | |||||
pbar.clear() | |||||
np.save(results_name_pre + 'train_pref.dt', train_pref) | |||||
np.save(results_name_pre + 'val_pref.dt', val_pref) | |||||
np.save(results_name_pre + 'test_pref.dt', test_pref) | |||||
# print('val_pref: ', val_pref) ##### | |||||
# print(val_pref.shape) | |||||
print() | |||||
print('4. Getting final performances...') | |||||
# averages and confidences of performances on outer trials for each combination of parameters | |||||
average_train_scores = np.mean(train_pref, axis=0) | |||||
average_val_scores = np.mean(val_pref, axis=0) | |||||
# print('average_val_scores: ', average_val_scores) ##### | |||||
# print(average_val_scores.shape) | |||||
average_perf_scores = np.mean(test_pref, axis=0) | |||||
# sample std is used here | |||||
std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||||
std_val_scores = np.std(val_pref, axis=0, ddof=1) | |||||
std_perf_scores = np.std(test_pref, axis=0, ddof=1) | |||||
if model_type == 'regression': | |||||
best_val_perf = np.amin(average_val_scores) | |||||
else: | |||||
best_val_perf = np.amax(average_val_scores) | |||||
# print() | |||||
# print('best_val_perf: ', best_val_perf) ##### | |||||
# print(best_val_perf.shape) | |||||
best_params_index = np.where(average_val_scores == best_val_perf) | |||||
# print('best_params_index: ', best_params_index) ##### | |||||
#print(best_params_index[0]) | |||||
#print(best_params_index[1]) | |||||
# print(best_params_index.shape) | |||||
best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]] | |||||
best_params_in = [param_list[i] for i in best_params_index[1]] | |||||
# print('best_params_index: ', best_params_index) | |||||
print('best_params_out: ', best_params_out) | |||||
print('best_params_in: ', best_params_in) | |||||
print() | |||||
print('best_val_perf: ', best_val_perf) | |||||
# below: only find one performance; muitiple pref might exist | |||||
best_val_std = std_val_scores[best_params_index[0] | |||||
[0]][best_params_index[1][0]] | |||||
print('best_val_std: ', best_val_std) | |||||
final_performance = average_perf_scores[best_params_index[0] | |||||
[0]][best_params_index[1][0]] | |||||
final_confidence = std_perf_scores[best_params_index[0] | |||||
[0]][best_params_index[1][0]] | |||||
print('final_performance: ', final_performance) | |||||
print('final_confidence: ', final_confidence) | |||||
train_performance = average_train_scores[best_params_index[0] | |||||
[0]][best_params_index[1][0]] | |||||
train_std = std_train_scores[best_params_index[0] | |||||
[0]][best_params_index[1][0]] | |||||
print('train_performance: ', train_performance) | |||||
print('train_std: ', train_std) | |||||
print() | |||||
average_gram_matrix_time = np.mean(gram_matrix_time) | |||||
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||||
best_gram_matrix_time = gram_matrix_time[best_params_index[0][0]] | |||||
print('time to calculate gram matrix with different hyperpapams: {:.2f}±{:.2f}' | |||||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
print('time to calculate best gram matrix: ', best_gram_matrix_time, 's') | |||||
# save results to file | |||||
np.savetxt(results_name_pre + 'average_train_scores.dt', | |||||
average_train_scores) | |||||
np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) | |||||
np.savetxt(results_name_pre + 'average_perf_scores.dt', | |||||
average_perf_scores) | |||||
np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | |||||
np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | |||||
np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | |||||
np.save(results_name_pre + 'best_params_index', best_params_index) | |||||
np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||||
np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||||
np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) | |||||
np.save(results_name_pre + 'best_val_std.dt', best_val_std) | |||||
np.save(results_name_pre + 'final_performance.dt', final_performance) | |||||
np.save(results_name_pre + 'final_confidence.dt', final_confidence) | |||||
np.save(results_name_pre + 'train_performance.dt', train_performance) | |||||
np.save(results_name_pre + 'train_std.dt', train_std) | |||||
best_val_perf = np.amax(average_val_scores) | |||||
best_params_index = np.where(average_val_scores == best_val_perf) | |||||
# find smallest val std with best val perf. | |||||
best_val_stds = [std_val_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
min_val_std = np.amin(best_val_stds) | |||||
best_params_index = np.where(std_val_scores == min_val_std) | |||||
best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]] | |||||
best_params_in = [param_list[i] for i in best_params_index[1]] | |||||
print('best_params_out: ', best_params_out) | |||||
print('best_params_in: ', best_params_in) | |||||
print() | |||||
print('best_val_perf: ', best_val_perf) | |||||
print('best_val_std: ', min_val_std) | |||||
fresults.write('best settings of hyper-params to build gram matrix: %s\n' % best_params_out) | |||||
fresults.write('best settings of other hyper-params: %s\n\n' % best_params_in) | |||||
fresults.write('best_val_perf: %s\n' % best_val_perf) | |||||
fresults.write('best_val_std: %s\n' % min_val_std) | |||||
final_performance = [average_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
final_confidence = [std_perf_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
print('final_performance: ', final_performance) | |||||
print('final_confidence: ', final_confidence) | |||||
fresults.write('final_performance: %s\n' % final_performance) | |||||
fresults.write('final_confidence: %s\n' % final_confidence) | |||||
train_performance = [average_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
train_std = [std_train_scores[value][best_params_index[1][idx]] for idx, value in enumerate(best_params_index[0])] | |||||
print('train_performance: %s' % train_performance) | |||||
print('train_std: ', train_std) | |||||
fresults.write('train_performance: %s\n' % train_performance) | |||||
fresults.write('train_std: %s\n\n' % train_std) | |||||
np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||||
np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||||
average_gram_matrix_time) | |||||
np.save(results_name_pre + 'std_gram_matrix_time.dt', | |||||
std_gram_matrix_time) | |||||
np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||||
best_gram_matrix_time) | |||||
print() | |||||
average_gram_matrix_time = np.mean(gram_matrix_time) | |||||
std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) | |||||
best_gram_matrix_time = [gram_matrix_time[i] for i in best_params_index[0]] | |||||
ave_bgmt = np.mean(best_gram_matrix_time) | |||||
std_bgmt = np.std(best_gram_matrix_time, ddof=1) | |||||
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(ave_bgmt, std_bgmt)) | |||||
fresults.write('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n' | |||||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
fresults.write('time to calculate best gram matrix: {:.2f}±{:.2f}s\n\n'.format(ave_bgmt, std_bgmt)) | |||||
# # save results to file | |||||
# np.savetxt(results_name_pre + 'average_train_scores.dt', | |||||
# average_train_scores) | |||||
# np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) | |||||
# np.savetxt(results_name_pre + 'average_perf_scores.dt', | |||||
# average_perf_scores) | |||||
# np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) | |||||
# np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) | |||||
# np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) | |||||
# np.save(results_name_pre + 'best_params_index', best_params_index) | |||||
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||||
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||||
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) | |||||
# np.save(results_name_pre + 'best_val_std.dt', best_val_std) | |||||
# np.save(results_name_pre + 'final_performance.dt', final_performance) | |||||
# np.save(results_name_pre + 'final_confidence.dt', final_confidence) | |||||
# np.save(results_name_pre + 'train_performance.dt', train_performance) | |||||
# np.save(results_name_pre + 'train_std.dt', train_std) | |||||
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||||
# np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||||
# average_gram_matrix_time) | |||||
# np.save(results_name_pre + 'std_gram_matrix_time.dt', | |||||
# std_gram_matrix_time) | |||||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||||
# best_gram_matrix_time) | |||||
# print out as table. | |||||
from collections import OrderedDict | |||||
from tabulate import tabulate | |||||
table_dict = {} | |||||
if model_type == 'regression': | |||||
for param_in in param_list: | |||||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||||
else: | |||||
for param_in in param_list: | |||||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||||
table_dict['params'] = [{**param_out, **param_in} | |||||
for param_in in param_list for param_out in param_list_pre_revised] | |||||
table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out]) | |||||
for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)] | |||||
table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||||
table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||||
table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||||
keyorder = ['params', 'train_perf', 'valid_perf', | |||||
'test_perf', 'gram_matrix_time'] | |||||
print() | |||||
tb_print = tabulate(OrderedDict(sorted(table_dict.items(), | |||||
key=lambda i: keyorder.index(i[0]))), headers='keys') | |||||
print(tb_print) | |||||
fresults.write('table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print) | |||||
# print out as table. | |||||
from collections import OrderedDict | |||||
from tabulate import tabulate | |||||
table_dict = {} | |||||
if model_type == 'regression': | |||||
for param_in in param_list: | |||||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||||
else: | |||||
for param_in in param_list: | |||||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||||
table_dict['params'] = [{**param_out, **param_in} | |||||
for param_in in param_list for param_out in param_list_pre_revised] | |||||
table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out]) | |||||
for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)] | |||||
table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||||
table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||||
table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)] | |||||
keyorder = ['params', 'train_perf', 'valid_perf', | |||||
'test_perf', 'gram_matrix_time'] | |||||
print() | |||||
print(tabulate(OrderedDict(sorted(table_dict.items(), | |||||
key=lambda i: keyorder.index(i[0]))), headers='keys')) | |||||
np.save(results_name_pre + 'results_vs_params.dt', table_dict) | |||||
fresults.close() |
@@ -0,0 +1,320 @@ | |||||
""" | |||||
@author: linlin | |||||
@references: | |||||
[1] https://github.com/ptrus/suffix-trees/blob/master/suffix_trees/STree.py, 2018.6 | |||||
""" | |||||
import sys | |||||
class STree(): | |||||
"""Class representing the suffix tree. The generalized suffix tree is supported.""" | |||||
def __init__(self, input=''): | |||||
self.root = _SNode() | |||||
self.root.depth = 0 | |||||
self.root.idx = 0 | |||||
self.root.parent = self.root | |||||
self.root._add_suffix_link(self.root) | |||||
if not input == '': | |||||
self.build(input) | |||||
def _check_input(self, input): | |||||
"""Checks the validity of the input. | |||||
In case of an invalid input throws ValueError. | |||||
""" | |||||
if isinstance(input, str): | |||||
return 'st' | |||||
elif isinstance(input, list): | |||||
if all(isinstance(item, str) for item in input): | |||||
return 'gst' | |||||
raise ValueError("String argument should be of type String or" | |||||
" a list of strings") | |||||
def build(self, x): | |||||
"""Builds the Suffix tree on the given input. | |||||
If the input is of type List of Strings: | |||||
Generalized Suffix Tree is built. | |||||
:param x: String or List of Strings | |||||
""" | |||||
type = self._check_input(x) | |||||
if type == 'st': | |||||
x += next(self._terminalSymbolsGenerator()) | |||||
self._build(x) | |||||
if type == 'gst': | |||||
self._build_generalized(x) | |||||
def _build(self, x): | |||||
"""Builds a Suffix tree.""" | |||||
self.word = x | |||||
self._build_McCreight(x) | |||||
def _build_McCreight(self, x): | |||||
"""Builds a Suffix tree using McCreight O(n) algorithm. | |||||
Algorithm based on: | |||||
McCreight, Edward M. "A space-economical suffix tree construction algorithm." - ACM, 1976. | |||||
Implementation based on: | |||||
UH CS - 58093 String Processing Algorithms Lecture Notes | |||||
""" | |||||
u = self.root | |||||
d = 0 | |||||
for i in range(len(x)): | |||||
while u.depth == d and u._has_transition(x[d + i]): | |||||
u = u._get_transition_link(x[d + i]) | |||||
d = d + 1 | |||||
while d < u.depth and x[u.idx + d] == x[i + d]: | |||||
d = d + 1 | |||||
if d < u.depth: | |||||
u = self._create_node(x, u, d) | |||||
self._create_leaf(x, i, u, d) | |||||
if not u._get_suffix_link(): | |||||
self._compute_slink(x, u) | |||||
u = u._get_suffix_link() | |||||
d = d - 1 | |||||
if d < 0: | |||||
d = 0 | |||||
def _create_node(self, x, u, d): | |||||
i = u.idx | |||||
p = u.parent | |||||
v = _SNode(idx=i, depth=d) | |||||
v._add_transition_link(u, x[i + d]) | |||||
u.parent = v | |||||
p._add_transition_link(v, x[i + p.depth]) | |||||
v.parent = p | |||||
return v | |||||
def _create_leaf(self, x, i, u, d): | |||||
w = _SNode() | |||||
w.idx = i | |||||
w.depth = len(x) - i | |||||
u._add_transition_link(w, x[i + d]) | |||||
w.parent = u | |||||
return w | |||||
def _compute_slink(self, x, u): | |||||
d = u.depth | |||||
v = u.parent._get_suffix_link() | |||||
while v.depth < d - 1: | |||||
v = v._get_transition_link(x[u.idx + v.depth + 1]) | |||||
if v.depth > d - 1: | |||||
v = self._create_node(x, v, d - 1) | |||||
u._add_suffix_link(v) | |||||
def _build_Ukkonen(self, x): | |||||
"""Builds a Suffix tree using Ukkonen's online O(n) algorithm. | |||||
Algorithm based on: | |||||
Ukkonen, Esko. "On-line construction of suffix trees." - Algorithmica, 1995. | |||||
""" | |||||
# TODO. | |||||
raise NotImplementedError() | |||||
def _build_generalized(self, xs): | |||||
"""Builds a Generalized Suffix Tree (GST) from the array of strings provided. | |||||
""" | |||||
terminal_gen = self._terminalSymbolsGenerator() | |||||
_xs = ''.join([x + next(terminal_gen) for x in xs]) | |||||
self.word = _xs | |||||
self._generalized_word_starts(xs) | |||||
self._build(_xs) | |||||
self.root._traverse(self._label_generalized) | |||||
def _label_generalized(self, node): | |||||
"""Helper method that labels the nodes of GST with indexes of strings | |||||
found in their descendants. | |||||
""" | |||||
if node.is_leaf(): | |||||
x = {self._get_word_start_index(node.idx)} | |||||
else: | |||||
x = { | |||||
n | |||||
for ns in node.transition_links for n in ns[0].generalized_idxs | |||||
} | |||||
node.generalized_idxs = x | |||||
def _get_word_start_index(self, idx): | |||||
"""Helper method that returns the index of the string based on node's | |||||
starting index""" | |||||
i = 0 | |||||
for _idx in self.word_starts[1:]: | |||||
if idx < _idx: | |||||
return i | |||||
else: | |||||
i += 1 | |||||
return i | |||||
def lcs(self, stringIdxs=-1): | |||||
"""Returns the Largest Common Substring of Strings provided in stringIdxs. | |||||
If stringIdxs is not provided, the LCS of all strings is returned. | |||||
::param stringIdxs: Optional: List of indexes of strings. | |||||
""" | |||||
if stringIdxs == -1 or not isinstance(stringIdxs, list): | |||||
stringIdxs = set(range(len(self.word_starts))) | |||||
else: | |||||
stringIdxs = set(stringIdxs) | |||||
deepestNode = self._find_lcs(self.root, stringIdxs) | |||||
start = deepestNode.idx | |||||
end = deepestNode.idx + deepestNode.depth | |||||
return self.word[start:end] | |||||
def _find_lcs(self, node, stringIdxs): | |||||
"""Helper method that finds LCS by traversing the labeled GSD.""" | |||||
nodes = [ | |||||
self._find_lcs(n, stringIdxs) for (n, _) in node.transition_links | |||||
if n.generalized_idxs.issuperset(stringIdxs) | |||||
] | |||||
if nodes == []: | |||||
return node | |||||
deepestNode = max(nodes, key=lambda n: n.depth) | |||||
return deepestNode | |||||
def _generalized_word_starts(self, xs): | |||||
"""Helper method returns the starting indexes of strings in GST""" | |||||
self.word_starts = [] | |||||
i = 0 | |||||
for n in range(len(xs)): | |||||
self.word_starts.append(i) | |||||
i += len(xs[n]) + 1 | |||||
def find(self, y): | |||||
"""Returns starting position of the substring y in the string used for | |||||
building the Suffix tree. | |||||
:param y: String | |||||
:return: Index of the starting position of string y in the string used for building the Suffix tree | |||||
-1 if y is not a substring. | |||||
""" | |||||
node = self.root | |||||
while True: | |||||
edge = self._edgeLabel(node, node.parent) | |||||
if edge.startswith(y): | |||||
return node.idx | |||||
i = 0 | |||||
while (i < len(edge) and edge[i] == y[0]): | |||||
y = y[1:] | |||||
i += 1 | |||||
if i != 0: | |||||
if i == len(edge) and y != '': | |||||
pass | |||||
else: | |||||
return -1 | |||||
node = node._get_transition_link(y[0]) | |||||
if not node: | |||||
return -1 | |||||
def find_all(self, y): | |||||
y_input = y | |||||
node = self.root | |||||
while True: | |||||
edge = self._edgeLabel(node, node.parent) | |||||
if edge.startswith(y): | |||||
break | |||||
i = 0 | |||||
while (i < len(edge) and edge[i] == y[0]): | |||||
y = y[1:] | |||||
i += 1 | |||||
if i != 0: | |||||
if i == len(edge) and y != '': | |||||
pass | |||||
else: | |||||
return [] | |||||
node = node._get_transition_link(y[0]) | |||||
if not node: | |||||
return [] | |||||
leaves = node._get_leaves() | |||||
return [n.idx for n in leaves] | |||||
def _edgeLabel(self, node, parent): | |||||
"""Helper method, returns the edge label between a node and it's parent""" | |||||
return self.word[node.idx + parent.depth:node.idx + node.depth] | |||||
def _terminalSymbolsGenerator(self): | |||||
"""Generator of unique terminal symbols used for building the Generalized Suffix Tree. | |||||
Unicode Private Use Area U+E000..U+F8FF is used to ensure that terminal symbols | |||||
are not part of the input string. | |||||
""" | |||||
py2 = sys.version[0] < '3' | |||||
UPPAs = list( | |||||
list(range(0xE000, 0xF8FF + 1)) + | |||||
list(range(0xF0000, 0xFFFFD + 1)) + | |||||
list(range(0x100000, 0x10FFFD + 1))) | |||||
for i in UPPAs: | |||||
if py2: | |||||
yield (unichr(i)) | |||||
else: | |||||
yield (chr(i)) | |||||
raise ValueError("To many input strings.") | |||||
class _SNode(): | |||||
"""Class representing a Node in the Suffix tree.""" | |||||
def __init__(self, idx=-1, parentNode=None, depth=-1): | |||||
# Links | |||||
self._suffix_link = None | |||||
self.transition_links = [] | |||||
# Properties | |||||
self.idx = idx | |||||
self.depth = depth | |||||
self.parent = parentNode | |||||
self.generalized_idxs = {} | |||||
def __str__(self): | |||||
return ("SNode: idx:" + str(self.idx) + " depth:" + str(self.depth) + | |||||
" transitons:" + str(self.transition_links)) | |||||
def _add_suffix_link(self, snode): | |||||
self._suffix_link = snode | |||||
def _get_suffix_link(self): | |||||
if self._suffix_link != None: | |||||
return self._suffix_link | |||||
else: | |||||
return False | |||||
def _get_transition_link(self, suffix): | |||||
for node, _suffix in self.transition_links: | |||||
if _suffix == '__@__' or suffix == _suffix: | |||||
return node | |||||
return False | |||||
def _add_transition_link(self, snode, suffix=''): | |||||
tl = self._get_transition_link(suffix) | |||||
if tl: # TODO: imporve this. | |||||
self.transition_links.remove((tl, suffix)) | |||||
self.transition_links.append((snode, suffix)) | |||||
def _has_transition(self, suffix): | |||||
for node, _suffix in self.transition_links: | |||||
if _suffix == '__@__' or suffix == _suffix: | |||||
return True | |||||
return False | |||||
def is_leaf(self): | |||||
return self.transition_links == [] | |||||
def _traverse(self, f): | |||||
for (node, _) in self.transition_links: | |||||
node._traverse(f) | |||||
f(self) | |||||
def _get_leaves(self): | |||||
if self.is_leaf(): | |||||
return [self] | |||||
else: | |||||
return [ | |||||
x for (n, _) in self.transition_links for x in n._get_leaves() | |||||
] |
@@ -62,8 +62,9 @@ def floydTransformation(G, edge_weight=None): | |||||
S = nx.Graph() | S = nx.Graph() | ||||
S.add_nodes_from(G.nodes(data=True)) | S.add_nodes_from(G.nodes(data=True)) | ||||
for i in range(0, G.number_of_nodes()): | for i in range(0, G.number_of_nodes()): | ||||
for j in range(i, G.number_of_nodes()): | |||||
S.add_edge(i, j, cost=spMatrix[i, j]) | |||||
for j in range(i + 1, G.number_of_nodes()): | |||||
if spMatrix[i, j] != np.inf: | |||||
S.add_edge(i, j, cost=spMatrix[i, j]) | |||||
return S | return S | ||||