@@ -79,3 +79,9 @@ outputs/ | |||
# pyCharm. | |||
.idea/ | |||
# tests. | |||
gklearn/tests/datasets/ | |||
# Experiments. | |||
gklearn/experiments/datasets/ |
@@ -1,5 +1,5 @@ | |||
# graphkit-learn | |||
[](https://travis-ci.org/jajupmochi/graphkit-learn) | |||
[](https://travis-ci.com/jajupmochi/graphkit-learn) | |||
[](https://ci.appveyor.com/project/jajupmochi/graphkit-learn) | |||
[](https://codecov.io/gh/jajupmochi/graphkit-learn) | |||
[](https://graphkit-learn.readthedocs.io/en/master/?badge=master) | |||
@@ -68,7 +68,7 @@ The docs of the library can be found [here](https://graphkit-learn.readthedocs.i | |||
* [The common walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/common_walk.py) [1] | |||
* Exponential | |||
* Geometric | |||
* [The marginalized kenrel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py) | |||
* [The marginalized kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py) | |||
* With tottering [2] | |||
* Without tottering [7] | |||
* [The generalized random walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/random_walk.py) [3] | |||
@@ -40,6 +40,7 @@ class Dataset(object): | |||
self._edge_attr_dim = None | |||
self._class_number = None | |||
self._ds_name = None | |||
self._task_type = None | |||
if inputs is None: | |||
self._graphs = None | |||
@@ -117,11 +118,16 @@ class Dataset(object): | |||
ds_file = [os.path.join(path, fn) for fn in load_files[0]] | |||
fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None | |||
# Get extra_params. | |||
if 'extra_params' in DATASET_META[ds_name]: | |||
kwargs = DATASET_META[ds_name]['extra_params'] | |||
else: | |||
kwargs = {} | |||
# Get the task type that is associated with the dataset. If it is classification, get the number of classes. | |||
self._get_task_type(ds_name) | |||
self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets, **kwargs).data | |||
self._node_labels = label_names['node_labels'] | |||
@@ -276,7 +282,8 @@ class Dataset(object): | |||
'edge_attr_dim', | |||
'class_number', | |||
'all_degree_entropy', | |||
'ave_degree_entropy' | |||
'ave_degree_entropy', | |||
'class_type' | |||
] | |||
# dataset size | |||
@@ -408,7 +415,7 @@ class Dataset(object): | |||
if 'class_number' in keys: | |||
if self._class_number is None: | |||
self._class_number = self._get_class_number() | |||
self._class_number = self._get_class_num() | |||
infos['class_number'] = self._class_number | |||
if 'node_attr_dim' in keys: | |||
@@ -437,6 +444,11 @@ class Dataset(object): | |||
base = None | |||
infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) | |||
if 'task_type' in keys: | |||
if self._task_type is None: | |||
self._task_type = self._get_task_type() | |||
infos['task_type'] = self._task_type | |||
return infos | |||
@@ -790,6 +802,13 @@ class Dataset(object): | |||
return degree_entropy | |||
def _get_task_type(self, ds_name): | |||
if 'task_type' in DATASET_META[ds_name]: | |||
self._task_type = DATASET_META[ds_name]['task_type'] | |||
if self._task_type == 'classification' and self._class_number is None and 'class_number' in DATASET_META[ds_name]: | |||
self._class_number = DATASET_META[ds_name]['class_number'] | |||
@property | |||
def graphs(self): | |||
return self._graphs | |||
@@ -13,7 +13,7 @@ import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import time | |||
from utils import get_dataset, set_edit_cost_consts | |||
from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation | |||
import sys | |||
from group_results import group_trials, check_group_existence, update_group_marker | |||
@@ -37,7 +37,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'attr_distance': 'euclidean', | |||
'ratio_runs_from_initial_solutions': 0.25, | |||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||
# parallel threads. Set to 1 automatically if parallel=True in compute_geds(). | |||
'threads': multiprocessing.cpu_count(), | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
@@ -98,7 +98,7 @@ def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
# Group trials and Remove single files. | |||
# Group trials and remove single files. | |||
# @todo: if the program stops between the following lines, then there may be errors. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | |||
@@ -111,21 +111,25 @@ def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for ratio in ratio_list: | |||
for params in list(param_grid): | |||
print() | |||
print('Ratio:', ratio) | |||
for num_solutions in num_solutions_list: | |||
print() | |||
print('# of solutions:', num_solutions) | |||
save_trials_as_group(dataset, ds_name, num_solutions, ratio) | |||
print(params) | |||
save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio']) | |||
def get_param_lists(ds_name, test=False): | |||
if test: | |||
num_solutions_list = [1, 10, 20, 30, 40, 50] | |||
def get_param_lists(ds_name, mode='test'): | |||
if mode == 'test': | |||
num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] | |||
ratio_list = [10] | |||
return num_solutions_list, ratio_list | |||
elif mode == 'simple': | |||
from sklearn.model_selection import ParameterGrid | |||
param_grid = ParameterGrid([ | |||
{'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]}, | |||
{'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]) | |||
# print(list(param_grid)) | |||
if ds_name == 'AIDS_symb': | |||
num_solutions_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
@@ -133,7 +137,7 @@ def get_param_lists(ds_name, test=False): | |||
num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] | |||
return num_solutions_list, ratio_list | |||
return param_grid | |||
if __name__ == '__main__': | |||
@@ -141,7 +145,7 @@ if __name__ == '__main__': | |||
ds_name_list = sys.argv[1:] | |||
else: | |||
ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | |||
# ds_name_list = ['Acyclic'] # 'Alkane_unlabeled'] | |||
# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] | |||
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' | |||
@@ -151,5 +155,5 @@ if __name__ == '__main__': | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
num_solutions_list, ratio_list = get_param_lists(ds_name, test=False) | |||
param_grid = get_param_lists(ds_name, mode='simple') | |||
results_for_a_dataset(ds_name) |
@@ -16,12 +16,12 @@ from gklearn.experiments import DATASET_ROOT | |||
def get_dataset(ds_name): | |||
# The node/edge labels that will not be used in the computation. | |||
# if ds_name == 'MAO': | |||
# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
# if ds_name == 'Monoterpenoides': | |||
# irrelevant_labels = {'edge_labels': ['valence']} | |||
# elif ds_name == 'MUTAG': | |||
# irrelevant_labels = {'edge_labels': ['label_0']} | |||
# if ds_name == 'MAO': | |||
# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
# if ds_name == 'Monoterpenoides': | |||
# irrelevant_labels = {'edge_labels': ['valence']} | |||
# elif ds_name == 'MUTAG': | |||
# irrelevant_labels = {'edge_labels': ['label_0']} | |||
if ds_name == 'AIDS_symb': | |||
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | |||
ds_name = 'AIDS' | |||
@@ -49,34 +49,36 @@ def set_edit_cost_consts(ratio, node_labeled=True, edge_labeled=True, mode='unif | |||
def nested_keys_exists(element, *keys): | |||
''' | |||
Check if *keys (nested) exists in `element` (dict). | |||
''' | |||
if not isinstance(element, dict): | |||
raise AttributeError('keys_exists() expects dict as first argument.') | |||
if len(keys) == 0: | |||
raise AttributeError('keys_exists() expects at least two arguments, one given.') | |||
_element = element | |||
for key in keys: | |||
try: | |||
_element = _element[key] | |||
except KeyError: | |||
return False | |||
return True | |||
''' | |||
Check if *keys (nested) exists in `element` (dict). | |||
''' | |||
if not isinstance(element, dict): | |||
raise AttributeError('keys_exists() expects dict as first argument.') | |||
if len(keys) == 0: | |||
raise AttributeError('keys_exists() expects at least two arguments, one given.') | |||
_element = element | |||
for key in keys: | |||
try: | |||
_element = _element[key] | |||
except KeyError: | |||
return False | |||
return True | |||
# Check average relative error along elements in two ged matrices. | |||
def matrices_ave_relative_error(m1, m2): | |||
error = 0 | |||
base = 0 | |||
for i in range(m1.shape[0]): | |||
for j in range(m1.shape[1]): | |||
error += np.abs(m1[i, j] - m2[i, j]) | |||
base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2 | |||
error = 0 | |||
base = 0 | |||
for i in range(m1.shape[0]): | |||
for j in range(m1.shape[1]): | |||
error += np.abs(m1[i, j] - m2[i, j]) | |||
# base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) | |||
base += (m1[i, j] + m2[i, j]) # Require only 25% of the time of "base += (np.abs(m1[i, j]) + np.abs(m2[i, j]))". | |||
return error / base | |||
base = base / 2 | |||
return error / base | |||
def compute_relative_error(ged_mats): | |||
@@ -92,9 +94,9 @@ def compute_relative_error(ged_mats): | |||
errors = [] | |||
for i, mat in enumerate(ged_mats): | |||
err = matrices_ave_relative_error(mat, ged_mat_s) | |||
# if not per_correct: | |||
# print('matrix # ', str(i)) | |||
# pass | |||
# if not per_correct: | |||
# print('matrix # ', str(i)) | |||
# pass | |||
errors.append(err) | |||
else: | |||
errors = [0] | |||
@@ -107,11 +109,11 @@ def parse_group_file_name(fn): | |||
key1 = splits_all[1] | |||
pos2 = splits_all[2].rfind('_') | |||
# key2 = splits_all[2][:pos2] | |||
# key2 = splits_all[2][:pos2] | |||
val2 = splits_all[2][pos2+1:] | |||
pos3 = splits_all[3].rfind('_') | |||
# key3 = splits_all[3][:pos3] | |||
# key3 = splits_all[3][:pos3] | |||
val3 = splits_all[3][pos3+1:] + '.' + splits_all[4] | |||
return key1, val2, val3 | |||
@@ -232,7 +234,7 @@ def set_axis_style(ax): | |||
ax.tick_params(labelsize=8, color='w', pad=1, grid_color='w') | |||
ax.tick_params(axis='x', pad=-2) | |||
ax.tick_params(axis='y', labelrotation=-40, pad=-2) | |||
# ax.zaxis._axinfo['juggled'] = (1, 2, 0) | |||
# ax.zaxis._axinfo['juggled'] = (1, 2, 0) | |||
ax.set_xlabel(ax.get_xlabel(), fontsize=10, labelpad=-3) | |||
ax.set_ylabel(ax.get_ylabel(), fontsize=10, labelpad=-2, rotation=50) | |||
ax.set_zlabel(ax.get_zlabel(), fontsize=10, labelpad=-2) | |||
@@ -240,16 +242,99 @@ def set_axis_style(ax): | |||
return | |||
def dichotomous_permutation(arr, layer=0): | |||
import math | |||
# def seperate_arr(arr, new_arr): | |||
# if (length % 2) == 0: | |||
# half = int(length / 2) | |||
# new_arr += [arr[half - 1], arr[half]] | |||
# subarr1 = [arr[i] for i in range(1, half - 1)] | |||
# else: | |||
# half = math.floor(length / 2) | |||
# new_arr.append(arr[half]) | |||
# subarr1 = [arr[i] for i in range(1, half)] | |||
# subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
# subarrs = [subarr1, subarr2] | |||
# return subarrs | |||
if layer == 0: | |||
length = len(arr) | |||
if length <= 2: | |||
return arr | |||
new_arr = [arr[0], arr[-1]] | |||
if (length % 2) == 0: | |||
half = int(length / 2) | |||
new_arr += [arr[half - 1], arr[half]] | |||
subarr1 = [arr[i] for i in range(1, half - 1)] | |||
else: | |||
half = math.floor(length / 2) | |||
new_arr.append(arr[half]) | |||
subarr1 = [arr[i] for i in range(1, half)] | |||
subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
subarrs = [subarr1, subarr2] | |||
# subarrs = seperate_arr(arr, new_arr) | |||
new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||
else: | |||
new_arr = [] | |||
subarrs = [] | |||
for a in arr: | |||
length = len(a) | |||
if length <= 2: | |||
new_arr += a | |||
else: | |||
# subarrs += seperate_arr(a, new_arr) | |||
if (length % 2) == 0: | |||
half = int(length / 2) | |||
new_arr += [a[half - 1], a[half]] | |||
subarr1 = [a[i] for i in range(0, half - 1)] | |||
else: | |||
half = math.floor(length / 2) | |||
new_arr.append(a[half]) | |||
subarr1 = [a[i] for i in range(0, half)] | |||
subarr2 = [a[i] for i in range(half + 1, length)] | |||
subarrs += [subarr1, subarr2] | |||
if len(subarrs) > 0: | |||
new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||
return new_arr | |||
# length = len(arr) | |||
# if length <= 2: | |||
# return arr | |||
# new_arr = [arr[0], arr[-1]] | |||
# if (length % 2) == 0: | |||
# half = int(length / 2) | |||
# new_arr += [arr[half - 1], arr[half]] | |||
# subarr1 = [arr[i] for i in range(1, half - 1)] | |||
# else: | |||
# half = math.floor(length / 2) | |||
# new_arr.append(arr[half]) | |||
# subarr1 = [arr[i] for i in range(1, half)] | |||
# subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
# if len(subarr1) > 0: | |||
# new_arr += dichotomous_permutation(subarr1) | |||
# if len(subarr2) > 0: | |||
# new_arr += dichotomous_permutation(subarr2) | |||
# return new_arr | |||
if __name__ == '__main__': | |||
root_dir = 'outputs/CRIANN/' | |||
# for dir_ in sorted(os.listdir(root_dir)): | |||
# if os.path.isdir(root_dir): | |||
# full_dir = os.path.join(root_dir, dir_) | |||
# print('---', full_dir,':') | |||
# save_dir = os.path.join(full_dir, 'groups/') | |||
# if os.path.exists(save_dir): | |||
# try: | |||
# get_relative_errors(save_dir) | |||
# except Exception as exp: | |||
# print('An exception occured when running this experiment:') | |||
# print(repr(exp)) | |||
# for dir_ in sorted(os.listdir(root_dir)): | |||
# if os.path.isdir(root_dir): | |||
# full_dir = os.path.join(root_dir, dir_) | |||
# print('---', full_dir,':') | |||
# save_dir = os.path.join(full_dir, 'groups/') | |||
# if os.path.exists(save_dir): | |||
# try: | |||
# get_relative_errors(save_dir) | |||
# except Exception as exp: | |||
# print('An exception occured when running this experiment:') | |||
# print(repr(exp)) |
@@ -0,0 +1,29 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Jan 26 09:53:33 2021 | |||
@author: ljia | |||
""" | |||
if __name__ == '__main__': | |||
tasks = [ | |||
{'path': 'thesis/graph_kernels/fcsp', | |||
'file': 'run_jobs_compare_fcsp.py' | |||
}, | |||
{'path': 'thesis/graph_kernels/fcsp', | |||
'file': 'run_jobs_compare_fcsp_space.py' | |||
}, | |||
{'path': 'ged/stability', | |||
'file': 'run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py' | |||
}, | |||
] | |||
import os | |||
for t in tasks: | |||
print(t['file']) | |||
command = '' | |||
command += 'cd ' + t['path'] + '\n' | |||
command += 'python3 ' + t['file'] + '\n' | |||
# command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n' | |||
os.system(command) |
@@ -19,7 +19,15 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||
('StructuralSP', 'NCI1', 'False'), | |||
('ShortestPath', 'NCI109', 'False'), | |||
('StructuralSP', 'NCI109', 'True'), | |||
('ShortestPath', 'NCI-H23', 'True'), | |||
('ShortestPath', 'NCI-H23', 'False'), | |||
('StructuralSP', 'NCI-H23', 'True'), | |||
('StructuralSP', 'NCI-H23', 'False'), | |||
('StructuralSP', 'NCI109', 'False'), | |||
('ShortestPath', 'NCI-H23H', 'True'), | |||
('ShortestPath', 'NCI-H23H', 'False'), | |||
('StructuralSP', 'NCI-H23H', 'True'), | |||
('StructuralSP', 'NCI-H23H', 'False'), | |||
('ShortestPath', 'DD', 'True'), | |||
('ShortestPath', 'DD', 'False'), | |||
('StructuralSP', 'BZR', 'False'), | |||
@@ -27,9 +35,37 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||
('StructuralSP', 'COX2', 'False'), | |||
('ShortestPath', 'DHFR', 'False'), | |||
('StructuralSP', 'DHFR', 'False'), | |||
('ShortestPath', 'MCF-7', 'True'), | |||
('ShortestPath', 'MCF-7', 'False'), | |||
('StructuralSP', 'MCF-7', 'True'), | |||
('StructuralSP', 'MCF-7', 'False'), | |||
('ShortestPath', 'MCF-7H', 'True'), | |||
('ShortestPath', 'MCF-7H', 'False'), | |||
('StructuralSP', 'MCF-7H', 'True'), | |||
('StructuralSP', 'MCF-7H', 'False'), | |||
('ShortestPath', 'MOLT-4', 'True'), | |||
('ShortestPath', 'MOLT-4', 'False'), | |||
('StructuralSP', 'MOLT-4', 'True'), | |||
('StructuralSP', 'MOLT-4', 'False'), | |||
('ShortestPath', 'MOLT-4H', 'True'), | |||
('ShortestPath', 'MOLT-4H', 'False'), | |||
('StructuralSP', 'MOLT-4H', 'True'), | |||
('StructuralSP', 'MOLT-4H', 'False'), | |||
('StructuralSP', 'OHSU', 'True'), | |||
('StructuralSP', 'OHSU', 'False'), | |||
('StructuralSP', 'SYNTHETIC', 'False'), | |||
('ShortestPath', 'OVCAR-8', 'True'), | |||
('ShortestPath', 'OVCAR-8', 'False'), | |||
('StructuralSP', 'OVCAR-8', 'True'), | |||
('StructuralSP', 'OVCAR-8', 'False'), | |||
('ShortestPath', 'OVCAR-8H', 'True'), | |||
('ShortestPath', 'OVCAR-8H', 'False'), | |||
('StructuralSP', 'OVCAR-8H', 'True'), | |||
('StructuralSP', 'OVCAR-8H', 'False'), | |||
('ShortestPath', 'P388', 'False'), | |||
('ShortestPath', 'P388', 'True'), | |||
('StructuralSP', 'P388', 'True'), | |||
('StructuralSP', 'Steroid', 'False'), | |||
('ShortestPath', 'SYNTHETIC', 'False'), | |||
('StructuralSP', 'SYNTHETIC', 'True'), | |||
('StructuralSP', 'SYNTHETIC', 'False'), | |||
('ShortestPath', 'SYNTHETICnew', 'False'), | |||
@@ -47,6 +83,9 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||
('StructuralSP', 'Mutagenicity', 'False'), | |||
('StructuralSP', 'REDDIT-BINARY', 'True'), | |||
('StructuralSP', 'REDDIT-BINARY', 'False'), | |||
('StructuralSP', 'Vitamin_D', 'False'), | |||
('ShortestPath', 'Web', 'True'), | |||
('ShortestPath', 'Web', 'False'), | |||
}) | |||
OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'), | |||
@@ -17,6 +17,7 @@ OUT_TIME_LIST = [] | |||
OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
('ShortestPath', 'REDDIT-BINARY', 'False'), | |||
('StructuralSP', 'ENZYMES', 'False'), | |||
('StructuralSP', 'AIDS', 'False'), | |||
('ShortestPath', 'DD', 'True'), | |||
('ShortestPath', 'DD', 'False'), | |||
('StructuralSP', 'DD', 'True'), | |||
@@ -55,6 +56,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
('ShortestPath', 'P388H', 'False'), | |||
('StructuralSP', 'P388H', 'True'), | |||
('StructuralSP', 'P388H', 'False'), | |||
('StructuralSP', 'NCI1', 'False'), | |||
('ShortestPath', 'NCI-H23', 'True'), | |||
('ShortestPath', 'NCI-H23', 'False'), | |||
('StructuralSP', 'NCI-H23', 'True'), | |||
@@ -63,6 +65,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
('ShortestPath', 'NCI-H23H', 'False'), | |||
('StructuralSP', 'NCI-H23H', 'True'), | |||
('StructuralSP', 'NCI-H23H', 'False'), | |||
('StructuralSP', 'OHSU', 'False'), | |||
('ShortestPath', 'OVCAR-8', 'True'), | |||
('ShortestPath', 'OVCAR-8', 'False'), | |||
('StructuralSP', 'OVCAR-8', 'True'), | |||
@@ -208,11 +211,12 @@ def check_task_status(save_dir, *params): | |||
# Check if the task is already computed. | |||
file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
data = pickle.load(f) | |||
if data['completed']: | |||
return True | |||
if os.path.getsize(file_name) > 0: | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
data = pickle.load(f) | |||
if data['completed']: | |||
return True | |||
return False | |||
@@ -7,7 +7,6 @@ __version__ = "0.1" | |||
__author__ = "Linlin Jia" | |||
__date__ = "November 2018" | |||
from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels | |||
from gklearn.kernels.graph_kernel import GraphKernel | |||
from gklearn.kernels.common_walk import CommonWalk | |||
@@ -24,6 +23,8 @@ from gklearn.kernels.path_up_to_h import PathUpToH | |||
from gklearn.kernels.treelet import Treelet | |||
from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree | |||
from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels | |||
# old version. | |||
from gklearn.kernels.commonWalkKernel import commonwalkkernel | |||
from gklearn.kernels.marginalizedKernel import marginalizedkernel | |||
@@ -32,4 +33,4 @@ from gklearn.kernels.spKernel import spkernel | |||
from gklearn.kernels.structuralspKernel import structuralspkernel | |||
from gklearn.kernels.untilHPathKernel import untilhpathkernel | |||
from gklearn.kernels.treeletKernel import treeletkernel | |||
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel |
@@ -47,7 +47,7 @@ class CommonWalk(GraphKernel): | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
length=len_itr, verbose=(self._verbose >= 2)) | |||
length=len_itr, verbose=(self.verbose >= 2)) | |||
# direct product graph method - exponential | |||
if self._compute_method == 'exp': | |||
@@ -86,7 +86,7 @@ class CommonWalk(GraphKernel): | |||
do_fun = self._wrapper_kernel_do_geo | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm, | |||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
return gram_matrix | |||
@@ -100,9 +100,9 @@ class CommonWalk(GraphKernel): | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', | |||
file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
else: | |||
iterator = range(len(g_list)) | |||
@@ -148,7 +148,7 @@ class CommonWalk(GraphKernel): | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered', | |||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
return kernel_list | |||
@@ -35,7 +35,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
def _compute_gm_series(self): | |||
self._check_edge_weight(self._graphs, self._verbose) | |||
self._check_edge_weight(self._graphs, self.verbose) | |||
self._check_graphs(self._graphs) | |||
lmda = self._weight | |||
@@ -44,7 +44,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
@@ -52,7 +52,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
for i, j in iterator: | |||
kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) | |||
@@ -66,7 +66,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
def _compute_gm_imap_unordered(self): | |||
self._check_edge_weight(self._graphs, self._verbose) | |||
self._check_edge_weight(self._graphs, self.verbose) | |||
self._check_graphs(self._graphs) | |||
# Compute Gram matrix. | |||
@@ -74,7 +74,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
# @todo: parallel this. | |||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
@@ -86,7 +86,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
do_fun = self._wrapper_kernel_do | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
else: # @todo | |||
pass | |||
@@ -95,7 +95,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
def _compute_kernel_list_series(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1], self._verbose) | |||
self._check_edge_weight(g_list + [g1], self.verbose) | |||
self._check_graphs(g_list + [g1]) | |||
lmda = self._weight | |||
@@ -105,11 +105,11 @@ class ConjugateGradient(RandomWalkMeta): | |||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | |||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i in iterator: | |||
kernel = self._kernel_do(g1, g_list[i], lmda) | |||
@@ -122,7 +122,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1], self._verbose) | |||
self._check_edge_weight(g_list + [g1], self.verbose) | |||
self._check_graphs(g_list + [g1]) | |||
# compute kernel list. | |||
@@ -131,7 +131,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | |||
# @todo: parallel this. | |||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
@@ -149,7 +149,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
else: # @todo | |||
pass | |||
@@ -162,7 +162,7 @@ class ConjugateGradient(RandomWalkMeta): | |||
def _compute_single_kernel_series(self, g1, g2): | |||
self._check_edge_weight([g1] + [g2], self._verbose) | |||
self._check_edge_weight([g1] + [g2], self.verbose) | |||
self._check_graphs([g1] + [g2]) | |||
lmda = self._weight | |||
@@ -35,7 +35,7 @@ class FixedPoint(RandomWalkMeta): | |||
def _compute_gm_series(self): | |||
self._check_edge_weight(self._graphs, self._verbose) | |||
self._check_edge_weight(self._graphs, self.verbose) | |||
self._check_graphs(self._graphs) | |||
lmda = self._weight | |||
@@ -44,7 +44,7 @@ class FixedPoint(RandomWalkMeta): | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self.verbose >= 2)) | |||
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
@@ -52,7 +52,7 @@ class FixedPoint(RandomWalkMeta): | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
for i, j in iterator: | |||
kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) | |||
@@ -66,7 +66,7 @@ class FixedPoint(RandomWalkMeta): | |||
def _compute_gm_imap_unordered(self): | |||
self._check_edge_weight(self._graphs, self._verbose) | |||
self._check_edge_weight(self._graphs, self.verbose) | |||
self._check_graphs(self._graphs) | |||
# Compute Gram matrix. | |||
@@ -74,7 +74,7 @@ class FixedPoint(RandomWalkMeta): | |||
# @todo: parallel this. | |||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
@@ -86,7 +86,7 @@ class FixedPoint(RandomWalkMeta): | |||
do_fun = self._wrapper_kernel_do | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
else: # @todo | |||
pass | |||
@@ -95,7 +95,7 @@ class FixedPoint(RandomWalkMeta): | |||
def _compute_kernel_list_series(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1], self._verbose) | |||
self._check_edge_weight(g_list + [g1], self.verbose) | |||
self._check_graphs(g_list + [g1]) | |||
lmda = self._weight | |||
@@ -105,12 +105,12 @@ class FixedPoint(RandomWalkMeta): | |||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | |||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i in iterator: | |||
kernel = self._kernel_do(g1, g_list[i], lmda) | |||
@@ -123,7 +123,7 @@ class FixedPoint(RandomWalkMeta): | |||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1], self._verbose) | |||
self._check_edge_weight(g_list + [g1], self.verbose) | |||
self._check_graphs(g_list + [g1]) | |||
# compute kernel list. | |||
@@ -132,7 +132,7 @@ class FixedPoint(RandomWalkMeta): | |||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | |||
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | |||
# @todo: parallel this. | |||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | |||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | |||
@@ -150,7 +150,7 @@ class FixedPoint(RandomWalkMeta): | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
else: # @todo | |||
pass | |||
@@ -163,7 +163,7 @@ class FixedPoint(RandomWalkMeta): | |||
def _compute_single_kernel_series(self, g1, g2): | |||
self._check_edge_weight([g1] + [g2], self._verbose) | |||
self._check_edge_weight([g1] + [g2], self.verbose) | |||
self._check_graphs([g1] + [g2]) | |||
lmda = self._weight | |||
@@ -9,55 +9,433 @@ import numpy as np | |||
import networkx as nx | |||
import multiprocessing | |||
import time | |||
# from abc import ABC, abstractmethod | |||
from sklearn.base import BaseEstimator # , TransformerMixin | |||
from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, | |||
from sklearn.exceptions import NotFittedError | |||
from gklearn.utils import normalize_gram_matrix | |||
class GraphKernel(object): | |||
class GraphKernel(BaseEstimator): #, ABC): | |||
"""The basic graph kernel class. | |||
def __init__(self): | |||
self._graphs = None | |||
self._parallel = '' | |||
self._n_jobs = 0 | |||
self._verbose = None | |||
self._normalize = True | |||
self._run_time = 0 | |||
self._gram_matrix = None | |||
self._gram_matrix_unnorm = None | |||
Attributes | |||
---------- | |||
_graphs : list | |||
Stores the input graphs on fit input data. | |||
Default format of the list objects is `NetworkX` graphs. | |||
**We don't guarantee that the input graphs remain unchanged during the | |||
computation.** | |||
References | |||
---------- | |||
https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. | |||
""" | |||
def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2): | |||
"""`__init__` for `GraphKernel` object.""" | |||
# @todo: the default settings of the parameters are different from those in the self.compute method. | |||
# self._graphs = None | |||
self.parallel = parallel | |||
self.n_jobs = n_jobs | |||
self.chunksize = chunksize | |||
self.normalize = normalize | |||
self.verbose = verbose | |||
# self._run_time = 0 | |||
# self._gram_matrix = None | |||
# self._gram_matrix_unnorm = None | |||
def compute(self, *graphs, **kwargs): | |||
self._parallel = kwargs.get('parallel', 'imap_unordered') | |||
self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||
self._normalize = kwargs.get('normalize', True) | |||
self._verbose = kwargs.get('verbose', 2) | |||
########################################################################## | |||
# The following is the 1st paradigm to compute kernel matrix, which is | |||
# compatible with `scikit-learn`. | |||
# ------------------------------------------------------------------- | |||
# Special thanks to the "GraKeL" library for providing an excellent template! | |||
########################################################################## | |||
def fit(self, X, y=None): | |||
"""Fit a graph dataset for a transformer. | |||
Parameters | |||
---------- | |||
X : iterable | |||
DESCRIPTION. | |||
y : None, optional | |||
There is no need of a target in a transformer, yet the `scikit-learn` | |||
pipeline API requires this parameter. | |||
Returns | |||
------- | |||
object | |||
Returns self. | |||
""" | |||
# self._is_tranformed = False | |||
# Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; | |||
self.clear_attributes() | |||
# Validate parameters for the transformer. | |||
self.validate_parameters() | |||
# Validate the input. | |||
self._graphs = self.validate_input(X) | |||
# self._X = X | |||
# self._kernel = self._get_kernel_instance() | |||
# Return the transformer. | |||
return self | |||
def transform(self, X): | |||
"""Compute the graph kernel matrix between given and fitted data. | |||
Parameters | |||
---------- | |||
X : TYPE | |||
DESCRIPTION. | |||
Raises | |||
------ | |||
ValueError | |||
DESCRIPTION. | |||
Returns | |||
------- | |||
None. | |||
""" | |||
# Check if method "fit" had been called. | |||
check_is_fitted(self, '_graphs') | |||
# Validate the input. | |||
Y = self.validate_input(X) | |||
# Transform: compute the graph kernel matrix. | |||
kernel_matrix = self.compute_kernel_matrix(Y) | |||
self._Y = Y | |||
# Self transform must appear before the diagonal call on normilization. | |||
self._is_transformed = True | |||
if self.normalize: | |||
X_diag, Y_diag = self.diagonals() | |||
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||
try: | |||
kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) | |||
except: | |||
raise | |||
finally: | |||
np.seterr(**old_settings) | |||
return kernel_matrix | |||
def fit_transform(self, X): | |||
"""Fit and transform: compute Gram matrix on the same data. | |||
Parameters | |||
---------- | |||
X : list of graphs | |||
Input graphs. | |||
Returns | |||
------- | |||
gram_matrix : numpy array, shape = [len(X), len(X)] | |||
The Gram matrix of X. | |||
""" | |||
self.fit(X) | |||
# Transform: compute Gram matrix. | |||
gram_matrix = self.compute_kernel_matrix() | |||
# Normalize. | |||
if self.normalize: | |||
self._X_diag = np.diagonal(gram_matrix).copy() | |||
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||
try: | |||
gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) | |||
except: | |||
raise | |||
finally: | |||
np.seterr(**old_settings) | |||
return gram_matrix | |||
def get_params(self): | |||
pass | |||
def set_params(self): | |||
pass | |||
def clear_attributes(self): | |||
if hasattr(self, '_X_diag'): | |||
delattr(self, '_X_diag') | |||
if hasattr(self, '_graphs'): | |||
delattr(self, '_graphs') | |||
if hasattr(self, '_Y'): | |||
delattr(self, '_Y') | |||
if hasattr(self, '_run_time'): | |||
delattr(self, '_run_time') | |||
def validate_parameters(self): | |||
"""Validate all parameters for the transformer. | |||
Returns | |||
------- | |||
None. | |||
""" | |||
if self.parallel is not None and self.parallel != 'imap_unordered': | |||
raise ValueError('Parallel mode is not set correctly.') | |||
if self.parallel == 'imap_unordered' and self.n_jobs is None: | |||
self.n_jobs = multiprocessing.cpu_count() | |||
def validate_input(self, X): | |||
"""Validate the given input and raise errors if it is invalid. | |||
Parameters | |||
---------- | |||
X : list | |||
The input to check. Should be a list of graph. | |||
Raises | |||
------ | |||
ValueError | |||
Raise if the input is not correct. | |||
Returns | |||
------- | |||
X : list | |||
The input. A list of graph. | |||
""" | |||
if X is None: | |||
raise ValueError('Please add graphs before computing.') | |||
elif not isinstance(X, list): | |||
raise ValueError('Cannot detect graphs.') | |||
elif len(X) == 0: | |||
raise ValueError('The graph list given is empty. No computation will be performed.') | |||
return X | |||
def compute_kernel_matrix(self, Y=None): | |||
"""Compute the kernel matrix between a given target graphs (Y) and | |||
the fitted graphs (X / self._graphs) or the Gram matrix for the fitted | |||
graphs (X / self._graphs). | |||
Parameters | |||
---------- | |||
Y : list of graphs, optional | |||
The target graphs. The default is None. If None kernel is computed | |||
between X and itself. | |||
Returns | |||
------- | |||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
The computed kernel matrix. | |||
""" | |||
if Y is None: | |||
# Compute Gram matrix for self._graphs (X). | |||
kernel_matrix = self._compute_gram_matrix() | |||
# self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||
else: | |||
# Compute kernel matrix between Y and self._graphs (X). | |||
start_time = time.time() | |||
if self.parallel == 'imap_unordered': | |||
kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y) | |||
elif self.parallel is None: | |||
kernel_matrix = self._compute_kernel_matrix_series(Y) | |||
self._run_time = time.time() - start_time | |||
if self.verbose: | |||
print('Kernel matrix of size (%d, %d) built in %s seconds.' | |||
% (len(Y), len(self._graphs), self._run_time)) | |||
return kernel_matrix | |||
def _compute_kernel_matrix_series(self, Y): | |||
"""Compute the kernel matrix between a given target graphs (Y) and | |||
the fitted graphs (X / self._graphs) without parallelization. | |||
Parameters | |||
---------- | |||
Y : list of graphs, optional | |||
The target graphs. | |||
Returns | |||
------- | |||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
The computed kernel matrix. | |||
""" | |||
kernel_matrix = np.zeros((len(Y), len(self._graphs))) | |||
for i_y, g_y in enumerate(Y): | |||
for i_x, g_x in enumerate(self._graphs): | |||
kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x) | |||
return kernel_matrix | |||
def _compute_kernel_matrix_imap_unordered(self, Y): | |||
"""Compute the kernel matrix between a given target graphs (Y) and | |||
the fitted graphs (X / self._graphs) using imap unordered parallelization. | |||
Parameters | |||
---------- | |||
Y : list of graphs, optional | |||
The target graphs. | |||
Returns | |||
------- | |||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
The computed kernel matrix. | |||
""" | |||
raise Exception('Parallelization for kernel matrix is not implemented.') | |||
def diagonals(self): | |||
"""Compute the kernel matrix diagonals of the fit/transformed data. | |||
Returns | |||
------- | |||
X_diag : numpy array | |||
The diagonal of the kernel matrix between the fitted data. | |||
This consists of each element calculated with itself. | |||
Y_diag : numpy array | |||
The diagonal of the kernel matrix, of the transform. | |||
This consists of each element calculated with itself. | |||
""" | |||
# Check if method "fit" had been called. | |||
check_is_fitted(self, ['_graphs']) | |||
# Check if the diagonals of X exist. | |||
try: | |||
check_is_fitted(self, ['_X_diag']) | |||
except NotFittedError: | |||
# Compute diagonals of X. | |||
self._X_diag = np.empty(shape=(len(self._graphs),)) | |||
for i, x in enumerate(self._graphs): | |||
self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? | |||
try: | |||
# If transform has happened, return both diagonals. | |||
check_is_fitted(self, ['_Y']) | |||
self._Y_diag = np.empty(shape=(len(self._Y),)) | |||
for (i, y) in enumerate(self._Y): | |||
self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? | |||
return self._X_diag, self._Y_diag | |||
except NotFittedError: | |||
# Else just return both X_diag | |||
return self._X_diag | |||
# @abstractmethod | |||
def pairwise_kernel(self, x, y): | |||
"""Compute pairwise kernel between two graphs. | |||
Parameters | |||
---------- | |||
x, y : NetworkX Graph. | |||
Graphs bewteen which the kernel is computed. | |||
Returns | |||
------- | |||
kernel: float | |||
The computed kernel. | |||
# Notes | |||
# ----- | |||
# This method is abstract and must be implemented by a subclass. | |||
""" | |||
raise NotImplementedError('Pairwise kernel computation is not implemented!') | |||
########################################################################## | |||
# The following is the 2nd paradigm to compute kernel matrix. It is | |||
# simplified and not compatible with `scikit-learn`. | |||
########################################################################## | |||
def compute(self, *graphs, **kwargs): | |||
self.parallel = kwargs.get('parallel', 'imap_unordered') | |||
self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||
self.normalize = kwargs.get('normalize', True) | |||
self.verbose = kwargs.get('verbose', 2) | |||
self.copy_graphs = kwargs.get('copy_graphs', True) | |||
self.save_unnormed = kwargs.get('save_unnormed', True) | |||
self.validate_parameters() | |||
# If the inputs is a list of graphs. | |||
if len(graphs) == 1: | |||
if not isinstance(graphs[0], list): | |||
raise Exception('Cannot detect graphs.') | |||
elif len(graphs[0]) == 0: | |||
raise Exception('The graph list given is empty. No computation was performed.') | |||
else: | |||
self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. | |||
if self.copy_graphs: | |||
self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. | |||
else: | |||
self._graphs = graphs | |||
self._gram_matrix = self._compute_gram_matrix() | |||
self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||
if self._normalize: | |||
if self.save_unnormed: | |||
self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||
if self.normalize: | |||
self._gram_matrix = normalize_gram_matrix(self._gram_matrix) | |||
return self._gram_matrix, self._run_time | |||
elif len(graphs) == 2: | |||
# If the inputs are two graphs. | |||
if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): | |||
kernel = self._compute_single_kernel(graphs[0].copy(), graphs[1].copy()) | |||
if self.copy_graphs: | |||
G0, G1 = graphs[0].copy(), graphs[1].copy() | |||
else: | |||
G0, G1 = graphs[0], graphs[1] | |||
kernel = self._compute_single_kernel(G0, G1) | |||
return kernel, self._run_time | |||
# If the inputs are a graph and a list of graphs. | |||
elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): | |||
g1 = graphs[0].copy() | |||
g_list = [g.copy() for g in graphs[1]] | |||
kernel_list = self._compute_kernel_list(g1, g_list) | |||
if self.copy_graphs: | |||
g1 = graphs[0].copy() | |||
g_list = [g.copy() for g in graphs[1]] | |||
kernel_list = self._compute_kernel_list(g1, g_list) | |||
else: | |||
kernel_list = self._compute_kernel_list(graphs[0], graphs[1]) | |||
return kernel_list, self._run_time | |||
elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): | |||
g1 = graphs[1].copy() | |||
g_list = [g.copy() for g in graphs[0]] | |||
kernel_list = self._compute_kernel_list(g1, g_list) | |||
if self.copy_graphs: | |||
g1 = graphs[1].copy() | |||
g_list = [g.copy() for g in graphs[0]] | |||
kernel_list = self._compute_kernel_list(g1, g_list) | |||
else: | |||
kernel_list = self._compute_kernel_list(graphs[1], graphs[0]) | |||
return kernel_list, self._run_time | |||
else: | |||
raise Exception('Cannot detect graphs.') | |||
@@ -103,15 +481,15 @@ class GraphKernel(object): | |||
def _compute_gram_matrix(self): | |||
start_time = time.time() | |||
if self._parallel == 'imap_unordered': | |||
if self.parallel == 'imap_unordered': | |||
gram_matrix = self._compute_gm_imap_unordered() | |||
elif self._parallel is None: | |||
elif self.parallel is None: | |||
gram_matrix = self._compute_gm_series() | |||
else: | |||
raise Exception('Parallel mode is not set correctly.') | |||
self._run_time = time.time() - start_time | |||
if self._verbose: | |||
if self.verbose: | |||
print('Gram matrix of size %d built in %s seconds.' | |||
% (len(self._graphs), self._run_time)) | |||
@@ -129,15 +507,15 @@ class GraphKernel(object): | |||
def _compute_kernel_list(self, g1, g_list): | |||
start_time = time.time() | |||
if self._parallel == 'imap_unordered': | |||
if self.parallel == 'imap_unordered': | |||
kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) | |||
elif self._parallel is None: | |||
elif self.parallel is None: | |||
kernel_list = self._compute_kernel_list_series(g1, g_list) | |||
else: | |||
raise Exception('Parallel mode is not set correctly.') | |||
self._run_time = time.time() - start_time | |||
if self._verbose: | |||
if self.verbose: | |||
print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' | |||
% (len(g_list), self._run_time)) | |||
@@ -158,7 +536,7 @@ class GraphKernel(object): | |||
kernel = self._compute_single_kernel_series(g1, g2) | |||
self._run_time = time.time() - start_time | |||
if self._verbose: | |||
if self.verbose: | |||
print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) | |||
return kernel | |||
@@ -185,24 +563,24 @@ class GraphKernel(object): | |||
return self._graphs | |||
@property | |||
def parallel(self): | |||
return self._parallel | |||
# @property | |||
# def parallel(self): | |||
# return self.parallel | |||
@property | |||
def n_jobs(self): | |||
return self._n_jobs | |||
# @property | |||
# def n_jobs(self): | |||
# return self.n_jobs | |||
@property | |||
def verbose(self): | |||
return self._verbose | |||
# @property | |||
# def verbose(self): | |||
# return self.verbose | |||
@property | |||
def normalize(self): | |||
return self._normalize | |||
# @property | |||
# def normalize(self): | |||
# return self.normalize | |||
@property | |||
@@ -46,7 +46,7 @@ class Marginalized(GraphKernel): | |||
self._add_dummy_labels(self._graphs) | |||
if self._remove_totters: | |||
iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
# @todo: this may not work. | |||
self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] | |||
@@ -57,7 +57,7 @@ class Marginalized(GraphKernel): | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
length=len_itr, verbose=(self._verbose >= 2)) | |||
length=len_itr, verbose=(self.verbose >= 2)) | |||
for i, j in iterator: | |||
kernel = self._kernel_do(self._graphs[i], self._graphs[j]) | |||
gram_matrix[i][j] = kernel | |||
@@ -70,16 +70,16 @@ class Marginalized(GraphKernel): | |||
self._add_dummy_labels(self._graphs) | |||
if self._remove_totters: | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
itr = range(0, len(self._graphs)) | |||
if len(self._graphs) < 100 * self._n_jobs: | |||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
if len(self._graphs) < 100 * self.n_jobs: | |||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
remove_fun = self._wrapper_untotter | |||
iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), | |||
desc='removing tottering', file=sys.stdout, | |||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
for i, g in iterator: | |||
self._graphs[i] = g | |||
pool.close() | |||
@@ -93,7 +93,7 @@ class Marginalized(GraphKernel): | |||
G_gn = gn_toshare | |||
do_fun = self._wrapper_kernel_do | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
return gram_matrix | |||
@@ -103,13 +103,13 @@ class Marginalized(GraphKernel): | |||
if self._remove_totters: | |||
g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. | |||
iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
# @todo: this may not work. | |||
g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i in iterator: | |||
kernel = self._kernel_do(g1, g_list[i]) | |||
kernel_list[i] = kernel | |||
@@ -122,16 +122,16 @@ class Marginalized(GraphKernel): | |||
if self._remove_totters: | |||
g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
itr = range(0, len(g_list)) | |||
if len(g_list) < 100 * self._n_jobs: | |||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
if len(g_list) < 100 * self.n_jobs: | |||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
remove_fun = self._wrapper_untotter | |||
iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), | |||
desc='removing tottering', file=sys.stdout, | |||
length=len(g_list), verbose=(self._verbose >= 2)) | |||
length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i, g in iterator: | |||
g_list[i] = g | |||
pool.close() | |||
@@ -151,7 +151,7 @@ class Marginalized(GraphKernel): | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
return kernel_list | |||
@@ -5,23 +5,35 @@ Created on Fri Nov 6 10:11:08 2020 | |||
@author: ljia | |||
""" | |||
from gklearn.kernels.common_walk import CommonWalk | |||
from gklearn.kernels.marginalized import Marginalized | |||
from gklearn.kernels.sylvester_equation import SylvesterEquation | |||
from gklearn.kernels.conjugate_gradient import ConjugateGradient | |||
from gklearn.kernels.fixed_point import FixedPoint | |||
from gklearn.kernels.spectral_decomposition import SpectralDecomposition | |||
from gklearn.kernels.shortest_path import ShortestPath | |||
from gklearn.kernels.structural_sp import StructuralSP | |||
from gklearn.kernels.path_up_to_h import PathUpToH | |||
from gklearn.kernels.treelet import Treelet | |||
from gklearn.kernels.weisfeiler_lehman import WLSubtree | |||
# The metadata of all graph kernels. | |||
GRAPH_KERNELS = { | |||
### based on walks. | |||
'common walk': '', | |||
'marginalized': '', | |||
'sylvester equation': '', | |||
'fixed point': '', | |||
'conjugate gradient': '', | |||
'spectral decomposition': '', | |||
'common walk': CommonWalk, | |||
'marginalized': Marginalized, | |||
'sylvester equation': SylvesterEquation, | |||
'fixed point': FixedPoint, | |||
'conjugate gradient': ConjugateGradient, | |||
'spectral decomposition': SpectralDecomposition, | |||
### based on paths. | |||
'shortest path': '', | |||
'structural shortest path': '', | |||
'path up to length h': '', | |||
'shortest path': ShortestPath, | |||
'structural shortest path': StructuralSP, | |||
'path up to length h': PathUpToH, | |||
### based on non-linear patterns. | |||
'weisfeiler-lehman subtree': '', | |||
'treelet': '', | |||
'weisfeiler-lehman subtree': WLSubtree, | |||
'treelet': Treelet, | |||
} | |||
@@ -41,10 +41,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
from itertools import combinations_with_replacement | |||
itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator_kernel = get_iters(itr_kernel, desc='Computing kernels', | |||
file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
@@ -69,10 +69,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
# get all paths of all graphs before computing kernels to save time, | |||
# but this may cost a lot of memory for large datasets. | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
itr = zip(self._graphs, range(0, len(self._graphs))) | |||
if len(self._graphs) < 100 * self._n_jobs: | |||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
if len(self._graphs) < 100 * self.n_jobs: | |||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
all_paths = [[] for _ in range(len(self._graphs))] | |||
@@ -84,7 +84,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) | |||
iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), | |||
desc='getting paths', file=sys.stdout, | |||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
for i, ps in iterator: | |||
all_paths[i] = ps | |||
pool.close() | |||
@@ -109,7 +109,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
G_plist = plist_toshare | |||
do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this? | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(all_paths,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
return gram_matrix | |||
@@ -117,8 +117,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
def _compute_kernel_list_series(self, g1, g_list): | |||
self._add_dummy_labels(g_list + [g1]) | |||
iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
kernel_list = [None] * len(g_list) | |||
@@ -143,10 +143,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
# get all paths of all graphs before computing kernels to save time, | |||
# but this may cost a lot of memory for large datasets. | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
itr = zip(g_list, range(0, len(g_list))) | |||
if len(g_list) < 100 * self._n_jobs: | |||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
if len(g_list) < 100 * self.n_jobs: | |||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
paths_g_list = [[] for _ in range(len(g_list))] | |||
@@ -161,7 +161,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) | |||
iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), | |||
desc='getting paths', file=sys.stdout, | |||
length=len(g_list), verbose=(self._verbose >= 2)) | |||
length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i, ps in iterator: | |||
paths_g_list[i] = ps | |||
pool.close() | |||
@@ -180,7 +180,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||
itr = range(len(g_list)) | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
return kernel_list | |||
@@ -38,7 +38,7 @@ class ShortestPath(GraphKernel): | |||
def _compute_gm_series(self): | |||
self._all_graphs_have_edges(self._graphs) | |||
# get shortest path graph of each graph. | |||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | |||
# compute Gram matrix. | |||
@@ -48,7 +48,7 @@ class ShortestPath(GraphKernel): | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', | |||
length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) | |||
length=len_itr, file=sys.stdout,verbose=(self.verbose >= 2)) | |||
for i, j in iterator: | |||
kernel = self._sp_do(self._graphs[i], self._graphs[j]) | |||
gram_matrix[i][j] = kernel | |||
@@ -60,16 +60,16 @@ class ShortestPath(GraphKernel): | |||
def _compute_gm_imap_unordered(self): | |||
self._all_graphs_have_edges(self._graphs) | |||
# get shortest path graph of each graph. | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
get_sp_graphs_fun = self._wrapper_get_sp_graphs | |||
itr = zip(self._graphs, range(0, len(self._graphs))) | |||
if len(self._graphs) < 100 * self._n_jobs: | |||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
if len(self._graphs) < 100 * self.n_jobs: | |||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), | |||
desc='getting sp graphs', file=sys.stdout, | |||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
for i, g in iterator: | |||
self._graphs[i] = g | |||
pool.close() | |||
@@ -83,7 +83,7 @@ class ShortestPath(GraphKernel): | |||
G_gs = gs_toshare | |||
do_fun = self._wrapper_sp_do | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
return gram_matrix | |||
@@ -92,12 +92,12 @@ class ShortestPath(GraphKernel): | |||
self._all_graphs_have_edges([g1] + g_list) | |||
# get shortest path graphs of g1 and each graph in g_list. | |||
g1 = getSPGraph(g1, edge_weight=self._edge_weight) | |||
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i in iterator: | |||
kernel = self._sp_do(g1, g_list[i]) | |||
kernel_list[i] = kernel | |||
@@ -109,16 +109,16 @@ class ShortestPath(GraphKernel): | |||
self._all_graphs_have_edges([g1] + g_list) | |||
# get shortest path graphs of g1 and each graph in g_list. | |||
g1 = getSPGraph(g1, edge_weight=self._edge_weight) | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
get_sp_graphs_fun = self._wrapper_get_sp_graphs | |||
itr = zip(g_list, range(0, len(g_list))) | |||
if len(g_list) < 100 * self._n_jobs: | |||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
if len(g_list) < 100 * self.n_jobs: | |||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), | |||
desc='getting sp graphs', file=sys.stdout, | |||
length=len(g_list), verbose=(self._verbose >= 2)) | |||
length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i, g in iterator: | |||
g_list[i] = g | |||
pool.close() | |||
@@ -137,7 +137,7 @@ class ShortestPath(GraphKernel): | |||
itr = range(len(g_list)) | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
return kernel_list | |||
@@ -28,9 +28,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
def _compute_gm_series(self): | |||
self._check_edge_weight(self._graphs, self._verbose) | |||
self._check_edge_weight(self._graphs, self.verbose) | |||
self._check_graphs(self._graphs) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
@@ -41,7 +41,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
# precompute the spectral decomposition of each graph. | |||
P_list = [] | |||
D_list = [] | |||
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
for G in iterator: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A actually is the transpose of the adjacency matrix. | |||
@@ -58,7 +58,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
for i, j in iterator: | |||
kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel) | |||
@@ -74,9 +74,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
def _compute_gm_imap_unordered(self): | |||
self._check_edge_weight(self._graphs, self._verbose) | |||
self._check_edge_weight(self._graphs, self.verbose) | |||
self._check_graphs(self._graphs) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
@@ -87,7 +87,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
# precompute the spectral decomposition of each graph. | |||
P_list = [] | |||
D_list = [] | |||
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
for G in iterator: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A actually is the transpose of the adjacency matrix. | |||
@@ -107,7 +107,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
do_fun = self._wrapper_kernel_do | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(q_T_list, P_list, D_list), n_jobs=self.n_jobs, verbose=self.verbose) | |||
else: # @todo | |||
pass | |||
@@ -118,9 +118,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
def _compute_kernel_list_series(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1], self._verbose) | |||
self._check_edge_weight(g_list + [g1], self.verbose) | |||
self._check_graphs(g_list + [g1]) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
@@ -133,7 +133,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
D1, P1 = np.linalg.eig(A1) | |||
P_list = [] | |||
D_list = [] | |||
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
for G in iterator: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A actually is the transpose of the adjacency matrix. | |||
@@ -145,7 +145,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
if self._p is None: # p is uniform distribution as default. | |||
q_T1 = 1 / nx.number_of_nodes(g1) | |||
q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i in iterator: | |||
kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel) | |||
@@ -160,9 +160,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1], self._verbose) | |||
self._check_edge_weight(g_list + [g1], self.verbose) | |||
self._check_graphs(g_list + [g1]) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
@@ -175,8 +175,8 @@ class SpectralDecomposition(RandomWalkMeta): | |||
D1, P1 = np.linalg.eig(A1) | |||
P_list = [] | |||
D_list = [] | |||
if self._verbose >= 2: | |||
iterator = tqdm(g_list, desc='spectral decompose', file=sys.stdout) | |||
if self.verbose >= 2: | |||
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout) | |||
else: | |||
iterator = g_list | |||
for G in iterator: | |||
@@ -207,7 +207,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||
itr = range(len(g_list)) | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
else: # @todo | |||
pass | |||
@@ -222,9 +222,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||
def _compute_single_kernel_series(self, g1, g2): | |||
self._check_edge_weight([g1] + [g2], self._verbose) | |||
self._check_edge_weight([g1] + [g2], self.verbose) | |||
self._check_graphs([g1] + [g2]) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | |||
@@ -41,7 +41,7 @@ class StructuralSP(GraphKernel): | |||
def _compute_gm_series(self): | |||
# get shortest paths of each graph in the graphs. | |||
splist = [] | |||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for g in iterator: | |||
splist.append(self._get_sps_as_trie(g)) | |||
@@ -56,7 +56,7 @@ class StructuralSP(GraphKernel): | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
length=len_itr, verbose=(self._verbose >= 2)) | |||
length=len_itr, verbose=(self.verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for i, j in iterator: | |||
kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||
@@ -76,10 +76,10 @@ class StructuralSP(GraphKernel): | |||
def _compute_gm_imap_unordered(self): | |||
# get shortest paths of each graph in the graphs. | |||
splist = [None] * len(self._graphs) | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
itr = zip(self._graphs, range(0, len(self._graphs))) | |||
if len(self._graphs) < 100 * self._n_jobs: | |||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
if len(self._graphs) < 100 * self.n_jobs: | |||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
# get shortest path graphs of self._graphs | |||
@@ -89,7 +89,7 @@ class StructuralSP(GraphKernel): | |||
get_sps_fun = self._wrapper_get_sps_naive | |||
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | |||
desc='getting shortest paths', file=sys.stdout, | |||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
for i, sp in iterator: | |||
splist[i] = sp | |||
pool.close() | |||
@@ -107,7 +107,7 @@ class StructuralSP(GraphKernel): | |||
else: | |||
do_fun = self._wrapper_ssp_do_naive | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(splist, self._graphs), n_jobs=self.n_jobs, verbose=self.verbose) | |||
return gram_matrix | |||
@@ -117,7 +117,7 @@ class StructuralSP(GraphKernel): | |||
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) | |||
splist = [] | |||
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, | |||
verbose=(self._verbose >= 2)) | |||
verbose=(self.verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for g in iterator: | |||
splist.append(self._get_sps_as_trie(g)) | |||
@@ -128,7 +128,7 @@ class StructuralSP(GraphKernel): | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', | |||
file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for i in iterator: | |||
kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i]) | |||
@@ -145,10 +145,10 @@ class StructuralSP(GraphKernel): | |||
# get shortest paths of g1 and each graph in g_list. | |||
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) | |||
splist = [None] * len(g_list) | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
itr = zip(g_list, range(0, len(g_list))) | |||
if len(g_list) < 100 * self._n_jobs: | |||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
if len(g_list) < 100 * self.n_jobs: | |||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
# get shortest path graphs of g_list | |||
@@ -158,7 +158,7 @@ class StructuralSP(GraphKernel): | |||
get_sps_fun = self._wrapper_get_sps_naive | |||
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | |||
desc='getting shortest paths', file=sys.stdout, | |||
length=len(g_list), verbose=(self._verbose >= 2)) | |||
length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i, sp in iterator: | |||
splist[i] = sp | |||
pool.close() | |||
@@ -182,7 +182,7 @@ class StructuralSP(GraphKernel): | |||
itr = range(len(g_list)) | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
return kernel_list | |||
@@ -14,6 +14,7 @@ import sys | |||
from gklearn.utils import get_iters | |||
import numpy as np | |||
import networkx as nx | |||
from control import dlyap | |||
from gklearn.utils.parallel import parallel_gm, parallel_me | |||
from gklearn.kernels import RandomWalkMeta | |||
@@ -22,14 +23,13 @@ class SylvesterEquation(RandomWalkMeta): | |||
def __init__(self, **kwargs): | |||
from control import dlyap | |||
super().__init__(**kwargs) | |||
def _compute_gm_series(self): | |||
self._check_edge_weight(self._graphs, self._verbose) | |||
self._check_edge_weight(self._graphs, self.verbose) | |||
self._check_graphs(self._graphs) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
@@ -41,7 +41,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
if self._q is None: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A_wave_list actually contains the transposes of the adjacency matrices. | |||
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | |||
# # normalized adjacency matrices | |||
# A_wave_list = [] | |||
@@ -55,7 +55,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
for i, j in iterator: | |||
kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda) | |||
@@ -71,9 +71,9 @@ class SylvesterEquation(RandomWalkMeta): | |||
def _compute_gm_imap_unordered(self): | |||
self._check_edge_weight(self._graphs, self._verbose) | |||
self._check_edge_weight(self._graphs, self.verbose) | |||
self._check_graphs(self._graphs) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
@@ -83,7 +83,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
if self._q is None: | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A_wave_list actually contains the transposes of the adjacency matrices. | |||
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | |||
if self._p is None: # p is uniform distribution as default. | |||
@@ -94,7 +94,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
do_fun = self._wrapper_kernel_do | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(A_wave_list,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
else: # @todo | |||
pass | |||
@@ -105,9 +105,9 @@ class SylvesterEquation(RandomWalkMeta): | |||
def _compute_kernel_list_series(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1], self._verbose) | |||
self._check_edge_weight(g_list + [g1], self.verbose) | |||
self._check_graphs(g_list + [g1]) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
@@ -120,11 +120,11 @@ class SylvesterEquation(RandomWalkMeta): | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A_wave_list actually contains the transposes of the adjacency matrices. | |||
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | |||
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | |||
if self._p is None: # p is uniform distribution as default. | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i in iterator: | |||
kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda) | |||
@@ -139,9 +139,9 @@ class SylvesterEquation(RandomWalkMeta): | |||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
self._check_edge_weight(g_list + [g1], self._verbose) | |||
self._check_edge_weight(g_list + [g1], self.verbose) | |||
self._check_graphs(g_list + [g1]) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
@@ -152,7 +152,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
# don't normalize adjacency matrices if q is a uniform vector. Note | |||
# A_wave_list actually contains the transposes of the adjacency matrices. | |||
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | |||
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | |||
if self._p is None: # p is uniform distribution as default. | |||
@@ -169,7 +169,7 @@ class SylvesterEquation(RandomWalkMeta): | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', | |||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
else: # @todo | |||
pass | |||
@@ -184,9 +184,9 @@ class SylvesterEquation(RandomWalkMeta): | |||
def _compute_single_kernel_series(self, g1, g2): | |||
self._check_edge_weight([g1] + [g2], self._verbose) | |||
self._check_edge_weight([g1] + [g2], self.verbose) | |||
self._check_graphs([g1] + [g2]) | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('All labels are ignored.') | |||
@@ -18,6 +18,8 @@ import numpy as np | |||
import networkx as nx | |||
from collections import Counter | |||
from itertools import chain | |||
from sklearn.utils.validation import check_is_fitted | |||
from sklearn.exceptions import NotFittedError | |||
from gklearn.utils import SpecialLabel | |||
from gklearn.utils.parallel import parallel_gm, parallel_me | |||
from gklearn.utils.utils import find_all_paths, get_mlti_dim_node_attrs | |||
@@ -26,14 +28,211 @@ from gklearn.kernels import GraphKernel | |||
class Treelet(GraphKernel): | |||
def __init__(self, **kwargs): | |||
GraphKernel.__init__(self) | |||
self._node_labels = kwargs.get('node_labels', []) | |||
self._edge_labels = kwargs.get('edge_labels', []) | |||
self._sub_kernel = kwargs.get('sub_kernel', None) | |||
self._ds_infos = kwargs.get('ds_infos', {}) | |||
if self._sub_kernel is None: | |||
raise Exception('Sub kernel not set.') | |||
def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs): | |||
"""Initialise a treelet kernel. | |||
""" | |||
super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose) | |||
self.node_labels = kwargs.get('node_labels', []) | |||
self.edge_labels = kwargs.get('edge_labels', []) | |||
self.sub_kernel = kwargs.get('sub_kernel', None) | |||
self.ds_infos = kwargs.get('ds_infos', {}) | |||
self.precompute_canonkeys = precompute_canonkeys | |||
self.save_canonkeys = save_canonkeys | |||
########################################################################## | |||
# The following is the 1st paradigm to compute kernel matrix, which is | |||
# compatible with `scikit-learn`. | |||
# ------------------------------------------------------------------- | |||
# Special thanks to the "GraKeL" library for providing an excellent template! | |||
########################################################################## | |||
def clear_attributes(self): | |||
super().clear_attributes() | |||
if hasattr(self, '_canonkeys'): | |||
delattr(self, '_canonkeys') | |||
if hasattr(self, '_Y_canonkeys'): | |||
delattr(self, '_Y_canonkeys') | |||
if hasattr(self, '_dummy_labels_considered'): | |||
delattr(self, '_dummy_labels_considered') | |||
def validate_parameters(self): | |||
"""Validate all parameters for the transformer. | |||
Returns | |||
------- | |||
None. | |||
""" | |||
super().validate_parameters() | |||
if self.sub_kernel is None: | |||
raise ValueError('Sub-kernel not set.') | |||
def _compute_kernel_matrix_series(self, Y): | |||
"""Compute the kernel matrix between a given target graphs (Y) and | |||
the fitted graphs (X / self._graphs) without parallelization. | |||
Parameters | |||
---------- | |||
Y : list of graphs, optional | |||
The target graphs. | |||
Returns | |||
------- | |||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
The computed kernel matrix. | |||
""" | |||
# self._add_dummy_labels will modify the input in place. | |||
self._add_dummy_labels() # For self._graphs | |||
# Y = [g.copy() for g in Y] # @todo: ? | |||
self._add_dummy_labels(Y) | |||
# get all canonical keys of all graphs before computing kernels to save | |||
# time, but this may cost a lot of memory for large dataset. | |||
# Canonical keys for self._graphs. | |||
try: | |||
check_is_fitted(self, ['_canonkeys']) | |||
canonkeys_list1 = self._canonkeys | |||
except NotFittedError: | |||
canonkeys_list1 = [] | |||
iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
for g in iterator: | |||
canonkeys_list1.append(self._get_canonkeys(g)) | |||
if self.save_canonkeys: | |||
self._canonkeys = canonkeys_list1 | |||
# Canonical keys for Y. | |||
canonkeys_list2 = [] | |||
iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
for g in iterator: | |||
canonkeys_list2.append(self._get_canonkeys(g)) | |||
if self.save_canonkeys: | |||
self._Y_canonkeys = canonkeys_list2 | |||
# compute kernel matrix. | |||
kernel_matrix = np.zeros((len(Y), len(canonkeys_list1))) | |||
from itertools import product | |||
itr = product(range(len(Y)), range(len(canonkeys_list1))) | |||
len_itr = int(len(Y) * len(canonkeys_list1)) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
length=len_itr, verbose=(self.verbose >= 2)) | |||
for i_y, i_x in iterator: | |||
kernel = self._kernel_do(canonkeys_list2[i_y], canonkeys_list1[i_x]) | |||
kernel_matrix[i_y][i_x] = kernel | |||
return kernel_matrix | |||
def _compute_kernel_matrix_imap_unordered(self, Y): | |||
"""Compute the kernel matrix between a given target graphs (Y) and | |||
the fitted graphs (X / self._graphs) using imap unordered parallelization. | |||
Parameters | |||
---------- | |||
Y : list of graphs, optional | |||
The target graphs. | |||
Returns | |||
------- | |||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||
The computed kernel matrix. | |||
""" | |||
raise Exception('Parallelization for kernel matrix is not implemented.') | |||
def pairwise_kernel(self, x, y, are_keys=False): | |||
"""Compute pairwise kernel between two graphs. | |||
Parameters | |||
---------- | |||
x, y : NetworkX Graph. | |||
Graphs bewteen which the kernel is computed. | |||
are_keys : boolean, optional | |||
If `True`, `x` and `y` are canonical keys, otherwise are graphs. | |||
The default is False. | |||
Returns | |||
------- | |||
kernel: float | |||
The computed kernel. | |||
""" | |||
if are_keys: | |||
# x, y are canonical keys. | |||
kernel = self._kernel_do(x, y) | |||
else: | |||
# x, y are graphs. | |||
kernel = self._compute_single_kernel_series(x, y) | |||
return kernel | |||
def diagonals(self): | |||
"""Compute the kernel matrix diagonals of the fit/transformed data. | |||
Returns | |||
------- | |||
X_diag : numpy array | |||
The diagonal of the kernel matrix between the fitted data. | |||
This consists of each element calculated with itself. | |||
Y_diag : numpy array | |||
The diagonal of the kernel matrix, of the transform. | |||
This consists of each element calculated with itself. | |||
""" | |||
# Check if method "fit" had been called. | |||
check_is_fitted(self, ['_graphs']) | |||
# Check if the diagonals of X exist. | |||
try: | |||
check_is_fitted(self, ['_X_diag']) | |||
except NotFittedError: | |||
# Compute diagonals of X. | |||
self._X_diag = np.empty(shape=(len(self._graphs),)) | |||
try: | |||
check_is_fitted(self, ['_canonkeys']) | |||
for i, x in enumerate(self._canonkeys): | |||
self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=True) # @todo: parallel? | |||
except NotFittedError: | |||
for i, x in enumerate(self._graphs): | |||
self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=False) # @todo: parallel? | |||
try: | |||
# If transform has happened, return both diagonals. | |||
check_is_fitted(self, ['_Y']) | |||
self._Y_diag = np.empty(shape=(len(self._Y),)) | |||
try: | |||
check_is_fitted(self, ['_Y_canonkeys']) | |||
for (i, y) in enumerate(self._Y_canonkeys): | |||
self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=True) # @todo: parallel? | |||
except NotFittedError: | |||
for (i, y) in enumerate(self._Y): | |||
self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=False) # @todo: parallel? | |||
return self._X_diag, self._Y_diag | |||
except NotFittedError: | |||
# Else just return both X_diag | |||
return self._X_diag | |||
########################################################################## | |||
# The following is the 2nd paradigm to compute kernel matrix. It is | |||
# simplified and not compatible with `scikit-learn`. | |||
########################################################################## | |||
def _compute_gm_series(self): | |||
@@ -43,10 +242,13 @@ class Treelet(GraphKernel): | |||
# time, but this may cost a lot of memory for large dataset. | |||
canonkeys = [] | |||
iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout, | |||
verbose=(self._verbose >= 2)) | |||
verbose=(self.verbose >= 2)) | |||
for g in iterator: | |||
canonkeys.append(self._get_canonkeys(g)) | |||
if self.save_canonkeys: | |||
self._canonkeys = canonkeys | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
@@ -54,7 +256,7 @@ class Treelet(GraphKernel): | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
length=len_itr, verbose=(self._verbose >= 2)) | |||
length=len_itr, verbose=(self.verbose >= 2)) | |||
for i, j in iterator: | |||
kernel = self._kernel_do(canonkeys[i], canonkeys[j]) | |||
gram_matrix[i][j] = kernel | |||
@@ -68,22 +270,25 @@ class Treelet(GraphKernel): | |||
# get all canonical keys of all graphs before computing kernels to save | |||
# time, but this may cost a lot of memory for large dataset. | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
itr = zip(self._graphs, range(0, len(self._graphs))) | |||
if len(self._graphs) < 100 * self._n_jobs: | |||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
if len(self._graphs) < 100 * self.n_jobs: | |||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
canonkeys = [[] for _ in range(len(self._graphs))] | |||
get_fun = self._wrapper_get_canonkeys | |||
iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), | |||
desc='getting canonkeys', file=sys.stdout, | |||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||
for i, ck in iterator: | |||
canonkeys[i] = ck | |||
pool.close() | |||
pool.join() | |||
if self.save_canonkeys: | |||
self._canonkeys = canonkeys | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
@@ -92,25 +297,25 @@ class Treelet(GraphKernel): | |||
G_canonkeys = canonkeys_toshare | |||
do_fun = self._wrapper_kernel_do | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(canonkeys,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(canonkeys,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
return gram_matrix | |||
def _compute_kernel_list_series(self, g1, g_list): | |||
self._add_dummy_labels(g_list + [g1]) | |||
# self._add_dummy_labels(g_list + [g1]) | |||
# get all canonical keys of all graphs before computing kernels to save | |||
# time, but this may cost a lot of memory for large dataset. | |||
canonkeys_1 = self._get_canonkeys(g1) | |||
canonkeys_list = [] | |||
iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self.verbose >= 2)) | |||
for g in iterator: | |||
canonkeys_list.append(self._get_canonkeys(g)) | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i in iterator: | |||
kernel = self._kernel_do(canonkeys_1, canonkeys_list[i]) | |||
kernel_list[i] = kernel | |||
@@ -125,16 +330,16 @@ class Treelet(GraphKernel): | |||
# time, but this may cost a lot of memory for large dataset. | |||
canonkeys_1 = self._get_canonkeys(g1) | |||
canonkeys_list = [[] for _ in range(len(g_list))] | |||
pool = Pool(self._n_jobs) | |||
pool = Pool(self.n_jobs) | |||
itr = zip(g_list, range(0, len(g_list))) | |||
if len(g_list) < 100 * self._n_jobs: | |||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
if len(g_list) < 100 * self.n_jobs: | |||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
get_fun = self._wrapper_get_canonkeys | |||
iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), | |||
desc='getting canonkeys', file=sys.stdout, | |||
length=len(g_list), verbose=(self._verbose >= 2)) | |||
length=len(g_list), verbose=(self.verbose >= 2)) | |||
for i, ck in iterator: | |||
canonkeys_list[i] = ck | |||
pool.close() | |||
@@ -154,7 +359,7 @@ class Treelet(GraphKernel): | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered', | |||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
return kernel_list | |||
@@ -164,13 +369,13 @@ class Treelet(GraphKernel): | |||
def _compute_single_kernel_series(self, g1, g2): | |||
self._add_dummy_labels([g1] + [g2]) | |||
# self._add_dummy_labels([g1] + [g2]) | |||
canonkeys_1 = self._get_canonkeys(g1) | |||
canonkeys_2 = self._get_canonkeys(g2) | |||
kernel = self._kernel_do(canonkeys_1, canonkeys_2) | |||
return kernel | |||
# @profile | |||
def _kernel_do(self, canonkey1, canonkey2): | |||
"""Compute treelet graph kernel between 2 graphs. | |||
@@ -187,7 +392,24 @@ class Treelet(GraphKernel): | |||
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | |||
vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | |||
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) | |||
kernel = self._sub_kernel(vector1, vector2) | |||
# vector1, vector2 = [], [] | |||
# keys1, keys2 = canonkey1, canonkey2 | |||
# keys_searched = {} | |||
# for k, v in canonkey1.items(): | |||
# if k in keys2: | |||
# vector1.append(v) | |||
# vector2.append(canonkey2[k]) | |||
# keys_searched[k] = v | |||
# for k, v in canonkey2.items(): | |||
# if k in keys1 and k not in keys_searched: | |||
# vector1.append(canonkey1[k]) | |||
# vector2.append(v) | |||
# vector1, vector2 = np.array(vector1), np.array(vector2) | |||
kernel = self.sub_kernel(vector1, vector2) | |||
return kernel | |||
@@ -223,7 +445,7 @@ class Treelet(GraphKernel): | |||
patterns['0'] = list(G.nodes()) | |||
canonkey['0'] = nx.number_of_nodes(G) | |||
for i in range(1, 6): # for i in range(1, 6): | |||
patterns[str(i)] = find_all_paths(G, i, self._ds_infos['directed']) | |||
patterns[str(i)] = find_all_paths(G, i, self.ds_infos['directed']) | |||
canonkey[str(i)] = len(patterns[str(i)]) | |||
# n-star patterns | |||
@@ -317,11 +539,11 @@ class Treelet(GraphKernel): | |||
### pattern obtained in the structural analysis section above, which is a | |||
### string corresponding to a unique treelet. A dictionary is built to keep | |||
### track of the amount of every treelet. | |||
if len(self._node_labels) > 0 or len(self._edge_labels) > 0: | |||
if len(self.node_labels) > 0 or len(self.edge_labels) > 0: | |||
canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. | |||
# linear patterns | |||
canonkey_t = Counter(get_mlti_dim_node_attrs(G, self._node_labels)) | |||
canonkey_t = Counter(get_mlti_dim_node_attrs(G, self.node_labels)) | |||
for key in canonkey_t: | |||
canonkey_l[('0', key)] = canonkey_t[key] | |||
@@ -330,9 +552,9 @@ class Treelet(GraphKernel): | |||
for pattern in patterns[str(i)]: | |||
canonlist = [] | |||
for idx, node in enumerate(pattern[:-1]): | |||
canonlist.append(tuple(G.nodes[node][nl] for nl in self._node_labels)) | |||
canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self._edge_labels)) | |||
canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self._node_labels)) | |||
canonlist.append(tuple(G.nodes[node][nl] for nl in self.node_labels)) | |||
canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self.edge_labels)) | |||
canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self.node_labels)) | |||
canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1] | |||
treelet.append(tuple([str(i)] + canonkey_t)) | |||
canonkey_l.update(Counter(treelet)) | |||
@@ -343,13 +565,13 @@ class Treelet(GraphKernel): | |||
for pattern in patterns[str(i) + 'star']: | |||
canonlist = [] | |||
for leaf in pattern[1:]: | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
canonlist.append(tuple((nlabels, elabels))) | |||
canonlist.sort() | |||
canonlist = list(chain.from_iterable(canonlist)) | |||
canonkey_t = tuple(['d' if i == 5 else str(i * 2)] + | |||
[tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||
[tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||
+ canonlist) | |||
treelet.append(canonkey_t) | |||
canonkey_l.update(Counter(treelet)) | |||
@@ -359,17 +581,17 @@ class Treelet(GraphKernel): | |||
for pattern in patterns['7']: | |||
canonlist = [] | |||
for leaf in pattern[1:3]: | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
canonlist.append(tuple((nlabels, elabels))) | |||
canonlist.sort() | |||
canonlist = list(chain.from_iterable(canonlist)) | |||
canonkey_t = tuple(['7'] | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist | |||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] | |||
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] | |||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] | |||
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]) | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist | |||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] | |||
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] | |||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] | |||
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)]) | |||
treelet.append(canonkey_t) | |||
canonkey_l.update(Counter(treelet)) | |||
@@ -378,38 +600,38 @@ class Treelet(GraphKernel): | |||
for pattern in patterns['11']: | |||
canonlist = [] | |||
for leaf in pattern[1:4]: | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
canonlist.append(tuple((nlabels, elabels))) | |||
canonlist.sort() | |||
canonlist = list(chain.from_iterable(canonlist)) | |||
canonkey_t = tuple(['b'] | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist | |||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] | |||
+ [tuple(G[pattern[4]][pattern[0]][el] for el in self._edge_labels)] | |||
+ [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)] | |||
+ [tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)]) | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist | |||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] | |||
+ [tuple(G[pattern[4]][pattern[0]][el] for el in self.edge_labels)] | |||
+ [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels)] | |||
+ [tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)]) | |||
treelet.append(canonkey_t) | |||
canonkey_l.update(Counter(treelet)) | |||
# pattern 10 | |||
treelet = [] | |||
for pattern in patterns['10']: | |||
canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), | |||
tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)] | |||
canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), | |||
tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)] | |||
canonlist = [] | |||
for leaf in pattern[1:3]: | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
canonlist.append(tuple((nlabels, elabels))) | |||
canonlist.sort() | |||
canonkey0 = list(chain.from_iterable(canonlist)) | |||
canonkey_t = tuple(['a'] | |||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] | |||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] | |||
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)] | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] | |||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] | |||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] | |||
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)] | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] | |||
+ canonkey4 + canonkey0) | |||
treelet.append(canonkey_t) | |||
canonkey_l.update(Counter(treelet)) | |||
@@ -419,15 +641,15 @@ class Treelet(GraphKernel): | |||
for pattern in patterns['12']: | |||
canonlist0 = [] | |||
for leaf in pattern[1:3]: | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||
canonlist0.append(tuple((nlabels, elabels))) | |||
canonlist0.sort() | |||
canonlist0 = list(chain.from_iterable(canonlist0)) | |||
canonlist3 = [] | |||
for leaf in pattern[4:6]: | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||
elabels = tuple(G[leaf][pattern[3]][el] for el in self._edge_labels) | |||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||
elabels = tuple(G[leaf][pattern[3]][el] for el in self.edge_labels) | |||
canonlist3.append(tuple((nlabels, elabels))) | |||
canonlist3.sort() | |||
canonlist3 = list(chain.from_iterable(canonlist3)) | |||
@@ -435,14 +657,14 @@ class Treelet(GraphKernel): | |||
# 2 possible key can be generated from 2 nodes with extended label 3, | |||
# select the one with lower lexicographic order. | |||
canonkey_t1 = tuple(['c'] | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0 | |||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] | |||
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist0 | |||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] | |||
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] | |||
+ canonlist3) | |||
canonkey_t2 = tuple(['c'] | |||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3 | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] | |||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + canonlist3 | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] | |||
+ canonlist0) | |||
treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) | |||
canonkey_l.update(Counter(treelet)) | |||
@@ -450,24 +672,24 @@ class Treelet(GraphKernel): | |||
# pattern 9 | |||
treelet = [] | |||
for pattern in patterns['9']: | |||
canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels), | |||
tuple(G[pattern[4]][pattern[2]][el] for el in self._edge_labels)] | |||
canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), | |||
tuple(G[pattern[5]][pattern[3]][el] for el in self._edge_labels)] | |||
prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self._node_labels), | |||
tuple(G[pattern[2]][pattern[0]][el] for el in self._edge_labels)] | |||
prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels), | |||
tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] | |||
canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels), | |||
tuple(G[pattern[4]][pattern[2]][el] for el in self.edge_labels)] | |||
canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), | |||
tuple(G[pattern[5]][pattern[3]][el] for el in self.edge_labels)] | |||
prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self.node_labels), | |||
tuple(G[pattern[2]][pattern[0]][el] for el in self.edge_labels)] | |||
prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels), | |||
tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] | |||
if prekey2 + canonkey2 < prekey3 + canonkey3: | |||
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ | |||
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ | |||
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ | |||
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ | |||
+ prekey2 + prekey3 + canonkey2 + canonkey3 | |||
else: | |||
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ | |||
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ | |||
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ | |||
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ | |||
+ prekey3 + prekey2 + canonkey3 + canonkey2 | |||
treelet.append(tuple(['9'] | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||
+ canonkey_t)) | |||
canonkey_l.update(Counter(treelet)) | |||
@@ -482,12 +704,33 @@ class Treelet(GraphKernel): | |||
return i, self._get_canonkeys(g) | |||
def _add_dummy_labels(self, Gn): | |||
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): | |||
for i in range(len(Gn)): | |||
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
self._node_labels = [SpecialLabel.DUMMY] | |||
if len(self._edge_labels) == 0 or (len(self._edge_labels) == 1 and self._edge_labels[0] == SpecialLabel.DUMMY): | |||
for i in range(len(Gn)): | |||
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
self._edge_labels = [SpecialLabel.DUMMY] | |||
def _add_dummy_labels(self, Gn=None): | |||
def _add_dummy(Gn): | |||
if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY): | |||
for i in range(len(Gn)): | |||
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
self.node_labels = [SpecialLabel.DUMMY] | |||
if len(self.edge_labels) == 0 or (len(self.edge_labels) == 1 and self.edge_labels[0] == SpecialLabel.DUMMY): | |||
for i in range(len(Gn)): | |||
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
self.edge_labels = [SpecialLabel.DUMMY] | |||
if Gn is None or Gn is self._graphs: | |||
# Add dummy labels for the copy of self._graphs. | |||
try: | |||
check_is_fitted(self, ['_dummy_labels_considered']) | |||
if not self._dummy_labels_considered: | |||
Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] | |||
_add_dummy(Gn) | |||
self._graphs = Gn | |||
self._dummy_labels_considered = True | |||
except NotFittedError: | |||
Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] | |||
_add_dummy(Gn) | |||
self._graphs = Gn | |||
self._dummy_labels_considered = True | |||
else: | |||
# Add dummy labels for the input. | |||
_add_dummy(Gn) | |||
@@ -14,30 +14,48 @@ Created on Tue Apr 14 15:16:34 2020 | |||
import numpy as np | |||
import networkx as nx | |||
import sys | |||
from collections import Counter | |||
# from functools import partial | |||
from itertools import combinations_with_replacement | |||
from gklearn.utils import SpecialLabel | |||
from gklearn.utils.parallel import parallel_gm, parallel_me | |||
from gklearn.kernels import GraphKernel | |||
from gklearn.utils.iters import get_iters | |||
class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
def __init__(self, **kwargs): | |||
GraphKernel.__init__(self) | |||
self._node_labels = kwargs.get('node_labels', []) | |||
self._edge_labels = kwargs.get('edge_labels', []) | |||
self._height = int(kwargs.get('height', 0)) | |||
self.node_labels = kwargs.get('node_labels', []) | |||
self.edge_labels = kwargs.get('edge_labels', []) | |||
self.height = int(kwargs.get('height', 0)) | |||
self._base_kernel = kwargs.get('base_kernel', 'subtree') | |||
self._ds_infos = kwargs.get('ds_infos', {}) | |||
########################################################################## | |||
# The following is the 1st paradigm to compute kernel matrix, which is | |||
# compatible with `scikit-learn`. | |||
# ------------------------------------------------------------------- | |||
# Special thanks to the "GraKeL" library for providing an excellent template! | |||
########################################################################## | |||
########################################################################## | |||
# The following is the 2nd paradigm to compute kernel matrix. It is | |||
# simplified and not compatible with `scikit-learn`. | |||
########################################################################## | |||
def _compute_gm_series(self): | |||
# if self._verbose >= 2: | |||
# if self.verbose >= 2: | |||
# import warnings | |||
# warnings.warn('A part of the computation is parallelized.') | |||
self._add_dummy_node_labels(self._graphs) | |||
# self._add_dummy_node_labels(self._graphs) | |||
# for WL subtree kernel | |||
if self._base_kernel == 'subtree': | |||
@@ -59,7 +77,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
def _compute_gm_imap_unordered(self): | |||
self._add_dummy_node_labels(self._graphs) | |||
# self._add_dummy_node_labels(self._graphs) | |||
if self._base_kernel == 'subtree': | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
@@ -74,17 +92,17 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
G_gn = gn_toshare | |||
do_fun = self._wrapper_pairwise | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
return gram_matrix | |||
else: | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') | |||
return self._compute_gm_series() | |||
def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better. | |||
# if self._verbose >= 2: | |||
# if self.verbose >= 2: | |||
# import warnings | |||
# warnings.warn('A part of the computation is parallelized.') | |||
@@ -126,10 +144,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||
return kernel_list | |||
else: | |||
if self._verbose >= 2: | |||
if self.verbose >= 2: | |||
import warnings | |||
warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') | |||
return self._compute_kernel_list_series(g1, g_list) | |||
@@ -160,6 +178,30 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
return gram_matrix[0][1] | |||
########################################################################## | |||
# The following are the methods used by both diagrams. | |||
########################################################################## | |||
def validate_parameters(self): | |||
"""Validate all parameters for the transformer. | |||
Returns | |||
------- | |||
None. | |||
""" | |||
super().validate_parameters() | |||
if len(self.node_labels) == 0: | |||
if len(self.edge_labels) == 0: | |||
self._subtree_kernel_do = self._subtree_kernel_do_unlabeled | |||
else: | |||
self._subtree_kernel_do = self._subtree_kernel_do_el | |||
else: | |||
if len(self.edge_labels) == 0: | |||
self._subtree_kernel_do = self._subtree_kernel_do_nl | |||
else: | |||
self._subtree_kernel_do = self._subtree_kernel_do_labeled | |||
def pairwise_kernel(self, g1, g2): | |||
Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster! | |||
@@ -172,9 +214,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
for G in Gn: | |||
# set all labels into a tuple. | |||
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | |||
G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels) | |||
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||
# get the set of original labels | |||
labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||
labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||
# number of occurence of each label in G | |||
all_num_of_each_label.append(dict(Counter(labels_ori))) | |||
@@ -182,22 +224,22 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
kernel = self._compute_kernel_itr(kernel, all_num_of_each_label) | |||
# iterate each height | |||
for h in range(1, self._height + 1): | |||
for h in range(1, self.height + 1): | |||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | |||
all_num_of_each_label = [] # number of occurence of each label in G | |||
# @todo: parallel this part. | |||
for idx, G in enumerate(Gn): | |||
for G in Gn: | |||
all_multisets = [] | |||
for node, attrs in G.nodes(data=True): | |||
# Multiset-label determination. | |||
multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] | |||
multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] | |||
# sorting each multiset | |||
multiset.sort() | |||
multiset = [attrs['label_tuple']] + multiset # add the prefix | |||
multiset = [attrs['lt']] + multiset # add the prefix | |||
all_multisets.append(tuple(multiset)) | |||
# label compression | |||
@@ -208,19 +250,19 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
# else assign the number of labels occured + 1 as the compressed label. | |||
for value in set_unique: | |||
if value in all_set_compressed.keys(): | |||
set_compressed.update({value: all_set_compressed[value]}) | |||
set_compressed[value] = all_set_compressed[value] | |||
else: | |||
set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||
set_compressed[value] = str(num_of_labels_occured + 1) | |||
num_of_labels_occured += 1 | |||
all_set_compressed.update(set_compressed) | |||
# relabel nodes | |||
for idx, node in enumerate(G.nodes()): | |||
G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] | |||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
# get the set of compressed labels | |||
labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||
# all_labels_ori.update(labels_comp) | |||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
@@ -249,8 +291,8 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
return kernel | |||
def _subtree_kernel_do(self, Gn): | |||
"""Compute Weisfeiler-Lehman kernels between graphs. | |||
def _subtree_kernel_do_nl(self, Gn): | |||
"""Compute Weisfeiler-Lehman kernels between graphs with node labels. | |||
Parameters | |||
---------- | |||
@@ -268,12 +310,16 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||
# for each graph | |||
for G in Gn: | |||
# set all labels into a tuple. | |||
if self.verbose >= 2: | |||
iterator = get_iters(Gn, desc='Setting all labels into a tuple') | |||
else: | |||
iterator = Gn | |||
for G in iterator: | |||
# set all labels into a tuple. # @todo: remove this original labels or not? | |||
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | |||
G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels) | |||
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||
# get the set of original labels | |||
labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||
labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||
# number of occurence of each label in G | |||
all_num_of_each_label.append(dict(Counter(labels_ori))) | |||
@@ -281,74 +327,398 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
# iterate each height | |||
for h in range(1, self._height + 1): | |||
for h in range(1, self.height + 1): | |||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | |||
all_num_of_each_label = [] # number of occurence of each label in G | |||
# @todo: parallel this part. | |||
for idx, G in enumerate(Gn): | |||
# if self.verbose >= 2: | |||
# iterator = get_iters(enumerate(Gn), desc='Going through iteration ' + str(h), length=len(Gn)) | |||
# else: | |||
# iterator = enumerate(Gn) | |||
for G in Gn: | |||
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
all_multisets = [] | |||
for node, attrs in G.nodes(data=True): | |||
# Multiset-label determination. | |||
multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] | |||
# sorting each multiset | |||
multiset.sort() | |||
multiset = [attrs['label_tuple']] + multiset # add the prefix | |||
all_multisets.append(tuple(multiset)) | |||
# Compute subtree kernel with h iterations and add it to the final kernel | |||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
# label compression | |||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
# a dictionary mapping original labels to new ones. | |||
set_compressed = {} | |||
# if a label occured before, assign its former compressed label, | |||
# else assign the number of labels occured + 1 as the compressed label. | |||
for value in set_unique: | |||
if value in all_set_compressed.keys(): | |||
set_compressed.update({value: all_set_compressed[value]}) | |||
else: | |||
set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||
num_of_labels_occured += 1 | |||
return gram_matrix | |||
all_set_compressed.update(set_compressed) | |||
# relabel nodes | |||
for idx, node in enumerate(G.nodes()): | |||
G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] | |||
def _subtree_kernel_do_el(self, Gn): | |||
"""Compute Weisfeiler-Lehman kernels between graphs with edge labels. | |||
# get the set of compressed labels | |||
labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||
# all_labels_ori.update(labels_comp) | |||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are computed. | |||
# Compute subtree kernel with h iterations and add it to the final kernel | |||
Return | |||
------ | |||
gram_matrix : Numpy matrix | |||
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
""" | |||
gram_matrix = np.zeros((len(Gn), len(Gn))) | |||
# initial for height = 0 | |||
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||
# Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||
for i, j in iterator: | |||
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||
gram_matrix[j][i] = gram_matrix[i][j] | |||
# if h >= 1. | |||
if self.height > 0: | |||
# Set all edge labels into a tuple. # @todo: remove this original labels or not? | |||
if self.verbose >= 2: | |||
iterator = get_iters(Gn, desc='Setting all labels into a tuple') | |||
else: | |||
iterator = Gn | |||
for G in iterator: | |||
for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way. | |||
G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels) | |||
# When h == 1, compute the kernel. | |||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
all_num_of_each_label = [] # number of occurence of each label in G | |||
# @todo: parallel this part. | |||
for G in Gn: | |||
num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
# Iterate along heights (>= 2). | |||
for h in range(2, self.height + 1): | |||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
all_num_of_each_label = [] # number of occurence of each label in G | |||
# @todo: parallel this part. | |||
for G in Gn: | |||
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
return gram_matrix | |||
def _subtree_kernel_do_labeled(self, Gn): | |||
"""Compute Weisfeiler-Lehman kernels between graphs with both node and | |||
edge labels. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are computed. | |||
Return | |||
------ | |||
gram_matrix : Numpy matrix | |||
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
""" | |||
gram_matrix = np.zeros((len(Gn), len(Gn))) | |||
# initial for height = 0 | |||
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||
# Set all node labels into a tuple and get # of occurence of each label. | |||
if self.verbose >= 2: | |||
iterator = get_iters(Gn, desc='Setting all node labels into a tuple') | |||
else: | |||
iterator = Gn | |||
for G in iterator: | |||
# Set all node labels into a tuple. # @todo: remove this original labels or not? | |||
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | |||
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||
# Get the set of original labels. | |||
labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||
# number of occurence of each label in G | |||
all_num_of_each_label.append(dict(Counter(labels_ori))) | |||
# Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
# if h >= 1. | |||
if self.height > 0: | |||
# Set all edge labels into a tuple. # @todo: remove this original labels or not? | |||
if self.verbose >= 2: | |||
iterator = get_iters(Gn, desc='Setting all edge labels into a tuple') | |||
else: | |||
iterator = Gn | |||
for G in iterator: | |||
for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way. | |||
G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels) | |||
# When h == 1, compute the kernel. | |||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
all_num_of_each_label = [] # number of occurence of each label in G | |||
# @todo: parallel this part. | |||
for G in Gn: | |||
num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
# Iterate along heights. | |||
for h in range(2, self.height + 1): | |||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
all_num_of_each_label = [] # number of occurence of each label in G | |||
# @todo: parallel this part. | |||
for G in Gn: | |||
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
return gram_matrix | |||
def _subtree_kernel_do_unlabeled(self, Gn): | |||
"""Compute Weisfeiler-Lehman kernels between graphs without labels. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs between which the kernels are computed. | |||
Return | |||
------ | |||
gram_matrix : Numpy matrix | |||
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||
""" | |||
gram_matrix = np.zeros((len(Gn), len(Gn))) | |||
# initial for height = 0 | |||
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||
# Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||
for i, j in iterator: | |||
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||
gram_matrix[j][i] = gram_matrix[i][j] | |||
# if h >= 1. | |||
if self.height > 0: | |||
# When h == 1, compute the kernel. | |||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
all_num_of_each_label = [] # number of occurence of each label in G | |||
# @todo: parallel this part. | |||
for G in Gn: | |||
num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
# Iterate along heights (>= 2). | |||
for h in range(2, self.height + 1): | |||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||
all_num_of_each_label = [] # number of occurence of each label in G | |||
# @todo: parallel this part. | |||
for G in Gn: | |||
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||
return gram_matrix | |||
def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||
all_multisets = [] | |||
for node, attrs in G.nodes(data=True): | |||
# Multiset-label determination. | |||
multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] | |||
# sorting each multiset | |||
multiset.sort() | |||
multiset = [attrs['lt']] + multiset # add the prefix | |||
all_multisets.append(tuple(multiset)) | |||
# label compression | |||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
# a dictionary mapping original labels to new ones. | |||
set_compressed = {} | |||
# If a label occured before, assign its former compressed label; | |||
# otherwise assign the number of labels occured + 1 as the | |||
# compressed label. | |||
for value in set_unique: | |||
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||
set_compressed[value] = all_set_compressed[value] | |||
else: | |||
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? and what if num_of_labels_occured is extremely big. | |||
num_of_labels_occured += 1 | |||
all_set_compressed.update(set_compressed) | |||
# Relabel nodes. | |||
for idx, node in enumerate(G.nodes()): | |||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
# Get the set of compressed labels. | |||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
return num_of_labels_occured | |||
def _subtree_1graph_el(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||
all_multisets = [] | |||
# for node, attrs in G.nodes(data=True): | |||
for node in G.nodes(): | |||
# Multiset-label determination. | |||
multiset = [G.edges[(node, neighbors)]['lt'] for neighbors in G[node]] # @todo: check reference for this. | |||
# sorting each multiset | |||
multiset.sort() | |||
# multiset = [attrs['lt']] + multiset # add the prefix | |||
all_multisets.append(tuple(multiset)) | |||
# label compression | |||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
# a dictionary mapping original labels to new ones. | |||
set_compressed = {} | |||
# If a label occured before, assign its former compressed label; | |||
# otherwise assign the number of labels occured + 1 as the | |||
# compressed label. | |||
for value in set_unique: | |||
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||
set_compressed[value] = all_set_compressed[value] | |||
else: | |||
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? | |||
num_of_labels_occured += 1 | |||
all_set_compressed.update(set_compressed) | |||
# Relabel nodes. | |||
for idx, node in enumerate(G.nodes()): | |||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
# Get the set of compressed labels. | |||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) # @todo: maybe can be faster. | |||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
return num_of_labels_occured | |||
def _subtree_1graph_labeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||
all_multisets = [] | |||
for node, attrs in G.nodes(data=True): | |||
# Multiset-label determination. | |||
multiset = [tuple((G.edges[(node, neighbors)]['lt'], G.nodes[neighbors]['lt'])) for neighbors in G[node]] # @todo: check reference for this. | |||
# sorting each multiset | |||
multiset.sort() | |||
multiset = [attrs['lt']] + multiset # add the prefix | |||
all_multisets.append(tuple(multiset)) | |||
# label compression | |||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
# a dictionary mapping original labels to new ones. | |||
set_compressed = {} | |||
# If a label occured before, assign its former compressed label; | |||
# otherwise assign the number of labels occured + 1 as the | |||
# compressed label. | |||
for value in set_unique: | |||
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||
set_compressed[value] = all_set_compressed[value] | |||
else: | |||
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? | |||
num_of_labels_occured += 1 | |||
all_set_compressed.update(set_compressed) | |||
# Relabel nodes. | |||
for idx, node in enumerate(G.nodes()): | |||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
# Get the set of compressed labels. | |||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
return num_of_labels_occured | |||
def _subtree_1graph_unlabeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||
# all_multisets = [] | |||
# for node, attrs in G.nodes(data=True): # @todo: it can be better. | |||
# # Multiset-label determination. | |||
# multiset = [0 for neighbors in G[node]] | |||
# # sorting each multiset | |||
# multiset.sort() | |||
# multiset = [0] + multiset # add the prefix | |||
# all_multisets.append(tuple(multiset)) | |||
all_multisets = [len(G[node]) for node in G.nodes()] | |||
# label compression | |||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||
# a dictionary mapping original labels to new ones. | |||
set_compressed = {} | |||
# If a label occured before, assign its former compressed label; | |||
# otherwise assign the number of labels occured + 1 as the | |||
# compressed label. | |||
for value in set_unique: | |||
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||
set_compressed[value] = all_set_compressed[value] | |||
else: | |||
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? | |||
num_of_labels_occured += 1 | |||
all_set_compressed.update(set_compressed) | |||
# Relabel nodes. | |||
for idx, node in enumerate(G.nodes()): | |||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||
# Get the set of compressed labels. | |||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||
return num_of_labels_occured | |||
def _compute_gram_itr(self, gram_matrix, all_num_of_each_label): | |||
"""Compute Gram matrix using the base kernel. | |||
""" | |||
# if self._parallel == 'imap_unordered': | |||
# if self.parallel == 'imap_unordered': | |||
# # compute kernels. | |||
# def init_worker(alllabels_toshare): | |||
# global G_alllabels | |||
# G_alllabels = alllabels_toshare | |||
# do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix) | |||
# parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, | |||
# glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
# elif self._parallel is None: | |||
for i in range(len(gram_matrix)): | |||
for j in range(i, len(gram_matrix)): | |||
gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i], | |||
all_num_of_each_label[j], gram_matrix[i][j]) | |||
gram_matrix[j][i] = gram_matrix[i][j] | |||
def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel): | |||
# glbv=(all_num_of_each_label,), n_jobs=self.n_jobs, verbose=self.verbose) | |||
# elif self.parallel is None: | |||
itr = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||
len_itr = int(len(gram_matrix) * (len(gram_matrix) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing Gram matrix for this iteration', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||
for i, j in iterator: | |||
# for i in iterator: | |||
# for j in range(i, len(gram_matrix)): | |||
gram_matrix[i][j] += self._compute_subtree_kernel(all_num_of_each_label[i], | |||
all_num_of_each_label[j]) | |||
gram_matrix[j][i] = gram_matrix[i][j] | |||
def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2): | |||
"""Compute the subtree kernel. | |||
""" | |||
labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) | |||
@@ -358,7 +728,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
vector2 = np.array([(num_of_each_label2[label] | |||
if (label in num_of_each_label2.keys()) else 0) | |||
for label in labels]) | |||
kernel += np.dot(vector1, vector2) | |||
kernel = np.dot(vector1, vector2) | |||
return kernel | |||
@@ -426,9 +796,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||
for value in set_unique: | |||
if value in all_set_compressed.keys(): | |||
set_compressed.update({ value : all_set_compressed[value] }) | |||
set_compressed[value] = all_set_compressed[value] | |||
else: | |||
set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||
set_compressed[value] = str(num_of_labels_occured + 1) | |||
num_of_labels_occured += 1 | |||
all_set_compressed.update(set_compressed) | |||
@@ -504,9 +874,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||
for value in set_unique: | |||
if value in all_set_compressed.keys(): | |||
set_compressed.update({ value : all_set_compressed[value] }) | |||
set_compressed[value] = all_set_compressed[value] | |||
else: | |||
set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||
set_compressed[value] = str(num_of_labels_occured + 1) | |||
num_of_labels_occured += 1 | |||
all_set_compressed.update(set_compressed) | |||
@@ -577,9 +947,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | |||
for value in set_unique: | |||
if value in all_set_compressed.keys(): | |||
set_compressed.update({ value : all_set_compressed[value] }) | |||
set_compressed[value] = all_set_compressed[value] | |||
else: | |||
set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||
set_compressed[value] = str(num_of_labels_occured + 1) | |||
num_of_labels_occured += 1 | |||
all_set_compressed.update(set_compressed) | |||
@@ -595,10 +965,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||
def _add_dummy_node_labels(self, Gn): | |||
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): | |||
if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY): | |||
for i in range(len(Gn)): | |||
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
self._node_labels = [SpecialLabel.DUMMY] | |||
self.node_labels = [SpecialLabel.DUMMY] | |||
class WLSubtree(WeisfeilerLehman): | |||
@@ -0,0 +1,14 @@ | |||
# -*-coding:utf-8 -*- | |||
""" | |||
model learning. | |||
""" | |||
# info | |||
__version__ = "0.2" | |||
__author__ = "Linlin Jia" | |||
__date__ = "November 2020" | |||
from gklearn.model_learning.nested_cv import NestedCV | |||
from gklearn.model_learning.workflow import Workflow | |||
from gklearn.model_learning.parameters import dichotomous_permutation |
@@ -0,0 +1,714 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri Nov 27 18:59:28 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
import datetime | |||
import time | |||
import sys | |||
from tqdm import tqdm | |||
from multiprocessing import Pool, Array | |||
from functools import partial | |||
import numpy as np | |||
from matplotlib import pyplot as plt | |||
from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||
from sklearn.kernel_ridge import KernelRidge | |||
from sklearn.svm import SVC | |||
from sklearn.metrics import accuracy_score, mean_squared_error | |||
class NestedCV(object): | |||
"""Perform model selection, fitting and testing for precomputed kernels | |||
using nested CV. Print out neccessary data during the process then finally | |||
the results. | |||
Parameters | |||
---------- | |||
datafile : string | |||
Path of dataset file. | |||
estimator : function | |||
kernel function used to estimate. This function needs to return a gram matrix. | |||
param_grid_precomputed : dictionary | |||
Dictionary with names (string) of parameters used to calculate gram | |||
matrices as keys and lists of parameter settings to try as values. This | |||
enables searching over any sequence of parameter settings. Params with | |||
length 1 will be omitted. | |||
param_grid : dictionary | |||
Dictionary with names (string) of parameters used as penelties as keys | |||
and lists of parameter settings to try as values. This enables | |||
searching over any sequence of parameter settings. Params with length 1 | |||
will be omitted. | |||
model_type : string | |||
Type of the problem, can be 'regression' or 'classification'. | |||
NUM_TRIALS : integer | |||
Number of random trials of the outer CV loop. The default is 30. | |||
datafile_y : string | |||
Path of file storing y data. This parameter is optional depending on | |||
the given dataset file. | |||
extra_params : dict | |||
Extra parameters for loading dataset. See function gklearn.utils. | |||
graphfiles.loadDataset for detail. | |||
ds_name : string | |||
Name of the dataset. | |||
n_jobs : int | |||
Number of jobs for parallelization. | |||
read_gm_from_file : boolean | |||
Whether gram matrices are loaded from a file. | |||
Examples | |||
-------- | |||
>>> import numpy as np | |||
>>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel | |||
>>> from gklearn.kernels.untilHPathKernel import untilhpathkernel | |||
>>> | |||
>>> datafile = '../datasets/MUTAG/MUTAG_A.txt' | |||
>>> estimator = untilhpathkernel | |||
>>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: | |||
[’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} | |||
>>> # ’C’ for classification problems and ’alpha’ for regression problems. | |||
>>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: | |||
np.logspace(-10, 10, num=41, base=10)}] | |||
>>> | |||
>>> model_selection_for_precomputed_kernel(datafile, estimator, | |||
param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) | |||
""" | |||
def __init__(self, dataset, estimator, param_grid_precomputed=None, param_grid=None, model_type=None, num_trials=30, output_dir=None, n_jobs=1, save_gms=True, save_gm_figs=False, logging=True, verbose=True, **kwargs): | |||
tqdm.monitor_interval = 0 | |||
self._ds = dataset | |||
self._estimator = estimator | |||
self._num_trials = num_trials | |||
self._n_jobs = n_jobs | |||
self._save_gms = save_gms | |||
self._save_gm_figs = save_gm_figs | |||
self._logging = logging | |||
self._verbose = verbose | |||
self._kwargs = kwargs | |||
# Set dataset name. | |||
if self._ds._ds_name is None: | |||
self._ds_name = 'ds-unknown' | |||
else: | |||
self._ds_name = self._ds._ds_name | |||
# The output directory. | |||
if output_dir is None: | |||
self._output_dir = os.path.join('outputs/', estimator.__name__) | |||
else: | |||
self._output_dir = output_dir | |||
os.makedirs(self._output_dir, exist_ok=True) | |||
# Setup the model type. | |||
if model_type is None: | |||
self._model_type = dataset._task_type | |||
else: | |||
self._model_type = model_type.lower() | |||
if self._model_type != 'regression' and self._model_type != 'classification': | |||
raise Exception('The model type is incorrect! Please choose from regression or classification.') | |||
# @todo: Set param_grid_precomputed and param_grid. | |||
self._param_grid_precomputed = param_grid_precomputed | |||
self._param_grid = param_grid | |||
if self._verbose: | |||
print() | |||
print('--- This is a %s problem ---' % self._model_type) | |||
# A string to save all the results. | |||
if self._logging: | |||
self._str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' | |||
self._str_fw += '# This file contains results of ' + self._estimator.__name__ + ' on dataset ' + self._ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||
self._str_fw += 'This is a %s problem.\n' % self._model_type | |||
self.run() | |||
def run(self): | |||
self.fit() | |||
self.compute_gram_matrices() | |||
if len(self._gram_matrices) == 0: | |||
if self._verbose: | |||
print('All gram matrices are ignored, no results obtained.') | |||
if self._logging: | |||
self._str_fw += '\nAll gram matrices are ignored, no results obtained.\n\n' | |||
else: | |||
self.do_cv() | |||
# print out results as table. | |||
if self._logging: | |||
self._str_fw += self.printResultsInTable(self._param_list, self._param_list_pre_revised, self._average_val_scores, self._std_val_scores, self._average_perf_scores, self._std_perf_scores, self._average_train_scores, self._std_train_scores, self._gram_matrix_time, self._model_type, self._verbose) | |||
# open file to save all results for this dataset. | |||
if not os.path.exists(self._output_dir + '/' + self._ds_name + '.output.txt'): | |||
with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'w') as f: | |||
f.write(self._str_fw) | |||
else: | |||
with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'r+') as f: | |||
content = f.read() | |||
f.seek(0, 0) | |||
f.write(self._str_fw + '\n\n\n' + content) | |||
return self._final_performance, self._final_confidence | |||
def fit(self): | |||
return | |||
def compute_gram_matrices(self): | |||
"""Compute all gram matrices. | |||
Returns | |||
------- | |||
None. | |||
""" | |||
# Grid of parameters with a discrete number of values for each. | |||
self._param_list_precomputed = list(ParameterGrid(self._param_grid_precomputed)) | |||
self._param_list = list(ParameterGrid(self._param_grid)) | |||
self._gram_matrices = [ | |||
] # a list to store gram matrices for all param_grid_precomputed | |||
self._gram_matrix_time = [ | |||
] # a list to store time to calculate gram matrices | |||
self._param_list_pre_revised = [ | |||
] # list to store param grids precomputed ignoring the useless ones | |||
if self._verbose: | |||
print() | |||
print('\n1. Computing gram matrices. This could take a while...') | |||
if self._logging: | |||
self._str_fw += '\nI. Gram matrices.\n\n' | |||
self._tts = time.time() # start training time | |||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||
for idx, params_out in enumerate(self._param_list_precomputed): | |||
y = self._ds.targets[:] | |||
params_out['n_jobs'] = self._n_jobs | |||
params_out['verbose'] = self._verbose | |||
# print(dataset) | |||
# import networkx as nx | |||
# nx.draw_networkx(dataset[1]) | |||
# plt.show() | |||
rtn_data = self._estimator(self._ds.graphs[:], **params_out) # @todo: Attention! this will not copy the graphs. | |||
Kmatrix = rtn_data[0] | |||
current_run_time = rtn_data[1] | |||
# for some kernels, some graphs in datasets may not meet the | |||
# kernels' requirements for graph structure. These graphs are trimmed. | |||
if len(rtn_data) == 3: | |||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||
y = [y[idxt] for idxt in idx_trim] # trim y accordingly | |||
# Kmatrix = np.random.rand(2250, 2250) | |||
# current_run_time = 0.1 | |||
# remove graphs whose kernels with themselves are zeros | |||
# @todo: y not changed accordingly? | |||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||
nb_g_ignore = 0 | |||
for idxk, diag in enumerate(Kmatrix_diag): | |||
if diag == 0: | |||
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) | |||
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) | |||
nb_g_ignore += 1 | |||
# normalization | |||
# @todo: works only for undirected graph? | |||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||
for i in range(len(Kmatrix)): | |||
for j in range(i, len(Kmatrix)): | |||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
if self._verbose: | |||
print() | |||
if params_out == {}: | |||
if self._verbose: | |||
print('the gram matrix is: ') | |||
if self._logging: | |||
self._str_fw += 'the gram matrix is:\n\n' | |||
else: | |||
if self._verbose: | |||
print('the gram matrix with parameters', params_out, 'is: \n\n') | |||
if self._logging: | |||
self._str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||
if len(Kmatrix) < 2: | |||
nb_gm_ignore += 1 | |||
if self._verbose: | |||
print('ignored, as at most only one of all its diagonal value is non-zero.') | |||
if self._logging: | |||
self._str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||
else: | |||
if np.isnan(Kmatrix).any( | |||
): # if the matrix contains elements that are not numbers | |||
nb_gm_ignore += 1 | |||
if self._verbose: | |||
print('ignored, as it contains elements that are not numbers.') | |||
if self._logging: | |||
self._str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||
else: | |||
# print(Kmatrix) | |||
if self._logging: | |||
self._str_fw += np.array2string( | |||
Kmatrix, | |||
separator=',') + '\n\n' | |||
# separator=',', | |||
# threshold=np.inf, | |||
# floatmode='unique') + '\n\n' | |||
# Draw and save Gram matrix figures. | |||
if self._save_gm_figs: | |||
fig_file_name = self._output_dir + '/GM[ds]' + self._ds_name | |||
if params_out != {}: | |||
fig_file_name += '[params]' + str(idx) | |||
plt.imshow(Kmatrix) | |||
plt.colorbar() | |||
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) | |||
# plt.show() | |||
plt.clf() | |||
self._gram_matrices.append(Kmatrix) | |||
self._gram_matrix_time.append(current_run_time) | |||
self._param_list_pre_revised.append(params_out) | |||
if nb_g_ignore > 0: | |||
if self._verbose: | |||
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||
if self._logging: | |||
self._str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||
if self._verbose: | |||
print() | |||
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(self._param_list_precomputed), nb_gm_ignore)) | |||
if self._logging: | |||
self._str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(self._param_list_precomputed), nb_gm_ignore) | |||
self._str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||
self._str_fw += ''.join(['{}: {}\n'.format(idx, params_out) for idx, params_out in enumerate(self._param_list_precomputed)]) | |||
def do_cv(self): | |||
# save gram matrices to file. | |||
# np.savez(output_dir + '/' + ds_name + '.gm', | |||
# gms=gram_matrices, params=param_list_pre_revised, y=y, | |||
# gmtime=gram_matrix_time) | |||
if self._verbose: | |||
print('2. Fitting and predicting using nested cross validation. This could really take a while...') | |||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
# def func_assign(result, var_to_assign): | |||
# for idx, itm in enumerate(var_to_assign): | |||
# itm.append(result[idx]) | |||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) | |||
# | |||
# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, | |||
# [train_pref, val_pref, test_pref], glbv=gram_matrices, | |||
# method='imap_unordered', n_jobs=n_jobs, chunksize=1, | |||
# itr_desc='cross validation') | |||
def init_worker(gms_toshare): | |||
global G_gms | |||
G_gms = gms_toshare | |||
# gram_matrices = np.array(gram_matrices) | |||
# gms_shape = gram_matrices.shape | |||
# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) | |||
# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) | |||
pool = Pool(processes=self._n_jobs, initializer=init_worker, initargs=(self._gram_matrices,)) | |||
trial_do_partial = partial(self._parallel_trial_do, self._param_list_pre_revised, self._param_list, self._ds.targets[:], self._model_type) # @todo: maybe self._ds.targets[:] should be y. | |||
train_pref = [] | |||
val_pref = [] | |||
test_pref = [] | |||
# if NUM_TRIALS < 1000 * n_jobs: | |||
# chunksize = int(NUM_TRIALS / n_jobs) + 1 | |||
# else: | |||
# chunksize = 1000 | |||
chunksize = 1 | |||
if self._verbose: | |||
iterator = tqdm(pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize), desc='cross validation', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize) | |||
for o1, o2, o3 in iterator: | |||
train_pref.append(o1) | |||
val_pref.append(o2) | |||
test_pref.append(o3) | |||
pool.close() | |||
pool.join() | |||
# # ---- use pool.map to parallel. ---- | |||
# pool = Pool(n_jobs) | |||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type) | |||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||
# train_pref = [item[0] for item in result_perf] | |||
# val_pref = [item[1] for item in result_perf] | |||
# test_pref = [item[2] for item in result_perf] | |||
# # ---- direct running, normally use a single CPU core. ---- | |||
# train_pref = [] | |||
# val_pref = [] | |||
# test_pref = [] | |||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||
# train_pref.append(o1) | |||
# val_pref.append(o2) | |||
# test_pref.append(o3) | |||
# print() | |||
if self._verbose: | |||
print() | |||
print('3. Getting final performance...') | |||
if self._logging: | |||
self._str_fw += '\nII. Performance.\n\n' | |||
# averages and confidences of performances on outer trials for each combination of parameters | |||
self._average_train_scores = np.mean(train_pref, axis=0) | |||
# print('val_pref: ', val_pref[0][0]) | |||
self._average_val_scores = np.mean(val_pref, axis=0) | |||
# print('test_pref: ', test_pref[0][0]) | |||
self._average_perf_scores = np.mean(test_pref, axis=0) | |||
# sample std is used here | |||
self._std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||
self._std_val_scores = np.std(val_pref, axis=0, ddof=1) | |||
self._std_perf_scores = np.std(test_pref, axis=0, ddof=1) | |||
if self._model_type == 'regression': | |||
best_val_perf = np.amin(self._average_val_scores) | |||
else: | |||
best_val_perf = np.amax(self._average_val_scores) | |||
# print('average_val_scores: ', self._average_val_scores) | |||
# print('best_val_perf: ', best_val_perf) | |||
# print() | |||
best_params_index = np.where(self._average_val_scores == best_val_perf) | |||
# find smallest val std with best val perf. | |||
best_val_stds = [ | |||
self._std_val_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
min_val_std = np.amin(best_val_stds) | |||
best_params_index = np.where(self._std_val_scores == min_val_std) | |||
best_params_out = [self._param_list_pre_revised[i] for i in best_params_index[0]] | |||
best_params_in = [self._param_list[i] for i in best_params_index[1]] | |||
if self._verbose: | |||
print('best_params_out: ', best_params_out) | |||
print('best_params_in: ', best_params_in) | |||
print() | |||
print('best_val_perf: ', best_val_perf) | |||
print('best_val_std: ', min_val_std) | |||
if self._logging: | |||
self._str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out | |||
self._str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in | |||
self._str_fw += 'best_val_perf: %s\n' % best_val_perf | |||
self._str_fw += 'best_val_std: %s\n' % min_val_std | |||
# print(best_params_index) | |||
# print(best_params_index[0]) | |||
# print(self._average_perf_scores) | |||
self._final_performance = [ | |||
self._average_perf_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
self._final_confidence = [ | |||
self._std_perf_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
if self._verbose: | |||
print('final_performance: ', self._final_performance) | |||
print('final_confidence: ', self._final_confidence) | |||
if self._logging: | |||
self._str_fw += 'final_performance: %s\n' % self._final_performance | |||
self._str_fw += 'final_confidence: %s\n' % self._final_confidence | |||
train_performance = [ | |||
self._average_train_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
train_std = [ | |||
self._std_train_scores[value][best_params_index[1][idx]] | |||
for idx, value in enumerate(best_params_index[0]) | |||
] | |||
if self._verbose: | |||
print('train_performance: %s' % train_performance) | |||
print('train_std: ', train_std) | |||
if self._logging: | |||
self._str_fw += 'train_performance: %s\n' % train_performance | |||
self._str_fw += 'train_std: %s\n\n' % train_std | |||
if self._verbose: | |||
print() | |||
tt_total = time.time() - self._tts # training time for all hyper-parameters | |||
average_gram_matrix_time = np.mean(self._gram_matrix_time) | |||
std_gram_matrix_time = np.std(self._gram_matrix_time, ddof=1) if len(self._gram_matrix_time) > 1 else 0 | |||
best_gram_matrix_time = [self._gram_matrix_time[i] for i in best_params_index[0]] | |||
ave_bgmt = np.mean(best_gram_matrix_time) | |||
std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 | |||
if self._verbose: | |||
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||
ave_bgmt, std_bgmt)) | |||
print('total training time with all hyper-param choices: {:.2f}s'.format( | |||
tt_total)) | |||
if self._logging: | |||
self._str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||
self._str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||
self._str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||
# # save results to file | |||
# np.savetxt(results_name_pre + 'average_train_scores.dt', | |||
# average_train_scores) | |||
# np.savetxt(results_name_pre + 'average_val_scores', self._average_val_scores) | |||
# np.savetxt(results_name_pre + 'average_perf_scores.dt', | |||
# average_perf_scores) | |||
# np.savetxt(results_name_pre + 'std_train_scores.dt', self._std_train_scores) | |||
# np.savetxt(results_name_pre + 'std_val_scores.dt', self._std_val_scores) | |||
# np.savetxt(results_name_pre + 'std_perf_scores.dt', self._std_perf_scores) | |||
# np.save(results_name_pre + 'best_params_index', best_params_index) | |||
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) | |||
# np.save(results_name_pre + 'best_val_std.dt', best_val_std) | |||
# np.save(results_name_pre + 'final_performance.dt', self._final_performance) | |||
# np.save(results_name_pre + 'final_confidence.dt', self._final_confidence) | |||
# np.save(results_name_pre + 'train_performance.dt', train_performance) | |||
# np.save(results_name_pre + 'train_std.dt', train_std) | |||
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||
# np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||
# average_gram_matrix_time) | |||
# np.save(results_name_pre + 'std_gram_matrix_time.dt', | |||
# std_gram_matrix_time) | |||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||
# best_gram_matrix_time) | |||
def trial_do(self, param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | |||
# # get gram matrices from global variables. | |||
# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') | |||
# Arrays to store scores | |||
train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||
# randomness added to seeds of split function below. "high" is "size" times | |||
# 10 so that at least 10 different random output will be yielded. Remove | |||
# these lines if identical outputs is required. | |||
rdm_out = np.random.RandomState(seed=None) | |||
rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, | |||
size=len(param_list_pre_revised)) | |||
# print(trial, rdm_seed_out_l) | |||
# print() | |||
# loop for each outer param tuple | |||
for index_out, params_out in enumerate(param_list_pre_revised): | |||
# get gram matrices from global variables. | |||
# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] | |||
# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') | |||
gm_now = gram_matrices[index_out].copy() | |||
# split gram matrix and y to app and test sets. | |||
indices = range(len(y)) | |||
# The argument "random_state" in function "train_test_split" can not be | |||
# set to None, because it will use RandomState instance used by | |||
# np.random, which is possible for multiple subprocesses to inherit the | |||
# same seed if they forked at the same time, leading to identical | |||
# random variates for different subprocesses. Instead, we use "trial" | |||
# and "index_out" parameters to generate different seeds for different | |||
# trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add | |||
# randomness into seeds, so that it yields a different output every | |||
# time the program is run. To yield identical outputs every time, | |||
# remove the second line below. Same method is used to the "KFold" | |||
# function in the inner loop. | |||
rdm_seed_out = (trial + 1) * (index_out + 1) | |||
rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) | |||
# print(trial, rdm_seed_out) | |||
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | |||
gm_now, y, indices, test_size=0.1, | |||
random_state=rdm_seed_out, shuffle=True) | |||
# print(trial, idx_app, idx_test) | |||
# print() | |||
X_app = X_app[:, idx_app] | |||
X_test = X_test[:, idx_app] | |||
y_app = np.array(y_app) | |||
y_test = np.array(y_test) | |||
rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, | |||
size=len(param_list)) | |||
# loop for each inner param tuple | |||
for index_in, params_in in enumerate(param_list): | |||
# if trial == 0: | |||
# print(index_out, index_in) | |||
# print('params_in: ', params_in) | |||
# st = time.time() | |||
rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) | |||
# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) | |||
rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) | |||
# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) | |||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) | |||
current_train_perf = [] | |||
current_valid_perf = [] | |||
current_test_perf = [] | |||
# For regression use the Kernel Ridge method | |||
# try: | |||
if self._model_type == 'regression': | |||
kr = KernelRidge(kernel='precomputed', **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
# print("train_index, valid_index: ", trial, index_in, train_index, valid_index) | |||
# if trial == 0: | |||
# print('train_index: ', train_index) | |||
# print('valid_index: ', valid_index) | |||
# print('idx_test: ', idx_test) | |||
# print('y_app[train_index]: ', y_app[train_index]) | |||
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||
kr.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = kr.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = kr.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
# if trial == 0: | |||
# print('y_pred_valid: ', y_pred_valid) | |||
# print() | |||
y_pred_test = kr.predict( | |||
X_test[:, train_index]) | |||
# root mean squared errors | |||
current_train_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_app[train_index], y_pred_train))) | |||
current_valid_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_app[valid_index], y_pred_valid))) | |||
# if trial == 0: | |||
# print(mean_squared_error( | |||
# y_app[valid_index], y_pred_valid)) | |||
current_test_perf.append( | |||
np.sqrt( | |||
mean_squared_error( | |||
y_test, y_pred_test))) | |||
# For clcassification use SVM | |||
else: | |||
svc = SVC(kernel='precomputed', cache_size=200, | |||
verbose=False, **params_in) | |||
# loop for each split on validation set level | |||
# validation set level | |||
for train_index, valid_index in inner_cv.split(X_app): | |||
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | |||
# if trial == 0: | |||
# print('train_index: ', train_index) | |||
# print('valid_index: ', valid_index) | |||
# print('idx_test: ', idx_test) | |||
# print('y_app[train_index]: ', y_app[train_index]) | |||
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||
svc.fit(X_app[train_index, :][:, train_index], | |||
y_app[train_index]) | |||
# predict on the train, validation and test set | |||
y_pred_train = svc.predict( | |||
X_app[train_index, :][:, train_index]) | |||
y_pred_valid = svc.predict( | |||
X_app[valid_index, :][:, train_index]) | |||
y_pred_test = svc.predict( | |||
X_test[:, train_index]) | |||
# root mean squared errors | |||
current_train_perf.append( | |||
accuracy_score(y_app[train_index], | |||
y_pred_train)) | |||
current_valid_perf.append( | |||
accuracy_score(y_app[valid_index], | |||
y_pred_valid)) | |||
current_test_perf.append( | |||
accuracy_score(y_test, y_pred_test)) | |||
# except ValueError: | |||
# print(sys.exc_info()[0]) | |||
# print(params_out, params_in) | |||
# average performance on inner splits | |||
train_pref[index_out][index_in] = np.mean( | |||
current_train_perf) | |||
val_pref[index_out][index_in] = np.mean( | |||
current_valid_perf) | |||
test_pref[index_out][index_in] = np.mean( | |||
current_test_perf) | |||
# print(time.time() - st) | |||
# if trial == 0: | |||
# print('val_pref: ', val_pref) | |||
# print('test_pref: ', test_pref) | |||
return train_pref, val_pref, test_pref | |||
def _parallel_trial_do(self, param_list_pre_revised, param_list, y, model_type, trial): | |||
train_pref, val_pref, test_pref = self._trial_do(param_list_pre_revised, | |||
param_list, G_gms, y, | |||
model_type, trial) | |||
return train_pref, val_pref, test_pref | |||
def printResultsInTable(self, param_list, param_list_pre_revised, average_val_scores, | |||
std_val_scores, average_perf_scores, std_perf_scores, | |||
average_train_scores, std_train_scores, gram_matrix_time, | |||
model_type, verbose): | |||
from collections import OrderedDict | |||
from tabulate import tabulate | |||
table_dict = {} | |||
if model_type == 'regression': | |||
for param_in in param_list: | |||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||
else: | |||
for param_in in param_list: | |||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||
table_dict['params'] = [{**param_out, **param_in} | |||
for param_in in param_list for param_out in param_list_pre_revised] | |||
table_dict['gram_matrix_time'] = [ | |||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||
for param_in in param_list | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['valid_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||
std_val_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['test_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||
std_perf_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
table_dict['train_perf'] = [ | |||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||
std_train_scores[index_out][index_in]) | |||
for index_in, _ in enumerate(param_list) | |||
for index_out, _ in enumerate(param_list_pre_revised) | |||
] | |||
keyorder = [ | |||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||
'gram_matrix_time' | |||
] | |||
if verbose: | |||
print() | |||
tb_print = tabulate(OrderedDict(sorted(table_dict.items(), | |||
key=lambda i: keyorder.index(i[0]))), headers='keys') | |||
# print(tb_print) | |||
return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print |
@@ -0,0 +1,89 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri May 21 12:18:02 2021 | |||
@author: ljia | |||
""" | |||
def dichotomous_permutation(arr, layer=0): | |||
import math | |||
# def seperate_arr(arr, new_arr): | |||
# if (length % 2) == 0: | |||
# half = int(length / 2) | |||
# new_arr += [arr[half - 1], arr[half]] | |||
# subarr1 = [arr[i] for i in range(1, half - 1)] | |||
# else: | |||
# half = math.floor(length / 2) | |||
# new_arr.append(arr[half]) | |||
# subarr1 = [arr[i] for i in range(1, half)] | |||
# subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
# subarrs = [subarr1, subarr2] | |||
# return subarrs | |||
if layer == 0: | |||
length = len(arr) | |||
if length <= 2: | |||
return arr | |||
new_arr = [arr[0], arr[-1]] | |||
if (length % 2) == 0: | |||
half = int(length / 2) | |||
new_arr += [arr[half - 1], arr[half]] | |||
subarr1 = [arr[i] for i in range(1, half - 1)] | |||
else: | |||
half = math.floor(length / 2) | |||
new_arr.append(arr[half]) | |||
subarr1 = [arr[i] for i in range(1, half)] | |||
subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
subarrs = [subarr1, subarr2] | |||
# subarrs = seperate_arr(arr, new_arr) | |||
new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||
else: | |||
new_arr = [] | |||
subarrs = [] | |||
for a in arr: | |||
length = len(a) | |||
if length <= 2: | |||
new_arr += a | |||
else: | |||
# subarrs += seperate_arr(a, new_arr) | |||
if (length % 2) == 0: | |||
half = int(length / 2) | |||
new_arr += [a[half - 1], a[half]] | |||
subarr1 = [a[i] for i in range(0, half - 1)] | |||
else: | |||
half = math.floor(length / 2) | |||
new_arr.append(a[half]) | |||
subarr1 = [a[i] for i in range(0, half)] | |||
subarr2 = [a[i] for i in range(half + 1, length)] | |||
subarrs += [subarr1, subarr2] | |||
if len(subarrs) > 0: | |||
new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||
return new_arr | |||
# length = len(arr) | |||
# if length <= 2: | |||
# return arr | |||
# new_arr = [arr[0], arr[-1]] | |||
# if (length % 2) == 0: | |||
# half = int(length / 2) | |||
# new_arr += [arr[half - 1], arr[half]] | |||
# subarr1 = [arr[i] for i in range(1, half - 1)] | |||
# else: | |||
# half = math.floor(length / 2) | |||
# new_arr.append(arr[half]) | |||
# subarr1 = [arr[i] for i in range(1, half)] | |||
# subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||
# if len(subarr1) > 0: | |||
# new_arr += dichotomous_permutation(subarr1) | |||
# if len(subarr2) > 0: | |||
# new_arr += dichotomous_permutation(subarr2) | |||
# return new_arr |
@@ -0,0 +1,109 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri Nov 27 19:33:51 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
import numpy as np | |||
import pickle | |||
from gklearn.dataset import Dataset | |||
from gklearn.model_learning import NestedCV | |||
from gklearn.kernels import GRAPH_KERNELS | |||
class Workflow(object): | |||
def __init__(self, **kwargs): | |||
self._job_prefix = kwargs.get('job_prefix', 'gktask') | |||
self._max_num_running_tasks = kwargs.get('max_num_running_tasks', np.inf) | |||
self._root_dir = kwargs.get('root_dir', 'outputs/') | |||
def run(self, tasks): | |||
### Check inputs. | |||
if self._check_inputs(tasks): | |||
self._tasks = tasks | |||
else: | |||
raise ValueError('The input "tasks" is not correct.') | |||
### Sort tasks. | |||
self.sort_tasks_by_complexity() | |||
### The main process. | |||
complete = False | |||
while not complete: | |||
self.get_running_tasks() | |||
if self._num_running_tasks < self._max_num_running_tasks: | |||
### Load results from table. | |||
self.load_results_from_table() | |||
for task in self._tasks: | |||
state = self.get_task_state(task) | |||
if state != 'complete' and state != 'runnning': | |||
self.run_task(task) | |||
if self._num_running_tasks >= self._max_num_running_tasks: | |||
break | |||
### Save results. | |||
self.save_results() | |||
complete = self.check_completeness() | |||
# sleep() | |||
def _check_inputs(self, tasks): | |||
if not isinstance(tasks, list): | |||
return False | |||
else: | |||
for i in tasks: | |||
if not 'kernel' in i or not 'dataset' in i: | |||
return False | |||
return True | |||
def sort_tasks_by_complexity(self): | |||
return | |||
def get_running_tasks(self): | |||
command = 'squeue --user $USER --format "%.50j" --noheader' | |||
stream = os.popen(command) | |||
output = stream.readlines() | |||
running_tasks = [o for o in output if o.strip().startswith(self._job_prefix)] | |||
self._num_running_tasks = len(running_tasks) | |||
def load_results_from_table(self): | |||
pass | |||
def get_task_state(self, task): | |||
task_dir = os.path.join(self._root_dir, task['kernel'] + '.' + task['dataset'] + '/') | |||
fn_summary = os.path.join(task_dir, 'results_summary.pkl') | |||
if os.path.isfile(fn_summary): | |||
output = pickle.loads(fn_summary) | |||
state = output['state'] | |||
return state | |||
else: | |||
return 'unstarted' | |||
def run_task(self, task): | |||
ds_name = task['dataset'] | |||
k_name = task['kernel'] | |||
# Get dataset. | |||
ds = Dataset(ds_name) | |||
graph_kernel = GRAPH_KERNELS[k_name] | |||
# Start CV. | |||
results = NestedCV(ds, graph_kernel) |
@@ -25,34 +25,40 @@ def chooseDataset(ds_name): | |||
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | |||
root = current_path + '../../datasets/' | |||
# no node labels (and no edge labels). | |||
if ds_name == 'Alkane': | |||
# no labels at all. | |||
if ds_name == 'Alkane_unlabeled': | |||
dataset = Dataset('Alkane_unlabeled', root=root) | |||
dataset.trim_dataset(edge_required=False) | |||
dataset.cut_graphs(range(1, 10)) | |||
# node symbolic labels. | |||
# node symbolic labels only. | |||
elif ds_name == 'Acyclic': | |||
dataset = Dataset('Acyclic', root=root) | |||
dataset.trim_dataset(edge_required=False) | |||
# node non-symbolic labels. | |||
# node non-symbolic labels only. | |||
elif ds_name == 'Letter-med': | |||
dataset = Dataset('Letter-med', root=root) | |||
dataset.trim_dataset(edge_required=False) | |||
# node symbolic and non-symbolic labels (and edge symbolic labels). | |||
# node symbolic + non-symbolic labels + edge symbolic labels. | |||
elif ds_name == 'AIDS': | |||
dataset = Dataset('AIDS', root=root) | |||
dataset.trim_dataset(edge_required=False) | |||
# edge non-symbolic labels (no node labels). | |||
elif ds_name == 'Fingerprint_edge': | |||
# node non-symbolic labels + edge non-symbolic labels. | |||
elif ds_name == 'Fingerprint': | |||
dataset = Dataset('Fingerprint', root=root) | |||
dataset.trim_dataset(edge_required=True) | |||
irrelevant_labels = {'edge_attrs': ['orient', 'angle']} | |||
# edge symbolic only. | |||
elif ds_name == 'MAO': | |||
dataset = Dataset('MAO', root=root) | |||
dataset.trim_dataset(edge_required=True) | |||
irrelevant_labels = {'node_labels': ['atom_symbol'], 'node_attrs': ['x', 'y']} | |||
dataset.remove_labels(**irrelevant_labels) | |||
# edge non-symbolic labels (and node non-symbolic labels). | |||
elif ds_name == 'Fingerprint': | |||
# edge non-symbolic labels only. | |||
elif ds_name == 'Fingerprint_edge': | |||
dataset = Dataset('Fingerprint', root=root) | |||
dataset.trim_dataset(edge_required=True) | |||
# edge symbolic and non-symbolic labels (and node symbolic and non-symbolic labels). | |||
irrelevant_labels = {'edge_attrs': ['orient', 'angle']} | |||
dataset.remove_labels(**irrelevant_labels) | |||
# node symbolic and non-symbolic labels + edge symbolic and non-symbolic labels. | |||
elif ds_name == 'Cuneiform': | |||
dataset = Dataset('Cuneiform', root=root) | |||
dataset.trim_dataset(edge_required=True) | |||
@@ -91,7 +97,7 @@ def assert_equality(compute_fun, **kwargs): | |||
assert np.array_equal(lst[i], lst[i + 1]) | |||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||
@pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) | |||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
def test_CommonWalk(ds_name, weight, compute_method): | |||
@@ -126,7 +132,7 @@ def test_CommonWalk(ds_name, weight, compute_method): | |||
assert_equality(compute, parallel=['imap_unordered', None]) | |||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||
@pytest.mark.parametrize('remove_totters', [False]) #[True, False]) | |||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
def test_Marginalized(ds_name, remove_totters): | |||
@@ -319,13 +325,13 @@ def test_SpectralDecomposition(ds_name, sub_kernel): | |||
# @pytest.mark.parametrize( | |||
# 'compute_method,ds_name,sub_kernel', | |||
# [ | |||
# ('sylvester', 'Alkane', None), | |||
# ('conjugate', 'Alkane', None), | |||
# ('sylvester', 'Alkane_unlabeled', None), | |||
# ('conjugate', 'Alkane_unlabeled', None), | |||
# ('conjugate', 'AIDS', None), | |||
# ('fp', 'Alkane', None), | |||
# ('fp', 'Alkane_unlabeled', None), | |||
# ('fp', 'AIDS', None), | |||
# ('spectral', 'Alkane', 'exp'), | |||
# ('spectral', 'Alkane', 'geo'), | |||
# ('spectral', 'Alkane_unlabeled', 'exp'), | |||
# ('spectral', 'Alkane_unlabeled', 'geo'), | |||
# ] | |||
# ) | |||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
@@ -365,7 +371,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel): | |||
# assert False, exception | |||
@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
def test_ShortestPath(ds_name): | |||
"""Test shortest path kernel. | |||
@@ -401,8 +407,8 @@ def test_ShortestPath(ds_name): | |||
assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) | |||
#@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||
@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) | |||
#@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) | |||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
def test_StructuralSP(ds_name): | |||
"""Test structural shortest path kernel. | |||
@@ -441,7 +447,7 @@ def test_StructuralSP(ds_name): | |||
assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) | |||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
#@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto', None]) | |||
@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto']) | |||
@@ -476,7 +482,7 @@ def test_PathUpToH(ds_name, k_func): | |||
compute_method=['trie', 'naive']) | |||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
def test_Treelet(ds_name): | |||
"""Test treelet kernel. | |||
@@ -510,7 +516,7 @@ def test_Treelet(ds_name): | |||
assert_equality(compute, parallel=['imap_unordered', None]) | |||
@pytest.mark.parametrize('ds_name', ['Acyclic']) | |||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'MAO', 'AIDS']) | |||
#@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge']) | |||
# @pytest.mark.parametrize('base_kernel', ['subtree']) | |||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
@@ -540,17 +546,17 @@ def test_WLSubtree(ds_name): | |||
else: | |||
return gram_matrix, kernel_list, kernel | |||
assert_equality(compute, parallel=['imap_unordered', None]) | |||
assert_equality(compute, parallel=[None, 'imap_unordered']) | |||
if __name__ == "__main__": | |||
test_list_graph_kernels() | |||
# test_spkernel('Alkane', 'imap_unordered') | |||
# test_ShortestPath('Alkane') | |||
# test_list_graph_kernels() | |||
# test_spkernel('Alkane_unlabeled', 'imap_unordered') | |||
# test_ShortestPath('Alkane_unlabeled') | |||
# test_StructuralSP('Fingerprint_edge', 'imap_unordered') | |||
# test_StructuralSP('Acyclic') | |||
# test_StructuralSP('Cuneiform', None) | |||
# test_WLSubtree('Acyclic') | |||
test_WLSubtree('MAO') # 'Alkane_unlabeled', 'Acyclic', 'AIDS' | |||
# test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') | |||
# test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') | |||
# test_RandomWalk('Acyclic', 'fp', None, None) | |||
@@ -559,7 +565,7 @@ if __name__ == "__main__": | |||
# test_Marginalized('Acyclic', False) | |||
# test_ShortestPath('Acyclic') | |||
# test_PathUpToH('Acyclic', 'MinMax') | |||
# test_Treelet('Acyclic') | |||
# test_Treelet('AIDS') | |||
# test_SylvesterEquation('Acyclic') | |||
# test_ConjugateGradient('Acyclic') | |||
# test_FixedPoint('Acyclic') |
@@ -3,156 +3,230 @@ These kernels are defined between pairs of vectors. | |||
""" | |||
import numpy as np | |||
def delta_kernel(x, y): | |||
"""Delta kernel. Return 1 if x == y, 0 otherwise. | |||
Parameters | |||
---------- | |||
x, y : any | |||
Two parts to compare. | |||
Return | |||
------ | |||
kernel : integer | |||
Delta kernel. | |||
References | |||
---------- | |||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||
labeled graphs. In Proceedings of the 20th International Conference on | |||
Machine Learning, Washington, DC, United States, 2003. | |||
""" | |||
return x == y #(1 if condition else 0) | |||
def deltakernel(x, y): | |||
"""Delta kernel. Return 1 if x == y, 0 otherwise. | |||
return delta_kernel(x, y) | |||
def gaussian_kernel(x, y, gamma=None): | |||
"""Gaussian kernel. | |||
Compute the rbf (gaussian) kernel between x and y: | |||
Parameters | |||
---------- | |||
x, y : any | |||
Two parts to compare. | |||
K(x, y) = exp(-gamma ||x-y||^2). | |||
Return | |||
------ | |||
kernel : integer | |||
Delta kernel. | |||
Read more in the `User Guide of scikit-learn library <https://scikit-learn.org/stable/modules/metrics.html#rbf-kernel>`__. | |||
References | |||
---------- | |||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||
labeled graphs. In Proceedings of the 20th International Conference on | |||
Machine Learning, Washington, DC, United States, 2003. | |||
""" | |||
return x == y #(1 if condition else 0) | |||
Parameters | |||
---------- | |||
x, y : array | |||
gamma : float, default None | |||
If None, defaults to 1.0 / n_features | |||
Returns | |||
------- | |||
kernel : float | |||
""" | |||
if gamma is None: | |||
gamma = 1.0 / len(x) | |||
# xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up. | |||
# yt = np.array([float(itm) for itm in y]) | |||
# kernel = xt - yt | |||
# kernel = kernel ** 2 | |||
# kernel = np.sum(kernel) | |||
# kernel *= -gamma | |||
# kernel = np.exp(kernel) | |||
# return kernel | |||
return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma) | |||
def gaussiankernel(x, y, gamma=None): | |||
"""Gaussian kernel. | |||
Compute the rbf (gaussian) kernel between x and y: | |||
return gaussian_kernel(x, y, gamma=gamma) | |||
K(x, y) = exp(-gamma ||x-y||^2). | |||
Read more in the `User Guide of scikit-learn library <https://scikit-learn.org/stable/modules/metrics.html#rbf-kernel>`__. | |||
def polynomial_kernel(x, y, gamma=1, coef0=0, d=1): | |||
return (np.dot(x, y) * gamma + coef0) ** d | |||
Parameters | |||
---------- | |||
x, y : array | |||
gamma : float, default None | |||
If None, defaults to 1.0 / n_features | |||
def highest_polynomial_kernel(x, y, d=1, c=0): | |||
"""Polynomial kernel. | |||
Compute the polynomial kernel between x and y: | |||
Returns | |||
------- | |||
kernel : float | |||
""" | |||
if gamma is None: | |||
gamma = 1.0 / len(x) | |||
K(x, y) = <x, y> ^d + c. | |||
xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up. | |||
yt = np.array([float(itm) for itm in y]) | |||
kernel = xt - yt | |||
kernel = kernel ** 2 | |||
kernel = np.sum(kernel) | |||
kernel *= -gamma | |||
kernel = np.exp(kernel) | |||
return kernel | |||
Parameters | |||
---------- | |||
x, y : array | |||
d : integer, default 1 | |||
c : float, default 0 | |||
Returns | |||
------- | |||
kernel : float | |||
""" | |||
return np.dot(x, y) ** d + c | |||
def polynomialkernel(x, y, d=1, c=0): | |||
"""Polynomial kernel. | |||
Compute the polynomial kernel between x and y: | |||
return highest_polynomial_kernel(x, y, d=d, c=c) | |||
K(x, y) = <x, y> ^d + c. | |||
def linear_kernel(x, y): | |||
"""Polynomial kernel. | |||
Compute the polynomial kernel between x and y: | |||
Parameters | |||
---------- | |||
x, y : array | |||
K(x, y) = <x, y>. | |||
d : integer, default 1 | |||
Parameters | |||
---------- | |||
x, y : array | |||
c : float, default 0 | |||
d : integer, default 1 | |||
Returns | |||
------- | |||
kernel : float | |||
""" | |||
return np.dot(x, y) ** d + c | |||
c : float, default 0 | |||
Returns | |||
------- | |||
kernel : float | |||
""" | |||
return np.dot(x, y) | |||
def linearkernel(x, y): | |||
"""Polynomial kernel. | |||
Compute the polynomial kernel between x and y: | |||
return linear_kernel(x, y) | |||
def cosine_kernel(x, y): | |||
return np.dot(x, y) / (np.abs(x) * np.abs(y)) | |||
def sigmoid_kernel(x, y, gamma=None, coef0=1): | |||
if gamma is None: | |||
gamma = 1.0 / len(x) | |||
k = np.dot(x, y) | |||
k *= gamma | |||
k += coef0 | |||
k = np.tanh(k) | |||
# k = np.tanh(k, k) # compute tanh in-place | |||
return k | |||
def laplacian_kernel(x, y, gamma=None): | |||
if gamma is None: | |||
gamma = 1.0 / len(x) | |||
k = -gamma * np.abs(np.subtract(x, y)) | |||
k = np.exp(k) | |||
return k | |||
def chi2_kernel(x, y, gamma=1.0): | |||
k = np.divide(np.subtract(x, y) ** 2, np.add(x, y)) | |||
k = np.sum(k) | |||
k *= -gamma | |||
return np.exp(k) | |||
def exponential_kernel(x, y, gamma=None): | |||
if gamma is None: | |||
gamma = 1.0 / len(x) | |||
return np.exp(np.dot(x, y) * gamma) | |||
K(x, y) = <x, y>. | |||
def intersection_kernel(x, y): | |||
return np.sum(np.minimum(x, y)) | |||
Parameters | |||
---------- | |||
x, y : array | |||
d : integer, default 1 | |||
def multiquadratic_kernel(x, y, c=0): | |||
return np.sqrt((np.sum(np.subtract(x, y) ** 2)) + c) | |||
c : float, default 0 | |||
Returns | |||
------- | |||
kernel : float | |||
""" | |||
return np.dot(x, y) | |||
def inverse_multiquadratic_kernel(x, y, c=0): | |||
return 1 / multiquadratic_kernel(x, y, c=c) | |||
def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | |||
"""Sum of a pair of kernels. | |||
"""Sum of a pair of kernels. | |||
k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||
k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||
Parameters | |||
---------- | |||
k1, k2 : function | |||
A pair of kernel functions. | |||
d11, d12: | |||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||
d21, d22: | |||
Inputs of k2. | |||
lamda1, lamda2: float | |||
Coefficients of the product. | |||
Parameters | |||
---------- | |||
k1, k2 : function | |||
A pair of kernel functions. | |||
d11, d12: | |||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||
d21, d22: | |||
Inputs of k2. | |||
lamda1, lamda2: float | |||
Coefficients of the product. | |||
Return | |||
------ | |||
kernel : integer | |||
Return | |||
------ | |||
kernel : integer | |||
""" | |||
if d21 == None or d22 == None: | |||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12) | |||
else: | |||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||
return kernel | |||
""" | |||
if d21 == None or d22 == None: | |||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12) | |||
else: | |||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||
return kernel | |||
def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1): | |||
"""Product of a pair of kernels. | |||
k = lamda * k1(d11, d12) * k2(d21, d22) | |||
Parameters | |||
---------- | |||
k1, k2 : function | |||
A pair of kernel functions. | |||
d11, d12: | |||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||
d21, d22: | |||
Inputs of k2. | |||
lamda: float | |||
Coefficient of the product. | |||
Return | |||
------ | |||
kernel : integer | |||
""" | |||
if d21 == None or d22 == None: | |||
kernel = lamda * k1(d11, d12) * k2(d11, d12) | |||
else: | |||
kernel = lamda * k1(d11, d12) * k2(d21, d22) | |||
return kernel | |||
"""Product of a pair of kernels. | |||
k = lamda * k1(d11, d12) * k2(d21, d22) | |||
Parameters | |||
---------- | |||
k1, k2 : function | |||
A pair of kernel functions. | |||
d11, d12: | |||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||
d21, d22: | |||
Inputs of k2. | |||
lamda: float | |||
Coefficient of the product. | |||
Return | |||
------ | |||
kernel : integer | |||
""" | |||
if d21 == None or d22 == None: | |||
kernel = lamda * k1(d11, d12) * k2(d11, d12) | |||
else: | |||
kernel = lamda * k1(d11, d12) * k2(d21, d22) | |||
return kernel | |||
if __name__ == '__main__': | |||
o = polynomialkernel([1, 2], [3, 4], 2, 3) | |||
o = polynomialkernel([1, 2], [3, 4], 2, 3) |
@@ -366,19 +366,62 @@ def get_edge_labels(Gn, edge_label): | |||
def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): | |||
if len(kwargs) != 0: | |||
kernel_options = kwargs | |||
if name == 'Marginalized': | |||
if name == 'CommonWalk' or name == 'common walk': | |||
from gklearn.kernels import CommonWalk | |||
graph_kernel = CommonWalk(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'Marginalized' or name == 'marginalized': | |||
from gklearn.kernels import Marginalized | |||
graph_kernel = Marginalized(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'ShortestPath': | |||
elif name == 'SylvesterEquation' or name == 'sylvester equation': | |||
from gklearn.kernels import SylvesterEquation | |||
graph_kernel = SylvesterEquation( | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'FixedPoint' or name == 'fixed point': | |||
from gklearn.kernels import FixedPoint | |||
graph_kernel = FixedPoint(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
node_attrs=node_attrs, | |||
edge_attrs=edge_attrs, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'ConjugateGradient' or name == 'conjugate gradient': | |||
from gklearn.kernels import ConjugateGradient | |||
graph_kernel = ConjugateGradient(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
node_attrs=node_attrs, | |||
edge_attrs=edge_attrs, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'SpectralDecomposition' or name == 'spectral decomposition': | |||
from gklearn.kernels import SpectralDecomposition | |||
graph_kernel = SpectralDecomposition(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
node_attrs=node_attrs, | |||
edge_attrs=edge_attrs, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'ShortestPath' or name == 'shortest path': | |||
from gklearn.kernels import ShortestPath | |||
graph_kernel = ShortestPath(node_labels=node_labels, | |||
node_attrs=node_attrs, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'StructuralSP': | |||
elif name == 'StructuralSP' or name == 'structural shortest path': | |||
from gklearn.kernels import StructuralSP | |||
graph_kernel = StructuralSP(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
@@ -386,25 +429,29 @@ def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attr | |||
edge_attrs=edge_attrs, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'PathUpToH': | |||
elif name == 'PathUpToH' or name == 'path up to length h': | |||
from gklearn.kernels import PathUpToH | |||
graph_kernel = PathUpToH(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'Treelet': | |||
elif name == 'Treelet' or name == 'treelet': | |||
from gklearn.kernels import Treelet | |||
graph_kernel = Treelet(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'WLSubtree': | |||
elif name == 'WLSubtree' or name == 'weisfeiler-lehman subtree': | |||
from gklearn.kernels import WLSubtree | |||
graph_kernel = WLSubtree(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'WeisfeilerLehman': | |||
elif name == 'WeisfeilerLehman' or name == 'weisfeiler-lehman': | |||
from gklearn.kernels import WeisfeilerLehman | |||
graph_kernel = WeisfeilerLehman(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
@@ -541,10 +588,18 @@ def get_mlti_dim_edge_attrs(G, attr_names): | |||
def normalize_gram_matrix(gram_matrix): | |||
diag = gram_matrix.diagonal().copy() | |||
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||
for i in range(len(gram_matrix)): | |||
for j in range(i, len(gram_matrix)): | |||
gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) | |||
gram_matrix[j][i] = gram_matrix[i][j] | |||
try: | |||
gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) | |||
except: | |||
# rollback() | |||
np.seterr(**old_settings) | |||
raise | |||
else: | |||
gram_matrix[j][i] = gram_matrix[i][j] | |||
np.seterr(**old_settings) | |||
return gram_matrix | |||