@@ -79,3 +79,9 @@ outputs/ | |||||
# pyCharm. | # pyCharm. | ||||
.idea/ | .idea/ | ||||
# tests. | |||||
gklearn/tests/datasets/ | |||||
# Experiments. | |||||
gklearn/experiments/datasets/ |
@@ -1,5 +1,5 @@ | |||||
# graphkit-learn | # graphkit-learn | ||||
[](https://travis-ci.org/jajupmochi/graphkit-learn) | |||||
[](https://travis-ci.com/jajupmochi/graphkit-learn) | |||||
[](https://ci.appveyor.com/project/jajupmochi/graphkit-learn) | [](https://ci.appveyor.com/project/jajupmochi/graphkit-learn) | ||||
[](https://codecov.io/gh/jajupmochi/graphkit-learn) | [](https://codecov.io/gh/jajupmochi/graphkit-learn) | ||||
[](https://graphkit-learn.readthedocs.io/en/master/?badge=master) | [](https://graphkit-learn.readthedocs.io/en/master/?badge=master) | ||||
@@ -68,7 +68,7 @@ The docs of the library can be found [here](https://graphkit-learn.readthedocs.i | |||||
* [The common walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/common_walk.py) [1] | * [The common walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/common_walk.py) [1] | ||||
* Exponential | * Exponential | ||||
* Geometric | * Geometric | ||||
* [The marginalized kenrel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py) | |||||
* [The marginalized kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py) | |||||
* With tottering [2] | * With tottering [2] | ||||
* Without tottering [7] | * Without tottering [7] | ||||
* [The generalized random walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/random_walk.py) [3] | * [The generalized random walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/random_walk.py) [3] | ||||
@@ -40,6 +40,7 @@ class Dataset(object): | |||||
self._edge_attr_dim = None | self._edge_attr_dim = None | ||||
self._class_number = None | self._class_number = None | ||||
self._ds_name = None | self._ds_name = None | ||||
self._task_type = None | |||||
if inputs is None: | if inputs is None: | ||||
self._graphs = None | self._graphs = None | ||||
@@ -117,11 +118,16 @@ class Dataset(object): | |||||
ds_file = [os.path.join(path, fn) for fn in load_files[0]] | ds_file = [os.path.join(path, fn) for fn in load_files[0]] | ||||
fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None | fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None | ||||
# Get extra_params. | |||||
if 'extra_params' in DATASET_META[ds_name]: | if 'extra_params' in DATASET_META[ds_name]: | ||||
kwargs = DATASET_META[ds_name]['extra_params'] | kwargs = DATASET_META[ds_name]['extra_params'] | ||||
else: | else: | ||||
kwargs = {} | kwargs = {} | ||||
# Get the task type that is associated with the dataset. If it is classification, get the number of classes. | |||||
self._get_task_type(ds_name) | |||||
self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets, **kwargs).data | self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets, **kwargs).data | ||||
self._node_labels = label_names['node_labels'] | self._node_labels = label_names['node_labels'] | ||||
@@ -276,7 +282,8 @@ class Dataset(object): | |||||
'edge_attr_dim', | 'edge_attr_dim', | ||||
'class_number', | 'class_number', | ||||
'all_degree_entropy', | 'all_degree_entropy', | ||||
'ave_degree_entropy' | |||||
'ave_degree_entropy', | |||||
'class_type' | |||||
] | ] | ||||
# dataset size | # dataset size | ||||
@@ -408,7 +415,7 @@ class Dataset(object): | |||||
if 'class_number' in keys: | if 'class_number' in keys: | ||||
if self._class_number is None: | if self._class_number is None: | ||||
self._class_number = self._get_class_number() | |||||
self._class_number = self._get_class_num() | |||||
infos['class_number'] = self._class_number | infos['class_number'] = self._class_number | ||||
if 'node_attr_dim' in keys: | if 'node_attr_dim' in keys: | ||||
@@ -437,6 +444,11 @@ class Dataset(object): | |||||
base = None | base = None | ||||
infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) | infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) | ||||
if 'task_type' in keys: | |||||
if self._task_type is None: | |||||
self._task_type = self._get_task_type() | |||||
infos['task_type'] = self._task_type | |||||
return infos | return infos | ||||
@@ -790,6 +802,13 @@ class Dataset(object): | |||||
return degree_entropy | return degree_entropy | ||||
def _get_task_type(self, ds_name): | |||||
if 'task_type' in DATASET_META[ds_name]: | |||||
self._task_type = DATASET_META[ds_name]['task_type'] | |||||
if self._task_type == 'classification' and self._class_number is None and 'class_number' in DATASET_META[ds_name]: | |||||
self._class_number = DATASET_META[ds_name]['class_number'] | |||||
@property | @property | ||||
def graphs(self): | def graphs(self): | ||||
return self._graphs | return self._graphs | ||||
@@ -13,7 +13,7 @@ import pickle | |||||
import logging | import logging | ||||
from gklearn.ged.util import compute_geds | from gklearn.ged.util import compute_geds | ||||
import time | import time | ||||
from utils import get_dataset, set_edit_cost_consts | |||||
from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation | |||||
import sys | import sys | ||||
from group_results import group_trials, check_group_existence, update_group_marker | from group_results import group_trials, check_group_existence, update_group_marker | ||||
@@ -37,7 +37,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | # the distance between non-symbolic node/edge labels is computed by euclidean distance. | ||||
'attr_distance': 'euclidean', | 'attr_distance': 'euclidean', | ||||
'ratio_runs_from_initial_solutions': 0.25, | 'ratio_runs_from_initial_solutions': 0.25, | ||||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
# parallel threads. Set to 1 automatically if parallel=True in compute_geds(). | |||||
'threads': multiprocessing.cpu_count(), | 'threads': multiprocessing.cpu_count(), | ||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | ||||
} | } | ||||
@@ -98,7 +98,7 @@ def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||||
ged_mats.append(ged_mat) | ged_mats.append(ged_mat) | ||||
runtimes.append(runtime) | runtimes.append(runtime) | ||||
# Group trials and Remove single files. | |||||
# Group trials and remove single files. | |||||
# @todo: if the program stops between the following lines, then there may be errors. | # @todo: if the program stops between the following lines, then there may be errors. | ||||
name_prefix = 'ged_matrix' + name_middle | name_prefix = 'ged_matrix' + name_middle | ||||
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | ||||
@@ -111,21 +111,25 @@ def results_for_a_dataset(ds_name): | |||||
"""**1. Get dataset.**""" | """**1. Get dataset.**""" | ||||
dataset = get_dataset(ds_name) | dataset = get_dataset(ds_name) | ||||
for ratio in ratio_list: | |||||
for params in list(param_grid): | |||||
print() | print() | ||||
print('Ratio:', ratio) | |||||
for num_solutions in num_solutions_list: | |||||
print() | |||||
print('# of solutions:', num_solutions) | |||||
save_trials_as_group(dataset, ds_name, num_solutions, ratio) | |||||
print(params) | |||||
save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio']) | |||||
def get_param_lists(ds_name, test=False): | |||||
if test: | |||||
num_solutions_list = [1, 10, 20, 30, 40, 50] | |||||
def get_param_lists(ds_name, mode='test'): | |||||
if mode == 'test': | |||||
num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] | |||||
ratio_list = [10] | ratio_list = [10] | ||||
return num_solutions_list, ratio_list | return num_solutions_list, ratio_list | ||||
elif mode == 'simple': | |||||
from sklearn.model_selection import ParameterGrid | |||||
param_grid = ParameterGrid([ | |||||
{'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]}, | |||||
{'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]) | |||||
# print(list(param_grid)) | |||||
if ds_name == 'AIDS_symb': | if ds_name == 'AIDS_symb': | ||||
num_solutions_list = [1, 20, 40, 60, 80, 100] | num_solutions_list = [1, 20, 40, 60, 80, 100] | ||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | ||||
@@ -133,7 +137,7 @@ def get_param_lists(ds_name, test=False): | |||||
num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] | num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] | ||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] | ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] | ||||
return num_solutions_list, ratio_list | |||||
return param_grid | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
@@ -141,7 +145,7 @@ if __name__ == '__main__': | |||||
ds_name_list = sys.argv[1:] | ds_name_list = sys.argv[1:] | ||||
else: | else: | ||||
ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | ||||
# ds_name_list = ['Acyclic'] # 'Alkane_unlabeled'] | |||||
# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] | |||||
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ||||
save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' | save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' | ||||
@@ -151,5 +155,5 @@ if __name__ == '__main__': | |||||
for ds_name in ds_name_list: | for ds_name in ds_name_list: | ||||
print() | print() | ||||
print('Dataset:', ds_name) | print('Dataset:', ds_name) | ||||
num_solutions_list, ratio_list = get_param_lists(ds_name, test=False) | |||||
param_grid = get_param_lists(ds_name, mode='simple') | |||||
results_for_a_dataset(ds_name) | results_for_a_dataset(ds_name) |
@@ -16,12 +16,12 @@ from gklearn.experiments import DATASET_ROOT | |||||
def get_dataset(ds_name): | def get_dataset(ds_name): | ||||
# The node/edge labels that will not be used in the computation. | # The node/edge labels that will not be used in the computation. | ||||
# if ds_name == 'MAO': | |||||
# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||||
# if ds_name == 'Monoterpenoides': | |||||
# irrelevant_labels = {'edge_labels': ['valence']} | |||||
# elif ds_name == 'MUTAG': | |||||
# irrelevant_labels = {'edge_labels': ['label_0']} | |||||
# if ds_name == 'MAO': | |||||
# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||||
# if ds_name == 'Monoterpenoides': | |||||
# irrelevant_labels = {'edge_labels': ['valence']} | |||||
# elif ds_name == 'MUTAG': | |||||
# irrelevant_labels = {'edge_labels': ['label_0']} | |||||
if ds_name == 'AIDS_symb': | if ds_name == 'AIDS_symb': | ||||
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | ||||
ds_name = 'AIDS' | ds_name = 'AIDS' | ||||
@@ -49,34 +49,36 @@ def set_edit_cost_consts(ratio, node_labeled=True, edge_labeled=True, mode='unif | |||||
def nested_keys_exists(element, *keys): | def nested_keys_exists(element, *keys): | ||||
''' | |||||
Check if *keys (nested) exists in `element` (dict). | |||||
''' | |||||
if not isinstance(element, dict): | |||||
raise AttributeError('keys_exists() expects dict as first argument.') | |||||
if len(keys) == 0: | |||||
raise AttributeError('keys_exists() expects at least two arguments, one given.') | |||||
_element = element | |||||
for key in keys: | |||||
try: | |||||
_element = _element[key] | |||||
except KeyError: | |||||
return False | |||||
return True | |||||
''' | |||||
Check if *keys (nested) exists in `element` (dict). | |||||
''' | |||||
if not isinstance(element, dict): | |||||
raise AttributeError('keys_exists() expects dict as first argument.') | |||||
if len(keys) == 0: | |||||
raise AttributeError('keys_exists() expects at least two arguments, one given.') | |||||
_element = element | |||||
for key in keys: | |||||
try: | |||||
_element = _element[key] | |||||
except KeyError: | |||||
return False | |||||
return True | |||||
# Check average relative error along elements in two ged matrices. | # Check average relative error along elements in two ged matrices. | ||||
def matrices_ave_relative_error(m1, m2): | def matrices_ave_relative_error(m1, m2): | ||||
error = 0 | |||||
base = 0 | |||||
for i in range(m1.shape[0]): | |||||
for j in range(m1.shape[1]): | |||||
error += np.abs(m1[i, j] - m2[i, j]) | |||||
base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2 | |||||
error = 0 | |||||
base = 0 | |||||
for i in range(m1.shape[0]): | |||||
for j in range(m1.shape[1]): | |||||
error += np.abs(m1[i, j] - m2[i, j]) | |||||
# base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) | |||||
base += (m1[i, j] + m2[i, j]) # Require only 25% of the time of "base += (np.abs(m1[i, j]) + np.abs(m2[i, j]))". | |||||
return error / base | |||||
base = base / 2 | |||||
return error / base | |||||
def compute_relative_error(ged_mats): | def compute_relative_error(ged_mats): | ||||
@@ -92,9 +94,9 @@ def compute_relative_error(ged_mats): | |||||
errors = [] | errors = [] | ||||
for i, mat in enumerate(ged_mats): | for i, mat in enumerate(ged_mats): | ||||
err = matrices_ave_relative_error(mat, ged_mat_s) | err = matrices_ave_relative_error(mat, ged_mat_s) | ||||
# if not per_correct: | |||||
# print('matrix # ', str(i)) | |||||
# pass | |||||
# if not per_correct: | |||||
# print('matrix # ', str(i)) | |||||
# pass | |||||
errors.append(err) | errors.append(err) | ||||
else: | else: | ||||
errors = [0] | errors = [0] | ||||
@@ -107,11 +109,11 @@ def parse_group_file_name(fn): | |||||
key1 = splits_all[1] | key1 = splits_all[1] | ||||
pos2 = splits_all[2].rfind('_') | pos2 = splits_all[2].rfind('_') | ||||
# key2 = splits_all[2][:pos2] | |||||
# key2 = splits_all[2][:pos2] | |||||
val2 = splits_all[2][pos2+1:] | val2 = splits_all[2][pos2+1:] | ||||
pos3 = splits_all[3].rfind('_') | pos3 = splits_all[3].rfind('_') | ||||
# key3 = splits_all[3][:pos3] | |||||
# key3 = splits_all[3][:pos3] | |||||
val3 = splits_all[3][pos3+1:] + '.' + splits_all[4] | val3 = splits_all[3][pos3+1:] + '.' + splits_all[4] | ||||
return key1, val2, val3 | return key1, val2, val3 | ||||
@@ -232,7 +234,7 @@ def set_axis_style(ax): | |||||
ax.tick_params(labelsize=8, color='w', pad=1, grid_color='w') | ax.tick_params(labelsize=8, color='w', pad=1, grid_color='w') | ||||
ax.tick_params(axis='x', pad=-2) | ax.tick_params(axis='x', pad=-2) | ||||
ax.tick_params(axis='y', labelrotation=-40, pad=-2) | ax.tick_params(axis='y', labelrotation=-40, pad=-2) | ||||
# ax.zaxis._axinfo['juggled'] = (1, 2, 0) | |||||
# ax.zaxis._axinfo['juggled'] = (1, 2, 0) | |||||
ax.set_xlabel(ax.get_xlabel(), fontsize=10, labelpad=-3) | ax.set_xlabel(ax.get_xlabel(), fontsize=10, labelpad=-3) | ||||
ax.set_ylabel(ax.get_ylabel(), fontsize=10, labelpad=-2, rotation=50) | ax.set_ylabel(ax.get_ylabel(), fontsize=10, labelpad=-2, rotation=50) | ||||
ax.set_zlabel(ax.get_zlabel(), fontsize=10, labelpad=-2) | ax.set_zlabel(ax.get_zlabel(), fontsize=10, labelpad=-2) | ||||
@@ -240,16 +242,99 @@ def set_axis_style(ax): | |||||
return | return | ||||
def dichotomous_permutation(arr, layer=0): | |||||
import math | |||||
# def seperate_arr(arr, new_arr): | |||||
# if (length % 2) == 0: | |||||
# half = int(length / 2) | |||||
# new_arr += [arr[half - 1], arr[half]] | |||||
# subarr1 = [arr[i] for i in range(1, half - 1)] | |||||
# else: | |||||
# half = math.floor(length / 2) | |||||
# new_arr.append(arr[half]) | |||||
# subarr1 = [arr[i] for i in range(1, half)] | |||||
# subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||||
# subarrs = [subarr1, subarr2] | |||||
# return subarrs | |||||
if layer == 0: | |||||
length = len(arr) | |||||
if length <= 2: | |||||
return arr | |||||
new_arr = [arr[0], arr[-1]] | |||||
if (length % 2) == 0: | |||||
half = int(length / 2) | |||||
new_arr += [arr[half - 1], arr[half]] | |||||
subarr1 = [arr[i] for i in range(1, half - 1)] | |||||
else: | |||||
half = math.floor(length / 2) | |||||
new_arr.append(arr[half]) | |||||
subarr1 = [arr[i] for i in range(1, half)] | |||||
subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||||
subarrs = [subarr1, subarr2] | |||||
# subarrs = seperate_arr(arr, new_arr) | |||||
new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||||
else: | |||||
new_arr = [] | |||||
subarrs = [] | |||||
for a in arr: | |||||
length = len(a) | |||||
if length <= 2: | |||||
new_arr += a | |||||
else: | |||||
# subarrs += seperate_arr(a, new_arr) | |||||
if (length % 2) == 0: | |||||
half = int(length / 2) | |||||
new_arr += [a[half - 1], a[half]] | |||||
subarr1 = [a[i] for i in range(0, half - 1)] | |||||
else: | |||||
half = math.floor(length / 2) | |||||
new_arr.append(a[half]) | |||||
subarr1 = [a[i] for i in range(0, half)] | |||||
subarr2 = [a[i] for i in range(half + 1, length)] | |||||
subarrs += [subarr1, subarr2] | |||||
if len(subarrs) > 0: | |||||
new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||||
return new_arr | |||||
# length = len(arr) | |||||
# if length <= 2: | |||||
# return arr | |||||
# new_arr = [arr[0], arr[-1]] | |||||
# if (length % 2) == 0: | |||||
# half = int(length / 2) | |||||
# new_arr += [arr[half - 1], arr[half]] | |||||
# subarr1 = [arr[i] for i in range(1, half - 1)] | |||||
# else: | |||||
# half = math.floor(length / 2) | |||||
# new_arr.append(arr[half]) | |||||
# subarr1 = [arr[i] for i in range(1, half)] | |||||
# subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||||
# if len(subarr1) > 0: | |||||
# new_arr += dichotomous_permutation(subarr1) | |||||
# if len(subarr2) > 0: | |||||
# new_arr += dichotomous_permutation(subarr2) | |||||
# return new_arr | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
root_dir = 'outputs/CRIANN/' | root_dir = 'outputs/CRIANN/' | ||||
# for dir_ in sorted(os.listdir(root_dir)): | |||||
# if os.path.isdir(root_dir): | |||||
# full_dir = os.path.join(root_dir, dir_) | |||||
# print('---', full_dir,':') | |||||
# save_dir = os.path.join(full_dir, 'groups/') | |||||
# if os.path.exists(save_dir): | |||||
# try: | |||||
# get_relative_errors(save_dir) | |||||
# except Exception as exp: | |||||
# print('An exception occured when running this experiment:') | |||||
# print(repr(exp)) | |||||
# for dir_ in sorted(os.listdir(root_dir)): | |||||
# if os.path.isdir(root_dir): | |||||
# full_dir = os.path.join(root_dir, dir_) | |||||
# print('---', full_dir,':') | |||||
# save_dir = os.path.join(full_dir, 'groups/') | |||||
# if os.path.exists(save_dir): | |||||
# try: | |||||
# get_relative_errors(save_dir) | |||||
# except Exception as exp: | |||||
# print('An exception occured when running this experiment:') | |||||
# print(repr(exp)) |
@@ -0,0 +1,29 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Tue Jan 26 09:53:33 2021 | |||||
@author: ljia | |||||
""" | |||||
if __name__ == '__main__': | |||||
tasks = [ | |||||
{'path': 'thesis/graph_kernels/fcsp', | |||||
'file': 'run_jobs_compare_fcsp.py' | |||||
}, | |||||
{'path': 'thesis/graph_kernels/fcsp', | |||||
'file': 'run_jobs_compare_fcsp_space.py' | |||||
}, | |||||
{'path': 'ged/stability', | |||||
'file': 'run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py' | |||||
}, | |||||
] | |||||
import os | |||||
for t in tasks: | |||||
print(t['file']) | |||||
command = '' | |||||
command += 'cd ' + t['path'] + '\n' | |||||
command += 'python3 ' + t['file'] + '\n' | |||||
# command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n' | |||||
os.system(command) |
@@ -19,7 +19,15 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||||
('StructuralSP', 'NCI1', 'False'), | ('StructuralSP', 'NCI1', 'False'), | ||||
('ShortestPath', 'NCI109', 'False'), | ('ShortestPath', 'NCI109', 'False'), | ||||
('StructuralSP', 'NCI109', 'True'), | ('StructuralSP', 'NCI109', 'True'), | ||||
('ShortestPath', 'NCI-H23', 'True'), | |||||
('ShortestPath', 'NCI-H23', 'False'), | |||||
('StructuralSP', 'NCI-H23', 'True'), | |||||
('StructuralSP', 'NCI-H23', 'False'), | |||||
('StructuralSP', 'NCI109', 'False'), | ('StructuralSP', 'NCI109', 'False'), | ||||
('ShortestPath', 'NCI-H23H', 'True'), | |||||
('ShortestPath', 'NCI-H23H', 'False'), | |||||
('StructuralSP', 'NCI-H23H', 'True'), | |||||
('StructuralSP', 'NCI-H23H', 'False'), | |||||
('ShortestPath', 'DD', 'True'), | ('ShortestPath', 'DD', 'True'), | ||||
('ShortestPath', 'DD', 'False'), | ('ShortestPath', 'DD', 'False'), | ||||
('StructuralSP', 'BZR', 'False'), | ('StructuralSP', 'BZR', 'False'), | ||||
@@ -27,9 +35,37 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||||
('StructuralSP', 'COX2', 'False'), | ('StructuralSP', 'COX2', 'False'), | ||||
('ShortestPath', 'DHFR', 'False'), | ('ShortestPath', 'DHFR', 'False'), | ||||
('StructuralSP', 'DHFR', 'False'), | ('StructuralSP', 'DHFR', 'False'), | ||||
('ShortestPath', 'MCF-7', 'True'), | |||||
('ShortestPath', 'MCF-7', 'False'), | |||||
('StructuralSP', 'MCF-7', 'True'), | |||||
('StructuralSP', 'MCF-7', 'False'), | |||||
('ShortestPath', 'MCF-7H', 'True'), | |||||
('ShortestPath', 'MCF-7H', 'False'), | |||||
('StructuralSP', 'MCF-7H', 'True'), | |||||
('StructuralSP', 'MCF-7H', 'False'), | |||||
('ShortestPath', 'MOLT-4', 'True'), | |||||
('ShortestPath', 'MOLT-4', 'False'), | |||||
('StructuralSP', 'MOLT-4', 'True'), | |||||
('StructuralSP', 'MOLT-4', 'False'), | |||||
('ShortestPath', 'MOLT-4H', 'True'), | |||||
('ShortestPath', 'MOLT-4H', 'False'), | |||||
('StructuralSP', 'MOLT-4H', 'True'), | |||||
('StructuralSP', 'MOLT-4H', 'False'), | |||||
('StructuralSP', 'OHSU', 'True'), | ('StructuralSP', 'OHSU', 'True'), | ||||
('StructuralSP', 'OHSU', 'False'), | ('StructuralSP', 'OHSU', 'False'), | ||||
('StructuralSP', 'SYNTHETIC', 'False'), | |||||
('ShortestPath', 'OVCAR-8', 'True'), | |||||
('ShortestPath', 'OVCAR-8', 'False'), | |||||
('StructuralSP', 'OVCAR-8', 'True'), | |||||
('StructuralSP', 'OVCAR-8', 'False'), | |||||
('ShortestPath', 'OVCAR-8H', 'True'), | |||||
('ShortestPath', 'OVCAR-8H', 'False'), | |||||
('StructuralSP', 'OVCAR-8H', 'True'), | |||||
('StructuralSP', 'OVCAR-8H', 'False'), | |||||
('ShortestPath', 'P388', 'False'), | |||||
('ShortestPath', 'P388', 'True'), | |||||
('StructuralSP', 'P388', 'True'), | |||||
('StructuralSP', 'Steroid', 'False'), | |||||
('ShortestPath', 'SYNTHETIC', 'False'), | |||||
('StructuralSP', 'SYNTHETIC', 'True'), | ('StructuralSP', 'SYNTHETIC', 'True'), | ||||
('StructuralSP', 'SYNTHETIC', 'False'), | ('StructuralSP', 'SYNTHETIC', 'False'), | ||||
('ShortestPath', 'SYNTHETICnew', 'False'), | ('ShortestPath', 'SYNTHETICnew', 'False'), | ||||
@@ -47,6 +83,9 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||||
('StructuralSP', 'Mutagenicity', 'False'), | ('StructuralSP', 'Mutagenicity', 'False'), | ||||
('StructuralSP', 'REDDIT-BINARY', 'True'), | ('StructuralSP', 'REDDIT-BINARY', 'True'), | ||||
('StructuralSP', 'REDDIT-BINARY', 'False'), | ('StructuralSP', 'REDDIT-BINARY', 'False'), | ||||
('StructuralSP', 'Vitamin_D', 'False'), | |||||
('ShortestPath', 'Web', 'True'), | |||||
('ShortestPath', 'Web', 'False'), | |||||
}) | }) | ||||
OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'), | OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'), | ||||
@@ -17,6 +17,7 @@ OUT_TIME_LIST = [] | |||||
OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | ||||
('ShortestPath', 'REDDIT-BINARY', 'False'), | ('ShortestPath', 'REDDIT-BINARY', 'False'), | ||||
('StructuralSP', 'ENZYMES', 'False'), | ('StructuralSP', 'ENZYMES', 'False'), | ||||
('StructuralSP', 'AIDS', 'False'), | |||||
('ShortestPath', 'DD', 'True'), | ('ShortestPath', 'DD', 'True'), | ||||
('ShortestPath', 'DD', 'False'), | ('ShortestPath', 'DD', 'False'), | ||||
('StructuralSP', 'DD', 'True'), | ('StructuralSP', 'DD', 'True'), | ||||
@@ -55,6 +56,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | |||||
('ShortestPath', 'P388H', 'False'), | ('ShortestPath', 'P388H', 'False'), | ||||
('StructuralSP', 'P388H', 'True'), | ('StructuralSP', 'P388H', 'True'), | ||||
('StructuralSP', 'P388H', 'False'), | ('StructuralSP', 'P388H', 'False'), | ||||
('StructuralSP', 'NCI1', 'False'), | |||||
('ShortestPath', 'NCI-H23', 'True'), | ('ShortestPath', 'NCI-H23', 'True'), | ||||
('ShortestPath', 'NCI-H23', 'False'), | ('ShortestPath', 'NCI-H23', 'False'), | ||||
('StructuralSP', 'NCI-H23', 'True'), | ('StructuralSP', 'NCI-H23', 'True'), | ||||
@@ -63,6 +65,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | |||||
('ShortestPath', 'NCI-H23H', 'False'), | ('ShortestPath', 'NCI-H23H', 'False'), | ||||
('StructuralSP', 'NCI-H23H', 'True'), | ('StructuralSP', 'NCI-H23H', 'True'), | ||||
('StructuralSP', 'NCI-H23H', 'False'), | ('StructuralSP', 'NCI-H23H', 'False'), | ||||
('StructuralSP', 'OHSU', 'False'), | |||||
('ShortestPath', 'OVCAR-8', 'True'), | ('ShortestPath', 'OVCAR-8', 'True'), | ||||
('ShortestPath', 'OVCAR-8', 'False'), | ('ShortestPath', 'OVCAR-8', 'False'), | ||||
('StructuralSP', 'OVCAR-8', 'True'), | ('StructuralSP', 'OVCAR-8', 'True'), | ||||
@@ -208,11 +211,12 @@ def check_task_status(save_dir, *params): | |||||
# Check if the task is already computed. | # Check if the task is already computed. | ||||
file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') | file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') | ||||
if os.path.isfile(file_name): | |||||
with open(file_name, 'rb') as f: | |||||
data = pickle.load(f) | |||||
if data['completed']: | |||||
return True | |||||
if os.path.getsize(file_name) > 0: | |||||
if os.path.isfile(file_name): | |||||
with open(file_name, 'rb') as f: | |||||
data = pickle.load(f) | |||||
if data['completed']: | |||||
return True | |||||
return False | return False | ||||
@@ -7,7 +7,6 @@ __version__ = "0.1" | |||||
__author__ = "Linlin Jia" | __author__ = "Linlin Jia" | ||||
__date__ = "November 2018" | __date__ = "November 2018" | ||||
from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels | |||||
from gklearn.kernels.graph_kernel import GraphKernel | from gklearn.kernels.graph_kernel import GraphKernel | ||||
from gklearn.kernels.common_walk import CommonWalk | from gklearn.kernels.common_walk import CommonWalk | ||||
@@ -24,6 +23,8 @@ from gklearn.kernels.path_up_to_h import PathUpToH | |||||
from gklearn.kernels.treelet import Treelet | from gklearn.kernels.treelet import Treelet | ||||
from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree | from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree | ||||
from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels | |||||
# old version. | # old version. | ||||
from gklearn.kernels.commonWalkKernel import commonwalkkernel | from gklearn.kernels.commonWalkKernel import commonwalkkernel | ||||
from gklearn.kernels.marginalizedKernel import marginalizedkernel | from gklearn.kernels.marginalizedKernel import marginalizedkernel | ||||
@@ -32,4 +33,4 @@ from gklearn.kernels.spKernel import spkernel | |||||
from gklearn.kernels.structuralspKernel import structuralspkernel | from gklearn.kernels.structuralspKernel import structuralspkernel | ||||
from gklearn.kernels.untilHPathKernel import untilhpathkernel | from gklearn.kernels.untilHPathKernel import untilhpathkernel | ||||
from gklearn.kernels.treeletKernel import treeletkernel | from gklearn.kernels.treeletKernel import treeletkernel | ||||
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||||
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel |
@@ -47,7 +47,7 @@ class CommonWalk(GraphKernel): | |||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | ||||
length=len_itr, verbose=(self._verbose >= 2)) | |||||
length=len_itr, verbose=(self.verbose >= 2)) | |||||
# direct product graph method - exponential | # direct product graph method - exponential | ||||
if self._compute_method == 'exp': | if self._compute_method == 'exp': | ||||
@@ -86,7 +86,7 @@ class CommonWalk(GraphKernel): | |||||
do_fun = self._wrapper_kernel_do_geo | do_fun = self._wrapper_kernel_do_geo | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm, | ||||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
return gram_matrix | return gram_matrix | ||||
@@ -100,9 +100,9 @@ class CommonWalk(GraphKernel): | |||||
# compute kernel list. | # compute kernel list. | ||||
kernel_list = [None] * len(g_list) | kernel_list = [None] * len(g_list) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', | iterator = get_iters(range(len(g_list)), desc='Computing kernels', | ||||
file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
else: | else: | ||||
iterator = range(len(g_list)) | iterator = range(len(g_list)) | ||||
@@ -148,7 +148,7 @@ class CommonWalk(GraphKernel): | |||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered', | init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered', | ||||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
return kernel_list | return kernel_list | ||||
@@ -35,7 +35,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
def _compute_gm_series(self): | def _compute_gm_series(self): | ||||
self._check_edge_weight(self._graphs, self._verbose) | |||||
self._check_edge_weight(self._graphs, self.verbose) | |||||
self._check_graphs(self._graphs) | self._check_graphs(self._graphs) | ||||
lmda = self._weight | lmda = self._weight | ||||
@@ -44,7 +44,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | ||||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | # Reindex nodes using consecutive integers for the convenience of kernel computation. | ||||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | ||||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | if self._p is None and self._q is None: # p and q are uniform distributions as default. | ||||
@@ -52,7 +52,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
from itertools import combinations_with_replacement | from itertools import combinations_with_replacement | ||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||||
for i, j in iterator: | for i, j in iterator: | ||||
kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) | kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) | ||||
@@ -66,7 +66,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
def _compute_gm_imap_unordered(self): | def _compute_gm_imap_unordered(self): | ||||
self._check_edge_weight(self._graphs, self._verbose) | |||||
self._check_edge_weight(self._graphs, self.verbose) | |||||
self._check_graphs(self._graphs) | self._check_graphs(self._graphs) | ||||
# Compute Gram matrix. | # Compute Gram matrix. | ||||
@@ -74,7 +74,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
# @todo: parallel this. | # @todo: parallel this. | ||||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | # Reindex nodes using consecutive integers for the convenience of kernel computation. | ||||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | ||||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | if self._p is None and self._q is None: # p and q are uniform distributions as default. | ||||
@@ -86,7 +86,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
do_fun = self._wrapper_kernel_do | do_fun = self._wrapper_kernel_do | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
else: # @todo | else: # @todo | ||||
pass | pass | ||||
@@ -95,7 +95,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
def _compute_kernel_list_series(self, g1, g_list): | def _compute_kernel_list_series(self, g1, g_list): | ||||
self._check_edge_weight(g_list + [g1], self._verbose) | |||||
self._check_edge_weight(g_list + [g1], self.verbose) | |||||
self._check_graphs(g_list + [g1]) | self._check_graphs(g_list + [g1]) | ||||
lmda = self._weight | lmda = self._weight | ||||
@@ -105,11 +105,11 @@ class ConjugateGradient(RandomWalkMeta): | |||||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | # Reindex nodes using consecutive integers for the convenience of kernel computation. | ||||
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | ||||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | ||||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | if self._p is None and self._q is None: # p and q are uniform distributions as default. | ||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i in iterator: | for i in iterator: | ||||
kernel = self._kernel_do(g1, g_list[i], lmda) | kernel = self._kernel_do(g1, g_list[i], lmda) | ||||
@@ -122,7 +122,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | def _compute_kernel_list_imap_unordered(self, g1, g_list): | ||||
self._check_edge_weight(g_list + [g1], self._verbose) | |||||
self._check_edge_weight(g_list + [g1], self.verbose) | |||||
self._check_graphs(g_list + [g1]) | self._check_graphs(g_list + [g1]) | ||||
# compute kernel list. | # compute kernel list. | ||||
@@ -131,7 +131,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | # Reindex nodes using consecutive integers for the convenience of kernel computation. | ||||
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | ||||
# @todo: parallel this. | # @todo: parallel this. | ||||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | ||||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | if self._p is None and self._q is None: # p and q are uniform distributions as default. | ||||
@@ -149,7 +149,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | ||||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
else: # @todo | else: # @todo | ||||
pass | pass | ||||
@@ -162,7 +162,7 @@ class ConjugateGradient(RandomWalkMeta): | |||||
def _compute_single_kernel_series(self, g1, g2): | def _compute_single_kernel_series(self, g1, g2): | ||||
self._check_edge_weight([g1] + [g2], self._verbose) | |||||
self._check_edge_weight([g1] + [g2], self.verbose) | |||||
self._check_graphs([g1] + [g2]) | self._check_graphs([g1] + [g2]) | ||||
lmda = self._weight | lmda = self._weight | ||||
@@ -35,7 +35,7 @@ class FixedPoint(RandomWalkMeta): | |||||
def _compute_gm_series(self): | def _compute_gm_series(self): | ||||
self._check_edge_weight(self._graphs, self._verbose) | |||||
self._check_edge_weight(self._graphs, self.verbose) | |||||
self._check_graphs(self._graphs) | self._check_graphs(self._graphs) | ||||
lmda = self._weight | lmda = self._weight | ||||
@@ -44,7 +44,7 @@ class FixedPoint(RandomWalkMeta): | |||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | ||||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | # Reindex nodes using consecutive integers for the convenience of kernel computation. | ||||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self.verbose >= 2)) | |||||
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | ||||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | if self._p is None and self._q is None: # p and q are uniform distributions as default. | ||||
@@ -52,7 +52,7 @@ class FixedPoint(RandomWalkMeta): | |||||
from itertools import combinations_with_replacement | from itertools import combinations_with_replacement | ||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||||
for i, j in iterator: | for i, j in iterator: | ||||
kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) | kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) | ||||
@@ -66,7 +66,7 @@ class FixedPoint(RandomWalkMeta): | |||||
def _compute_gm_imap_unordered(self): | def _compute_gm_imap_unordered(self): | ||||
self._check_edge_weight(self._graphs, self._verbose) | |||||
self._check_edge_weight(self._graphs, self.verbose) | |||||
self._check_graphs(self._graphs) | self._check_graphs(self._graphs) | ||||
# Compute Gram matrix. | # Compute Gram matrix. | ||||
@@ -74,7 +74,7 @@ class FixedPoint(RandomWalkMeta): | |||||
# @todo: parallel this. | # @todo: parallel this. | ||||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | # Reindex nodes using consecutive integers for the convenience of kernel computation. | ||||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | ||||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | if self._p is None and self._q is None: # p and q are uniform distributions as default. | ||||
@@ -86,7 +86,7 @@ class FixedPoint(RandomWalkMeta): | |||||
do_fun = self._wrapper_kernel_do | do_fun = self._wrapper_kernel_do | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
else: # @todo | else: # @todo | ||||
pass | pass | ||||
@@ -95,7 +95,7 @@ class FixedPoint(RandomWalkMeta): | |||||
def _compute_kernel_list_series(self, g1, g_list): | def _compute_kernel_list_series(self, g1, g_list): | ||||
self._check_edge_weight(g_list + [g1], self._verbose) | |||||
self._check_edge_weight(g_list + [g1], self.verbose) | |||||
self._check_graphs(g_list + [g1]) | self._check_graphs(g_list + [g1]) | ||||
lmda = self._weight | lmda = self._weight | ||||
@@ -105,12 +105,12 @@ class FixedPoint(RandomWalkMeta): | |||||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | # Reindex nodes using consecutive integers for the convenience of kernel computation. | ||||
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | ||||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | ||||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | if self._p is None and self._q is None: # p and q are uniform distributions as default. | ||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i in iterator: | for i in iterator: | ||||
kernel = self._kernel_do(g1, g_list[i], lmda) | kernel = self._kernel_do(g1, g_list[i], lmda) | ||||
@@ -123,7 +123,7 @@ class FixedPoint(RandomWalkMeta): | |||||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | def _compute_kernel_list_imap_unordered(self, g1, g_list): | ||||
self._check_edge_weight(g_list + [g1], self._verbose) | |||||
self._check_edge_weight(g_list + [g1], self.verbose) | |||||
self._check_graphs(g_list + [g1]) | self._check_graphs(g_list + [g1]) | ||||
# compute kernel list. | # compute kernel list. | ||||
@@ -132,7 +132,7 @@ class FixedPoint(RandomWalkMeta): | |||||
# Reindex nodes using consecutive integers for the convenience of kernel computation. | # Reindex nodes using consecutive integers for the convenience of kernel computation. | ||||
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') | ||||
# @todo: parallel this. | # @todo: parallel this. | ||||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] | ||||
if self._p is None and self._q is None: # p and q are uniform distributions as default. | if self._p is None and self._q is None: # p and q are uniform distributions as default. | ||||
@@ -150,7 +150,7 @@ class FixedPoint(RandomWalkMeta): | |||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | ||||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
else: # @todo | else: # @todo | ||||
pass | pass | ||||
@@ -163,7 +163,7 @@ class FixedPoint(RandomWalkMeta): | |||||
def _compute_single_kernel_series(self, g1, g2): | def _compute_single_kernel_series(self, g1, g2): | ||||
self._check_edge_weight([g1] + [g2], self._verbose) | |||||
self._check_edge_weight([g1] + [g2], self.verbose) | |||||
self._check_graphs([g1] + [g2]) | self._check_graphs([g1] + [g2]) | ||||
lmda = self._weight | lmda = self._weight | ||||
@@ -9,55 +9,433 @@ import numpy as np | |||||
import networkx as nx | import networkx as nx | ||||
import multiprocessing | import multiprocessing | ||||
import time | import time | ||||
# from abc import ABC, abstractmethod | |||||
from sklearn.base import BaseEstimator # , TransformerMixin | |||||
from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, | |||||
from sklearn.exceptions import NotFittedError | |||||
from gklearn.utils import normalize_gram_matrix | from gklearn.utils import normalize_gram_matrix | ||||
class GraphKernel(object): | |||||
class GraphKernel(BaseEstimator): #, ABC): | |||||
"""The basic graph kernel class. | |||||
def __init__(self): | |||||
self._graphs = None | |||||
self._parallel = '' | |||||
self._n_jobs = 0 | |||||
self._verbose = None | |||||
self._normalize = True | |||||
self._run_time = 0 | |||||
self._gram_matrix = None | |||||
self._gram_matrix_unnorm = None | |||||
Attributes | |||||
---------- | |||||
_graphs : list | |||||
Stores the input graphs on fit input data. | |||||
Default format of the list objects is `NetworkX` graphs. | |||||
**We don't guarantee that the input graphs remain unchanged during the | |||||
computation.** | |||||
References | |||||
---------- | |||||
https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. | |||||
""" | |||||
def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2): | |||||
"""`__init__` for `GraphKernel` object.""" | |||||
# @todo: the default settings of the parameters are different from those in the self.compute method. | |||||
# self._graphs = None | |||||
self.parallel = parallel | |||||
self.n_jobs = n_jobs | |||||
self.chunksize = chunksize | |||||
self.normalize = normalize | |||||
self.verbose = verbose | |||||
# self._run_time = 0 | |||||
# self._gram_matrix = None | |||||
# self._gram_matrix_unnorm = None | |||||
def compute(self, *graphs, **kwargs): | |||||
self._parallel = kwargs.get('parallel', 'imap_unordered') | |||||
self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||||
self._normalize = kwargs.get('normalize', True) | |||||
self._verbose = kwargs.get('verbose', 2) | |||||
########################################################################## | |||||
# The following is the 1st paradigm to compute kernel matrix, which is | |||||
# compatible with `scikit-learn`. | |||||
# ------------------------------------------------------------------- | |||||
# Special thanks to the "GraKeL" library for providing an excellent template! | |||||
########################################################################## | |||||
def fit(self, X, y=None): | |||||
"""Fit a graph dataset for a transformer. | |||||
Parameters | |||||
---------- | |||||
X : iterable | |||||
DESCRIPTION. | |||||
y : None, optional | |||||
There is no need of a target in a transformer, yet the `scikit-learn` | |||||
pipeline API requires this parameter. | |||||
Returns | |||||
------- | |||||
object | |||||
Returns self. | |||||
""" | |||||
# self._is_tranformed = False | |||||
# Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; | |||||
self.clear_attributes() | |||||
# Validate parameters for the transformer. | |||||
self.validate_parameters() | |||||
# Validate the input. | |||||
self._graphs = self.validate_input(X) | |||||
# self._X = X | |||||
# self._kernel = self._get_kernel_instance() | |||||
# Return the transformer. | |||||
return self | |||||
def transform(self, X): | |||||
"""Compute the graph kernel matrix between given and fitted data. | |||||
Parameters | |||||
---------- | |||||
X : TYPE | |||||
DESCRIPTION. | |||||
Raises | |||||
------ | |||||
ValueError | |||||
DESCRIPTION. | |||||
Returns | |||||
------- | |||||
None. | |||||
""" | |||||
# Check if method "fit" had been called. | |||||
check_is_fitted(self, '_graphs') | |||||
# Validate the input. | |||||
Y = self.validate_input(X) | |||||
# Transform: compute the graph kernel matrix. | |||||
kernel_matrix = self.compute_kernel_matrix(Y) | |||||
self._Y = Y | |||||
# Self transform must appear before the diagonal call on normilization. | |||||
self._is_transformed = True | |||||
if self.normalize: | |||||
X_diag, Y_diag = self.diagonals() | |||||
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||||
try: | |||||
kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) | |||||
except: | |||||
raise | |||||
finally: | |||||
np.seterr(**old_settings) | |||||
return kernel_matrix | |||||
def fit_transform(self, X): | |||||
"""Fit and transform: compute Gram matrix on the same data. | |||||
Parameters | |||||
---------- | |||||
X : list of graphs | |||||
Input graphs. | |||||
Returns | |||||
------- | |||||
gram_matrix : numpy array, shape = [len(X), len(X)] | |||||
The Gram matrix of X. | |||||
""" | |||||
self.fit(X) | |||||
# Transform: compute Gram matrix. | |||||
gram_matrix = self.compute_kernel_matrix() | |||||
# Normalize. | |||||
if self.normalize: | |||||
self._X_diag = np.diagonal(gram_matrix).copy() | |||||
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||||
try: | |||||
gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) | |||||
except: | |||||
raise | |||||
finally: | |||||
np.seterr(**old_settings) | |||||
return gram_matrix | |||||
def get_params(self): | |||||
pass | |||||
def set_params(self): | |||||
pass | |||||
def clear_attributes(self): | |||||
if hasattr(self, '_X_diag'): | |||||
delattr(self, '_X_diag') | |||||
if hasattr(self, '_graphs'): | |||||
delattr(self, '_graphs') | |||||
if hasattr(self, '_Y'): | |||||
delattr(self, '_Y') | |||||
if hasattr(self, '_run_time'): | |||||
delattr(self, '_run_time') | |||||
def validate_parameters(self): | |||||
"""Validate all parameters for the transformer. | |||||
Returns | |||||
------- | |||||
None. | |||||
""" | |||||
if self.parallel is not None and self.parallel != 'imap_unordered': | |||||
raise ValueError('Parallel mode is not set correctly.') | |||||
if self.parallel == 'imap_unordered' and self.n_jobs is None: | |||||
self.n_jobs = multiprocessing.cpu_count() | |||||
def validate_input(self, X): | |||||
"""Validate the given input and raise errors if it is invalid. | |||||
Parameters | |||||
---------- | |||||
X : list | |||||
The input to check. Should be a list of graph. | |||||
Raises | |||||
------ | |||||
ValueError | |||||
Raise if the input is not correct. | |||||
Returns | |||||
------- | |||||
X : list | |||||
The input. A list of graph. | |||||
""" | |||||
if X is None: | |||||
raise ValueError('Please add graphs before computing.') | |||||
elif not isinstance(X, list): | |||||
raise ValueError('Cannot detect graphs.') | |||||
elif len(X) == 0: | |||||
raise ValueError('The graph list given is empty. No computation will be performed.') | |||||
return X | |||||
def compute_kernel_matrix(self, Y=None): | |||||
"""Compute the kernel matrix between a given target graphs (Y) and | |||||
the fitted graphs (X / self._graphs) or the Gram matrix for the fitted | |||||
graphs (X / self._graphs). | |||||
Parameters | |||||
---------- | |||||
Y : list of graphs, optional | |||||
The target graphs. The default is None. If None kernel is computed | |||||
between X and itself. | |||||
Returns | |||||
------- | |||||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||||
The computed kernel matrix. | |||||
""" | |||||
if Y is None: | |||||
# Compute Gram matrix for self._graphs (X). | |||||
kernel_matrix = self._compute_gram_matrix() | |||||
# self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||||
else: | |||||
# Compute kernel matrix between Y and self._graphs (X). | |||||
start_time = time.time() | |||||
if self.parallel == 'imap_unordered': | |||||
kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y) | |||||
elif self.parallel is None: | |||||
kernel_matrix = self._compute_kernel_matrix_series(Y) | |||||
self._run_time = time.time() - start_time | |||||
if self.verbose: | |||||
print('Kernel matrix of size (%d, %d) built in %s seconds.' | |||||
% (len(Y), len(self._graphs), self._run_time)) | |||||
return kernel_matrix | |||||
def _compute_kernel_matrix_series(self, Y): | |||||
"""Compute the kernel matrix between a given target graphs (Y) and | |||||
the fitted graphs (X / self._graphs) without parallelization. | |||||
Parameters | |||||
---------- | |||||
Y : list of graphs, optional | |||||
The target graphs. | |||||
Returns | |||||
------- | |||||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||||
The computed kernel matrix. | |||||
""" | |||||
kernel_matrix = np.zeros((len(Y), len(self._graphs))) | |||||
for i_y, g_y in enumerate(Y): | |||||
for i_x, g_x in enumerate(self._graphs): | |||||
kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x) | |||||
return kernel_matrix | |||||
def _compute_kernel_matrix_imap_unordered(self, Y): | |||||
"""Compute the kernel matrix between a given target graphs (Y) and | |||||
the fitted graphs (X / self._graphs) using imap unordered parallelization. | |||||
Parameters | |||||
---------- | |||||
Y : list of graphs, optional | |||||
The target graphs. | |||||
Returns | |||||
------- | |||||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||||
The computed kernel matrix. | |||||
""" | |||||
raise Exception('Parallelization for kernel matrix is not implemented.') | |||||
def diagonals(self): | |||||
"""Compute the kernel matrix diagonals of the fit/transformed data. | |||||
Returns | |||||
------- | |||||
X_diag : numpy array | |||||
The diagonal of the kernel matrix between the fitted data. | |||||
This consists of each element calculated with itself. | |||||
Y_diag : numpy array | |||||
The diagonal of the kernel matrix, of the transform. | |||||
This consists of each element calculated with itself. | |||||
""" | |||||
# Check if method "fit" had been called. | |||||
check_is_fitted(self, ['_graphs']) | |||||
# Check if the diagonals of X exist. | |||||
try: | |||||
check_is_fitted(self, ['_X_diag']) | |||||
except NotFittedError: | |||||
# Compute diagonals of X. | |||||
self._X_diag = np.empty(shape=(len(self._graphs),)) | |||||
for i, x in enumerate(self._graphs): | |||||
self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? | |||||
try: | |||||
# If transform has happened, return both diagonals. | |||||
check_is_fitted(self, ['_Y']) | |||||
self._Y_diag = np.empty(shape=(len(self._Y),)) | |||||
for (i, y) in enumerate(self._Y): | |||||
self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? | |||||
return self._X_diag, self._Y_diag | |||||
except NotFittedError: | |||||
# Else just return both X_diag | |||||
return self._X_diag | |||||
# @abstractmethod | |||||
def pairwise_kernel(self, x, y): | |||||
"""Compute pairwise kernel between two graphs. | |||||
Parameters | |||||
---------- | |||||
x, y : NetworkX Graph. | |||||
Graphs bewteen which the kernel is computed. | |||||
Returns | |||||
------- | |||||
kernel: float | |||||
The computed kernel. | |||||
# Notes | |||||
# ----- | |||||
# This method is abstract and must be implemented by a subclass. | |||||
""" | |||||
raise NotImplementedError('Pairwise kernel computation is not implemented!') | |||||
########################################################################## | |||||
# The following is the 2nd paradigm to compute kernel matrix. It is | |||||
# simplified and not compatible with `scikit-learn`. | |||||
########################################################################## | |||||
def compute(self, *graphs, **kwargs): | |||||
self.parallel = kwargs.get('parallel', 'imap_unordered') | |||||
self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||||
self.normalize = kwargs.get('normalize', True) | |||||
self.verbose = kwargs.get('verbose', 2) | |||||
self.copy_graphs = kwargs.get('copy_graphs', True) | |||||
self.save_unnormed = kwargs.get('save_unnormed', True) | |||||
self.validate_parameters() | |||||
# If the inputs is a list of graphs. | |||||
if len(graphs) == 1: | if len(graphs) == 1: | ||||
if not isinstance(graphs[0], list): | if not isinstance(graphs[0], list): | ||||
raise Exception('Cannot detect graphs.') | raise Exception('Cannot detect graphs.') | ||||
elif len(graphs[0]) == 0: | elif len(graphs[0]) == 0: | ||||
raise Exception('The graph list given is empty. No computation was performed.') | raise Exception('The graph list given is empty. No computation was performed.') | ||||
else: | else: | ||||
self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. | |||||
if self.copy_graphs: | |||||
self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. | |||||
else: | |||||
self._graphs = graphs | |||||
self._gram_matrix = self._compute_gram_matrix() | self._gram_matrix = self._compute_gram_matrix() | ||||
self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||||
if self._normalize: | |||||
if self.save_unnormed: | |||||
self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||||
if self.normalize: | |||||
self._gram_matrix = normalize_gram_matrix(self._gram_matrix) | self._gram_matrix = normalize_gram_matrix(self._gram_matrix) | ||||
return self._gram_matrix, self._run_time | return self._gram_matrix, self._run_time | ||||
elif len(graphs) == 2: | elif len(graphs) == 2: | ||||
# If the inputs are two graphs. | |||||
if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): | if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): | ||||
kernel = self._compute_single_kernel(graphs[0].copy(), graphs[1].copy()) | |||||
if self.copy_graphs: | |||||
G0, G1 = graphs[0].copy(), graphs[1].copy() | |||||
else: | |||||
G0, G1 = graphs[0], graphs[1] | |||||
kernel = self._compute_single_kernel(G0, G1) | |||||
return kernel, self._run_time | return kernel, self._run_time | ||||
# If the inputs are a graph and a list of graphs. | |||||
elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): | elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): | ||||
g1 = graphs[0].copy() | |||||
g_list = [g.copy() for g in graphs[1]] | |||||
kernel_list = self._compute_kernel_list(g1, g_list) | |||||
if self.copy_graphs: | |||||
g1 = graphs[0].copy() | |||||
g_list = [g.copy() for g in graphs[1]] | |||||
kernel_list = self._compute_kernel_list(g1, g_list) | |||||
else: | |||||
kernel_list = self._compute_kernel_list(graphs[0], graphs[1]) | |||||
return kernel_list, self._run_time | return kernel_list, self._run_time | ||||
elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): | elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): | ||||
g1 = graphs[1].copy() | |||||
g_list = [g.copy() for g in graphs[0]] | |||||
kernel_list = self._compute_kernel_list(g1, g_list) | |||||
if self.copy_graphs: | |||||
g1 = graphs[1].copy() | |||||
g_list = [g.copy() for g in graphs[0]] | |||||
kernel_list = self._compute_kernel_list(g1, g_list) | |||||
else: | |||||
kernel_list = self._compute_kernel_list(graphs[1], graphs[0]) | |||||
return kernel_list, self._run_time | return kernel_list, self._run_time | ||||
else: | else: | ||||
raise Exception('Cannot detect graphs.') | raise Exception('Cannot detect graphs.') | ||||
@@ -103,15 +481,15 @@ class GraphKernel(object): | |||||
def _compute_gram_matrix(self): | def _compute_gram_matrix(self): | ||||
start_time = time.time() | start_time = time.time() | ||||
if self._parallel == 'imap_unordered': | |||||
if self.parallel == 'imap_unordered': | |||||
gram_matrix = self._compute_gm_imap_unordered() | gram_matrix = self._compute_gm_imap_unordered() | ||||
elif self._parallel is None: | |||||
elif self.parallel is None: | |||||
gram_matrix = self._compute_gm_series() | gram_matrix = self._compute_gm_series() | ||||
else: | else: | ||||
raise Exception('Parallel mode is not set correctly.') | raise Exception('Parallel mode is not set correctly.') | ||||
self._run_time = time.time() - start_time | self._run_time = time.time() - start_time | ||||
if self._verbose: | |||||
if self.verbose: | |||||
print('Gram matrix of size %d built in %s seconds.' | print('Gram matrix of size %d built in %s seconds.' | ||||
% (len(self._graphs), self._run_time)) | % (len(self._graphs), self._run_time)) | ||||
@@ -129,15 +507,15 @@ class GraphKernel(object): | |||||
def _compute_kernel_list(self, g1, g_list): | def _compute_kernel_list(self, g1, g_list): | ||||
start_time = time.time() | start_time = time.time() | ||||
if self._parallel == 'imap_unordered': | |||||
if self.parallel == 'imap_unordered': | |||||
kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) | kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) | ||||
elif self._parallel is None: | |||||
elif self.parallel is None: | |||||
kernel_list = self._compute_kernel_list_series(g1, g_list) | kernel_list = self._compute_kernel_list_series(g1, g_list) | ||||
else: | else: | ||||
raise Exception('Parallel mode is not set correctly.') | raise Exception('Parallel mode is not set correctly.') | ||||
self._run_time = time.time() - start_time | self._run_time = time.time() - start_time | ||||
if self._verbose: | |||||
if self.verbose: | |||||
print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' | print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' | ||||
% (len(g_list), self._run_time)) | % (len(g_list), self._run_time)) | ||||
@@ -158,7 +536,7 @@ class GraphKernel(object): | |||||
kernel = self._compute_single_kernel_series(g1, g2) | kernel = self._compute_single_kernel_series(g1, g2) | ||||
self._run_time = time.time() - start_time | self._run_time = time.time() - start_time | ||||
if self._verbose: | |||||
if self.verbose: | |||||
print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) | print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) | ||||
return kernel | return kernel | ||||
@@ -185,24 +563,24 @@ class GraphKernel(object): | |||||
return self._graphs | return self._graphs | ||||
@property | |||||
def parallel(self): | |||||
return self._parallel | |||||
# @property | |||||
# def parallel(self): | |||||
# return self.parallel | |||||
@property | |||||
def n_jobs(self): | |||||
return self._n_jobs | |||||
# @property | |||||
# def n_jobs(self): | |||||
# return self.n_jobs | |||||
@property | |||||
def verbose(self): | |||||
return self._verbose | |||||
# @property | |||||
# def verbose(self): | |||||
# return self.verbose | |||||
@property | |||||
def normalize(self): | |||||
return self._normalize | |||||
# @property | |||||
# def normalize(self): | |||||
# return self.normalize | |||||
@property | @property | ||||
@@ -46,7 +46,7 @@ class Marginalized(GraphKernel): | |||||
self._add_dummy_labels(self._graphs) | self._add_dummy_labels(self._graphs) | ||||
if self._remove_totters: | if self._remove_totters: | ||||
iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
# @todo: this may not work. | # @todo: this may not work. | ||||
self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] | self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] | ||||
@@ -57,7 +57,7 @@ class Marginalized(GraphKernel): | |||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | ||||
length=len_itr, verbose=(self._verbose >= 2)) | |||||
length=len_itr, verbose=(self.verbose >= 2)) | |||||
for i, j in iterator: | for i, j in iterator: | ||||
kernel = self._kernel_do(self._graphs[i], self._graphs[j]) | kernel = self._kernel_do(self._graphs[i], self._graphs[j]) | ||||
gram_matrix[i][j] = kernel | gram_matrix[i][j] = kernel | ||||
@@ -70,16 +70,16 @@ class Marginalized(GraphKernel): | |||||
self._add_dummy_labels(self._graphs) | self._add_dummy_labels(self._graphs) | ||||
if self._remove_totters: | if self._remove_totters: | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
itr = range(0, len(self._graphs)) | itr = range(0, len(self._graphs)) | ||||
if len(self._graphs) < 100 * self._n_jobs: | |||||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||||
if len(self._graphs) < 100 * self.n_jobs: | |||||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
remove_fun = self._wrapper_untotter | remove_fun = self._wrapper_untotter | ||||
iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), | ||||
desc='removing tottering', file=sys.stdout, | desc='removing tottering', file=sys.stdout, | ||||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||||
for i, g in iterator: | for i, g in iterator: | ||||
self._graphs[i] = g | self._graphs[i] = g | ||||
pool.close() | pool.close() | ||||
@@ -93,7 +93,7 @@ class Marginalized(GraphKernel): | |||||
G_gn = gn_toshare | G_gn = gn_toshare | ||||
do_fun = self._wrapper_kernel_do | do_fun = self._wrapper_kernel_do | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
return gram_matrix | return gram_matrix | ||||
@@ -103,13 +103,13 @@ class Marginalized(GraphKernel): | |||||
if self._remove_totters: | if self._remove_totters: | ||||
g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. | g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. | ||||
iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
# @todo: this may not work. | # @todo: this may not work. | ||||
g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] | g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] | ||||
# compute kernel list. | # compute kernel list. | ||||
kernel_list = [None] * len(g_list) | kernel_list = [None] * len(g_list) | ||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i in iterator: | for i in iterator: | ||||
kernel = self._kernel_do(g1, g_list[i]) | kernel = self._kernel_do(g1, g_list[i]) | ||||
kernel_list[i] = kernel | kernel_list[i] = kernel | ||||
@@ -122,16 +122,16 @@ class Marginalized(GraphKernel): | |||||
if self._remove_totters: | if self._remove_totters: | ||||
g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. | g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
itr = range(0, len(g_list)) | itr = range(0, len(g_list)) | ||||
if len(g_list) < 100 * self._n_jobs: | |||||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||||
if len(g_list) < 100 * self.n_jobs: | |||||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
remove_fun = self._wrapper_untotter | remove_fun = self._wrapper_untotter | ||||
iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), | ||||
desc='removing tottering', file=sys.stdout, | desc='removing tottering', file=sys.stdout, | ||||
length=len(g_list), verbose=(self._verbose >= 2)) | |||||
length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i, g in iterator: | for i, g in iterator: | ||||
g_list[i] = g | g_list[i] = g | ||||
pool.close() | pool.close() | ||||
@@ -151,7 +151,7 @@ class Marginalized(GraphKernel): | |||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | ||||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
return kernel_list | return kernel_list | ||||
@@ -5,23 +5,35 @@ Created on Fri Nov 6 10:11:08 2020 | |||||
@author: ljia | @author: ljia | ||||
""" | """ | ||||
from gklearn.kernels.common_walk import CommonWalk | |||||
from gklearn.kernels.marginalized import Marginalized | |||||
from gklearn.kernels.sylvester_equation import SylvesterEquation | |||||
from gklearn.kernels.conjugate_gradient import ConjugateGradient | |||||
from gklearn.kernels.fixed_point import FixedPoint | |||||
from gklearn.kernels.spectral_decomposition import SpectralDecomposition | |||||
from gklearn.kernels.shortest_path import ShortestPath | |||||
from gklearn.kernels.structural_sp import StructuralSP | |||||
from gklearn.kernels.path_up_to_h import PathUpToH | |||||
from gklearn.kernels.treelet import Treelet | |||||
from gklearn.kernels.weisfeiler_lehman import WLSubtree | |||||
# The metadata of all graph kernels. | # The metadata of all graph kernels. | ||||
GRAPH_KERNELS = { | GRAPH_KERNELS = { | ||||
### based on walks. | ### based on walks. | ||||
'common walk': '', | |||||
'marginalized': '', | |||||
'sylvester equation': '', | |||||
'fixed point': '', | |||||
'conjugate gradient': '', | |||||
'spectral decomposition': '', | |||||
'common walk': CommonWalk, | |||||
'marginalized': Marginalized, | |||||
'sylvester equation': SylvesterEquation, | |||||
'fixed point': FixedPoint, | |||||
'conjugate gradient': ConjugateGradient, | |||||
'spectral decomposition': SpectralDecomposition, | |||||
### based on paths. | ### based on paths. | ||||
'shortest path': '', | |||||
'structural shortest path': '', | |||||
'path up to length h': '', | |||||
'shortest path': ShortestPath, | |||||
'structural shortest path': StructuralSP, | |||||
'path up to length h': PathUpToH, | |||||
### based on non-linear patterns. | ### based on non-linear patterns. | ||||
'weisfeiler-lehman subtree': '', | |||||
'treelet': '', | |||||
'weisfeiler-lehman subtree': WLSubtree, | |||||
'treelet': Treelet, | |||||
} | } | ||||
@@ -41,10 +41,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||||
from itertools import combinations_with_replacement | from itertools import combinations_with_replacement | ||||
itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2)) | |||||
iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self.verbose >= 2)) | |||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator_kernel = get_iters(itr_kernel, desc='Computing kernels', | iterator_kernel = get_iters(itr_kernel, desc='Computing kernels', | ||||
file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||||
file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | ||||
@@ -69,10 +69,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||||
# get all paths of all graphs before computing kernels to save time, | # get all paths of all graphs before computing kernels to save time, | ||||
# but this may cost a lot of memory for large datasets. | # but this may cost a lot of memory for large datasets. | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
itr = zip(self._graphs, range(0, len(self._graphs))) | itr = zip(self._graphs, range(0, len(self._graphs))) | ||||
if len(self._graphs) < 100 * self._n_jobs: | |||||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||||
if len(self._graphs) < 100 * self.n_jobs: | |||||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
all_paths = [[] for _ in range(len(self._graphs))] | all_paths = [[] for _ in range(len(self._graphs))] | ||||
@@ -84,7 +84,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||||
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) | get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) | ||||
iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), | ||||
desc='getting paths', file=sys.stdout, | desc='getting paths', file=sys.stdout, | ||||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||||
for i, ps in iterator: | for i, ps in iterator: | ||||
all_paths[i] = ps | all_paths[i] = ps | ||||
pool.close() | pool.close() | ||||
@@ -109,7 +109,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||||
G_plist = plist_toshare | G_plist = plist_toshare | ||||
do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this? | do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this? | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(all_paths,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
return gram_matrix | return gram_matrix | ||||
@@ -117,8 +117,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||||
def _compute_kernel_list_series(self, g1, g_list): | def _compute_kernel_list_series(self, g1, g_list): | ||||
self._add_dummy_labels(g_list + [g1]) | self._add_dummy_labels(g_list + [g1]) | ||||
iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
kernel_list = [None] * len(g_list) | kernel_list = [None] * len(g_list) | ||||
@@ -143,10 +143,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||||
# get all paths of all graphs before computing kernels to save time, | # get all paths of all graphs before computing kernels to save time, | ||||
# but this may cost a lot of memory for large datasets. | # but this may cost a lot of memory for large datasets. | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
itr = zip(g_list, range(0, len(g_list))) | itr = zip(g_list, range(0, len(g_list))) | ||||
if len(g_list) < 100 * self._n_jobs: | |||||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||||
if len(g_list) < 100 * self.n_jobs: | |||||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
paths_g_list = [[] for _ in range(len(g_list))] | paths_g_list = [[] for _ in range(len(g_list))] | ||||
@@ -161,7 +161,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||||
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) | get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) | ||||
iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), | ||||
desc='getting paths', file=sys.stdout, | desc='getting paths', file=sys.stdout, | ||||
length=len(g_list), verbose=(self._verbose >= 2)) | |||||
length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i, ps in iterator: | for i, ps in iterator: | ||||
paths_g_list[i] = ps | paths_g_list[i] = ps | ||||
pool.close() | pool.close() | ||||
@@ -180,7 +180,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None | |||||
itr = range(len(g_list)) | itr = range(len(g_list)) | ||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
return kernel_list | return kernel_list | ||||
@@ -38,7 +38,7 @@ class ShortestPath(GraphKernel): | |||||
def _compute_gm_series(self): | def _compute_gm_series(self): | ||||
self._all_graphs_have_edges(self._graphs) | self._all_graphs_have_edges(self._graphs) | ||||
# get shortest path graph of each graph. | # get shortest path graph of each graph. | ||||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | ||||
# compute Gram matrix. | # compute Gram matrix. | ||||
@@ -48,7 +48,7 @@ class ShortestPath(GraphKernel): | |||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator = get_iters(itr, desc='Computing kernels', | iterator = get_iters(itr, desc='Computing kernels', | ||||
length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) | |||||
length=len_itr, file=sys.stdout,verbose=(self.verbose >= 2)) | |||||
for i, j in iterator: | for i, j in iterator: | ||||
kernel = self._sp_do(self._graphs[i], self._graphs[j]) | kernel = self._sp_do(self._graphs[i], self._graphs[j]) | ||||
gram_matrix[i][j] = kernel | gram_matrix[i][j] = kernel | ||||
@@ -60,16 +60,16 @@ class ShortestPath(GraphKernel): | |||||
def _compute_gm_imap_unordered(self): | def _compute_gm_imap_unordered(self): | ||||
self._all_graphs_have_edges(self._graphs) | self._all_graphs_have_edges(self._graphs) | ||||
# get shortest path graph of each graph. | # get shortest path graph of each graph. | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
get_sp_graphs_fun = self._wrapper_get_sp_graphs | get_sp_graphs_fun = self._wrapper_get_sp_graphs | ||||
itr = zip(self._graphs, range(0, len(self._graphs))) | itr = zip(self._graphs, range(0, len(self._graphs))) | ||||
if len(self._graphs) < 100 * self._n_jobs: | |||||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||||
if len(self._graphs) < 100 * self.n_jobs: | |||||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), | ||||
desc='getting sp graphs', file=sys.stdout, | desc='getting sp graphs', file=sys.stdout, | ||||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||||
for i, g in iterator: | for i, g in iterator: | ||||
self._graphs[i] = g | self._graphs[i] = g | ||||
pool.close() | pool.close() | ||||
@@ -83,7 +83,7 @@ class ShortestPath(GraphKernel): | |||||
G_gs = gs_toshare | G_gs = gs_toshare | ||||
do_fun = self._wrapper_sp_do | do_fun = self._wrapper_sp_do | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
return gram_matrix | return gram_matrix | ||||
@@ -92,12 +92,12 @@ class ShortestPath(GraphKernel): | |||||
self._all_graphs_have_edges([g1] + g_list) | self._all_graphs_have_edges([g1] + g_list) | ||||
# get shortest path graphs of g1 and each graph in g_list. | # get shortest path graphs of g1 and each graph in g_list. | ||||
g1 = getSPGraph(g1, edge_weight=self._edge_weight) | g1 = getSPGraph(g1, edge_weight=self._edge_weight) | ||||
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | ||||
# compute kernel list. | # compute kernel list. | ||||
kernel_list = [None] * len(g_list) | kernel_list = [None] * len(g_list) | ||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i in iterator: | for i in iterator: | ||||
kernel = self._sp_do(g1, g_list[i]) | kernel = self._sp_do(g1, g_list[i]) | ||||
kernel_list[i] = kernel | kernel_list[i] = kernel | ||||
@@ -109,16 +109,16 @@ class ShortestPath(GraphKernel): | |||||
self._all_graphs_have_edges([g1] + g_list) | self._all_graphs_have_edges([g1] + g_list) | ||||
# get shortest path graphs of g1 and each graph in g_list. | # get shortest path graphs of g1 and each graph in g_list. | ||||
g1 = getSPGraph(g1, edge_weight=self._edge_weight) | g1 = getSPGraph(g1, edge_weight=self._edge_weight) | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
get_sp_graphs_fun = self._wrapper_get_sp_graphs | get_sp_graphs_fun = self._wrapper_get_sp_graphs | ||||
itr = zip(g_list, range(0, len(g_list))) | itr = zip(g_list, range(0, len(g_list))) | ||||
if len(g_list) < 100 * self._n_jobs: | |||||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||||
if len(g_list) < 100 * self.n_jobs: | |||||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), | ||||
desc='getting sp graphs', file=sys.stdout, | desc='getting sp graphs', file=sys.stdout, | ||||
length=len(g_list), verbose=(self._verbose >= 2)) | |||||
length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i, g in iterator: | for i, g in iterator: | ||||
g_list[i] = g | g_list[i] = g | ||||
pool.close() | pool.close() | ||||
@@ -137,7 +137,7 @@ class ShortestPath(GraphKernel): | |||||
itr = range(len(g_list)) | itr = range(len(g_list)) | ||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
return kernel_list | return kernel_list | ||||
@@ -28,9 +28,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
def _compute_gm_series(self): | def _compute_gm_series(self): | ||||
self._check_edge_weight(self._graphs, self._verbose) | |||||
self._check_edge_weight(self._graphs, self.verbose) | |||||
self._check_graphs(self._graphs) | self._check_graphs(self._graphs) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | warnings.warn('All labels are ignored. Only works for undirected graphs.') | ||||
@@ -41,7 +41,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
# precompute the spectral decomposition of each graph. | # precompute the spectral decomposition of each graph. | ||||
P_list = [] | P_list = [] | ||||
D_list = [] | D_list = [] | ||||
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
for G in iterator: | for G in iterator: | ||||
# don't normalize adjacency matrices if q is a uniform vector. Note | # don't normalize adjacency matrices if q is a uniform vector. Note | ||||
# A actually is the transpose of the adjacency matrix. | # A actually is the transpose of the adjacency matrix. | ||||
@@ -58,7 +58,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
from itertools import combinations_with_replacement | from itertools import combinations_with_replacement | ||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||||
for i, j in iterator: | for i, j in iterator: | ||||
kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel) | kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel) | ||||
@@ -74,9 +74,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
def _compute_gm_imap_unordered(self): | def _compute_gm_imap_unordered(self): | ||||
self._check_edge_weight(self._graphs, self._verbose) | |||||
self._check_edge_weight(self._graphs, self.verbose) | |||||
self._check_graphs(self._graphs) | self._check_graphs(self._graphs) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | warnings.warn('All labels are ignored. Only works for undirected graphs.') | ||||
@@ -87,7 +87,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
# precompute the spectral decomposition of each graph. | # precompute the spectral decomposition of each graph. | ||||
P_list = [] | P_list = [] | ||||
D_list = [] | D_list = [] | ||||
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
for G in iterator: | for G in iterator: | ||||
# don't normalize adjacency matrices if q is a uniform vector. Note | # don't normalize adjacency matrices if q is a uniform vector. Note | ||||
# A actually is the transpose of the adjacency matrix. | # A actually is the transpose of the adjacency matrix. | ||||
@@ -107,7 +107,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
do_fun = self._wrapper_kernel_do | do_fun = self._wrapper_kernel_do | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(q_T_list, P_list, D_list), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
else: # @todo | else: # @todo | ||||
pass | pass | ||||
@@ -118,9 +118,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
def _compute_kernel_list_series(self, g1, g_list): | def _compute_kernel_list_series(self, g1, g_list): | ||||
self._check_edge_weight(g_list + [g1], self._verbose) | |||||
self._check_edge_weight(g_list + [g1], self.verbose) | |||||
self._check_graphs(g_list + [g1]) | self._check_graphs(g_list + [g1]) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | warnings.warn('All labels are ignored. Only works for undirected graphs.') | ||||
@@ -133,7 +133,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
D1, P1 = np.linalg.eig(A1) | D1, P1 = np.linalg.eig(A1) | ||||
P_list = [] | P_list = [] | ||||
D_list = [] | D_list = [] | ||||
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
for G in iterator: | for G in iterator: | ||||
# don't normalize adjacency matrices if q is a uniform vector. Note | # don't normalize adjacency matrices if q is a uniform vector. Note | ||||
# A actually is the transpose of the adjacency matrix. | # A actually is the transpose of the adjacency matrix. | ||||
@@ -145,7 +145,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
if self._p is None: # p is uniform distribution as default. | if self._p is None: # p is uniform distribution as default. | ||||
q_T1 = 1 / nx.number_of_nodes(g1) | q_T1 = 1 / nx.number_of_nodes(g1) | ||||
q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] | q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] | ||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i in iterator: | for i in iterator: | ||||
kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel) | kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel) | ||||
@@ -160,9 +160,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | def _compute_kernel_list_imap_unordered(self, g1, g_list): | ||||
self._check_edge_weight(g_list + [g1], self._verbose) | |||||
self._check_edge_weight(g_list + [g1], self.verbose) | |||||
self._check_graphs(g_list + [g1]) | self._check_graphs(g_list + [g1]) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | warnings.warn('All labels are ignored. Only works for undirected graphs.') | ||||
@@ -175,8 +175,8 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
D1, P1 = np.linalg.eig(A1) | D1, P1 = np.linalg.eig(A1) | ||||
P_list = [] | P_list = [] | ||||
D_list = [] | D_list = [] | ||||
if self._verbose >= 2: | |||||
iterator = tqdm(g_list, desc='spectral decompose', file=sys.stdout) | |||||
if self.verbose >= 2: | |||||
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout) | |||||
else: | else: | ||||
iterator = g_list | iterator = g_list | ||||
for G in iterator: | for G in iterator: | ||||
@@ -207,7 +207,7 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
itr = range(len(g_list)) | itr = range(len(g_list)) | ||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
else: # @todo | else: # @todo | ||||
pass | pass | ||||
@@ -222,9 +222,9 @@ class SpectralDecomposition(RandomWalkMeta): | |||||
def _compute_single_kernel_series(self, g1, g2): | def _compute_single_kernel_series(self, g1, g2): | ||||
self._check_edge_weight([g1] + [g2], self._verbose) | |||||
self._check_edge_weight([g1] + [g2], self.verbose) | |||||
self._check_graphs([g1] + [g2]) | self._check_graphs([g1] + [g2]) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored. Only works for undirected graphs.') | warnings.warn('All labels are ignored. Only works for undirected graphs.') | ||||
@@ -41,7 +41,7 @@ class StructuralSP(GraphKernel): | |||||
def _compute_gm_series(self): | def _compute_gm_series(self): | ||||
# get shortest paths of each graph in the graphs. | # get shortest paths of each graph in the graphs. | ||||
splist = [] | splist = [] | ||||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
if self._compute_method == 'trie': | if self._compute_method == 'trie': | ||||
for g in iterator: | for g in iterator: | ||||
splist.append(self._get_sps_as_trie(g)) | splist.append(self._get_sps_as_trie(g)) | ||||
@@ -56,7 +56,7 @@ class StructuralSP(GraphKernel): | |||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | ||||
length=len_itr, verbose=(self._verbose >= 2)) | |||||
length=len_itr, verbose=(self.verbose >= 2)) | |||||
if self._compute_method == 'trie': | if self._compute_method == 'trie': | ||||
for i, j in iterator: | for i, j in iterator: | ||||
kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | ||||
@@ -76,10 +76,10 @@ class StructuralSP(GraphKernel): | |||||
def _compute_gm_imap_unordered(self): | def _compute_gm_imap_unordered(self): | ||||
# get shortest paths of each graph in the graphs. | # get shortest paths of each graph in the graphs. | ||||
splist = [None] * len(self._graphs) | splist = [None] * len(self._graphs) | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
itr = zip(self._graphs, range(0, len(self._graphs))) | itr = zip(self._graphs, range(0, len(self._graphs))) | ||||
if len(self._graphs) < 100 * self._n_jobs: | |||||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||||
if len(self._graphs) < 100 * self.n_jobs: | |||||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
# get shortest path graphs of self._graphs | # get shortest path graphs of self._graphs | ||||
@@ -89,7 +89,7 @@ class StructuralSP(GraphKernel): | |||||
get_sps_fun = self._wrapper_get_sps_naive | get_sps_fun = self._wrapper_get_sps_naive | ||||
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | ||||
desc='getting shortest paths', file=sys.stdout, | desc='getting shortest paths', file=sys.stdout, | ||||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||||
for i, sp in iterator: | for i, sp in iterator: | ||||
splist[i] = sp | splist[i] = sp | ||||
pool.close() | pool.close() | ||||
@@ -107,7 +107,7 @@ class StructuralSP(GraphKernel): | |||||
else: | else: | ||||
do_fun = self._wrapper_ssp_do_naive | do_fun = self._wrapper_ssp_do_naive | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(splist, self._graphs), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
return gram_matrix | return gram_matrix | ||||
@@ -117,7 +117,7 @@ class StructuralSP(GraphKernel): | |||||
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) | sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) | ||||
splist = [] | splist = [] | ||||
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, | iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, | ||||
verbose=(self._verbose >= 2)) | |||||
verbose=(self.verbose >= 2)) | |||||
if self._compute_method == 'trie': | if self._compute_method == 'trie': | ||||
for g in iterator: | for g in iterator: | ||||
splist.append(self._get_sps_as_trie(g)) | splist.append(self._get_sps_as_trie(g)) | ||||
@@ -128,7 +128,7 @@ class StructuralSP(GraphKernel): | |||||
# compute kernel list. | # compute kernel list. | ||||
kernel_list = [None] * len(g_list) | kernel_list = [None] * len(g_list) | ||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', | iterator = get_iters(range(len(g_list)), desc='Computing kernels', | ||||
file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
if self._compute_method == 'trie': | if self._compute_method == 'trie': | ||||
for i in iterator: | for i in iterator: | ||||
kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i]) | kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i]) | ||||
@@ -145,10 +145,10 @@ class StructuralSP(GraphKernel): | |||||
# get shortest paths of g1 and each graph in g_list. | # get shortest paths of g1 and each graph in g_list. | ||||
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) | sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) | ||||
splist = [None] * len(g_list) | splist = [None] * len(g_list) | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
itr = zip(g_list, range(0, len(g_list))) | itr = zip(g_list, range(0, len(g_list))) | ||||
if len(g_list) < 100 * self._n_jobs: | |||||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||||
if len(g_list) < 100 * self.n_jobs: | |||||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
# get shortest path graphs of g_list | # get shortest path graphs of g_list | ||||
@@ -158,7 +158,7 @@ class StructuralSP(GraphKernel): | |||||
get_sps_fun = self._wrapper_get_sps_naive | get_sps_fun = self._wrapper_get_sps_naive | ||||
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | ||||
desc='getting shortest paths', file=sys.stdout, | desc='getting shortest paths', file=sys.stdout, | ||||
length=len(g_list), verbose=(self._verbose >= 2)) | |||||
length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i, sp in iterator: | for i, sp in iterator: | ||||
splist[i] = sp | splist[i] = sp | ||||
pool.close() | pool.close() | ||||
@@ -182,7 +182,7 @@ class StructuralSP(GraphKernel): | |||||
itr = range(len(g_list)) | itr = range(len(g_list)) | ||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
return kernel_list | return kernel_list | ||||
@@ -14,6 +14,7 @@ import sys | |||||
from gklearn.utils import get_iters | from gklearn.utils import get_iters | ||||
import numpy as np | import numpy as np | ||||
import networkx as nx | import networkx as nx | ||||
from control import dlyap | |||||
from gklearn.utils.parallel import parallel_gm, parallel_me | from gklearn.utils.parallel import parallel_gm, parallel_me | ||||
from gklearn.kernels import RandomWalkMeta | from gklearn.kernels import RandomWalkMeta | ||||
@@ -22,14 +23,13 @@ class SylvesterEquation(RandomWalkMeta): | |||||
def __init__(self, **kwargs): | def __init__(self, **kwargs): | ||||
from control import dlyap | |||||
super().__init__(**kwargs) | super().__init__(**kwargs) | ||||
def _compute_gm_series(self): | def _compute_gm_series(self): | ||||
self._check_edge_weight(self._graphs, self._verbose) | |||||
self._check_edge_weight(self._graphs, self.verbose) | |||||
self._check_graphs(self._graphs) | self._check_graphs(self._graphs) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored.') | warnings.warn('All labels are ignored.') | ||||
@@ -41,7 +41,7 @@ class SylvesterEquation(RandomWalkMeta): | |||||
if self._q is None: | if self._q is None: | ||||
# don't normalize adjacency matrices if q is a uniform vector. Note | # don't normalize adjacency matrices if q is a uniform vector. Note | ||||
# A_wave_list actually contains the transposes of the adjacency matrices. | # A_wave_list actually contains the transposes of the adjacency matrices. | ||||
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | ||||
# # normalized adjacency matrices | # # normalized adjacency matrices | ||||
# A_wave_list = [] | # A_wave_list = [] | ||||
@@ -55,7 +55,7 @@ class SylvesterEquation(RandomWalkMeta): | |||||
from itertools import combinations_with_replacement | from itertools import combinations_with_replacement | ||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||||
for i, j in iterator: | for i, j in iterator: | ||||
kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda) | kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda) | ||||
@@ -71,9 +71,9 @@ class SylvesterEquation(RandomWalkMeta): | |||||
def _compute_gm_imap_unordered(self): | def _compute_gm_imap_unordered(self): | ||||
self._check_edge_weight(self._graphs, self._verbose) | |||||
self._check_edge_weight(self._graphs, self.verbose) | |||||
self._check_graphs(self._graphs) | self._check_graphs(self._graphs) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored.') | warnings.warn('All labels are ignored.') | ||||
@@ -83,7 +83,7 @@ class SylvesterEquation(RandomWalkMeta): | |||||
if self._q is None: | if self._q is None: | ||||
# don't normalize adjacency matrices if q is a uniform vector. Note | # don't normalize adjacency matrices if q is a uniform vector. Note | ||||
# A_wave_list actually contains the transposes of the adjacency matrices. | # A_wave_list actually contains the transposes of the adjacency matrices. | ||||
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | ||||
if self._p is None: # p is uniform distribution as default. | if self._p is None: # p is uniform distribution as default. | ||||
@@ -94,7 +94,7 @@ class SylvesterEquation(RandomWalkMeta): | |||||
do_fun = self._wrapper_kernel_do | do_fun = self._wrapper_kernel_do | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(A_wave_list,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
else: # @todo | else: # @todo | ||||
pass | pass | ||||
@@ -105,9 +105,9 @@ class SylvesterEquation(RandomWalkMeta): | |||||
def _compute_kernel_list_series(self, g1, g_list): | def _compute_kernel_list_series(self, g1, g_list): | ||||
self._check_edge_weight(g_list + [g1], self._verbose) | |||||
self._check_edge_weight(g_list + [g1], self.verbose) | |||||
self._check_graphs(g_list + [g1]) | self._check_graphs(g_list + [g1]) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored.') | warnings.warn('All labels are ignored.') | ||||
@@ -120,11 +120,11 @@ class SylvesterEquation(RandomWalkMeta): | |||||
# don't normalize adjacency matrices if q is a uniform vector. Note | # don't normalize adjacency matrices if q is a uniform vector. Note | ||||
# A_wave_list actually contains the transposes of the adjacency matrices. | # A_wave_list actually contains the transposes of the adjacency matrices. | ||||
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | ||||
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] | ||||
if self._p is None: # p is uniform distribution as default. | if self._p is None: # p is uniform distribution as default. | ||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i in iterator: | for i in iterator: | ||||
kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda) | kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda) | ||||
@@ -139,9 +139,9 @@ class SylvesterEquation(RandomWalkMeta): | |||||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | def _compute_kernel_list_imap_unordered(self, g1, g_list): | ||||
self._check_edge_weight(g_list + [g1], self._verbose) | |||||
self._check_edge_weight(g_list + [g1], self.verbose) | |||||
self._check_graphs(g_list + [g1]) | self._check_graphs(g_list + [g1]) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored.') | warnings.warn('All labels are ignored.') | ||||
@@ -152,7 +152,7 @@ class SylvesterEquation(RandomWalkMeta): | |||||
# don't normalize adjacency matrices if q is a uniform vector. Note | # don't normalize adjacency matrices if q is a uniform vector. Note | ||||
# A_wave_list actually contains the transposes of the adjacency matrices. | # A_wave_list actually contains the transposes of the adjacency matrices. | ||||
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() | ||||
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? | ||||
if self._p is None: # p is uniform distribution as default. | if self._p is None: # p is uniform distribution as default. | ||||
@@ -169,7 +169,7 @@ class SylvesterEquation(RandomWalkMeta): | |||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', | init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', | ||||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
else: # @todo | else: # @todo | ||||
pass | pass | ||||
@@ -184,9 +184,9 @@ class SylvesterEquation(RandomWalkMeta): | |||||
def _compute_single_kernel_series(self, g1, g2): | def _compute_single_kernel_series(self, g1, g2): | ||||
self._check_edge_weight([g1] + [g2], self._verbose) | |||||
self._check_edge_weight([g1] + [g2], self.verbose) | |||||
self._check_graphs([g1] + [g2]) | self._check_graphs([g1] + [g2]) | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('All labels are ignored.') | warnings.warn('All labels are ignored.') | ||||
@@ -18,6 +18,8 @@ import numpy as np | |||||
import networkx as nx | import networkx as nx | ||||
from collections import Counter | from collections import Counter | ||||
from itertools import chain | from itertools import chain | ||||
from sklearn.utils.validation import check_is_fitted | |||||
from sklearn.exceptions import NotFittedError | |||||
from gklearn.utils import SpecialLabel | from gklearn.utils import SpecialLabel | ||||
from gklearn.utils.parallel import parallel_gm, parallel_me | from gklearn.utils.parallel import parallel_gm, parallel_me | ||||
from gklearn.utils.utils import find_all_paths, get_mlti_dim_node_attrs | from gklearn.utils.utils import find_all_paths, get_mlti_dim_node_attrs | ||||
@@ -26,14 +28,211 @@ from gklearn.kernels import GraphKernel | |||||
class Treelet(GraphKernel): | class Treelet(GraphKernel): | ||||
def __init__(self, **kwargs): | |||||
GraphKernel.__init__(self) | |||||
self._node_labels = kwargs.get('node_labels', []) | |||||
self._edge_labels = kwargs.get('edge_labels', []) | |||||
self._sub_kernel = kwargs.get('sub_kernel', None) | |||||
self._ds_infos = kwargs.get('ds_infos', {}) | |||||
if self._sub_kernel is None: | |||||
raise Exception('Sub kernel not set.') | |||||
def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs): | |||||
"""Initialise a treelet kernel. | |||||
""" | |||||
super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose) | |||||
self.node_labels = kwargs.get('node_labels', []) | |||||
self.edge_labels = kwargs.get('edge_labels', []) | |||||
self.sub_kernel = kwargs.get('sub_kernel', None) | |||||
self.ds_infos = kwargs.get('ds_infos', {}) | |||||
self.precompute_canonkeys = precompute_canonkeys | |||||
self.save_canonkeys = save_canonkeys | |||||
########################################################################## | |||||
# The following is the 1st paradigm to compute kernel matrix, which is | |||||
# compatible with `scikit-learn`. | |||||
# ------------------------------------------------------------------- | |||||
# Special thanks to the "GraKeL" library for providing an excellent template! | |||||
########################################################################## | |||||
def clear_attributes(self): | |||||
super().clear_attributes() | |||||
if hasattr(self, '_canonkeys'): | |||||
delattr(self, '_canonkeys') | |||||
if hasattr(self, '_Y_canonkeys'): | |||||
delattr(self, '_Y_canonkeys') | |||||
if hasattr(self, '_dummy_labels_considered'): | |||||
delattr(self, '_dummy_labels_considered') | |||||
def validate_parameters(self): | |||||
"""Validate all parameters for the transformer. | |||||
Returns | |||||
------- | |||||
None. | |||||
""" | |||||
super().validate_parameters() | |||||
if self.sub_kernel is None: | |||||
raise ValueError('Sub-kernel not set.') | |||||
def _compute_kernel_matrix_series(self, Y): | |||||
"""Compute the kernel matrix between a given target graphs (Y) and | |||||
the fitted graphs (X / self._graphs) without parallelization. | |||||
Parameters | |||||
---------- | |||||
Y : list of graphs, optional | |||||
The target graphs. | |||||
Returns | |||||
------- | |||||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||||
The computed kernel matrix. | |||||
""" | |||||
# self._add_dummy_labels will modify the input in place. | |||||
self._add_dummy_labels() # For self._graphs | |||||
# Y = [g.copy() for g in Y] # @todo: ? | |||||
self._add_dummy_labels(Y) | |||||
# get all canonical keys of all graphs before computing kernels to save | |||||
# time, but this may cost a lot of memory for large dataset. | |||||
# Canonical keys for self._graphs. | |||||
try: | |||||
check_is_fitted(self, ['_canonkeys']) | |||||
canonkeys_list1 = self._canonkeys | |||||
except NotFittedError: | |||||
canonkeys_list1 = [] | |||||
iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
for g in iterator: | |||||
canonkeys_list1.append(self._get_canonkeys(g)) | |||||
if self.save_canonkeys: | |||||
self._canonkeys = canonkeys_list1 | |||||
# Canonical keys for Y. | |||||
canonkeys_list2 = [] | |||||
iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
for g in iterator: | |||||
canonkeys_list2.append(self._get_canonkeys(g)) | |||||
if self.save_canonkeys: | |||||
self._Y_canonkeys = canonkeys_list2 | |||||
# compute kernel matrix. | |||||
kernel_matrix = np.zeros((len(Y), len(canonkeys_list1))) | |||||
from itertools import product | |||||
itr = product(range(len(Y)), range(len(canonkeys_list1))) | |||||
len_itr = int(len(Y) * len(canonkeys_list1)) | |||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||||
length=len_itr, verbose=(self.verbose >= 2)) | |||||
for i_y, i_x in iterator: | |||||
kernel = self._kernel_do(canonkeys_list2[i_y], canonkeys_list1[i_x]) | |||||
kernel_matrix[i_y][i_x] = kernel | |||||
return kernel_matrix | |||||
def _compute_kernel_matrix_imap_unordered(self, Y): | |||||
"""Compute the kernel matrix between a given target graphs (Y) and | |||||
the fitted graphs (X / self._graphs) using imap unordered parallelization. | |||||
Parameters | |||||
---------- | |||||
Y : list of graphs, optional | |||||
The target graphs. | |||||
Returns | |||||
------- | |||||
kernel_matrix : numpy array, shape = [n_targets, n_inputs] | |||||
The computed kernel matrix. | |||||
""" | |||||
raise Exception('Parallelization for kernel matrix is not implemented.') | |||||
def pairwise_kernel(self, x, y, are_keys=False): | |||||
"""Compute pairwise kernel between two graphs. | |||||
Parameters | |||||
---------- | |||||
x, y : NetworkX Graph. | |||||
Graphs bewteen which the kernel is computed. | |||||
are_keys : boolean, optional | |||||
If `True`, `x` and `y` are canonical keys, otherwise are graphs. | |||||
The default is False. | |||||
Returns | |||||
------- | |||||
kernel: float | |||||
The computed kernel. | |||||
""" | |||||
if are_keys: | |||||
# x, y are canonical keys. | |||||
kernel = self._kernel_do(x, y) | |||||
else: | |||||
# x, y are graphs. | |||||
kernel = self._compute_single_kernel_series(x, y) | |||||
return kernel | |||||
def diagonals(self): | |||||
"""Compute the kernel matrix diagonals of the fit/transformed data. | |||||
Returns | |||||
------- | |||||
X_diag : numpy array | |||||
The diagonal of the kernel matrix between the fitted data. | |||||
This consists of each element calculated with itself. | |||||
Y_diag : numpy array | |||||
The diagonal of the kernel matrix, of the transform. | |||||
This consists of each element calculated with itself. | |||||
""" | |||||
# Check if method "fit" had been called. | |||||
check_is_fitted(self, ['_graphs']) | |||||
# Check if the diagonals of X exist. | |||||
try: | |||||
check_is_fitted(self, ['_X_diag']) | |||||
except NotFittedError: | |||||
# Compute diagonals of X. | |||||
self._X_diag = np.empty(shape=(len(self._graphs),)) | |||||
try: | |||||
check_is_fitted(self, ['_canonkeys']) | |||||
for i, x in enumerate(self._canonkeys): | |||||
self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=True) # @todo: parallel? | |||||
except NotFittedError: | |||||
for i, x in enumerate(self._graphs): | |||||
self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=False) # @todo: parallel? | |||||
try: | |||||
# If transform has happened, return both diagonals. | |||||
check_is_fitted(self, ['_Y']) | |||||
self._Y_diag = np.empty(shape=(len(self._Y),)) | |||||
try: | |||||
check_is_fitted(self, ['_Y_canonkeys']) | |||||
for (i, y) in enumerate(self._Y_canonkeys): | |||||
self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=True) # @todo: parallel? | |||||
except NotFittedError: | |||||
for (i, y) in enumerate(self._Y): | |||||
self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=False) # @todo: parallel? | |||||
return self._X_diag, self._Y_diag | |||||
except NotFittedError: | |||||
# Else just return both X_diag | |||||
return self._X_diag | |||||
########################################################################## | |||||
# The following is the 2nd paradigm to compute kernel matrix. It is | |||||
# simplified and not compatible with `scikit-learn`. | |||||
########################################################################## | |||||
def _compute_gm_series(self): | def _compute_gm_series(self): | ||||
@@ -43,10 +242,13 @@ class Treelet(GraphKernel): | |||||
# time, but this may cost a lot of memory for large dataset. | # time, but this may cost a lot of memory for large dataset. | ||||
canonkeys = [] | canonkeys = [] | ||||
iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout, | iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout, | ||||
verbose=(self._verbose >= 2)) | |||||
verbose=(self.verbose >= 2)) | |||||
for g in iterator: | for g in iterator: | ||||
canonkeys.append(self._get_canonkeys(g)) | canonkeys.append(self._get_canonkeys(g)) | ||||
if self.save_canonkeys: | |||||
self._canonkeys = canonkeys | |||||
# compute Gram matrix. | # compute Gram matrix. | ||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | ||||
@@ -54,7 +256,7 @@ class Treelet(GraphKernel): | |||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | ||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | ||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | ||||
length=len_itr, verbose=(self._verbose >= 2)) | |||||
length=len_itr, verbose=(self.verbose >= 2)) | |||||
for i, j in iterator: | for i, j in iterator: | ||||
kernel = self._kernel_do(canonkeys[i], canonkeys[j]) | kernel = self._kernel_do(canonkeys[i], canonkeys[j]) | ||||
gram_matrix[i][j] = kernel | gram_matrix[i][j] = kernel | ||||
@@ -68,22 +270,25 @@ class Treelet(GraphKernel): | |||||
# get all canonical keys of all graphs before computing kernels to save | # get all canonical keys of all graphs before computing kernels to save | ||||
# time, but this may cost a lot of memory for large dataset. | # time, but this may cost a lot of memory for large dataset. | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
itr = zip(self._graphs, range(0, len(self._graphs))) | itr = zip(self._graphs, range(0, len(self._graphs))) | ||||
if len(self._graphs) < 100 * self._n_jobs: | |||||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||||
if len(self._graphs) < 100 * self.n_jobs: | |||||
chunksize = int(len(self._graphs) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
canonkeys = [[] for _ in range(len(self._graphs))] | canonkeys = [[] for _ in range(len(self._graphs))] | ||||
get_fun = self._wrapper_get_canonkeys | get_fun = self._wrapper_get_canonkeys | ||||
iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), | ||||
desc='getting canonkeys', file=sys.stdout, | desc='getting canonkeys', file=sys.stdout, | ||||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||||
length=len(self._graphs), verbose=(self.verbose >= 2)) | |||||
for i, ck in iterator: | for i, ck in iterator: | ||||
canonkeys[i] = ck | canonkeys[i] = ck | ||||
pool.close() | pool.close() | ||||
pool.join() | pool.join() | ||||
if self.save_canonkeys: | |||||
self._canonkeys = canonkeys | |||||
# compute Gram matrix. | # compute Gram matrix. | ||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | ||||
@@ -92,25 +297,25 @@ class Treelet(GraphKernel): | |||||
G_canonkeys = canonkeys_toshare | G_canonkeys = canonkeys_toshare | ||||
do_fun = self._wrapper_kernel_do | do_fun = self._wrapper_kernel_do | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(canonkeys,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(canonkeys,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
return gram_matrix | return gram_matrix | ||||
def _compute_kernel_list_series(self, g1, g_list): | def _compute_kernel_list_series(self, g1, g_list): | ||||
self._add_dummy_labels(g_list + [g1]) | |||||
# self._add_dummy_labels(g_list + [g1]) | |||||
# get all canonical keys of all graphs before computing kernels to save | # get all canonical keys of all graphs before computing kernels to save | ||||
# time, but this may cost a lot of memory for large dataset. | # time, but this may cost a lot of memory for large dataset. | ||||
canonkeys_1 = self._get_canonkeys(g1) | canonkeys_1 = self._get_canonkeys(g1) | ||||
canonkeys_list = [] | canonkeys_list = [] | ||||
iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self.verbose >= 2)) | |||||
for g in iterator: | for g in iterator: | ||||
canonkeys_list.append(self._get_canonkeys(g)) | canonkeys_list.append(self._get_canonkeys(g)) | ||||
# compute kernel list. | # compute kernel list. | ||||
kernel_list = [None] * len(g_list) | kernel_list = [None] * len(g_list) | ||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i in iterator: | for i in iterator: | ||||
kernel = self._kernel_do(canonkeys_1, canonkeys_list[i]) | kernel = self._kernel_do(canonkeys_1, canonkeys_list[i]) | ||||
kernel_list[i] = kernel | kernel_list[i] = kernel | ||||
@@ -125,16 +330,16 @@ class Treelet(GraphKernel): | |||||
# time, but this may cost a lot of memory for large dataset. | # time, but this may cost a lot of memory for large dataset. | ||||
canonkeys_1 = self._get_canonkeys(g1) | canonkeys_1 = self._get_canonkeys(g1) | ||||
canonkeys_list = [[] for _ in range(len(g_list))] | canonkeys_list = [[] for _ in range(len(g_list))] | ||||
pool = Pool(self._n_jobs) | |||||
pool = Pool(self.n_jobs) | |||||
itr = zip(g_list, range(0, len(g_list))) | itr = zip(g_list, range(0, len(g_list))) | ||||
if len(g_list) < 100 * self._n_jobs: | |||||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||||
if len(g_list) < 100 * self.n_jobs: | |||||
chunksize = int(len(g_list) / self.n_jobs) + 1 | |||||
else: | else: | ||||
chunksize = 100 | chunksize = 100 | ||||
get_fun = self._wrapper_get_canonkeys | get_fun = self._wrapper_get_canonkeys | ||||
iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), | iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), | ||||
desc='getting canonkeys', file=sys.stdout, | desc='getting canonkeys', file=sys.stdout, | ||||
length=len(g_list), verbose=(self._verbose >= 2)) | |||||
length=len(g_list), verbose=(self.verbose >= 2)) | |||||
for i, ck in iterator: | for i, ck in iterator: | ||||
canonkeys_list[i] = ck | canonkeys_list[i] = ck | ||||
pool.close() | pool.close() | ||||
@@ -154,7 +359,7 @@ class Treelet(GraphKernel): | |||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered', | init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered', | ||||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
return kernel_list | return kernel_list | ||||
@@ -164,13 +369,13 @@ class Treelet(GraphKernel): | |||||
def _compute_single_kernel_series(self, g1, g2): | def _compute_single_kernel_series(self, g1, g2): | ||||
self._add_dummy_labels([g1] + [g2]) | |||||
# self._add_dummy_labels([g1] + [g2]) | |||||
canonkeys_1 = self._get_canonkeys(g1) | canonkeys_1 = self._get_canonkeys(g1) | ||||
canonkeys_2 = self._get_canonkeys(g2) | canonkeys_2 = self._get_canonkeys(g2) | ||||
kernel = self._kernel_do(canonkeys_1, canonkeys_2) | kernel = self._kernel_do(canonkeys_1, canonkeys_2) | ||||
return kernel | return kernel | ||||
# @profile | |||||
def _kernel_do(self, canonkey1, canonkey2): | def _kernel_do(self, canonkey1, canonkey2): | ||||
"""Compute treelet graph kernel between 2 graphs. | """Compute treelet graph kernel between 2 graphs. | ||||
@@ -187,7 +392,24 @@ class Treelet(GraphKernel): | |||||
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | ||||
vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | ||||
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) | vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) | ||||
kernel = self._sub_kernel(vector1, vector2) | |||||
# vector1, vector2 = [], [] | |||||
# keys1, keys2 = canonkey1, canonkey2 | |||||
# keys_searched = {} | |||||
# for k, v in canonkey1.items(): | |||||
# if k in keys2: | |||||
# vector1.append(v) | |||||
# vector2.append(canonkey2[k]) | |||||
# keys_searched[k] = v | |||||
# for k, v in canonkey2.items(): | |||||
# if k in keys1 and k not in keys_searched: | |||||
# vector1.append(canonkey1[k]) | |||||
# vector2.append(v) | |||||
# vector1, vector2 = np.array(vector1), np.array(vector2) | |||||
kernel = self.sub_kernel(vector1, vector2) | |||||
return kernel | return kernel | ||||
@@ -223,7 +445,7 @@ class Treelet(GraphKernel): | |||||
patterns['0'] = list(G.nodes()) | patterns['0'] = list(G.nodes()) | ||||
canonkey['0'] = nx.number_of_nodes(G) | canonkey['0'] = nx.number_of_nodes(G) | ||||
for i in range(1, 6): # for i in range(1, 6): | for i in range(1, 6): # for i in range(1, 6): | ||||
patterns[str(i)] = find_all_paths(G, i, self._ds_infos['directed']) | |||||
patterns[str(i)] = find_all_paths(G, i, self.ds_infos['directed']) | |||||
canonkey[str(i)] = len(patterns[str(i)]) | canonkey[str(i)] = len(patterns[str(i)]) | ||||
# n-star patterns | # n-star patterns | ||||
@@ -317,11 +539,11 @@ class Treelet(GraphKernel): | |||||
### pattern obtained in the structural analysis section above, which is a | ### pattern obtained in the structural analysis section above, which is a | ||||
### string corresponding to a unique treelet. A dictionary is built to keep | ### string corresponding to a unique treelet. A dictionary is built to keep | ||||
### track of the amount of every treelet. | ### track of the amount of every treelet. | ||||
if len(self._node_labels) > 0 or len(self._edge_labels) > 0: | |||||
if len(self.node_labels) > 0 or len(self.edge_labels) > 0: | |||||
canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. | canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. | ||||
# linear patterns | # linear patterns | ||||
canonkey_t = Counter(get_mlti_dim_node_attrs(G, self._node_labels)) | |||||
canonkey_t = Counter(get_mlti_dim_node_attrs(G, self.node_labels)) | |||||
for key in canonkey_t: | for key in canonkey_t: | ||||
canonkey_l[('0', key)] = canonkey_t[key] | canonkey_l[('0', key)] = canonkey_t[key] | ||||
@@ -330,9 +552,9 @@ class Treelet(GraphKernel): | |||||
for pattern in patterns[str(i)]: | for pattern in patterns[str(i)]: | ||||
canonlist = [] | canonlist = [] | ||||
for idx, node in enumerate(pattern[:-1]): | for idx, node in enumerate(pattern[:-1]): | ||||
canonlist.append(tuple(G.nodes[node][nl] for nl in self._node_labels)) | |||||
canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self._edge_labels)) | |||||
canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self._node_labels)) | |||||
canonlist.append(tuple(G.nodes[node][nl] for nl in self.node_labels)) | |||||
canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self.edge_labels)) | |||||
canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self.node_labels)) | |||||
canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1] | canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1] | ||||
treelet.append(tuple([str(i)] + canonkey_t)) | treelet.append(tuple([str(i)] + canonkey_t)) | ||||
canonkey_l.update(Counter(treelet)) | canonkey_l.update(Counter(treelet)) | ||||
@@ -343,13 +565,13 @@ class Treelet(GraphKernel): | |||||
for pattern in patterns[str(i) + 'star']: | for pattern in patterns[str(i) + 'star']: | ||||
canonlist = [] | canonlist = [] | ||||
for leaf in pattern[1:]: | for leaf in pattern[1:]: | ||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||||
canonlist.append(tuple((nlabels, elabels))) | canonlist.append(tuple((nlabels, elabels))) | ||||
canonlist.sort() | canonlist.sort() | ||||
canonlist = list(chain.from_iterable(canonlist)) | canonlist = list(chain.from_iterable(canonlist)) | ||||
canonkey_t = tuple(['d' if i == 5 else str(i * 2)] + | canonkey_t = tuple(['d' if i == 5 else str(i * 2)] + | ||||
[tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||||
[tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||||
+ canonlist) | + canonlist) | ||||
treelet.append(canonkey_t) | treelet.append(canonkey_t) | ||||
canonkey_l.update(Counter(treelet)) | canonkey_l.update(Counter(treelet)) | ||||
@@ -359,17 +581,17 @@ class Treelet(GraphKernel): | |||||
for pattern in patterns['7']: | for pattern in patterns['7']: | ||||
canonlist = [] | canonlist = [] | ||||
for leaf in pattern[1:3]: | for leaf in pattern[1:3]: | ||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||||
canonlist.append(tuple((nlabels, elabels))) | canonlist.append(tuple((nlabels, elabels))) | ||||
canonlist.sort() | canonlist.sort() | ||||
canonlist = list(chain.from_iterable(canonlist)) | canonlist = list(chain.from_iterable(canonlist)) | ||||
canonkey_t = tuple(['7'] | canonkey_t = tuple(['7'] | ||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist | |||||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] | |||||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]) | |||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist | |||||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] | |||||
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] | |||||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] | |||||
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)]) | |||||
treelet.append(canonkey_t) | treelet.append(canonkey_t) | ||||
canonkey_l.update(Counter(treelet)) | canonkey_l.update(Counter(treelet)) | ||||
@@ -378,38 +600,38 @@ class Treelet(GraphKernel): | |||||
for pattern in patterns['11']: | for pattern in patterns['11']: | ||||
canonlist = [] | canonlist = [] | ||||
for leaf in pattern[1:4]: | for leaf in pattern[1:4]: | ||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||||
canonlist.append(tuple((nlabels, elabels))) | canonlist.append(tuple((nlabels, elabels))) | ||||
canonlist.sort() | canonlist.sort() | ||||
canonlist = list(chain.from_iterable(canonlist)) | canonlist = list(chain.from_iterable(canonlist)) | ||||
canonkey_t = tuple(['b'] | canonkey_t = tuple(['b'] | ||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist | |||||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G[pattern[4]][pattern[0]][el] for el in self._edge_labels)] | |||||
+ [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)]) | |||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist | |||||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] | |||||
+ [tuple(G[pattern[4]][pattern[0]][el] for el in self.edge_labels)] | |||||
+ [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels)] | |||||
+ [tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)]) | |||||
treelet.append(canonkey_t) | treelet.append(canonkey_t) | ||||
canonkey_l.update(Counter(treelet)) | canonkey_l.update(Counter(treelet)) | ||||
# pattern 10 | # pattern 10 | ||||
treelet = [] | treelet = [] | ||||
for pattern in patterns['10']: | for pattern in patterns['10']: | ||||
canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), | |||||
tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)] | |||||
canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), | |||||
tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)] | |||||
canonlist = [] | canonlist = [] | ||||
for leaf in pattern[1:3]: | for leaf in pattern[1:3]: | ||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||||
canonlist.append(tuple((nlabels, elabels))) | canonlist.append(tuple((nlabels, elabels))) | ||||
canonlist.sort() | canonlist.sort() | ||||
canonkey0 = list(chain.from_iterable(canonlist)) | canonkey0 = list(chain.from_iterable(canonlist)) | ||||
canonkey_t = tuple(['a'] | canonkey_t = tuple(['a'] | ||||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)] | |||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] | |||||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] | |||||
+ [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] | |||||
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)] | |||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||||
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] | |||||
+ canonkey4 + canonkey0) | + canonkey4 + canonkey0) | ||||
treelet.append(canonkey_t) | treelet.append(canonkey_t) | ||||
canonkey_l.update(Counter(treelet)) | canonkey_l.update(Counter(treelet)) | ||||
@@ -419,15 +641,15 @@ class Treelet(GraphKernel): | |||||
for pattern in patterns['12']: | for pattern in patterns['12']: | ||||
canonlist0 = [] | canonlist0 = [] | ||||
for leaf in pattern[1:3]: | for leaf in pattern[1:3]: | ||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) | |||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||||
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) | |||||
canonlist0.append(tuple((nlabels, elabels))) | canonlist0.append(tuple((nlabels, elabels))) | ||||
canonlist0.sort() | canonlist0.sort() | ||||
canonlist0 = list(chain.from_iterable(canonlist0)) | canonlist0 = list(chain.from_iterable(canonlist0)) | ||||
canonlist3 = [] | canonlist3 = [] | ||||
for leaf in pattern[4:6]: | for leaf in pattern[4:6]: | ||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) | |||||
elabels = tuple(G[leaf][pattern[3]][el] for el in self._edge_labels) | |||||
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) | |||||
elabels = tuple(G[leaf][pattern[3]][el] for el in self.edge_labels) | |||||
canonlist3.append(tuple((nlabels, elabels))) | canonlist3.append(tuple((nlabels, elabels))) | ||||
canonlist3.sort() | canonlist3.sort() | ||||
canonlist3 = list(chain.from_iterable(canonlist3)) | canonlist3 = list(chain.from_iterable(canonlist3)) | ||||
@@ -435,14 +657,14 @@ class Treelet(GraphKernel): | |||||
# 2 possible key can be generated from 2 nodes with extended label 3, | # 2 possible key can be generated from 2 nodes with extended label 3, | ||||
# select the one with lower lexicographic order. | # select the one with lower lexicographic order. | ||||
canonkey_t1 = tuple(['c'] | canonkey_t1 = tuple(['c'] | ||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0 | |||||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] | |||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist0 | |||||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] | |||||
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] | |||||
+ canonlist3) | + canonlist3) | ||||
canonkey_t2 = tuple(['c'] | canonkey_t2 = tuple(['c'] | ||||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3 | |||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] | |||||
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + canonlist3 | |||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||||
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] | |||||
+ canonlist0) | + canonlist0) | ||||
treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) | treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) | ||||
canonkey_l.update(Counter(treelet)) | canonkey_l.update(Counter(treelet)) | ||||
@@ -450,24 +672,24 @@ class Treelet(GraphKernel): | |||||
# pattern 9 | # pattern 9 | ||||
treelet = [] | treelet = [] | ||||
for pattern in patterns['9']: | for pattern in patterns['9']: | ||||
canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels), | |||||
tuple(G[pattern[4]][pattern[2]][el] for el in self._edge_labels)] | |||||
canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), | |||||
tuple(G[pattern[5]][pattern[3]][el] for el in self._edge_labels)] | |||||
prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self._node_labels), | |||||
tuple(G[pattern[2]][pattern[0]][el] for el in self._edge_labels)] | |||||
prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels), | |||||
tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] | |||||
canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels), | |||||
tuple(G[pattern[4]][pattern[2]][el] for el in self.edge_labels)] | |||||
canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), | |||||
tuple(G[pattern[5]][pattern[3]][el] for el in self.edge_labels)] | |||||
prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self.node_labels), | |||||
tuple(G[pattern[2]][pattern[0]][el] for el in self.edge_labels)] | |||||
prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels), | |||||
tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] | |||||
if prekey2 + canonkey2 < prekey3 + canonkey3: | if prekey2 + canonkey2 < prekey3 + canonkey3: | ||||
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ | |||||
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ | |||||
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ | |||||
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ | |||||
+ prekey2 + prekey3 + canonkey2 + canonkey3 | + prekey2 + prekey3 + canonkey2 + canonkey3 | ||||
else: | else: | ||||
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ | |||||
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ | |||||
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ | |||||
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ | |||||
+ prekey3 + prekey2 + canonkey3 + canonkey2 | + prekey3 + prekey2 + canonkey3 + canonkey2 | ||||
treelet.append(tuple(['9'] | treelet.append(tuple(['9'] | ||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] | |||||
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] | |||||
+ canonkey_t)) | + canonkey_t)) | ||||
canonkey_l.update(Counter(treelet)) | canonkey_l.update(Counter(treelet)) | ||||
@@ -482,12 +704,33 @@ class Treelet(GraphKernel): | |||||
return i, self._get_canonkeys(g) | return i, self._get_canonkeys(g) | ||||
def _add_dummy_labels(self, Gn): | |||||
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): | |||||
for i in range(len(Gn)): | |||||
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||||
self._node_labels = [SpecialLabel.DUMMY] | |||||
if len(self._edge_labels) == 0 or (len(self._edge_labels) == 1 and self._edge_labels[0] == SpecialLabel.DUMMY): | |||||
for i in range(len(Gn)): | |||||
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||||
self._edge_labels = [SpecialLabel.DUMMY] | |||||
def _add_dummy_labels(self, Gn=None): | |||||
def _add_dummy(Gn): | |||||
if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY): | |||||
for i in range(len(Gn)): | |||||
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||||
self.node_labels = [SpecialLabel.DUMMY] | |||||
if len(self.edge_labels) == 0 or (len(self.edge_labels) == 1 and self.edge_labels[0] == SpecialLabel.DUMMY): | |||||
for i in range(len(Gn)): | |||||
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||||
self.edge_labels = [SpecialLabel.DUMMY] | |||||
if Gn is None or Gn is self._graphs: | |||||
# Add dummy labels for the copy of self._graphs. | |||||
try: | |||||
check_is_fitted(self, ['_dummy_labels_considered']) | |||||
if not self._dummy_labels_considered: | |||||
Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] | |||||
_add_dummy(Gn) | |||||
self._graphs = Gn | |||||
self._dummy_labels_considered = True | |||||
except NotFittedError: | |||||
Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] | |||||
_add_dummy(Gn) | |||||
self._graphs = Gn | |||||
self._dummy_labels_considered = True | |||||
else: | |||||
# Add dummy labels for the input. | |||||
_add_dummy(Gn) | |||||
@@ -14,30 +14,48 @@ Created on Tue Apr 14 15:16:34 2020 | |||||
import numpy as np | import numpy as np | ||||
import networkx as nx | import networkx as nx | ||||
import sys | |||||
from collections import Counter | from collections import Counter | ||||
# from functools import partial | # from functools import partial | ||||
from itertools import combinations_with_replacement | |||||
from gklearn.utils import SpecialLabel | from gklearn.utils import SpecialLabel | ||||
from gklearn.utils.parallel import parallel_gm, parallel_me | from gklearn.utils.parallel import parallel_gm, parallel_me | ||||
from gklearn.kernels import GraphKernel | from gklearn.kernels import GraphKernel | ||||
from gklearn.utils.iters import get_iters | |||||
class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | ||||
def __init__(self, **kwargs): | def __init__(self, **kwargs): | ||||
GraphKernel.__init__(self) | GraphKernel.__init__(self) | ||||
self._node_labels = kwargs.get('node_labels', []) | |||||
self._edge_labels = kwargs.get('edge_labels', []) | |||||
self._height = int(kwargs.get('height', 0)) | |||||
self.node_labels = kwargs.get('node_labels', []) | |||||
self.edge_labels = kwargs.get('edge_labels', []) | |||||
self.height = int(kwargs.get('height', 0)) | |||||
self._base_kernel = kwargs.get('base_kernel', 'subtree') | self._base_kernel = kwargs.get('base_kernel', 'subtree') | ||||
self._ds_infos = kwargs.get('ds_infos', {}) | self._ds_infos = kwargs.get('ds_infos', {}) | ||||
########################################################################## | |||||
# The following is the 1st paradigm to compute kernel matrix, which is | |||||
# compatible with `scikit-learn`. | |||||
# ------------------------------------------------------------------- | |||||
# Special thanks to the "GraKeL" library for providing an excellent template! | |||||
########################################################################## | |||||
########################################################################## | |||||
# The following is the 2nd paradigm to compute kernel matrix. It is | |||||
# simplified and not compatible with `scikit-learn`. | |||||
########################################################################## | |||||
def _compute_gm_series(self): | def _compute_gm_series(self): | ||||
# if self._verbose >= 2: | |||||
# if self.verbose >= 2: | |||||
# import warnings | # import warnings | ||||
# warnings.warn('A part of the computation is parallelized.') | # warnings.warn('A part of the computation is parallelized.') | ||||
self._add_dummy_node_labels(self._graphs) | |||||
# self._add_dummy_node_labels(self._graphs) | |||||
# for WL subtree kernel | # for WL subtree kernel | ||||
if self._base_kernel == 'subtree': | if self._base_kernel == 'subtree': | ||||
@@ -59,7 +77,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
def _compute_gm_imap_unordered(self): | def _compute_gm_imap_unordered(self): | ||||
self._add_dummy_node_labels(self._graphs) | |||||
# self._add_dummy_node_labels(self._graphs) | |||||
if self._base_kernel == 'subtree': | if self._base_kernel == 'subtree': | ||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | ||||
@@ -74,17 +92,17 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
G_gn = gn_toshare | G_gn = gn_toshare | ||||
do_fun = self._wrapper_pairwise | do_fun = self._wrapper_pairwise | ||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | ||||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
return gram_matrix | return gram_matrix | ||||
else: | else: | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') | warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') | ||||
return self._compute_gm_series() | return self._compute_gm_series() | ||||
def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better. | def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better. | ||||
# if self._verbose >= 2: | |||||
# if self.verbose >= 2: | |||||
# import warnings | # import warnings | ||||
# warnings.warn('A part of the computation is parallelized.') | # warnings.warn('A part of the computation is parallelized.') | ||||
@@ -126,10 +144,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
len_itr = len(g_list) | len_itr = len(g_list) | ||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | ||||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | ||||
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) | |||||
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) | |||||
return kernel_list | return kernel_list | ||||
else: | else: | ||||
if self._verbose >= 2: | |||||
if self.verbose >= 2: | |||||
import warnings | import warnings | ||||
warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') | warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') | ||||
return self._compute_kernel_list_series(g1, g_list) | return self._compute_kernel_list_series(g1, g_list) | ||||
@@ -160,6 +178,30 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
return gram_matrix[0][1] | return gram_matrix[0][1] | ||||
########################################################################## | |||||
# The following are the methods used by both diagrams. | |||||
########################################################################## | |||||
def validate_parameters(self): | |||||
"""Validate all parameters for the transformer. | |||||
Returns | |||||
------- | |||||
None. | |||||
""" | |||||
super().validate_parameters() | |||||
if len(self.node_labels) == 0: | |||||
if len(self.edge_labels) == 0: | |||||
self._subtree_kernel_do = self._subtree_kernel_do_unlabeled | |||||
else: | |||||
self._subtree_kernel_do = self._subtree_kernel_do_el | |||||
else: | |||||
if len(self.edge_labels) == 0: | |||||
self._subtree_kernel_do = self._subtree_kernel_do_nl | |||||
else: | |||||
self._subtree_kernel_do = self._subtree_kernel_do_labeled | |||||
def pairwise_kernel(self, g1, g2): | def pairwise_kernel(self, g1, g2): | ||||
Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster! | Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster! | ||||
@@ -172,9 +214,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
for G in Gn: | for G in Gn: | ||||
# set all labels into a tuple. | # set all labels into a tuple. | ||||
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | ||||
G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels) | |||||
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||||
# get the set of original labels | # get the set of original labels | ||||
labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||||
labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||||
# number of occurence of each label in G | # number of occurence of each label in G | ||||
all_num_of_each_label.append(dict(Counter(labels_ori))) | all_num_of_each_label.append(dict(Counter(labels_ori))) | ||||
@@ -182,22 +224,22 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
kernel = self._compute_kernel_itr(kernel, all_num_of_each_label) | kernel = self._compute_kernel_itr(kernel, all_num_of_each_label) | ||||
# iterate each height | # iterate each height | ||||
for h in range(1, self._height + 1): | |||||
for h in range(1, self.height + 1): | |||||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | ||||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | ||||
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | ||||
all_num_of_each_label = [] # number of occurence of each label in G | all_num_of_each_label = [] # number of occurence of each label in G | ||||
# @todo: parallel this part. | # @todo: parallel this part. | ||||
for idx, G in enumerate(Gn): | |||||
for G in Gn: | |||||
all_multisets = [] | all_multisets = [] | ||||
for node, attrs in G.nodes(data=True): | for node, attrs in G.nodes(data=True): | ||||
# Multiset-label determination. | # Multiset-label determination. | ||||
multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] | |||||
multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] | |||||
# sorting each multiset | # sorting each multiset | ||||
multiset.sort() | multiset.sort() | ||||
multiset = [attrs['label_tuple']] + multiset # add the prefix | |||||
multiset = [attrs['lt']] + multiset # add the prefix | |||||
all_multisets.append(tuple(multiset)) | all_multisets.append(tuple(multiset)) | ||||
# label compression | # label compression | ||||
@@ -208,19 +250,19 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
# else assign the number of labels occured + 1 as the compressed label. | # else assign the number of labels occured + 1 as the compressed label. | ||||
for value in set_unique: | for value in set_unique: | ||||
if value in all_set_compressed.keys(): | if value in all_set_compressed.keys(): | ||||
set_compressed.update({value: all_set_compressed[value]}) | |||||
set_compressed[value] = all_set_compressed[value] | |||||
else: | else: | ||||
set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||||
set_compressed[value] = str(num_of_labels_occured + 1) | |||||
num_of_labels_occured += 1 | num_of_labels_occured += 1 | ||||
all_set_compressed.update(set_compressed) | all_set_compressed.update(set_compressed) | ||||
# relabel nodes | # relabel nodes | ||||
for idx, node in enumerate(G.nodes()): | for idx, node in enumerate(G.nodes()): | ||||
G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] | |||||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||||
# get the set of compressed labels | # get the set of compressed labels | ||||
labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||||
# all_labels_ori.update(labels_comp) | # all_labels_ori.update(labels_comp) | ||||
all_num_of_each_label.append(dict(Counter(labels_comp))) | all_num_of_each_label.append(dict(Counter(labels_comp))) | ||||
@@ -249,8 +291,8 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
return kernel | return kernel | ||||
def _subtree_kernel_do(self, Gn): | |||||
"""Compute Weisfeiler-Lehman kernels between graphs. | |||||
def _subtree_kernel_do_nl(self, Gn): | |||||
"""Compute Weisfeiler-Lehman kernels between graphs with node labels. | |||||
Parameters | Parameters | ||||
---------- | ---------- | ||||
@@ -268,12 +310,16 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | ||||
# for each graph | # for each graph | ||||
for G in Gn: | |||||
# set all labels into a tuple. | |||||
if self.verbose >= 2: | |||||
iterator = get_iters(Gn, desc='Setting all labels into a tuple') | |||||
else: | |||||
iterator = Gn | |||||
for G in iterator: | |||||
# set all labels into a tuple. # @todo: remove this original labels or not? | |||||
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | ||||
G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels) | |||||
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||||
# get the set of original labels | # get the set of original labels | ||||
labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||||
labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||||
# number of occurence of each label in G | # number of occurence of each label in G | ||||
all_num_of_each_label.append(dict(Counter(labels_ori))) | all_num_of_each_label.append(dict(Counter(labels_ori))) | ||||
@@ -281,74 +327,398 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | self._compute_gram_itr(gram_matrix, all_num_of_each_label) | ||||
# iterate each height | # iterate each height | ||||
for h in range(1, self._height + 1): | |||||
for h in range(1, self.height + 1): | |||||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | ||||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | ||||
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration | ||||
all_num_of_each_label = [] # number of occurence of each label in G | all_num_of_each_label = [] # number of occurence of each label in G | ||||
# @todo: parallel this part. | # @todo: parallel this part. | ||||
for idx, G in enumerate(Gn): | |||||
# if self.verbose >= 2: | |||||
# iterator = get_iters(enumerate(Gn), desc='Going through iteration ' + str(h), length=len(Gn)) | |||||
# else: | |||||
# iterator = enumerate(Gn) | |||||
for G in Gn: | |||||
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||||
all_multisets = [] | |||||
for node, attrs in G.nodes(data=True): | |||||
# Multiset-label determination. | |||||
multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] | |||||
# sorting each multiset | |||||
multiset.sort() | |||||
multiset = [attrs['label_tuple']] + multiset # add the prefix | |||||
all_multisets.append(tuple(multiset)) | |||||
# Compute subtree kernel with h iterations and add it to the final kernel | |||||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
# label compression | |||||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||||
# a dictionary mapping original labels to new ones. | |||||
set_compressed = {} | |||||
# if a label occured before, assign its former compressed label, | |||||
# else assign the number of labels occured + 1 as the compressed label. | |||||
for value in set_unique: | |||||
if value in all_set_compressed.keys(): | |||||
set_compressed.update({value: all_set_compressed[value]}) | |||||
else: | |||||
set_compressed.update({value: str(num_of_labels_occured + 1)}) | |||||
num_of_labels_occured += 1 | |||||
return gram_matrix | |||||
all_set_compressed.update(set_compressed) | |||||
# relabel nodes | |||||
for idx, node in enumerate(G.nodes()): | |||||
G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] | |||||
def _subtree_kernel_do_el(self, Gn): | |||||
"""Compute Weisfeiler-Lehman kernels between graphs with edge labels. | |||||
# get the set of compressed labels | |||||
labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) | |||||
# all_labels_ori.update(labels_comp) | |||||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are computed. | |||||
# Compute subtree kernel with h iterations and add it to the final kernel | |||||
Return | |||||
------ | |||||
gram_matrix : Numpy matrix | |||||
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||||
""" | |||||
gram_matrix = np.zeros((len(Gn), len(Gn))) | |||||
# initial for height = 0 | |||||
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||||
# Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||||
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||||
for i, j in iterator: | |||||
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||||
gram_matrix[j][i] = gram_matrix[i][j] | |||||
# if h >= 1. | |||||
if self.height > 0: | |||||
# Set all edge labels into a tuple. # @todo: remove this original labels or not? | |||||
if self.verbose >= 2: | |||||
iterator = get_iters(Gn, desc='Setting all labels into a tuple') | |||||
else: | |||||
iterator = Gn | |||||
for G in iterator: | |||||
for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way. | |||||
G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels) | |||||
# When h == 1, compute the kernel. | |||||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
all_num_of_each_label = [] # number of occurence of each label in G | |||||
# @todo: parallel this part. | |||||
for G in Gn: | |||||
num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
# Iterate along heights (>= 2). | |||||
for h in range(2, self.height + 1): | |||||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
all_num_of_each_label = [] # number of occurence of each label in G | |||||
# @todo: parallel this part. | |||||
for G in Gn: | |||||
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | self._compute_gram_itr(gram_matrix, all_num_of_each_label) | ||||
return gram_matrix | return gram_matrix | ||||
def _subtree_kernel_do_labeled(self, Gn): | |||||
"""Compute Weisfeiler-Lehman kernels between graphs with both node and | |||||
edge labels. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are computed. | |||||
Return | |||||
------ | |||||
gram_matrix : Numpy matrix | |||||
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||||
""" | |||||
gram_matrix = np.zeros((len(Gn), len(Gn))) | |||||
# initial for height = 0 | |||||
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||||
# Set all node labels into a tuple and get # of occurence of each label. | |||||
if self.verbose >= 2: | |||||
iterator = get_iters(Gn, desc='Setting all node labels into a tuple') | |||||
else: | |||||
iterator = Gn | |||||
for G in iterator: | |||||
# Set all node labels into a tuple. # @todo: remove this original labels or not? | |||||
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. | |||||
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) | |||||
# Get the set of original labels. | |||||
labels_ori = list(nx.get_node_attributes(G, 'lt').values()) | |||||
# number of occurence of each label in G | |||||
all_num_of_each_label.append(dict(Counter(labels_ori))) | |||||
# Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
# if h >= 1. | |||||
if self.height > 0: | |||||
# Set all edge labels into a tuple. # @todo: remove this original labels or not? | |||||
if self.verbose >= 2: | |||||
iterator = get_iters(Gn, desc='Setting all edge labels into a tuple') | |||||
else: | |||||
iterator = Gn | |||||
for G in iterator: | |||||
for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way. | |||||
G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels) | |||||
# When h == 1, compute the kernel. | |||||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
all_num_of_each_label = [] # number of occurence of each label in G | |||||
# @todo: parallel this part. | |||||
for G in Gn: | |||||
num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
# Iterate along heights. | |||||
for h in range(2, self.height + 1): | |||||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
all_num_of_each_label = [] # number of occurence of each label in G | |||||
# @todo: parallel this part. | |||||
for G in Gn: | |||||
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
return gram_matrix | |||||
def _subtree_kernel_do_unlabeled(self, Gn): | |||||
"""Compute Weisfeiler-Lehman kernels between graphs without labels. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are computed. | |||||
Return | |||||
------ | |||||
gram_matrix : Numpy matrix | |||||
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. | |||||
""" | |||||
gram_matrix = np.zeros((len(Gn), len(Gn))) | |||||
# initial for height = 0 | |||||
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration | |||||
# Compute subtree kernel with the 0th iteration and add it to the final kernel. | |||||
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||||
for i, j in iterator: | |||||
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) | |||||
gram_matrix[j][i] = gram_matrix[i][j] | |||||
# if h >= 1. | |||||
if self.height > 0: | |||||
# When h == 1, compute the kernel. | |||||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
all_num_of_each_label = [] # number of occurence of each label in G | |||||
# @todo: parallel this part. | |||||
for G in Gn: | |||||
num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
# Iterate along heights (>= 2). | |||||
for h in range(2, self.height + 1): | |||||
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration | |||||
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs | |||||
all_num_of_each_label = [] # number of occurence of each label in G | |||||
# @todo: parallel this part. | |||||
for G in Gn: | |||||
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) | |||||
# Compute subtree kernel with h iterations and add it to the final kernel. | |||||
self._compute_gram_itr(gram_matrix, all_num_of_each_label) | |||||
return gram_matrix | |||||
def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||||
all_multisets = [] | |||||
for node, attrs in G.nodes(data=True): | |||||
# Multiset-label determination. | |||||
multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] | |||||
# sorting each multiset | |||||
multiset.sort() | |||||
multiset = [attrs['lt']] + multiset # add the prefix | |||||
all_multisets.append(tuple(multiset)) | |||||
# label compression | |||||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||||
# a dictionary mapping original labels to new ones. | |||||
set_compressed = {} | |||||
# If a label occured before, assign its former compressed label; | |||||
# otherwise assign the number of labels occured + 1 as the | |||||
# compressed label. | |||||
for value in set_unique: | |||||
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||||
set_compressed[value] = all_set_compressed[value] | |||||
else: | |||||
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? and what if num_of_labels_occured is extremely big. | |||||
num_of_labels_occured += 1 | |||||
all_set_compressed.update(set_compressed) | |||||
# Relabel nodes. | |||||
for idx, node in enumerate(G.nodes()): | |||||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||||
# Get the set of compressed labels. | |||||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||||
return num_of_labels_occured | |||||
def _subtree_1graph_el(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||||
all_multisets = [] | |||||
# for node, attrs in G.nodes(data=True): | |||||
for node in G.nodes(): | |||||
# Multiset-label determination. | |||||
multiset = [G.edges[(node, neighbors)]['lt'] for neighbors in G[node]] # @todo: check reference for this. | |||||
# sorting each multiset | |||||
multiset.sort() | |||||
# multiset = [attrs['lt']] + multiset # add the prefix | |||||
all_multisets.append(tuple(multiset)) | |||||
# label compression | |||||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||||
# a dictionary mapping original labels to new ones. | |||||
set_compressed = {} | |||||
# If a label occured before, assign its former compressed label; | |||||
# otherwise assign the number of labels occured + 1 as the | |||||
# compressed label. | |||||
for value in set_unique: | |||||
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||||
set_compressed[value] = all_set_compressed[value] | |||||
else: | |||||
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? | |||||
num_of_labels_occured += 1 | |||||
all_set_compressed.update(set_compressed) | |||||
# Relabel nodes. | |||||
for idx, node in enumerate(G.nodes()): | |||||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||||
# Get the set of compressed labels. | |||||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) # @todo: maybe can be faster. | |||||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||||
return num_of_labels_occured | |||||
def _subtree_1graph_labeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||||
all_multisets = [] | |||||
for node, attrs in G.nodes(data=True): | |||||
# Multiset-label determination. | |||||
multiset = [tuple((G.edges[(node, neighbors)]['lt'], G.nodes[neighbors]['lt'])) for neighbors in G[node]] # @todo: check reference for this. | |||||
# sorting each multiset | |||||
multiset.sort() | |||||
multiset = [attrs['lt']] + multiset # add the prefix | |||||
all_multisets.append(tuple(multiset)) | |||||
# label compression | |||||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||||
# a dictionary mapping original labels to new ones. | |||||
set_compressed = {} | |||||
# If a label occured before, assign its former compressed label; | |||||
# otherwise assign the number of labels occured + 1 as the | |||||
# compressed label. | |||||
for value in set_unique: | |||||
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||||
set_compressed[value] = all_set_compressed[value] | |||||
else: | |||||
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? | |||||
num_of_labels_occured += 1 | |||||
all_set_compressed.update(set_compressed) | |||||
# Relabel nodes. | |||||
for idx, node in enumerate(G.nodes()): | |||||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||||
# Get the set of compressed labels. | |||||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||||
return num_of_labels_occured | |||||
def _subtree_1graph_unlabeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): | |||||
# all_multisets = [] | |||||
# for node, attrs in G.nodes(data=True): # @todo: it can be better. | |||||
# # Multiset-label determination. | |||||
# multiset = [0 for neighbors in G[node]] | |||||
# # sorting each multiset | |||||
# multiset.sort() | |||||
# multiset = [0] + multiset # add the prefix | |||||
# all_multisets.append(tuple(multiset)) | |||||
all_multisets = [len(G[node]) for node in G.nodes()] | |||||
# label compression | |||||
set_unique = list(set(all_multisets)) # set of unique multiset labels | |||||
# a dictionary mapping original labels to new ones. | |||||
set_compressed = {} | |||||
# If a label occured before, assign its former compressed label; | |||||
# otherwise assign the number of labels occured + 1 as the | |||||
# compressed label. | |||||
for value in set_unique: | |||||
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? | |||||
set_compressed[value] = all_set_compressed[value] | |||||
else: | |||||
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? | |||||
num_of_labels_occured += 1 | |||||
all_set_compressed.update(set_compressed) | |||||
# Relabel nodes. | |||||
for idx, node in enumerate(G.nodes()): | |||||
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] | |||||
# Get the set of compressed labels. | |||||
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) | |||||
all_num_of_each_label.append(dict(Counter(labels_comp))) | |||||
return num_of_labels_occured | |||||
def _compute_gram_itr(self, gram_matrix, all_num_of_each_label): | def _compute_gram_itr(self, gram_matrix, all_num_of_each_label): | ||||
"""Compute Gram matrix using the base kernel. | """Compute Gram matrix using the base kernel. | ||||
""" | """ | ||||
# if self._parallel == 'imap_unordered': | |||||
# if self.parallel == 'imap_unordered': | |||||
# # compute kernels. | # # compute kernels. | ||||
# def init_worker(alllabels_toshare): | # def init_worker(alllabels_toshare): | ||||
# global G_alllabels | # global G_alllabels | ||||
# G_alllabels = alllabels_toshare | # G_alllabels = alllabels_toshare | ||||
# do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix) | # do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix) | ||||
# parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, | # parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, | ||||
# glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
# elif self._parallel is None: | |||||
for i in range(len(gram_matrix)): | |||||
for j in range(i, len(gram_matrix)): | |||||
gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i], | |||||
all_num_of_each_label[j], gram_matrix[i][j]) | |||||
gram_matrix[j][i] = gram_matrix[i][j] | |||||
def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel): | |||||
# glbv=(all_num_of_each_label,), n_jobs=self.n_jobs, verbose=self.verbose) | |||||
# elif self.parallel is None: | |||||
itr = combinations_with_replacement(range(0, len(gram_matrix)), 2) | |||||
len_itr = int(len(gram_matrix) * (len(gram_matrix) + 1) / 2) | |||||
iterator = get_iters(itr, desc='Computing Gram matrix for this iteration', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) | |||||
for i, j in iterator: | |||||
# for i in iterator: | |||||
# for j in range(i, len(gram_matrix)): | |||||
gram_matrix[i][j] += self._compute_subtree_kernel(all_num_of_each_label[i], | |||||
all_num_of_each_label[j]) | |||||
gram_matrix[j][i] = gram_matrix[i][j] | |||||
def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2): | |||||
"""Compute the subtree kernel. | """Compute the subtree kernel. | ||||
""" | """ | ||||
labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) | labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) | ||||
@@ -358,7 +728,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
vector2 = np.array([(num_of_each_label2[label] | vector2 = np.array([(num_of_each_label2[label] | ||||
if (label in num_of_each_label2.keys()) else 0) | if (label in num_of_each_label2.keys()) else 0) | ||||
for label in labels]) | for label in labels]) | ||||
kernel += np.dot(vector1, vector2) | |||||
kernel = np.dot(vector1, vector2) | |||||
return kernel | return kernel | ||||
@@ -426,9 +796,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | ||||
for value in set_unique: | for value in set_unique: | ||||
if value in all_set_compressed.keys(): | if value in all_set_compressed.keys(): | ||||
set_compressed.update({ value : all_set_compressed[value] }) | |||||
set_compressed[value] = all_set_compressed[value] | |||||
else: | else: | ||||
set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||||
set_compressed[value] = str(num_of_labels_occured + 1) | |||||
num_of_labels_occured += 1 | num_of_labels_occured += 1 | ||||
all_set_compressed.update(set_compressed) | all_set_compressed.update(set_compressed) | ||||
@@ -504,9 +874,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | ||||
for value in set_unique: | for value in set_unique: | ||||
if value in all_set_compressed.keys(): | if value in all_set_compressed.keys(): | ||||
set_compressed.update({ value : all_set_compressed[value] }) | |||||
set_compressed[value] = all_set_compressed[value] | |||||
else: | else: | ||||
set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||||
set_compressed[value] = str(num_of_labels_occured + 1) | |||||
num_of_labels_occured += 1 | num_of_labels_occured += 1 | ||||
all_set_compressed.update(set_compressed) | all_set_compressed.update(set_compressed) | ||||
@@ -577,9 +947,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label | ||||
for value in set_unique: | for value in set_unique: | ||||
if value in all_set_compressed.keys(): | if value in all_set_compressed.keys(): | ||||
set_compressed.update({ value : all_set_compressed[value] }) | |||||
set_compressed[value] = all_set_compressed[value] | |||||
else: | else: | ||||
set_compressed.update({ value : str(num_of_labels_occured + 1) }) | |||||
set_compressed[value] = str(num_of_labels_occured + 1) | |||||
num_of_labels_occured += 1 | num_of_labels_occured += 1 | ||||
all_set_compressed.update(set_compressed) | all_set_compressed.update(set_compressed) | ||||
@@ -595,10 +965,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. | |||||
def _add_dummy_node_labels(self, Gn): | def _add_dummy_node_labels(self, Gn): | ||||
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): | |||||
if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY): | |||||
for i in range(len(Gn)): | for i in range(len(Gn)): | ||||
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | ||||
self._node_labels = [SpecialLabel.DUMMY] | |||||
self.node_labels = [SpecialLabel.DUMMY] | |||||
class WLSubtree(WeisfeilerLehman): | class WLSubtree(WeisfeilerLehman): | ||||
@@ -0,0 +1,14 @@ | |||||
# -*-coding:utf-8 -*- | |||||
""" | |||||
model learning. | |||||
""" | |||||
# info | |||||
__version__ = "0.2" | |||||
__author__ = "Linlin Jia" | |||||
__date__ = "November 2020" | |||||
from gklearn.model_learning.nested_cv import NestedCV | |||||
from gklearn.model_learning.workflow import Workflow | |||||
from gklearn.model_learning.parameters import dichotomous_permutation |
@@ -0,0 +1,714 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri Nov 27 18:59:28 2020 | |||||
@author: ljia | |||||
""" | |||||
import os | |||||
import datetime | |||||
import time | |||||
import sys | |||||
from tqdm import tqdm | |||||
from multiprocessing import Pool, Array | |||||
from functools import partial | |||||
import numpy as np | |||||
from matplotlib import pyplot as plt | |||||
from sklearn.model_selection import KFold, train_test_split, ParameterGrid | |||||
from sklearn.kernel_ridge import KernelRidge | |||||
from sklearn.svm import SVC | |||||
from sklearn.metrics import accuracy_score, mean_squared_error | |||||
class NestedCV(object): | |||||
"""Perform model selection, fitting and testing for precomputed kernels | |||||
using nested CV. Print out neccessary data during the process then finally | |||||
the results. | |||||
Parameters | |||||
---------- | |||||
datafile : string | |||||
Path of dataset file. | |||||
estimator : function | |||||
kernel function used to estimate. This function needs to return a gram matrix. | |||||
param_grid_precomputed : dictionary | |||||
Dictionary with names (string) of parameters used to calculate gram | |||||
matrices as keys and lists of parameter settings to try as values. This | |||||
enables searching over any sequence of parameter settings. Params with | |||||
length 1 will be omitted. | |||||
param_grid : dictionary | |||||
Dictionary with names (string) of parameters used as penelties as keys | |||||
and lists of parameter settings to try as values. This enables | |||||
searching over any sequence of parameter settings. Params with length 1 | |||||
will be omitted. | |||||
model_type : string | |||||
Type of the problem, can be 'regression' or 'classification'. | |||||
NUM_TRIALS : integer | |||||
Number of random trials of the outer CV loop. The default is 30. | |||||
datafile_y : string | |||||
Path of file storing y data. This parameter is optional depending on | |||||
the given dataset file. | |||||
extra_params : dict | |||||
Extra parameters for loading dataset. See function gklearn.utils. | |||||
graphfiles.loadDataset for detail. | |||||
ds_name : string | |||||
Name of the dataset. | |||||
n_jobs : int | |||||
Number of jobs for parallelization. | |||||
read_gm_from_file : boolean | |||||
Whether gram matrices are loaded from a file. | |||||
Examples | |||||
-------- | |||||
>>> import numpy as np | |||||
>>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel | |||||
>>> from gklearn.kernels.untilHPathKernel import untilhpathkernel | |||||
>>> | |||||
>>> datafile = '../datasets/MUTAG/MUTAG_A.txt' | |||||
>>> estimator = untilhpathkernel | |||||
>>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: | |||||
[’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} | |||||
>>> # ’C’ for classification problems and ’alpha’ for regression problems. | |||||
>>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: | |||||
np.logspace(-10, 10, num=41, base=10)}] | |||||
>>> | |||||
>>> model_selection_for_precomputed_kernel(datafile, estimator, | |||||
param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) | |||||
""" | |||||
def __init__(self, dataset, estimator, param_grid_precomputed=None, param_grid=None, model_type=None, num_trials=30, output_dir=None, n_jobs=1, save_gms=True, save_gm_figs=False, logging=True, verbose=True, **kwargs): | |||||
tqdm.monitor_interval = 0 | |||||
self._ds = dataset | |||||
self._estimator = estimator | |||||
self._num_trials = num_trials | |||||
self._n_jobs = n_jobs | |||||
self._save_gms = save_gms | |||||
self._save_gm_figs = save_gm_figs | |||||
self._logging = logging | |||||
self._verbose = verbose | |||||
self._kwargs = kwargs | |||||
# Set dataset name. | |||||
if self._ds._ds_name is None: | |||||
self._ds_name = 'ds-unknown' | |||||
else: | |||||
self._ds_name = self._ds._ds_name | |||||
# The output directory. | |||||
if output_dir is None: | |||||
self._output_dir = os.path.join('outputs/', estimator.__name__) | |||||
else: | |||||
self._output_dir = output_dir | |||||
os.makedirs(self._output_dir, exist_ok=True) | |||||
# Setup the model type. | |||||
if model_type is None: | |||||
self._model_type = dataset._task_type | |||||
else: | |||||
self._model_type = model_type.lower() | |||||
if self._model_type != 'regression' and self._model_type != 'classification': | |||||
raise Exception('The model type is incorrect! Please choose from regression or classification.') | |||||
# @todo: Set param_grid_precomputed and param_grid. | |||||
self._param_grid_precomputed = param_grid_precomputed | |||||
self._param_grid = param_grid | |||||
if self._verbose: | |||||
print() | |||||
print('--- This is a %s problem ---' % self._model_type) | |||||
# A string to save all the results. | |||||
if self._logging: | |||||
self._str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' | |||||
self._str_fw += '# This file contains results of ' + self._estimator.__name__ + ' on dataset ' + self._ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||||
self._str_fw += 'This is a %s problem.\n' % self._model_type | |||||
self.run() | |||||
def run(self): | |||||
self.fit() | |||||
self.compute_gram_matrices() | |||||
if len(self._gram_matrices) == 0: | |||||
if self._verbose: | |||||
print('All gram matrices are ignored, no results obtained.') | |||||
if self._logging: | |||||
self._str_fw += '\nAll gram matrices are ignored, no results obtained.\n\n' | |||||
else: | |||||
self.do_cv() | |||||
# print out results as table. | |||||
if self._logging: | |||||
self._str_fw += self.printResultsInTable(self._param_list, self._param_list_pre_revised, self._average_val_scores, self._std_val_scores, self._average_perf_scores, self._std_perf_scores, self._average_train_scores, self._std_train_scores, self._gram_matrix_time, self._model_type, self._verbose) | |||||
# open file to save all results for this dataset. | |||||
if not os.path.exists(self._output_dir + '/' + self._ds_name + '.output.txt'): | |||||
with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'w') as f: | |||||
f.write(self._str_fw) | |||||
else: | |||||
with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'r+') as f: | |||||
content = f.read() | |||||
f.seek(0, 0) | |||||
f.write(self._str_fw + '\n\n\n' + content) | |||||
return self._final_performance, self._final_confidence | |||||
def fit(self): | |||||
return | |||||
def compute_gram_matrices(self): | |||||
"""Compute all gram matrices. | |||||
Returns | |||||
------- | |||||
None. | |||||
""" | |||||
# Grid of parameters with a discrete number of values for each. | |||||
self._param_list_precomputed = list(ParameterGrid(self._param_grid_precomputed)) | |||||
self._param_list = list(ParameterGrid(self._param_grid)) | |||||
self._gram_matrices = [ | |||||
] # a list to store gram matrices for all param_grid_precomputed | |||||
self._gram_matrix_time = [ | |||||
] # a list to store time to calculate gram matrices | |||||
self._param_list_pre_revised = [ | |||||
] # list to store param grids precomputed ignoring the useless ones | |||||
if self._verbose: | |||||
print() | |||||
print('\n1. Computing gram matrices. This could take a while...') | |||||
if self._logging: | |||||
self._str_fw += '\nI. Gram matrices.\n\n' | |||||
self._tts = time.time() # start training time | |||||
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) | |||||
for idx, params_out in enumerate(self._param_list_precomputed): | |||||
y = self._ds.targets[:] | |||||
params_out['n_jobs'] = self._n_jobs | |||||
params_out['verbose'] = self._verbose | |||||
# print(dataset) | |||||
# import networkx as nx | |||||
# nx.draw_networkx(dataset[1]) | |||||
# plt.show() | |||||
rtn_data = self._estimator(self._ds.graphs[:], **params_out) # @todo: Attention! this will not copy the graphs. | |||||
Kmatrix = rtn_data[0] | |||||
current_run_time = rtn_data[1] | |||||
# for some kernels, some graphs in datasets may not meet the | |||||
# kernels' requirements for graph structure. These graphs are trimmed. | |||||
if len(rtn_data) == 3: | |||||
idx_trim = rtn_data[2] # the index of trimmed graph list | |||||
y = [y[idxt] for idxt in idx_trim] # trim y accordingly | |||||
# Kmatrix = np.random.rand(2250, 2250) | |||||
# current_run_time = 0.1 | |||||
# remove graphs whose kernels with themselves are zeros | |||||
# @todo: y not changed accordingly? | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
nb_g_ignore = 0 | |||||
for idxk, diag in enumerate(Kmatrix_diag): | |||||
if diag == 0: | |||||
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) | |||||
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) | |||||
nb_g_ignore += 1 | |||||
# normalization | |||||
# @todo: works only for undirected graph? | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
for i in range(len(Kmatrix)): | |||||
for j in range(i, len(Kmatrix)): | |||||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
if self._verbose: | |||||
print() | |||||
if params_out == {}: | |||||
if self._verbose: | |||||
print('the gram matrix is: ') | |||||
if self._logging: | |||||
self._str_fw += 'the gram matrix is:\n\n' | |||||
else: | |||||
if self._verbose: | |||||
print('the gram matrix with parameters', params_out, 'is: \n\n') | |||||
if self._logging: | |||||
self._str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out | |||||
if len(Kmatrix) < 2: | |||||
nb_gm_ignore += 1 | |||||
if self._verbose: | |||||
print('ignored, as at most only one of all its diagonal value is non-zero.') | |||||
if self._logging: | |||||
self._str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' | |||||
else: | |||||
if np.isnan(Kmatrix).any( | |||||
): # if the matrix contains elements that are not numbers | |||||
nb_gm_ignore += 1 | |||||
if self._verbose: | |||||
print('ignored, as it contains elements that are not numbers.') | |||||
if self._logging: | |||||
self._str_fw += 'ignored, as it contains elements that are not numbers.\n\n' | |||||
else: | |||||
# print(Kmatrix) | |||||
if self._logging: | |||||
self._str_fw += np.array2string( | |||||
Kmatrix, | |||||
separator=',') + '\n\n' | |||||
# separator=',', | |||||
# threshold=np.inf, | |||||
# floatmode='unique') + '\n\n' | |||||
# Draw and save Gram matrix figures. | |||||
if self._save_gm_figs: | |||||
fig_file_name = self._output_dir + '/GM[ds]' + self._ds_name | |||||
if params_out != {}: | |||||
fig_file_name += '[params]' + str(idx) | |||||
plt.imshow(Kmatrix) | |||||
plt.colorbar() | |||||
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) | |||||
# plt.show() | |||||
plt.clf() | |||||
self._gram_matrices.append(Kmatrix) | |||||
self._gram_matrix_time.append(current_run_time) | |||||
self._param_list_pre_revised.append(params_out) | |||||
if nb_g_ignore > 0: | |||||
if self._verbose: | |||||
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) | |||||
if self._logging: | |||||
self._str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore | |||||
if self._verbose: | |||||
print() | |||||
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(self._param_list_precomputed), nb_gm_ignore)) | |||||
if self._logging: | |||||
self._str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(self._param_list_precomputed), nb_gm_ignore) | |||||
self._str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' | |||||
self._str_fw += ''.join(['{}: {}\n'.format(idx, params_out) for idx, params_out in enumerate(self._param_list_precomputed)]) | |||||
def do_cv(self): | |||||
# save gram matrices to file. | |||||
# np.savez(output_dir + '/' + ds_name + '.gm', | |||||
# gms=gram_matrices, params=param_list_pre_revised, y=y, | |||||
# gmtime=gram_matrix_time) | |||||
if self._verbose: | |||||
print('2. Fitting and predicting using nested cross validation. This could really take a while...') | |||||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
# def func_assign(result, var_to_assign): | |||||
# for idx, itm in enumerate(var_to_assign): | |||||
# itm.append(result[idx]) | |||||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) | |||||
# | |||||
# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, | |||||
# [train_pref, val_pref, test_pref], glbv=gram_matrices, | |||||
# method='imap_unordered', n_jobs=n_jobs, chunksize=1, | |||||
# itr_desc='cross validation') | |||||
def init_worker(gms_toshare): | |||||
global G_gms | |||||
G_gms = gms_toshare | |||||
# gram_matrices = np.array(gram_matrices) | |||||
# gms_shape = gram_matrices.shape | |||||
# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) | |||||
# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) | |||||
pool = Pool(processes=self._n_jobs, initializer=init_worker, initargs=(self._gram_matrices,)) | |||||
trial_do_partial = partial(self._parallel_trial_do, self._param_list_pre_revised, self._param_list, self._ds.targets[:], self._model_type) # @todo: maybe self._ds.targets[:] should be y. | |||||
train_pref = [] | |||||
val_pref = [] | |||||
test_pref = [] | |||||
# if NUM_TRIALS < 1000 * n_jobs: | |||||
# chunksize = int(NUM_TRIALS / n_jobs) + 1 | |||||
# else: | |||||
# chunksize = 1000 | |||||
chunksize = 1 | |||||
if self._verbose: | |||||
iterator = tqdm(pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize), desc='cross validation', file=sys.stdout) | |||||
else: | |||||
iterator = pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize) | |||||
for o1, o2, o3 in iterator: | |||||
train_pref.append(o1) | |||||
val_pref.append(o2) | |||||
test_pref.append(o3) | |||||
pool.close() | |||||
pool.join() | |||||
# # ---- use pool.map to parallel. ---- | |||||
# pool = Pool(n_jobs) | |||||
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type) | |||||
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) | |||||
# train_pref = [item[0] for item in result_perf] | |||||
# val_pref = [item[1] for item in result_perf] | |||||
# test_pref = [item[2] for item in result_perf] | |||||
# # ---- direct running, normally use a single CPU core. ---- | |||||
# train_pref = [] | |||||
# val_pref = [] | |||||
# test_pref = [] | |||||
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): | |||||
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) | |||||
# train_pref.append(o1) | |||||
# val_pref.append(o2) | |||||
# test_pref.append(o3) | |||||
# print() | |||||
if self._verbose: | |||||
print() | |||||
print('3. Getting final performance...') | |||||
if self._logging: | |||||
self._str_fw += '\nII. Performance.\n\n' | |||||
# averages and confidences of performances on outer trials for each combination of parameters | |||||
self._average_train_scores = np.mean(train_pref, axis=0) | |||||
# print('val_pref: ', val_pref[0][0]) | |||||
self._average_val_scores = np.mean(val_pref, axis=0) | |||||
# print('test_pref: ', test_pref[0][0]) | |||||
self._average_perf_scores = np.mean(test_pref, axis=0) | |||||
# sample std is used here | |||||
self._std_train_scores = np.std(train_pref, axis=0, ddof=1) | |||||
self._std_val_scores = np.std(val_pref, axis=0, ddof=1) | |||||
self._std_perf_scores = np.std(test_pref, axis=0, ddof=1) | |||||
if self._model_type == 'regression': | |||||
best_val_perf = np.amin(self._average_val_scores) | |||||
else: | |||||
best_val_perf = np.amax(self._average_val_scores) | |||||
# print('average_val_scores: ', self._average_val_scores) | |||||
# print('best_val_perf: ', best_val_perf) | |||||
# print() | |||||
best_params_index = np.where(self._average_val_scores == best_val_perf) | |||||
# find smallest val std with best val perf. | |||||
best_val_stds = [ | |||||
self._std_val_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
min_val_std = np.amin(best_val_stds) | |||||
best_params_index = np.where(self._std_val_scores == min_val_std) | |||||
best_params_out = [self._param_list_pre_revised[i] for i in best_params_index[0]] | |||||
best_params_in = [self._param_list[i] for i in best_params_index[1]] | |||||
if self._verbose: | |||||
print('best_params_out: ', best_params_out) | |||||
print('best_params_in: ', best_params_in) | |||||
print() | |||||
print('best_val_perf: ', best_val_perf) | |||||
print('best_val_std: ', min_val_std) | |||||
if self._logging: | |||||
self._str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out | |||||
self._str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in | |||||
self._str_fw += 'best_val_perf: %s\n' % best_val_perf | |||||
self._str_fw += 'best_val_std: %s\n' % min_val_std | |||||
# print(best_params_index) | |||||
# print(best_params_index[0]) | |||||
# print(self._average_perf_scores) | |||||
self._final_performance = [ | |||||
self._average_perf_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
self._final_confidence = [ | |||||
self._std_perf_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
if self._verbose: | |||||
print('final_performance: ', self._final_performance) | |||||
print('final_confidence: ', self._final_confidence) | |||||
if self._logging: | |||||
self._str_fw += 'final_performance: %s\n' % self._final_performance | |||||
self._str_fw += 'final_confidence: %s\n' % self._final_confidence | |||||
train_performance = [ | |||||
self._average_train_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
train_std = [ | |||||
self._std_train_scores[value][best_params_index[1][idx]] | |||||
for idx, value in enumerate(best_params_index[0]) | |||||
] | |||||
if self._verbose: | |||||
print('train_performance: %s' % train_performance) | |||||
print('train_std: ', train_std) | |||||
if self._logging: | |||||
self._str_fw += 'train_performance: %s\n' % train_performance | |||||
self._str_fw += 'train_std: %s\n\n' % train_std | |||||
if self._verbose: | |||||
print() | |||||
tt_total = time.time() - self._tts # training time for all hyper-parameters | |||||
average_gram_matrix_time = np.mean(self._gram_matrix_time) | |||||
std_gram_matrix_time = np.std(self._gram_matrix_time, ddof=1) if len(self._gram_matrix_time) > 1 else 0 | |||||
best_gram_matrix_time = [self._gram_matrix_time[i] for i in best_params_index[0]] | |||||
ave_bgmt = np.mean(best_gram_matrix_time) | |||||
std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 | |||||
if self._verbose: | |||||
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' | |||||
.format(average_gram_matrix_time, std_gram_matrix_time)) | |||||
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( | |||||
ave_bgmt, std_bgmt)) | |||||
print('total training time with all hyper-param choices: {:.2f}s'.format( | |||||
tt_total)) | |||||
if self._logging: | |||||
self._str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) | |||||
self._str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) | |||||
self._str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) | |||||
# # save results to file | |||||
# np.savetxt(results_name_pre + 'average_train_scores.dt', | |||||
# average_train_scores) | |||||
# np.savetxt(results_name_pre + 'average_val_scores', self._average_val_scores) | |||||
# np.savetxt(results_name_pre + 'average_perf_scores.dt', | |||||
# average_perf_scores) | |||||
# np.savetxt(results_name_pre + 'std_train_scores.dt', self._std_train_scores) | |||||
# np.savetxt(results_name_pre + 'std_val_scores.dt', self._std_val_scores) | |||||
# np.savetxt(results_name_pre + 'std_perf_scores.dt', self._std_perf_scores) | |||||
# np.save(results_name_pre + 'best_params_index', best_params_index) | |||||
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out) | |||||
# np.save(results_name_pre + 'best_params_in.dt', best_params_in) | |||||
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) | |||||
# np.save(results_name_pre + 'best_val_std.dt', best_val_std) | |||||
# np.save(results_name_pre + 'final_performance.dt', self._final_performance) | |||||
# np.save(results_name_pre + 'final_confidence.dt', self._final_confidence) | |||||
# np.save(results_name_pre + 'train_performance.dt', train_performance) | |||||
# np.save(results_name_pre + 'train_std.dt', train_std) | |||||
# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) | |||||
# np.save(results_name_pre + 'average_gram_matrix_time.dt', | |||||
# average_gram_matrix_time) | |||||
# np.save(results_name_pre + 'std_gram_matrix_time.dt', | |||||
# std_gram_matrix_time) | |||||
# np.save(results_name_pre + 'best_gram_matrix_time.dt', | |||||
# best_gram_matrix_time) | |||||
def trial_do(self, param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level | |||||
# # get gram matrices from global variables. | |||||
# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') | |||||
# Arrays to store scores | |||||
train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||||
val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||||
test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) | |||||
# randomness added to seeds of split function below. "high" is "size" times | |||||
# 10 so that at least 10 different random output will be yielded. Remove | |||||
# these lines if identical outputs is required. | |||||
rdm_out = np.random.RandomState(seed=None) | |||||
rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, | |||||
size=len(param_list_pre_revised)) | |||||
# print(trial, rdm_seed_out_l) | |||||
# print() | |||||
# loop for each outer param tuple | |||||
for index_out, params_out in enumerate(param_list_pre_revised): | |||||
# get gram matrices from global variables. | |||||
# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] | |||||
# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') | |||||
gm_now = gram_matrices[index_out].copy() | |||||
# split gram matrix and y to app and test sets. | |||||
indices = range(len(y)) | |||||
# The argument "random_state" in function "train_test_split" can not be | |||||
# set to None, because it will use RandomState instance used by | |||||
# np.random, which is possible for multiple subprocesses to inherit the | |||||
# same seed if they forked at the same time, leading to identical | |||||
# random variates for different subprocesses. Instead, we use "trial" | |||||
# and "index_out" parameters to generate different seeds for different | |||||
# trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add | |||||
# randomness into seeds, so that it yields a different output every | |||||
# time the program is run. To yield identical outputs every time, | |||||
# remove the second line below. Same method is used to the "KFold" | |||||
# function in the inner loop. | |||||
rdm_seed_out = (trial + 1) * (index_out + 1) | |||||
rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) | |||||
# print(trial, rdm_seed_out) | |||||
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( | |||||
gm_now, y, indices, test_size=0.1, | |||||
random_state=rdm_seed_out, shuffle=True) | |||||
# print(trial, idx_app, idx_test) | |||||
# print() | |||||
X_app = X_app[:, idx_app] | |||||
X_test = X_test[:, idx_app] | |||||
y_app = np.array(y_app) | |||||
y_test = np.array(y_test) | |||||
rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, | |||||
size=len(param_list)) | |||||
# loop for each inner param tuple | |||||
for index_in, params_in in enumerate(param_list): | |||||
# if trial == 0: | |||||
# print(index_out, index_in) | |||||
# print('params_in: ', params_in) | |||||
# st = time.time() | |||||
rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) | |||||
# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) | |||||
rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) | |||||
# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) | |||||
inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) | |||||
current_train_perf = [] | |||||
current_valid_perf = [] | |||||
current_test_perf = [] | |||||
# For regression use the Kernel Ridge method | |||||
# try: | |||||
if self._model_type == 'regression': | |||||
kr = KernelRidge(kernel='precomputed', **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
# print("train_index, valid_index: ", trial, index_in, train_index, valid_index) | |||||
# if trial == 0: | |||||
# print('train_index: ', train_index) | |||||
# print('valid_index: ', valid_index) | |||||
# print('idx_test: ', idx_test) | |||||
# print('y_app[train_index]: ', y_app[train_index]) | |||||
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||||
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||||
kr.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = kr.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = kr.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
# if trial == 0: | |||||
# print('y_pred_valid: ', y_pred_valid) | |||||
# print() | |||||
y_pred_test = kr.predict( | |||||
X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_app[train_index], y_pred_train))) | |||||
current_valid_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_app[valid_index], y_pred_valid))) | |||||
# if trial == 0: | |||||
# print(mean_squared_error( | |||||
# y_app[valid_index], y_pred_valid)) | |||||
current_test_perf.append( | |||||
np.sqrt( | |||||
mean_squared_error( | |||||
y_test, y_pred_test))) | |||||
# For clcassification use SVM | |||||
else: | |||||
svc = SVC(kernel='precomputed', cache_size=200, | |||||
verbose=False, **params_in) | |||||
# loop for each split on validation set level | |||||
# validation set level | |||||
for train_index, valid_index in inner_cv.split(X_app): | |||||
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) | |||||
# if trial == 0: | |||||
# print('train_index: ', train_index) | |||||
# print('valid_index: ', valid_index) | |||||
# print('idx_test: ', idx_test) | |||||
# print('y_app[train_index]: ', y_app[train_index]) | |||||
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) | |||||
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) | |||||
svc.fit(X_app[train_index, :][:, train_index], | |||||
y_app[train_index]) | |||||
# predict on the train, validation and test set | |||||
y_pred_train = svc.predict( | |||||
X_app[train_index, :][:, train_index]) | |||||
y_pred_valid = svc.predict( | |||||
X_app[valid_index, :][:, train_index]) | |||||
y_pred_test = svc.predict( | |||||
X_test[:, train_index]) | |||||
# root mean squared errors | |||||
current_train_perf.append( | |||||
accuracy_score(y_app[train_index], | |||||
y_pred_train)) | |||||
current_valid_perf.append( | |||||
accuracy_score(y_app[valid_index], | |||||
y_pred_valid)) | |||||
current_test_perf.append( | |||||
accuracy_score(y_test, y_pred_test)) | |||||
# except ValueError: | |||||
# print(sys.exc_info()[0]) | |||||
# print(params_out, params_in) | |||||
# average performance on inner splits | |||||
train_pref[index_out][index_in] = np.mean( | |||||
current_train_perf) | |||||
val_pref[index_out][index_in] = np.mean( | |||||
current_valid_perf) | |||||
test_pref[index_out][index_in] = np.mean( | |||||
current_test_perf) | |||||
# print(time.time() - st) | |||||
# if trial == 0: | |||||
# print('val_pref: ', val_pref) | |||||
# print('test_pref: ', test_pref) | |||||
return train_pref, val_pref, test_pref | |||||
def _parallel_trial_do(self, param_list_pre_revised, param_list, y, model_type, trial): | |||||
train_pref, val_pref, test_pref = self._trial_do(param_list_pre_revised, | |||||
param_list, G_gms, y, | |||||
model_type, trial) | |||||
return train_pref, val_pref, test_pref | |||||
def printResultsInTable(self, param_list, param_list_pre_revised, average_val_scores, | |||||
std_val_scores, average_perf_scores, std_perf_scores, | |||||
average_train_scores, std_train_scores, gram_matrix_time, | |||||
model_type, verbose): | |||||
from collections import OrderedDict | |||||
from tabulate import tabulate | |||||
table_dict = {} | |||||
if model_type == 'regression': | |||||
for param_in in param_list: | |||||
param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) | |||||
else: | |||||
for param_in in param_list: | |||||
param_in['C'] = '{:.2e}'.format(param_in['C']) | |||||
table_dict['params'] = [{**param_out, **param_in} | |||||
for param_in in param_list for param_out in param_list_pre_revised] | |||||
table_dict['gram_matrix_time'] = [ | |||||
'{:.2f}'.format(gram_matrix_time[index_out]) | |||||
for param_in in param_list | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['valid_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], | |||||
std_val_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['test_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], | |||||
std_perf_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
table_dict['train_perf'] = [ | |||||
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], | |||||
std_train_scores[index_out][index_in]) | |||||
for index_in, _ in enumerate(param_list) | |||||
for index_out, _ in enumerate(param_list_pre_revised) | |||||
] | |||||
keyorder = [ | |||||
'params', 'train_perf', 'valid_perf', 'test_perf', | |||||
'gram_matrix_time' | |||||
] | |||||
if verbose: | |||||
print() | |||||
tb_print = tabulate(OrderedDict(sorted(table_dict.items(), | |||||
key=lambda i: keyorder.index(i[0]))), headers='keys') | |||||
# print(tb_print) | |||||
return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print |
@@ -0,0 +1,89 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri May 21 12:18:02 2021 | |||||
@author: ljia | |||||
""" | |||||
def dichotomous_permutation(arr, layer=0): | |||||
import math | |||||
# def seperate_arr(arr, new_arr): | |||||
# if (length % 2) == 0: | |||||
# half = int(length / 2) | |||||
# new_arr += [arr[half - 1], arr[half]] | |||||
# subarr1 = [arr[i] for i in range(1, half - 1)] | |||||
# else: | |||||
# half = math.floor(length / 2) | |||||
# new_arr.append(arr[half]) | |||||
# subarr1 = [arr[i] for i in range(1, half)] | |||||
# subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||||
# subarrs = [subarr1, subarr2] | |||||
# return subarrs | |||||
if layer == 0: | |||||
length = len(arr) | |||||
if length <= 2: | |||||
return arr | |||||
new_arr = [arr[0], arr[-1]] | |||||
if (length % 2) == 0: | |||||
half = int(length / 2) | |||||
new_arr += [arr[half - 1], arr[half]] | |||||
subarr1 = [arr[i] for i in range(1, half - 1)] | |||||
else: | |||||
half = math.floor(length / 2) | |||||
new_arr.append(arr[half]) | |||||
subarr1 = [arr[i] for i in range(1, half)] | |||||
subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||||
subarrs = [subarr1, subarr2] | |||||
# subarrs = seperate_arr(arr, new_arr) | |||||
new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||||
else: | |||||
new_arr = [] | |||||
subarrs = [] | |||||
for a in arr: | |||||
length = len(a) | |||||
if length <= 2: | |||||
new_arr += a | |||||
else: | |||||
# subarrs += seperate_arr(a, new_arr) | |||||
if (length % 2) == 0: | |||||
half = int(length / 2) | |||||
new_arr += [a[half - 1], a[half]] | |||||
subarr1 = [a[i] for i in range(0, half - 1)] | |||||
else: | |||||
half = math.floor(length / 2) | |||||
new_arr.append(a[half]) | |||||
subarr1 = [a[i] for i in range(0, half)] | |||||
subarr2 = [a[i] for i in range(half + 1, length)] | |||||
subarrs += [subarr1, subarr2] | |||||
if len(subarrs) > 0: | |||||
new_arr += dichotomous_permutation(subarrs, layer=layer+1) | |||||
return new_arr | |||||
# length = len(arr) | |||||
# if length <= 2: | |||||
# return arr | |||||
# new_arr = [arr[0], arr[-1]] | |||||
# if (length % 2) == 0: | |||||
# half = int(length / 2) | |||||
# new_arr += [arr[half - 1], arr[half]] | |||||
# subarr1 = [arr[i] for i in range(1, half - 1)] | |||||
# else: | |||||
# half = math.floor(length / 2) | |||||
# new_arr.append(arr[half]) | |||||
# subarr1 = [arr[i] for i in range(1, half)] | |||||
# subarr2 = [arr[i] for i in range(half + 1, length - 1)] | |||||
# if len(subarr1) > 0: | |||||
# new_arr += dichotomous_permutation(subarr1) | |||||
# if len(subarr2) > 0: | |||||
# new_arr += dichotomous_permutation(subarr2) | |||||
# return new_arr |
@@ -0,0 +1,109 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri Nov 27 19:33:51 2020 | |||||
@author: ljia | |||||
""" | |||||
import os | |||||
import numpy as np | |||||
import pickle | |||||
from gklearn.dataset import Dataset | |||||
from gklearn.model_learning import NestedCV | |||||
from gklearn.kernels import GRAPH_KERNELS | |||||
class Workflow(object): | |||||
def __init__(self, **kwargs): | |||||
self._job_prefix = kwargs.get('job_prefix', 'gktask') | |||||
self._max_num_running_tasks = kwargs.get('max_num_running_tasks', np.inf) | |||||
self._root_dir = kwargs.get('root_dir', 'outputs/') | |||||
def run(self, tasks): | |||||
### Check inputs. | |||||
if self._check_inputs(tasks): | |||||
self._tasks = tasks | |||||
else: | |||||
raise ValueError('The input "tasks" is not correct.') | |||||
### Sort tasks. | |||||
self.sort_tasks_by_complexity() | |||||
### The main process. | |||||
complete = False | |||||
while not complete: | |||||
self.get_running_tasks() | |||||
if self._num_running_tasks < self._max_num_running_tasks: | |||||
### Load results from table. | |||||
self.load_results_from_table() | |||||
for task in self._tasks: | |||||
state = self.get_task_state(task) | |||||
if state != 'complete' and state != 'runnning': | |||||
self.run_task(task) | |||||
if self._num_running_tasks >= self._max_num_running_tasks: | |||||
break | |||||
### Save results. | |||||
self.save_results() | |||||
complete = self.check_completeness() | |||||
# sleep() | |||||
def _check_inputs(self, tasks): | |||||
if not isinstance(tasks, list): | |||||
return False | |||||
else: | |||||
for i in tasks: | |||||
if not 'kernel' in i or not 'dataset' in i: | |||||
return False | |||||
return True | |||||
def sort_tasks_by_complexity(self): | |||||
return | |||||
def get_running_tasks(self): | |||||
command = 'squeue --user $USER --format "%.50j" --noheader' | |||||
stream = os.popen(command) | |||||
output = stream.readlines() | |||||
running_tasks = [o for o in output if o.strip().startswith(self._job_prefix)] | |||||
self._num_running_tasks = len(running_tasks) | |||||
def load_results_from_table(self): | |||||
pass | |||||
def get_task_state(self, task): | |||||
task_dir = os.path.join(self._root_dir, task['kernel'] + '.' + task['dataset'] + '/') | |||||
fn_summary = os.path.join(task_dir, 'results_summary.pkl') | |||||
if os.path.isfile(fn_summary): | |||||
output = pickle.loads(fn_summary) | |||||
state = output['state'] | |||||
return state | |||||
else: | |||||
return 'unstarted' | |||||
def run_task(self, task): | |||||
ds_name = task['dataset'] | |||||
k_name = task['kernel'] | |||||
# Get dataset. | |||||
ds = Dataset(ds_name) | |||||
graph_kernel = GRAPH_KERNELS[k_name] | |||||
# Start CV. | |||||
results = NestedCV(ds, graph_kernel) |
@@ -25,34 +25,40 @@ def chooseDataset(ds_name): | |||||
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | ||||
root = current_path + '../../datasets/' | root = current_path + '../../datasets/' | ||||
# no node labels (and no edge labels). | |||||
if ds_name == 'Alkane': | |||||
# no labels at all. | |||||
if ds_name == 'Alkane_unlabeled': | |||||
dataset = Dataset('Alkane_unlabeled', root=root) | dataset = Dataset('Alkane_unlabeled', root=root) | ||||
dataset.trim_dataset(edge_required=False) | dataset.trim_dataset(edge_required=False) | ||||
dataset.cut_graphs(range(1, 10)) | dataset.cut_graphs(range(1, 10)) | ||||
# node symbolic labels. | |||||
# node symbolic labels only. | |||||
elif ds_name == 'Acyclic': | elif ds_name == 'Acyclic': | ||||
dataset = Dataset('Acyclic', root=root) | dataset = Dataset('Acyclic', root=root) | ||||
dataset.trim_dataset(edge_required=False) | dataset.trim_dataset(edge_required=False) | ||||
# node non-symbolic labels. | |||||
# node non-symbolic labels only. | |||||
elif ds_name == 'Letter-med': | elif ds_name == 'Letter-med': | ||||
dataset = Dataset('Letter-med', root=root) | dataset = Dataset('Letter-med', root=root) | ||||
dataset.trim_dataset(edge_required=False) | dataset.trim_dataset(edge_required=False) | ||||
# node symbolic and non-symbolic labels (and edge symbolic labels). | |||||
# node symbolic + non-symbolic labels + edge symbolic labels. | |||||
elif ds_name == 'AIDS': | elif ds_name == 'AIDS': | ||||
dataset = Dataset('AIDS', root=root) | dataset = Dataset('AIDS', root=root) | ||||
dataset.trim_dataset(edge_required=False) | dataset.trim_dataset(edge_required=False) | ||||
# edge non-symbolic labels (no node labels). | |||||
elif ds_name == 'Fingerprint_edge': | |||||
# node non-symbolic labels + edge non-symbolic labels. | |||||
elif ds_name == 'Fingerprint': | |||||
dataset = Dataset('Fingerprint', root=root) | dataset = Dataset('Fingerprint', root=root) | ||||
dataset.trim_dataset(edge_required=True) | dataset.trim_dataset(edge_required=True) | ||||
irrelevant_labels = {'edge_attrs': ['orient', 'angle']} | |||||
# edge symbolic only. | |||||
elif ds_name == 'MAO': | |||||
dataset = Dataset('MAO', root=root) | |||||
dataset.trim_dataset(edge_required=True) | |||||
irrelevant_labels = {'node_labels': ['atom_symbol'], 'node_attrs': ['x', 'y']} | |||||
dataset.remove_labels(**irrelevant_labels) | dataset.remove_labels(**irrelevant_labels) | ||||
# edge non-symbolic labels (and node non-symbolic labels). | |||||
elif ds_name == 'Fingerprint': | |||||
# edge non-symbolic labels only. | |||||
elif ds_name == 'Fingerprint_edge': | |||||
dataset = Dataset('Fingerprint', root=root) | dataset = Dataset('Fingerprint', root=root) | ||||
dataset.trim_dataset(edge_required=True) | dataset.trim_dataset(edge_required=True) | ||||
# edge symbolic and non-symbolic labels (and node symbolic and non-symbolic labels). | |||||
irrelevant_labels = {'edge_attrs': ['orient', 'angle']} | |||||
dataset.remove_labels(**irrelevant_labels) | |||||
# node symbolic and non-symbolic labels + edge symbolic and non-symbolic labels. | |||||
elif ds_name == 'Cuneiform': | elif ds_name == 'Cuneiform': | ||||
dataset = Dataset('Cuneiform', root=root) | dataset = Dataset('Cuneiform', root=root) | ||||
dataset.trim_dataset(edge_required=True) | dataset.trim_dataset(edge_required=True) | ||||
@@ -91,7 +97,7 @@ def assert_equality(compute_fun, **kwargs): | |||||
assert np.array_equal(lst[i], lst[i + 1]) | assert np.array_equal(lst[i], lst[i + 1]) | ||||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||||
@pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) | @pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) | ||||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | ||||
def test_CommonWalk(ds_name, weight, compute_method): | def test_CommonWalk(ds_name, weight, compute_method): | ||||
@@ -126,7 +132,7 @@ def test_CommonWalk(ds_name, weight, compute_method): | |||||
assert_equality(compute, parallel=['imap_unordered', None]) | assert_equality(compute, parallel=['imap_unordered', None]) | ||||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||||
@pytest.mark.parametrize('remove_totters', [False]) #[True, False]) | @pytest.mark.parametrize('remove_totters', [False]) #[True, False]) | ||||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | ||||
def test_Marginalized(ds_name, remove_totters): | def test_Marginalized(ds_name, remove_totters): | ||||
@@ -319,13 +325,13 @@ def test_SpectralDecomposition(ds_name, sub_kernel): | |||||
# @pytest.mark.parametrize( | # @pytest.mark.parametrize( | ||||
# 'compute_method,ds_name,sub_kernel', | # 'compute_method,ds_name,sub_kernel', | ||||
# [ | # [ | ||||
# ('sylvester', 'Alkane', None), | |||||
# ('conjugate', 'Alkane', None), | |||||
# ('sylvester', 'Alkane_unlabeled', None), | |||||
# ('conjugate', 'Alkane_unlabeled', None), | |||||
# ('conjugate', 'AIDS', None), | # ('conjugate', 'AIDS', None), | ||||
# ('fp', 'Alkane', None), | |||||
# ('fp', 'Alkane_unlabeled', None), | |||||
# ('fp', 'AIDS', None), | # ('fp', 'AIDS', None), | ||||
# ('spectral', 'Alkane', 'exp'), | |||||
# ('spectral', 'Alkane', 'geo'), | |||||
# ('spectral', 'Alkane_unlabeled', 'exp'), | |||||
# ('spectral', 'Alkane_unlabeled', 'geo'), | |||||
# ] | # ] | ||||
# ) | # ) | ||||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | ||||
@@ -365,7 +371,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel): | |||||
# assert False, exception | # assert False, exception | ||||
@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | ||||
def test_ShortestPath(ds_name): | def test_ShortestPath(ds_name): | ||||
"""Test shortest path kernel. | """Test shortest path kernel. | ||||
@@ -401,8 +407,8 @@ def test_ShortestPath(ds_name): | |||||
assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) | assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) | ||||
#@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||||
@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) | |||||
#@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) | |||||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) | |||||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | ||||
def test_StructuralSP(ds_name): | def test_StructuralSP(ds_name): | ||||
"""Test structural shortest path kernel. | """Test structural shortest path kernel. | ||||
@@ -441,7 +447,7 @@ def test_StructuralSP(ds_name): | |||||
assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) | assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) | ||||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | ||||
#@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto', None]) | #@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto', None]) | ||||
@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto']) | @pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto']) | ||||
@@ -476,7 +482,7 @@ def test_PathUpToH(ds_name, k_func): | |||||
compute_method=['trie', 'naive']) | compute_method=['trie', 'naive']) | ||||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) | |||||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | ||||
def test_Treelet(ds_name): | def test_Treelet(ds_name): | ||||
"""Test treelet kernel. | """Test treelet kernel. | ||||
@@ -510,7 +516,7 @@ def test_Treelet(ds_name): | |||||
assert_equality(compute, parallel=['imap_unordered', None]) | assert_equality(compute, parallel=['imap_unordered', None]) | ||||
@pytest.mark.parametrize('ds_name', ['Acyclic']) | |||||
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'MAO', 'AIDS']) | |||||
#@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge']) | #@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge']) | ||||
# @pytest.mark.parametrize('base_kernel', ['subtree']) | # @pytest.mark.parametrize('base_kernel', ['subtree']) | ||||
# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | ||||
@@ -540,17 +546,17 @@ def test_WLSubtree(ds_name): | |||||
else: | else: | ||||
return gram_matrix, kernel_list, kernel | return gram_matrix, kernel_list, kernel | ||||
assert_equality(compute, parallel=['imap_unordered', None]) | |||||
assert_equality(compute, parallel=[None, 'imap_unordered']) | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
test_list_graph_kernels() | |||||
# test_spkernel('Alkane', 'imap_unordered') | |||||
# test_ShortestPath('Alkane') | |||||
# test_list_graph_kernels() | |||||
# test_spkernel('Alkane_unlabeled', 'imap_unordered') | |||||
# test_ShortestPath('Alkane_unlabeled') | |||||
# test_StructuralSP('Fingerprint_edge', 'imap_unordered') | # test_StructuralSP('Fingerprint_edge', 'imap_unordered') | ||||
# test_StructuralSP('Acyclic') | # test_StructuralSP('Acyclic') | ||||
# test_StructuralSP('Cuneiform', None) | # test_StructuralSP('Cuneiform', None) | ||||
# test_WLSubtree('Acyclic') | |||||
test_WLSubtree('MAO') # 'Alkane_unlabeled', 'Acyclic', 'AIDS' | |||||
# test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') | # test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') | ||||
# test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') | # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') | ||||
# test_RandomWalk('Acyclic', 'fp', None, None) | # test_RandomWalk('Acyclic', 'fp', None, None) | ||||
@@ -559,7 +565,7 @@ if __name__ == "__main__": | |||||
# test_Marginalized('Acyclic', False) | # test_Marginalized('Acyclic', False) | ||||
# test_ShortestPath('Acyclic') | # test_ShortestPath('Acyclic') | ||||
# test_PathUpToH('Acyclic', 'MinMax') | # test_PathUpToH('Acyclic', 'MinMax') | ||||
# test_Treelet('Acyclic') | |||||
# test_Treelet('AIDS') | |||||
# test_SylvesterEquation('Acyclic') | # test_SylvesterEquation('Acyclic') | ||||
# test_ConjugateGradient('Acyclic') | # test_ConjugateGradient('Acyclic') | ||||
# test_FixedPoint('Acyclic') | # test_FixedPoint('Acyclic') |
@@ -3,156 +3,230 @@ These kernels are defined between pairs of vectors. | |||||
""" | """ | ||||
import numpy as np | import numpy as np | ||||
def delta_kernel(x, y): | |||||
"""Delta kernel. Return 1 if x == y, 0 otherwise. | |||||
Parameters | |||||
---------- | |||||
x, y : any | |||||
Two parts to compare. | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
Delta kernel. | |||||
References | |||||
---------- | |||||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||||
labeled graphs. In Proceedings of the 20th International Conference on | |||||
Machine Learning, Washington, DC, United States, 2003. | |||||
""" | |||||
return x == y #(1 if condition else 0) | |||||
def deltakernel(x, y): | def deltakernel(x, y): | ||||
"""Delta kernel. Return 1 if x == y, 0 otherwise. | |||||
return delta_kernel(x, y) | |||||
def gaussian_kernel(x, y, gamma=None): | |||||
"""Gaussian kernel. | |||||
Compute the rbf (gaussian) kernel between x and y: | |||||
Parameters | |||||
---------- | |||||
x, y : any | |||||
Two parts to compare. | |||||
K(x, y) = exp(-gamma ||x-y||^2). | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
Delta kernel. | |||||
Read more in the `User Guide of scikit-learn library <https://scikit-learn.org/stable/modules/metrics.html#rbf-kernel>`__. | |||||
References | |||||
---------- | |||||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||||
labeled graphs. In Proceedings of the 20th International Conference on | |||||
Machine Learning, Washington, DC, United States, 2003. | |||||
""" | |||||
return x == y #(1 if condition else 0) | |||||
Parameters | |||||
---------- | |||||
x, y : array | |||||
gamma : float, default None | |||||
If None, defaults to 1.0 / n_features | |||||
Returns | |||||
------- | |||||
kernel : float | |||||
""" | |||||
if gamma is None: | |||||
gamma = 1.0 / len(x) | |||||
# xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up. | |||||
# yt = np.array([float(itm) for itm in y]) | |||||
# kernel = xt - yt | |||||
# kernel = kernel ** 2 | |||||
# kernel = np.sum(kernel) | |||||
# kernel *= -gamma | |||||
# kernel = np.exp(kernel) | |||||
# return kernel | |||||
return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma) | |||||
def gaussiankernel(x, y, gamma=None): | def gaussiankernel(x, y, gamma=None): | ||||
"""Gaussian kernel. | |||||
Compute the rbf (gaussian) kernel between x and y: | |||||
return gaussian_kernel(x, y, gamma=gamma) | |||||
K(x, y) = exp(-gamma ||x-y||^2). | |||||
Read more in the `User Guide of scikit-learn library <https://scikit-learn.org/stable/modules/metrics.html#rbf-kernel>`__. | |||||
def polynomial_kernel(x, y, gamma=1, coef0=0, d=1): | |||||
return (np.dot(x, y) * gamma + coef0) ** d | |||||
Parameters | |||||
---------- | |||||
x, y : array | |||||
gamma : float, default None | |||||
If None, defaults to 1.0 / n_features | |||||
def highest_polynomial_kernel(x, y, d=1, c=0): | |||||
"""Polynomial kernel. | |||||
Compute the polynomial kernel between x and y: | |||||
Returns | |||||
------- | |||||
kernel : float | |||||
""" | |||||
if gamma is None: | |||||
gamma = 1.0 / len(x) | |||||
K(x, y) = <x, y> ^d + c. | |||||
xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up. | |||||
yt = np.array([float(itm) for itm in y]) | |||||
kernel = xt - yt | |||||
kernel = kernel ** 2 | |||||
kernel = np.sum(kernel) | |||||
kernel *= -gamma | |||||
kernel = np.exp(kernel) | |||||
return kernel | |||||
Parameters | |||||
---------- | |||||
x, y : array | |||||
d : integer, default 1 | |||||
c : float, default 0 | |||||
Returns | |||||
------- | |||||
kernel : float | |||||
""" | |||||
return np.dot(x, y) ** d + c | |||||
def polynomialkernel(x, y, d=1, c=0): | def polynomialkernel(x, y, d=1, c=0): | ||||
"""Polynomial kernel. | |||||
Compute the polynomial kernel between x and y: | |||||
return highest_polynomial_kernel(x, y, d=d, c=c) | |||||
K(x, y) = <x, y> ^d + c. | |||||
def linear_kernel(x, y): | |||||
"""Polynomial kernel. | |||||
Compute the polynomial kernel between x and y: | |||||
Parameters | |||||
---------- | |||||
x, y : array | |||||
K(x, y) = <x, y>. | |||||
d : integer, default 1 | |||||
Parameters | |||||
---------- | |||||
x, y : array | |||||
c : float, default 0 | |||||
d : integer, default 1 | |||||
Returns | |||||
------- | |||||
kernel : float | |||||
""" | |||||
return np.dot(x, y) ** d + c | |||||
c : float, default 0 | |||||
Returns | |||||
------- | |||||
kernel : float | |||||
""" | |||||
return np.dot(x, y) | |||||
def linearkernel(x, y): | def linearkernel(x, y): | ||||
"""Polynomial kernel. | |||||
Compute the polynomial kernel between x and y: | |||||
return linear_kernel(x, y) | |||||
def cosine_kernel(x, y): | |||||
return np.dot(x, y) / (np.abs(x) * np.abs(y)) | |||||
def sigmoid_kernel(x, y, gamma=None, coef0=1): | |||||
if gamma is None: | |||||
gamma = 1.0 / len(x) | |||||
k = np.dot(x, y) | |||||
k *= gamma | |||||
k += coef0 | |||||
k = np.tanh(k) | |||||
# k = np.tanh(k, k) # compute tanh in-place | |||||
return k | |||||
def laplacian_kernel(x, y, gamma=None): | |||||
if gamma is None: | |||||
gamma = 1.0 / len(x) | |||||
k = -gamma * np.abs(np.subtract(x, y)) | |||||
k = np.exp(k) | |||||
return k | |||||
def chi2_kernel(x, y, gamma=1.0): | |||||
k = np.divide(np.subtract(x, y) ** 2, np.add(x, y)) | |||||
k = np.sum(k) | |||||
k *= -gamma | |||||
return np.exp(k) | |||||
def exponential_kernel(x, y, gamma=None): | |||||
if gamma is None: | |||||
gamma = 1.0 / len(x) | |||||
return np.exp(np.dot(x, y) * gamma) | |||||
K(x, y) = <x, y>. | |||||
def intersection_kernel(x, y): | |||||
return np.sum(np.minimum(x, y)) | |||||
Parameters | |||||
---------- | |||||
x, y : array | |||||
d : integer, default 1 | |||||
def multiquadratic_kernel(x, y, c=0): | |||||
return np.sqrt((np.sum(np.subtract(x, y) ** 2)) + c) | |||||
c : float, default 0 | |||||
Returns | |||||
------- | |||||
kernel : float | |||||
""" | |||||
return np.dot(x, y) | |||||
def inverse_multiquadratic_kernel(x, y, c=0): | |||||
return 1 / multiquadratic_kernel(x, y, c=c) | |||||
def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | ||||
"""Sum of a pair of kernels. | |||||
"""Sum of a pair of kernels. | |||||
k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||||
k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||||
Parameters | |||||
---------- | |||||
k1, k2 : function | |||||
A pair of kernel functions. | |||||
d11, d12: | |||||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||||
d21, d22: | |||||
Inputs of k2. | |||||
lamda1, lamda2: float | |||||
Coefficients of the product. | |||||
Parameters | |||||
---------- | |||||
k1, k2 : function | |||||
A pair of kernel functions. | |||||
d11, d12: | |||||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||||
d21, d22: | |||||
Inputs of k2. | |||||
lamda1, lamda2: float | |||||
Coefficients of the product. | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
""" | |||||
if d21 == None or d22 == None: | |||||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12) | |||||
else: | |||||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||||
return kernel | |||||
""" | |||||
if d21 == None or d22 == None: | |||||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12) | |||||
else: | |||||
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) | |||||
return kernel | |||||
def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1): | def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1): | ||||
"""Product of a pair of kernels. | |||||
k = lamda * k1(d11, d12) * k2(d21, d22) | |||||
Parameters | |||||
---------- | |||||
k1, k2 : function | |||||
A pair of kernel functions. | |||||
d11, d12: | |||||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||||
d21, d22: | |||||
Inputs of k2. | |||||
lamda: float | |||||
Coefficient of the product. | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
""" | |||||
if d21 == None or d22 == None: | |||||
kernel = lamda * k1(d11, d12) * k2(d11, d12) | |||||
else: | |||||
kernel = lamda * k1(d11, d12) * k2(d21, d22) | |||||
return kernel | |||||
"""Product of a pair of kernels. | |||||
k = lamda * k1(d11, d12) * k2(d21, d22) | |||||
Parameters | |||||
---------- | |||||
k1, k2 : function | |||||
A pair of kernel functions. | |||||
d11, d12: | |||||
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. | |||||
d21, d22: | |||||
Inputs of k2. | |||||
lamda: float | |||||
Coefficient of the product. | |||||
Return | |||||
------ | |||||
kernel : integer | |||||
""" | |||||
if d21 == None or d22 == None: | |||||
kernel = lamda * k1(d11, d12) * k2(d11, d12) | |||||
else: | |||||
kernel = lamda * k1(d11, d12) * k2(d21, d22) | |||||
return kernel | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
o = polynomialkernel([1, 2], [3, 4], 2, 3) | |||||
o = polynomialkernel([1, 2], [3, 4], 2, 3) |
@@ -366,19 +366,62 @@ def get_edge_labels(Gn, edge_label): | |||||
def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): | def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): | ||||
if len(kwargs) != 0: | if len(kwargs) != 0: | ||||
kernel_options = kwargs | kernel_options = kwargs | ||||
if name == 'Marginalized': | |||||
if name == 'CommonWalk' or name == 'common walk': | |||||
from gklearn.kernels import CommonWalk | |||||
graph_kernel = CommonWalk(node_labels=node_labels, | |||||
edge_labels=edge_labels, | |||||
ds_infos=ds_infos, | |||||
**kernel_options) | |||||
elif name == 'Marginalized' or name == 'marginalized': | |||||
from gklearn.kernels import Marginalized | from gklearn.kernels import Marginalized | ||||
graph_kernel = Marginalized(node_labels=node_labels, | graph_kernel = Marginalized(node_labels=node_labels, | ||||
edge_labels=edge_labels, | edge_labels=edge_labels, | ||||
ds_infos=ds_infos, | ds_infos=ds_infos, | ||||
**kernel_options) | **kernel_options) | ||||
elif name == 'ShortestPath': | |||||
elif name == 'SylvesterEquation' or name == 'sylvester equation': | |||||
from gklearn.kernels import SylvesterEquation | |||||
graph_kernel = SylvesterEquation( | |||||
ds_infos=ds_infos, | |||||
**kernel_options) | |||||
elif name == 'FixedPoint' or name == 'fixed point': | |||||
from gklearn.kernels import FixedPoint | |||||
graph_kernel = FixedPoint(node_labels=node_labels, | |||||
edge_labels=edge_labels, | |||||
node_attrs=node_attrs, | |||||
edge_attrs=edge_attrs, | |||||
ds_infos=ds_infos, | |||||
**kernel_options) | |||||
elif name == 'ConjugateGradient' or name == 'conjugate gradient': | |||||
from gklearn.kernels import ConjugateGradient | |||||
graph_kernel = ConjugateGradient(node_labels=node_labels, | |||||
edge_labels=edge_labels, | |||||
node_attrs=node_attrs, | |||||
edge_attrs=edge_attrs, | |||||
ds_infos=ds_infos, | |||||
**kernel_options) | |||||
elif name == 'SpectralDecomposition' or name == 'spectral decomposition': | |||||
from gklearn.kernels import SpectralDecomposition | |||||
graph_kernel = SpectralDecomposition(node_labels=node_labels, | |||||
edge_labels=edge_labels, | |||||
node_attrs=node_attrs, | |||||
edge_attrs=edge_attrs, | |||||
ds_infos=ds_infos, | |||||
**kernel_options) | |||||
elif name == 'ShortestPath' or name == 'shortest path': | |||||
from gklearn.kernels import ShortestPath | from gklearn.kernels import ShortestPath | ||||
graph_kernel = ShortestPath(node_labels=node_labels, | graph_kernel = ShortestPath(node_labels=node_labels, | ||||
node_attrs=node_attrs, | node_attrs=node_attrs, | ||||
ds_infos=ds_infos, | ds_infos=ds_infos, | ||||
**kernel_options) | **kernel_options) | ||||
elif name == 'StructuralSP': | |||||
elif name == 'StructuralSP' or name == 'structural shortest path': | |||||
from gklearn.kernels import StructuralSP | from gklearn.kernels import StructuralSP | ||||
graph_kernel = StructuralSP(node_labels=node_labels, | graph_kernel = StructuralSP(node_labels=node_labels, | ||||
edge_labels=edge_labels, | edge_labels=edge_labels, | ||||
@@ -386,25 +429,29 @@ def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attr | |||||
edge_attrs=edge_attrs, | edge_attrs=edge_attrs, | ||||
ds_infos=ds_infos, | ds_infos=ds_infos, | ||||
**kernel_options) | **kernel_options) | ||||
elif name == 'PathUpToH': | |||||
elif name == 'PathUpToH' or name == 'path up to length h': | |||||
from gklearn.kernels import PathUpToH | from gklearn.kernels import PathUpToH | ||||
graph_kernel = PathUpToH(node_labels=node_labels, | graph_kernel = PathUpToH(node_labels=node_labels, | ||||
edge_labels=edge_labels, | edge_labels=edge_labels, | ||||
ds_infos=ds_infos, | ds_infos=ds_infos, | ||||
**kernel_options) | **kernel_options) | ||||
elif name == 'Treelet': | |||||
elif name == 'Treelet' or name == 'treelet': | |||||
from gklearn.kernels import Treelet | from gklearn.kernels import Treelet | ||||
graph_kernel = Treelet(node_labels=node_labels, | graph_kernel = Treelet(node_labels=node_labels, | ||||
edge_labels=edge_labels, | edge_labels=edge_labels, | ||||
ds_infos=ds_infos, | ds_infos=ds_infos, | ||||
**kernel_options) | **kernel_options) | ||||
elif name == 'WLSubtree': | |||||
elif name == 'WLSubtree' or name == 'weisfeiler-lehman subtree': | |||||
from gklearn.kernels import WLSubtree | from gklearn.kernels import WLSubtree | ||||
graph_kernel = WLSubtree(node_labels=node_labels, | graph_kernel = WLSubtree(node_labels=node_labels, | ||||
edge_labels=edge_labels, | edge_labels=edge_labels, | ||||
ds_infos=ds_infos, | ds_infos=ds_infos, | ||||
**kernel_options) | **kernel_options) | ||||
elif name == 'WeisfeilerLehman': | |||||
elif name == 'WeisfeilerLehman' or name == 'weisfeiler-lehman': | |||||
from gklearn.kernels import WeisfeilerLehman | from gklearn.kernels import WeisfeilerLehman | ||||
graph_kernel = WeisfeilerLehman(node_labels=node_labels, | graph_kernel = WeisfeilerLehman(node_labels=node_labels, | ||||
edge_labels=edge_labels, | edge_labels=edge_labels, | ||||
@@ -541,10 +588,18 @@ def get_mlti_dim_edge_attrs(G, attr_names): | |||||
def normalize_gram_matrix(gram_matrix): | def normalize_gram_matrix(gram_matrix): | ||||
diag = gram_matrix.diagonal().copy() | diag = gram_matrix.diagonal().copy() | ||||
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. | |||||
for i in range(len(gram_matrix)): | for i in range(len(gram_matrix)): | ||||
for j in range(i, len(gram_matrix)): | for j in range(i, len(gram_matrix)): | ||||
gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) | |||||
gram_matrix[j][i] = gram_matrix[i][j] | |||||
try: | |||||
gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) | |||||
except: | |||||
# rollback() | |||||
np.seterr(**old_settings) | |||||
raise | |||||
else: | |||||
gram_matrix[j][i] = gram_matrix[i][j] | |||||
np.seterr(**old_settings) | |||||
return gram_matrix | return gram_matrix | ||||