From 71df9f591f58f98e489c94bd2ab48686aace8688 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 26 Jan 2021 09:45:37 +0100 Subject: [PATCH 01/35] [Update] Move the import of control out of the __init__(). --- gklearn/kernels/sylvester_equation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gklearn/kernels/sylvester_equation.py b/gklearn/kernels/sylvester_equation.py index 03f9a18..9f8fc66 100644 --- a/gklearn/kernels/sylvester_equation.py +++ b/gklearn/kernels/sylvester_equation.py @@ -14,6 +14,7 @@ import sys from gklearn.utils import get_iters import numpy as np import networkx as nx +from control import dlyap from gklearn.utils.parallel import parallel_gm, parallel_me from gklearn.kernels import RandomWalkMeta @@ -22,7 +23,6 @@ class SylvesterEquation(RandomWalkMeta): def __init__(self, **kwargs): - from control import dlyap super().__init__(**kwargs) From e991f59742cd93cb557ec2a26a4e187968b17d1b Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 26 Jan 2021 10:09:54 +0100 Subject: [PATCH 02/35] [Exp] Update exceptions in fcsp exps. --- .../experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py index 4997c41..5568dd7 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py @@ -29,7 +29,8 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('StructuralSP', 'DHFR', 'False'), ('StructuralSP', 'OHSU', 'True'), ('StructuralSP', 'OHSU', 'False'), - ('StructuralSP', 'SYNTHETIC', 'False'), + ('StructuralSP', 'Steroid', 'False'), + ('ShortestPath', 'SYNTHETIC', 'False'), ('StructuralSP', 'SYNTHETIC', 'True'), ('StructuralSP', 'SYNTHETIC', 'False'), ('ShortestPath', 'SYNTHETICnew', 'False'), @@ -47,6 +48,7 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('StructuralSP', 'Mutagenicity', 'False'), ('StructuralSP', 'REDDIT-BINARY', 'True'), ('StructuralSP', 'REDDIT-BINARY', 'False'), + ('StructuralSP', 'Vitamin_D', 'False'), }) OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'), From 07478a571747cecd675e371350baa709ed44a2c4 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 26 Jan 2021 10:10:59 +0100 Subject: [PATCH 03/35] [Exp] Add taskhub to run exps all in once. --- gklearn/experiments/taskhub.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 gklearn/experiments/taskhub.py diff --git a/gklearn/experiments/taskhub.py b/gklearn/experiments/taskhub.py new file mode 100644 index 0000000..370475d --- /dev/null +++ b/gklearn/experiments/taskhub.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jan 26 09:53:33 2021 + +@author: ljia +""" + +if __name__ == '__main__': + tasks = [ + {'path': 'thesis/graph_kernels/fcsp', + 'file': 'run_jobs_compare_fcsp.py' + }, + {'path': 'thesis/graph_kernels/fcsp', + 'file': 'run_jobs_compare_fcsp_space.py' + }, + {'path': 'ged/stability', + 'file': 'Analysis_stability.ratios.real_data.relative_error.py' + }, + ] + + command = '' + for t in tasks: + command += 'cd ' + t['path'] + '\n' + command += 'python3 ' + t['file'] + '\n' + command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n' From ac78c8517a7c8497899218ff9cf9b43001c977c3 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 26 Jan 2021 10:14:59 +0100 Subject: [PATCH 04/35] [Exp] Add taskhub to run exps all in once. --- gklearn/experiments/taskhub.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gklearn/experiments/taskhub.py b/gklearn/experiments/taskhub.py index 370475d..d31ede8 100644 --- a/gklearn/experiments/taskhub.py +++ b/gklearn/experiments/taskhub.py @@ -24,3 +24,6 @@ if __name__ == '__main__': command += 'cd ' + t['path'] + '\n' command += 'python3 ' + t['file'] + '\n' command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n' + + import os + os.system(command) From 756371aaf1938111f1877b9ed9da047d6a95716f Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 26 Jan 2021 10:16:35 +0100 Subject: [PATCH 05/35] [Exp] Add taskhub to run exps all in once. --- gklearn/experiments/taskhub.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gklearn/experiments/taskhub.py b/gklearn/experiments/taskhub.py index d31ede8..84143aa 100644 --- a/gklearn/experiments/taskhub.py +++ b/gklearn/experiments/taskhub.py @@ -21,6 +21,7 @@ if __name__ == '__main__': command = '' for t in tasks: + print(t['file']) command += 'cd ' + t['path'] + '\n' command += 'python3 ' + t['file'] + '\n' command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n' From 13ac6d9153a04c87593533b817fd8eb697f972db Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 26 Jan 2021 10:20:24 +0100 Subject: [PATCH 06/35] [Exp] Add taskhub to run exps all in once. --- gklearn/experiments/taskhub.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gklearn/experiments/taskhub.py b/gklearn/experiments/taskhub.py index 84143aa..0e0344b 100644 --- a/gklearn/experiments/taskhub.py +++ b/gklearn/experiments/taskhub.py @@ -19,12 +19,11 @@ if __name__ == '__main__': }, ] + import os command = '' for t in tasks: print(t['file']) command += 'cd ' + t['path'] + '\n' command += 'python3 ' + t['file'] + '\n' - command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n' - - import os - os.system(command) +# command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n' + os.system(command) From 74bab60deb5be5ede709a37cb06b2e02d33738bc Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 26 Jan 2021 10:26:24 +0100 Subject: [PATCH 07/35] [Exp] Add taskhub to run exps all in once. --- gklearn/experiments/taskhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gklearn/experiments/taskhub.py b/gklearn/experiments/taskhub.py index 0e0344b..b8fc7ac 100644 --- a/gklearn/experiments/taskhub.py +++ b/gklearn/experiments/taskhub.py @@ -20,9 +20,9 @@ if __name__ == '__main__': ] import os - command = '' for t in tasks: print(t['file']) + command = '' command += 'cd ' + t['path'] + '\n' command += 'python3 ' + t['file'] + '\n' # command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n' From 59687d1e87f8de95af102b505798c9161e5c23e6 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 26 Jan 2021 10:31:42 +0100 Subject: [PATCH 08/35] [Exp] Add taskhub to run exps all in once. --- gklearn/experiments/taskhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gklearn/experiments/taskhub.py b/gklearn/experiments/taskhub.py index b8fc7ac..b1a8c8d 100644 --- a/gklearn/experiments/taskhub.py +++ b/gklearn/experiments/taskhub.py @@ -15,7 +15,7 @@ if __name__ == '__main__': 'file': 'run_jobs_compare_fcsp_space.py' }, {'path': 'ged/stability', - 'file': 'Analysis_stability.ratios.real_data.relative_error.py' + 'file': 'run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py' }, ] From cba80472792d383365c0eb0915903acac62fabdf Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 2 Feb 2021 17:13:11 +0100 Subject: [PATCH 09/35] [Exp] Update computation of ged stability. --- .../edit_costs.real_data.nums_sols.ratios.IPFP.py | 34 ++-- gklearn/experiments/ged/stability/utils.py | 177 +++++++++++++++------ 2 files changed, 150 insertions(+), 61 deletions(-) diff --git a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py index 33c6973..aa08579 100644 --- a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py @@ -13,7 +13,7 @@ import pickle import logging from gklearn.ged.util import compute_geds import time -from utils import get_dataset, set_edit_cost_consts +from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation import sys from group_results import group_trials, check_group_existence, update_group_marker @@ -37,7 +37,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): # the distance between non-symbolic node/edge labels is computed by euclidean distance. 'attr_distance': 'euclidean', 'ratio_runs_from_initial_solutions': 0.25, - # parallel threads. Do not work if mpg_options['parallel'] = False. + # parallel threads. Set to 1 automatically if parallel=True in compute_geds(). 'threads': multiprocessing.cpu_count(), 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' } @@ -98,7 +98,7 @@ def save_trials_as_group(dataset, ds_name, num_solutions, ratio): ged_mats.append(ged_mat) runtimes.append(runtime) - # Group trials and Remove single files. + # Group trials and remove single files. # @todo: if the program stops between the following lines, then there may be errors. name_prefix = 'ged_matrix' + name_middle group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) @@ -111,21 +111,25 @@ def results_for_a_dataset(ds_name): """**1. Get dataset.**""" dataset = get_dataset(ds_name) - for ratio in ratio_list: + for params in list(param_grid): print() - print('Ratio:', ratio) - for num_solutions in num_solutions_list: - print() - print('# of solutions:', num_solutions) - save_trials_as_group(dataset, ds_name, num_solutions, ratio) + print(params) + save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio']) -def get_param_lists(ds_name, test=False): - if test: - num_solutions_list = [1, 10, 20, 30, 40, 50] +def get_param_lists(ds_name, mode='test'): + if mode == 'test': + num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] ratio_list = [10] return num_solutions_list, ratio_list + elif mode == 'simple': + from sklearn.model_selection import ParameterGrid + param_grid = ParameterGrid([ + {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]}, + {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]) +# print(list(param_grid)) + if ds_name == 'AIDS_symb': num_solutions_list = [1, 20, 40, 60, 80, 100] ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] @@ -133,7 +137,7 @@ def get_param_lists(ds_name, test=False): num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] - return num_solutions_list, ratio_list + return param_grid if __name__ == '__main__': @@ -141,7 +145,7 @@ if __name__ == '__main__': ds_name_list = sys.argv[1:] else: ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] -# ds_name_list = ['Acyclic'] # 'Alkane_unlabeled'] +# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' @@ -151,5 +155,5 @@ if __name__ == '__main__': for ds_name in ds_name_list: print() print('Dataset:', ds_name) - num_solutions_list, ratio_list = get_param_lists(ds_name, test=False) + param_grid = get_param_lists(ds_name, mode='simple') results_for_a_dataset(ds_name) diff --git a/gklearn/experiments/ged/stability/utils.py b/gklearn/experiments/ged/stability/utils.py index cbb45b1..e743b27 100644 --- a/gklearn/experiments/ged/stability/utils.py +++ b/gklearn/experiments/ged/stability/utils.py @@ -16,12 +16,12 @@ from gklearn.experiments import DATASET_ROOT def get_dataset(ds_name): # The node/edge labels that will not be used in the computation. -# if ds_name == 'MAO': -# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} -# if ds_name == 'Monoterpenoides': -# irrelevant_labels = {'edge_labels': ['valence']} -# elif ds_name == 'MUTAG': -# irrelevant_labels = {'edge_labels': ['label_0']} +# if ds_name == 'MAO': +# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} +# if ds_name == 'Monoterpenoides': +# irrelevant_labels = {'edge_labels': ['valence']} +# elif ds_name == 'MUTAG': +# irrelevant_labels = {'edge_labels': ['label_0']} if ds_name == 'AIDS_symb': irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} ds_name = 'AIDS' @@ -49,34 +49,36 @@ def set_edit_cost_consts(ratio, node_labeled=True, edge_labeled=True, mode='unif def nested_keys_exists(element, *keys): - ''' - Check if *keys (nested) exists in `element` (dict). - ''' - if not isinstance(element, dict): - raise AttributeError('keys_exists() expects dict as first argument.') - if len(keys) == 0: - raise AttributeError('keys_exists() expects at least two arguments, one given.') - - _element = element - for key in keys: - try: - _element = _element[key] - except KeyError: - return False - return True - + ''' + Check if *keys (nested) exists in `element` (dict). + ''' + if not isinstance(element, dict): + raise AttributeError('keys_exists() expects dict as first argument.') + if len(keys) == 0: + raise AttributeError('keys_exists() expects at least two arguments, one given.') + + _element = element + for key in keys: + try: + _element = _element[key] + except KeyError: + return False + return True # Check average relative error along elements in two ged matrices. def matrices_ave_relative_error(m1, m2): - error = 0 - base = 0 - for i in range(m1.shape[0]): - for j in range(m1.shape[1]): - error += np.abs(m1[i, j] - m2[i, j]) - base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2 + error = 0 + base = 0 + for i in range(m1.shape[0]): + for j in range(m1.shape[1]): + error += np.abs(m1[i, j] - m2[i, j]) +# base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) + base += (m1[i, j] + m2[i, j]) # Require only 25% of the time of "base += (np.abs(m1[i, j]) + np.abs(m2[i, j]))". - return error / base + base = base / 2 + + return error / base def compute_relative_error(ged_mats): @@ -92,9 +94,9 @@ def compute_relative_error(ged_mats): errors = [] for i, mat in enumerate(ged_mats): err = matrices_ave_relative_error(mat, ged_mat_s) - # if not per_correct: - # print('matrix # ', str(i)) - # pass + # if not per_correct: + # print('matrix # ', str(i)) + # pass errors.append(err) else: errors = [0] @@ -107,11 +109,11 @@ def parse_group_file_name(fn): key1 = splits_all[1] pos2 = splits_all[2].rfind('_') -# key2 = splits_all[2][:pos2] +# key2 = splits_all[2][:pos2] val2 = splits_all[2][pos2+1:] pos3 = splits_all[3].rfind('_') -# key3 = splits_all[3][:pos3] +# key3 = splits_all[3][:pos3] val3 = splits_all[3][pos3+1:] + '.' + splits_all[4] return key1, val2, val3 @@ -232,7 +234,7 @@ def set_axis_style(ax): ax.tick_params(labelsize=8, color='w', pad=1, grid_color='w') ax.tick_params(axis='x', pad=-2) ax.tick_params(axis='y', labelrotation=-40, pad=-2) -# ax.zaxis._axinfo['juggled'] = (1, 2, 0) +# ax.zaxis._axinfo['juggled'] = (1, 2, 0) ax.set_xlabel(ax.get_xlabel(), fontsize=10, labelpad=-3) ax.set_ylabel(ax.get_ylabel(), fontsize=10, labelpad=-2, rotation=50) ax.set_zlabel(ax.get_zlabel(), fontsize=10, labelpad=-2) @@ -240,16 +242,99 @@ def set_axis_style(ax): return +def dichotomous_permutation(arr, layer=0): + import math + +# def seperate_arr(arr, new_arr): +# if (length % 2) == 0: +# half = int(length / 2) +# new_arr += [arr[half - 1], arr[half]] +# subarr1 = [arr[i] for i in range(1, half - 1)] +# else: +# half = math.floor(length / 2) +# new_arr.append(arr[half]) +# subarr1 = [arr[i] for i in range(1, half)] +# subarr2 = [arr[i] for i in range(half + 1, length - 1)] +# subarrs = [subarr1, subarr2] +# return subarrs + + + if layer == 0: + length = len(arr) + if length <= 2: + return arr + + new_arr = [arr[0], arr[-1]] + if (length % 2) == 0: + half = int(length / 2) + new_arr += [arr[half - 1], arr[half]] + subarr1 = [arr[i] for i in range(1, half - 1)] + else: + half = math.floor(length / 2) + new_arr.append(arr[half]) + subarr1 = [arr[i] for i in range(1, half)] + subarr2 = [arr[i] for i in range(half + 1, length - 1)] + subarrs = [subarr1, subarr2] +# subarrs = seperate_arr(arr, new_arr) + new_arr += dichotomous_permutation(subarrs, layer=layer+1) + + else: + new_arr = [] + subarrs = [] + for a in arr: + length = len(a) + if length <= 2: + new_arr += a + else: +# subarrs += seperate_arr(a, new_arr) + if (length % 2) == 0: + half = int(length / 2) + new_arr += [a[half - 1], a[half]] + subarr1 = [a[i] for i in range(0, half - 1)] + else: + half = math.floor(length / 2) + new_arr.append(a[half]) + subarr1 = [a[i] for i in range(0, half)] + subarr2 = [a[i] for i in range(half + 1, length)] + subarrs += [subarr1, subarr2] + + if len(subarrs) > 0: + new_arr += dichotomous_permutation(subarrs, layer=layer+1) + + return new_arr + +# length = len(arr) +# if length <= 2: +# return arr + +# new_arr = [arr[0], arr[-1]] +# if (length % 2) == 0: +# half = int(length / 2) +# new_arr += [arr[half - 1], arr[half]] +# subarr1 = [arr[i] for i in range(1, half - 1)] +# else: +# half = math.floor(length / 2) +# new_arr.append(arr[half]) +# subarr1 = [arr[i] for i in range(1, half)] +# subarr2 = [arr[i] for i in range(half + 1, length - 1)] +# if len(subarr1) > 0: +# new_arr += dichotomous_permutation(subarr1) +# if len(subarr2) > 0: +# new_arr += dichotomous_permutation(subarr2) + +# return new_arr + + if __name__ == '__main__': root_dir = 'outputs/CRIANN/' -# for dir_ in sorted(os.listdir(root_dir)): -# if os.path.isdir(root_dir): -# full_dir = os.path.join(root_dir, dir_) -# print('---', full_dir,':') -# save_dir = os.path.join(full_dir, 'groups/') -# if os.path.exists(save_dir): -# try: -# get_relative_errors(save_dir) -# except Exception as exp: -# print('An exception occured when running this experiment:') -# print(repr(exp)) \ No newline at end of file +# for dir_ in sorted(os.listdir(root_dir)): +# if os.path.isdir(root_dir): +# full_dir = os.path.join(root_dir, dir_) +# print('---', full_dir,':') +# save_dir = os.path.join(full_dir, 'groups/') +# if os.path.exists(save_dir): +# try: +# get_relative_errors(save_dir) +# except Exception as exp: +# print('An exception occured when running this experiment:') +# print(repr(exp)) \ No newline at end of file From 25e10c8d49633ce08ae419e91f19e68dfeca54d1 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 2 Feb 2021 17:14:57 +0100 Subject: [PATCH 10/35] [Exp] Update exceptions in fcsp exps. --- gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py index 5568dd7..ba74045 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py @@ -27,6 +27,7 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('StructuralSP', 'COX2', 'False'), ('ShortestPath', 'DHFR', 'False'), ('StructuralSP', 'DHFR', 'False'), + ('ShortestPath', 'MCF-7', 'True'), ('StructuralSP', 'OHSU', 'True'), ('StructuralSP', 'OHSU', 'False'), ('StructuralSP', 'Steroid', 'False'), @@ -49,6 +50,8 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('StructuralSP', 'REDDIT-BINARY', 'True'), ('StructuralSP', 'REDDIT-BINARY', 'False'), ('StructuralSP', 'Vitamin_D', 'False'), + ('ShortestPath', 'Web', 'True'), + ('ShortestPath', 'Web', 'False'), }) OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'), From 27de2454a932792dddd17b40b6b3ef0ccce0fbbe Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 9 Feb 2021 10:09:30 +0100 Subject: [PATCH 11/35] [Exp] Update exceptions in fcsp exps. --- gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py index ba74045..187f5b4 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py @@ -28,6 +28,7 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('ShortestPath', 'DHFR', 'False'), ('StructuralSP', 'DHFR', 'False'), ('ShortestPath', 'MCF-7', 'True'), + ('ShortestPath', 'MCF-7H', 'True'), ('StructuralSP', 'OHSU', 'True'), ('StructuralSP', 'OHSU', 'False'), ('StructuralSP', 'Steroid', 'False'), From 6e898240e3e75f6e9fa99fbb277a319fe0e206bb Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 15 Feb 2021 11:47:17 +0100 Subject: [PATCH 12/35] [Exp] Update exceptions in fcsp exps. --- .../experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py index 187f5b4..2de6b4d 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py @@ -46,6 +46,12 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('StructuralSP', 'COIL-DEL', 'False'), ('ShortestPath', 'PROTEINS', 'False'), ('ShortestPath', 'PROTEINS_full', 'False'), + ('ShortestPath', 'MOLT-4', 'False'), + ('StructuralSP', 'MOLT-4', 'True'), + ('StructuralSP', 'MOLT-4', 'False'), + ('ShortestPath', 'MOLT-4', 'True'), + ('ShortestPath', 'MOLT-4', 'False'), + ('StructuralSP', 'MOLT-4', 'True'), ('StructuralSP', 'Mutagenicity', 'True'), ('StructuralSP', 'Mutagenicity', 'False'), ('StructuralSP', 'REDDIT-BINARY', 'True'), From 13c0fb37c1ae6ef6345dd3fce3df67a5043303b7 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 12 Mar 2021 11:37:41 +0100 Subject: [PATCH 13/35] [Exp] Update exceptions in fcsp exps. --- .../thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py index 2de6b4d..5944f58 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py @@ -28,7 +28,13 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('ShortestPath', 'DHFR', 'False'), ('StructuralSP', 'DHFR', 'False'), ('ShortestPath', 'MCF-7', 'True'), + ('ShortestPath', 'MCF-7', 'False'), + ('StructuralSP', 'MCF-7', 'True'), + ('StructuralSP', 'MCF-7', 'False'), ('ShortestPath', 'MCF-7H', 'True'), + ('ShortestPath', 'MCF-7H', 'False'), + ('StructuralSP', 'MCF-7H', 'True'), + ('StructuralSP', 'MCF-7H', 'False'), ('StructuralSP', 'OHSU', 'True'), ('StructuralSP', 'OHSU', 'False'), ('StructuralSP', 'Steroid', 'False'), @@ -49,9 +55,10 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('ShortestPath', 'MOLT-4', 'False'), ('StructuralSP', 'MOLT-4', 'True'), ('StructuralSP', 'MOLT-4', 'False'), - ('ShortestPath', 'MOLT-4', 'True'), - ('ShortestPath', 'MOLT-4', 'False'), - ('StructuralSP', 'MOLT-4', 'True'), + ('ShortestPath', 'MOLT-4H', 'True'), + ('ShortestPath', 'MOLT-4H', 'False'), + ('StructuralSP', 'MOLT-4H', 'True'), + ('StructuralSP', 'MOLT-4H', 'False'), ('StructuralSP', 'Mutagenicity', 'True'), ('StructuralSP', 'Mutagenicity', 'False'), ('StructuralSP', 'REDDIT-BINARY', 'True'), From 29bcfe4dabe0550e4dbf5f8842a345c7640d4de6 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 30 Mar 2021 11:23:56 +0200 Subject: [PATCH 14/35] [Exp] Update exceptions in fcsp exps. --- .../graph_kernels/fcsp/run_jobs_compare_fcsp.py | 24 +++++++++++++++------- .../fcsp/run_jobs_compare_fcsp_space.py | 2 ++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py index 5944f58..9e9af66 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py @@ -19,7 +19,15 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('StructuralSP', 'NCI1', 'False'), ('ShortestPath', 'NCI109', 'False'), ('StructuralSP', 'NCI109', 'True'), + ('ShortestPath', 'NCI-H23', 'True'), + ('ShortestPath', 'NCI-H23', 'False'), + ('StructuralSP', 'NCI-H23', 'True'), + ('StructuralSP', 'NCI-H23', 'False'), ('StructuralSP', 'NCI109', 'False'), + ('ShortestPath', 'NCI-H23H', 'True'), + ('ShortestPath', 'NCI-H23H', 'False'), + ('StructuralSP', 'NCI-H23H', 'True'), + ('StructuralSP', 'NCI-H23H', 'False'), ('ShortestPath', 'DD', 'True'), ('ShortestPath', 'DD', 'False'), ('StructuralSP', 'BZR', 'False'), @@ -35,8 +43,17 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('ShortestPath', 'MCF-7H', 'False'), ('StructuralSP', 'MCF-7H', 'True'), ('StructuralSP', 'MCF-7H', 'False'), + ('ShortestPath', 'MOLT-4', 'True'), + ('ShortestPath', 'MOLT-4', 'False'), + ('StructuralSP', 'MOLT-4', 'True'), + ('StructuralSP', 'MOLT-4', 'False'), + ('ShortestPath', 'MOLT-4H', 'True'), + ('ShortestPath', 'MOLT-4H', 'False'), + ('StructuralSP', 'MOLT-4H', 'True'), + ('StructuralSP', 'MOLT-4H', 'False'), ('StructuralSP', 'OHSU', 'True'), ('StructuralSP', 'OHSU', 'False'), + ('ShortestPath', 'OVCAR-8', 'True'), ('StructuralSP', 'Steroid', 'False'), ('ShortestPath', 'SYNTHETIC', 'False'), ('StructuralSP', 'SYNTHETIC', 'True'), @@ -52,13 +69,6 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('StructuralSP', 'COIL-DEL', 'False'), ('ShortestPath', 'PROTEINS', 'False'), ('ShortestPath', 'PROTEINS_full', 'False'), - ('ShortestPath', 'MOLT-4', 'False'), - ('StructuralSP', 'MOLT-4', 'True'), - ('StructuralSP', 'MOLT-4', 'False'), - ('ShortestPath', 'MOLT-4H', 'True'), - ('ShortestPath', 'MOLT-4H', 'False'), - ('StructuralSP', 'MOLT-4H', 'True'), - ('StructuralSP', 'MOLT-4H', 'False'), ('StructuralSP', 'Mutagenicity', 'True'), ('StructuralSP', 'Mutagenicity', 'False'), ('StructuralSP', 'REDDIT-BINARY', 'True'), diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py index 35a6d63..c608047 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py @@ -17,6 +17,7 @@ OUT_TIME_LIST = [] OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), ('ShortestPath', 'REDDIT-BINARY', 'False'), ('StructuralSP', 'ENZYMES', 'False'), + ('StructuralSP', 'AIDS', 'False'), ('ShortestPath', 'DD', 'True'), ('ShortestPath', 'DD', 'False'), ('StructuralSP', 'DD', 'True'), @@ -55,6 +56,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), ('ShortestPath', 'P388H', 'False'), ('StructuralSP', 'P388H', 'True'), ('StructuralSP', 'P388H', 'False'), + ('StructuralSP', 'NCI1', 'False'), ('ShortestPath', 'NCI-H23', 'True'), ('ShortestPath', 'NCI-H23', 'False'), ('StructuralSP', 'NCI-H23', 'True'), From 14bd9db5a7c7fd8f7783beb4bc66bfcf3801ab9b Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 4 May 2021 11:33:40 +0200 Subject: [PATCH 15/35] [Exp] Update exceptions in fcsp exps. --- .../thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py index 9e9af66..e9ced4d 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py @@ -54,6 +54,16 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), ('StructuralSP', 'OHSU', 'True'), ('StructuralSP', 'OHSU', 'False'), ('ShortestPath', 'OVCAR-8', 'True'), + ('ShortestPath', 'OVCAR-8', 'False'), + ('StructuralSP', 'OVCAR-8', 'True'), + ('StructuralSP', 'OVCAR-8', 'False'), + ('ShortestPath', 'OVCAR-8H', 'True'), + ('ShortestPath', 'OVCAR-8H', 'False'), + ('StructuralSP', 'OVCAR-8H', 'True'), + ('StructuralSP', 'OVCAR-8H', 'False'), + ('ShortestPath', 'P388', 'False'), + ('ShortestPath', 'P388', 'True'), + ('StructuralSP', 'P388', 'True'), ('StructuralSP', 'Steroid', 'False'), ('ShortestPath', 'SYNTHETIC', 'False'), ('StructuralSP', 'SYNTHETIC', 'True'), From 81a881bd79fa1407c79c787f60c45294b53bb040 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 4 May 2021 12:18:13 +0200 Subject: [PATCH 16/35] [Exp] Update exceptions in fcsp exps. --- .../thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py index c608047..9685d12 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py @@ -210,11 +210,12 @@ def check_task_status(save_dir, *params): # Check if the task is already computed. file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') - if os.path.isfile(file_name): - with open(file_name, 'rb') as f: - data = pickle.load(f) - if data['completed']: - return True + if os.path.getsize(file_name) > 0: + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + data = pickle.load(f) + if data['completed']: + return True return False From 679953200c8ad1559cc0fe9e245379b611df2a5e Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 10 May 2021 10:39:27 +0200 Subject: [PATCH 17/35] [Exp] Update exceptions in fcsp exps. --- .../experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py index 9685d12..228b89b 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py @@ -65,6 +65,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), ('ShortestPath', 'NCI-H23H', 'False'), ('StructuralSP', 'NCI-H23H', 'True'), ('StructuralSP', 'NCI-H23H', 'False'), + ('StructuralSP', 'OHSU', 'False'), ('ShortestPath', 'OVCAR-8', 'True'), ('ShortestPath', 'OVCAR-8', 'False'), ('StructuralSP', 'OVCAR-8', 'True'), From d08eddc69c94dc8bae7adbda8aebc85669a19964 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 21 May 2021 17:32:18 +0200 Subject: [PATCH 18/35] [Enhancement] The global variable now uses kernel classes as values. --- gklearn/kernels/__init__.py | 5 +++-- gklearn/kernels/metadata.py | 34 +++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/gklearn/kernels/__init__.py b/gklearn/kernels/__init__.py index 6ffef06..753715e 100644 --- a/gklearn/kernels/__init__.py +++ b/gklearn/kernels/__init__.py @@ -7,7 +7,6 @@ __version__ = "0.1" __author__ = "Linlin Jia" __date__ = "November 2018" -from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels from gklearn.kernels.graph_kernel import GraphKernel from gklearn.kernels.common_walk import CommonWalk @@ -24,6 +23,8 @@ from gklearn.kernels.path_up_to_h import PathUpToH from gklearn.kernels.treelet import Treelet from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree +from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels + # old version. from gklearn.kernels.commonWalkKernel import commonwalkkernel from gklearn.kernels.marginalizedKernel import marginalizedkernel @@ -32,4 +33,4 @@ from gklearn.kernels.spKernel import spkernel from gklearn.kernels.structuralspKernel import structuralspkernel from gklearn.kernels.untilHPathKernel import untilhpathkernel from gklearn.kernels.treeletKernel import treeletkernel -from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel \ No newline at end of file +from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel diff --git a/gklearn/kernels/metadata.py b/gklearn/kernels/metadata.py index 188fc56..e3a5fde 100644 --- a/gklearn/kernels/metadata.py +++ b/gklearn/kernels/metadata.py @@ -5,23 +5,35 @@ Created on Fri Nov 6 10:11:08 2020 @author: ljia """ +from gklearn.kernels.common_walk import CommonWalk +from gklearn.kernels.marginalized import Marginalized +from gklearn.kernels.sylvester_equation import SylvesterEquation +from gklearn.kernels.conjugate_gradient import ConjugateGradient +from gklearn.kernels.fixed_point import FixedPoint +from gklearn.kernels.spectral_decomposition import SpectralDecomposition +from gklearn.kernels.shortest_path import ShortestPath +from gklearn.kernels.structural_sp import StructuralSP +from gklearn.kernels.path_up_to_h import PathUpToH +from gklearn.kernels.treelet import Treelet +from gklearn.kernels.weisfeiler_lehman import WLSubtree + # The metadata of all graph kernels. GRAPH_KERNELS = { ### based on walks. - 'common walk': '', - 'marginalized': '', - 'sylvester equation': '', - 'fixed point': '', - 'conjugate gradient': '', - 'spectral decomposition': '', + 'common walk': CommonWalk, + 'marginalized': Marginalized, + 'sylvester equation': SylvesterEquation, + 'fixed point': FixedPoint, + 'conjugate gradient': ConjugateGradient, + 'spectral decomposition': SpectralDecomposition, ### based on paths. - 'shortest path': '', - 'structural shortest path': '', - 'path up to length h': '', + 'shortest path': ShortestPath, + 'structural shortest path': StructuralSP, + 'path up to length h': PathUpToH, ### based on non-linear patterns. - 'weisfeiler-lehman subtree': '', - 'treelet': '', + 'weisfeiler-lehman subtree': WLSubtree, + 'treelet': Treelet, } From bb36d0f50779dc517fcacc5f74663f5c38dc872f Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 21 May 2021 17:51:33 +0200 Subject: [PATCH 19/35] [Major Feature] Graph kernel classes now can compute kernel matrix between two different list of graphs using fit/transform methods which uses the same scheme as the scikit-learn library! --- gklearn/kernels/common_walk.py | 10 +- gklearn/kernels/conjugate_gradient.py | 26 +- gklearn/kernels/fixed_point.py | 26 +- gklearn/kernels/graph_kernel.py | 413 +++++++++++++++++++++++++++--- gklearn/kernels/marginalized.py | 28 +- gklearn/kernels/path_up_to_h.py | 28 +- gklearn/kernels/shortest_path.py | 28 +- gklearn/kernels/spectral_decomposition.py | 38 +-- gklearn/kernels/structural_sp.py | 28 +- gklearn/kernels/sylvester_equation.py | 36 +-- gklearn/kernels/treelet.py | 400 ++++++++++++++++++++++------- gklearn/kernels/weisfeiler_lehman.py | 18 +- 12 files changed, 825 insertions(+), 254 deletions(-) diff --git a/gklearn/kernels/common_walk.py b/gklearn/kernels/common_walk.py index ac3363e..ea0e59f 100644 --- a/gklearn/kernels/common_walk.py +++ b/gklearn/kernels/common_walk.py @@ -47,7 +47,7 @@ class CommonWalk(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, - length=len_itr, verbose=(self._verbose >= 2)) + length=len_itr, verbose=(self.verbose >= 2)) # direct product graph method - exponential if self._compute_method == 'exp': @@ -86,7 +86,7 @@ class CommonWalk(GraphKernel): do_fun = self._wrapper_kernel_do_geo parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -100,9 +100,9 @@ class CommonWalk(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) - if self._verbose >= 2: + if self.verbose >= 2: iterator = get_iters(range(len(g_list)), desc='Computing kernels', - file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) else: iterator = range(len(g_list)) @@ -148,7 +148,7 @@ class CommonWalk(GraphKernel): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/conjugate_gradient.py b/gklearn/kernels/conjugate_gradient.py index b162f20..eb5e428 100644 --- a/gklearn/kernels/conjugate_gradient.py +++ b/gklearn/kernels/conjugate_gradient.py @@ -35,7 +35,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_gm_series(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) lmda = self._weight @@ -44,7 +44,7 @@ class ConjugateGradient(RandomWalkMeta): gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) # Reindex nodes using consecutive integers for the convenience of kernel computation. - iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -52,7 +52,7 @@ class ConjugateGradient(RandomWalkMeta): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) - iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) @@ -66,7 +66,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) # Compute Gram matrix. @@ -74,7 +74,7 @@ class ConjugateGradient(RandomWalkMeta): # @todo: parallel this. # Reindex nodes using consecutive integers for the convenience of kernel computation. - iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -86,7 +86,7 @@ class ConjugateGradient(RandomWalkMeta): do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) else: # @todo pass @@ -95,7 +95,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) lmda = self._weight @@ -105,11 +105,11 @@ class ConjugateGradient(RandomWalkMeta): # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') - iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(g1, g_list[i], lmda) @@ -122,7 +122,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) # compute kernel list. @@ -131,7 +131,7 @@ class ConjugateGradient(RandomWalkMeta): # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') # @todo: parallel this. - iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -149,7 +149,7 @@ class ConjugateGradient(RandomWalkMeta): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) else: # @todo pass @@ -162,7 +162,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2], self._verbose) + self._check_edge_weight([g1] + [g2], self.verbose) self._check_graphs([g1] + [g2]) lmda = self._weight diff --git a/gklearn/kernels/fixed_point.py b/gklearn/kernels/fixed_point.py index 12d8fe7..ced5430 100644 --- a/gklearn/kernels/fixed_point.py +++ b/gklearn/kernels/fixed_point.py @@ -35,7 +35,7 @@ class FixedPoint(RandomWalkMeta): def _compute_gm_series(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) lmda = self._weight @@ -44,7 +44,7 @@ class FixedPoint(RandomWalkMeta): gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) # Reindex nodes using consecutive integers for the convenience of kernel computation. - iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self.verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -52,7 +52,7 @@ class FixedPoint(RandomWalkMeta): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) - iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) @@ -66,7 +66,7 @@ class FixedPoint(RandomWalkMeta): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) # Compute Gram matrix. @@ -74,7 +74,7 @@ class FixedPoint(RandomWalkMeta): # @todo: parallel this. # Reindex nodes using consecutive integers for the convenience of kernel computation. - iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -86,7 +86,7 @@ class FixedPoint(RandomWalkMeta): do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) else: # @todo pass @@ -95,7 +95,7 @@ class FixedPoint(RandomWalkMeta): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) lmda = self._weight @@ -105,12 +105,12 @@ class FixedPoint(RandomWalkMeta): # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') - iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(g1, g_list[i], lmda) @@ -123,7 +123,7 @@ class FixedPoint(RandomWalkMeta): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) # compute kernel list. @@ -132,7 +132,7 @@ class FixedPoint(RandomWalkMeta): # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') # @todo: parallel this. - iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -150,7 +150,7 @@ class FixedPoint(RandomWalkMeta): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) else: # @todo pass @@ -163,7 +163,7 @@ class FixedPoint(RandomWalkMeta): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2], self._verbose) + self._check_edge_weight([g1] + [g2], self.verbose) self._check_graphs([g1] + [g2]) lmda = self._weight diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index 2692713..90a0906 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -9,27 +9,372 @@ import numpy as np import networkx as nx import multiprocessing import time +# from abc import ABC, abstractmethod +from sklearn.base import BaseEstimator # , TransformerMixin +from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, +from sklearn.exceptions import NotFittedError from gklearn.utils import normalize_gram_matrix -class GraphKernel(object): +class GraphKernel(BaseEstimator): #, ABC): + """The basic graph kernel class. - def __init__(self): - self._graphs = None - self._parallel = '' - self._n_jobs = 0 - self._verbose = None - self._normalize = True - self._run_time = 0 - self._gram_matrix = None - self._gram_matrix_unnorm = None + Attributes + ---------- + _graphs : list + Stores the input graphs on fit input data. + Default format of the list objects is `NetworkX` graphs. + **We don't guarantee that the input graphs remain unchanged during the + computation.** + + References + ---------- + https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. + """ + + def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2): + """`__init__` for `GraphKernel` object.""" + # @todo: the default settings of the parameters are different from those in the self.compute method. +# self._graphs = None + self.parallel = parallel + self.n_jobs = n_jobs + self.chunksize = chunksize + self.normalize = normalize + self.verbose = verbose +# self._run_time = 0 +# self._gram_matrix = None +# self._gram_matrix_unnorm = None + + + ########################################################################## + # The following is the 1st paradigm to compute kernel matrix, which is + # compatible with `scikit-learn`. + # ------------------------------------------------------------------- + # Special thanks to the "GraKeL" library for providing an excellent template! + ########################################################################## + + + def fit(self, X, y=None): + """Fit a graph dataset for a transformer. + + Parameters + ---------- + X : iterable + DESCRIPTION. + + y : None, optional + There is no need of a target in a transformer, yet the `scikit-learn` + pipeline API requires this parameter. + + Returns + ------- + object + Returns self. + + """ +# self._is_tranformed = False + + # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; + self.clear_attributes() + +# X = check_array(X, accept_sparse=True) + + # Validate parameters for the transformer. + self.validate_parameters() + + # Validate the input. + self._graphs = self.validate_input(X) + +# self._X = X +# self._kernel = self._get_kernel_instance() + + # Return the transformer. + return self + + + def transform(self, X): + """Compute the graph kernel matrix between given and fitted data. + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Raises + ------ + ValueError + DESCRIPTION. + + Returns + ------- + None. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, '_graphs') + + # Validate the input. + Y = self.validate_input(X) + + # Transform: compute the graph kernel matrix. + kernel_matrix = self.compute_kernel_matrix(Y) + self._Y = Y + + # Self transform must appear before the diagonal call on normilization. + self._is_transformed = True + if self.normalize: + X_diag, Y_diag = self.diagonals() + kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) + + return kernel_matrix + + + + def fit_transform(self, X): + """Fit and transform: compute Gram matrix on the same data. + + Parameters + ---------- + X : list of graphs + Input graphs. + + Returns + ------- + gram_matrix : numpy array, shape = [len(X), len(X)] + The Gram matrix of X. + + """ + self.fit(X) + + # Transform: compute Gram matrix. + gram_matrix = self.compute_kernel_matrix() + + # Normalize. + self._X_diag = np.diagonal(gram_matrix).copy() + if self.normalize: + gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) + + return gram_matrix + + + def get_params(self): + pass + + + def set_params(self): + pass + + + def clear_attributes(self): + if hasattr(self, '_X_diag'): + delattr(self, '_X_diag') + if hasattr(self, '_graphs'): + delattr(self, '_graphs') + if hasattr(self, '_Y'): + delattr(self, '_Y') + if hasattr(self, '_run_time'): + delattr(self, '_run_time') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + if self.parallel is not None and self.parallel != 'imap_unordered': + raise ValueError('Parallel mode is not set correctly.') + + if self.parallel == 'imap_unordered' and self.n_jobs is None: + self.n_jobs = multiprocessing.cpu_count() + + + def validate_input(self, X): + """Validate the given input and raise errors if it is invalid. + + Parameters + ---------- + X : list + The input to check. Should be a list of graph. + + Raises + ------ + ValueError + Raise if the input is not correct. + + Returns + ------- + X : list + The input. A list of graph. + + """ + if X is None: + raise ValueError('Please add graphs before computing.') + elif not isinstance(X, list): + raise ValueError('Cannot detect graphs.') + elif len(X) == 0: + raise ValueError('The graph list given is empty. No computation will be performed.') + + return X + + + def compute_kernel_matrix(self, Y=None): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) or the Gram matrix for the fitted + graphs (X / self._graphs). + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. The default is None. If None kernel is computed + between X and itself. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + if Y is None: + # Compute Gram matrix for self._graphs (X). + kernel_matrix = self._compute_gram_matrix() +# self._gram_matrix_unnorm = np.copy(self._gram_matrix) + + else: + # Compute kernel matrix between Y and self._graphs (X). + start_time = time.time() + + if self.parallel == 'imap_unordered': + kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y) + + elif self.parallel is None: + kernel_matrix = self._compute_kernel_matrix_series(Y) + + self._run_time = time.time() - start_time + if self.verbose: + print('Kernel matrix of size (%d, %d) built in %s seconds.' + % (len(Y), len(self._graphs), self._run_time)) + + return kernel_matrix + + + def _compute_kernel_matrix_series(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) without parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + kernel_matrix = np.zeros((len(Y), len(self._graphs))) + + for i_y, g_y in enumerate(Y): + for i_x, g_x in enumerate(self._graphs): + kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x) + + return kernel_matrix + + + def _compute_kernel_matrix_imap_unordered(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) using imap unordered parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + raise Exception('Parallelization for kernel matrix is not implemented.') + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + for i, x in enumerate(self._graphs): + self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + for (i, y) in enumerate(self._Y): + self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? + + return self._X_diag, self._Y_diag + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + +# @abstractmethod + def pairwise_kernel(self, x, y): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + Returns + ------- + kernel: float + The computed kernel. + +# Notes +# ----- +# This method is abstract and must be implemented by a subclass. + + """ + raise NotImplementedError('Pairwise kernel computation is not implemented!') + + + ########################################################################## + # The following is the 2nd paradigm to compute kernel matrix. It is + # simplified and not compatible with `scikit-learn`. + ########################################################################## def compute(self, *graphs, **kwargs): - self._parallel = kwargs.get('parallel', 'imap_unordered') - self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) - self._normalize = kwargs.get('normalize', True) - self._verbose = kwargs.get('verbose', 2) + self.parallel = kwargs.get('parallel', 'imap_unordered') + self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) + self.normalize = kwargs.get('normalize', True) + self.verbose = kwargs.get('verbose', 2) + self.validate_parameters() if len(graphs) == 1: if not isinstance(graphs[0], list): @@ -40,7 +385,7 @@ class GraphKernel(object): self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. self._gram_matrix = self._compute_gram_matrix() self._gram_matrix_unnorm = np.copy(self._gram_matrix) - if self._normalize: + if self.normalize: self._gram_matrix = normalize_gram_matrix(self._gram_matrix) return self._gram_matrix, self._run_time @@ -103,15 +448,15 @@ class GraphKernel(object): def _compute_gram_matrix(self): start_time = time.time() - if self._parallel == 'imap_unordered': + if self.parallel == 'imap_unordered': gram_matrix = self._compute_gm_imap_unordered() - elif self._parallel is None: + elif self.parallel is None: gram_matrix = self._compute_gm_series() else: raise Exception('Parallel mode is not set correctly.') self._run_time = time.time() - start_time - if self._verbose: + if self.verbose: print('Gram matrix of size %d built in %s seconds.' % (len(self._graphs), self._run_time)) @@ -129,15 +474,15 @@ class GraphKernel(object): def _compute_kernel_list(self, g1, g_list): start_time = time.time() - if self._parallel == 'imap_unordered': + if self.parallel == 'imap_unordered': kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) - elif self._parallel is None: + elif self.parallel is None: kernel_list = self._compute_kernel_list_series(g1, g_list) else: raise Exception('Parallel mode is not set correctly.') self._run_time = time.time() - start_time - if self._verbose: + if self.verbose: print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' % (len(g_list), self._run_time)) @@ -158,7 +503,7 @@ class GraphKernel(object): kernel = self._compute_single_kernel_series(g1, g2) self._run_time = time.time() - start_time - if self._verbose: + if self.verbose: print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) return kernel @@ -185,24 +530,24 @@ class GraphKernel(object): return self._graphs - @property - def parallel(self): - return self._parallel +# @property +# def parallel(self): +# return self.parallel - @property - def n_jobs(self): - return self._n_jobs +# @property +# def n_jobs(self): +# return self.n_jobs - @property - def verbose(self): - return self._verbose +# @property +# def verbose(self): +# return self.verbose - @property - def normalize(self): - return self._normalize +# @property +# def normalize(self): +# return self.normalize @property diff --git a/gklearn/kernels/marginalized.py b/gklearn/kernels/marginalized.py index e3d70c6..d6c203e 100644 --- a/gklearn/kernels/marginalized.py +++ b/gklearn/kernels/marginalized.py @@ -46,7 +46,7 @@ class Marginalized(GraphKernel): self._add_dummy_labels(self._graphs) if self._remove_totters: - iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) # @todo: this may not work. self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] @@ -57,7 +57,7 @@ class Marginalized(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, - length=len_itr, verbose=(self._verbose >= 2)) + length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(self._graphs[i], self._graphs[j]) gram_matrix[i][j] = kernel @@ -70,16 +70,16 @@ class Marginalized(GraphKernel): self._add_dummy_labels(self._graphs) if self._remove_totters: - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = range(0, len(self._graphs)) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 remove_fun = self._wrapper_untotter iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), desc='removing tottering', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, g in iterator: self._graphs[i] = g pool.close() @@ -93,7 +93,7 @@ class Marginalized(GraphKernel): G_gn = gn_toshare do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -103,13 +103,13 @@ class Marginalized(GraphKernel): if self._remove_totters: g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. - iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) # @todo: this may not work. g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] # compute kernel list. kernel_list = [None] * len(g_list) - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(g1, g_list[i]) kernel_list[i] = kernel @@ -122,16 +122,16 @@ class Marginalized(GraphKernel): if self._remove_totters: g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = range(0, len(g_list)) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 remove_fun = self._wrapper_untotter iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), desc='removing tottering', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, g in iterator: g_list[i] = g pool.close() @@ -151,7 +151,7 @@ class Marginalized(GraphKernel): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/path_up_to_h.py b/gklearn/kernels/path_up_to_h.py index afe3859..0c80931 100644 --- a/gklearn/kernels/path_up_to_h.py +++ b/gklearn/kernels/path_up_to_h.py @@ -41,10 +41,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None from itertools import combinations_with_replacement itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) - iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2)) + iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self.verbose >= 2)) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator_kernel = get_iters(itr_kernel, desc='Computing kernels', - file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) @@ -69,10 +69,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 all_paths = [[] for _ in range(len(self._graphs))] @@ -84,7 +84,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), desc='getting paths', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, ps in iterator: all_paths[i] = ps pool.close() @@ -109,7 +109,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None G_plist = plist_toshare do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this? parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(all_paths,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -117,8 +117,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None def _compute_kernel_list_series(self, g1, g_list): self._add_dummy_labels(g_list + [g1]) - iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self._verbose >= 2)) - iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self.verbose >= 2)) + iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) kernel_list = [None] * len(g_list) @@ -143,10 +143,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(g_list, range(0, len(g_list))) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 paths_g_list = [[] for _ in range(len(g_list))] @@ -161,7 +161,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), desc='getting paths', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, ps in iterator: paths_g_list[i] = ps pool.close() @@ -180,7 +180,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/shortest_path.py b/gklearn/kernels/shortest_path.py index bfea553..0c5fccc 100644 --- a/gklearn/kernels/shortest_path.py +++ b/gklearn/kernels/shortest_path.py @@ -38,7 +38,7 @@ class ShortestPath(GraphKernel): def _compute_gm_series(self): self._all_graphs_have_edges(self._graphs) # get shortest path graph of each graph. - iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] # compute Gram matrix. @@ -48,7 +48,7 @@ class ShortestPath(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', - length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) + length=len_itr, file=sys.stdout,verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._sp_do(self._graphs[i], self._graphs[j]) gram_matrix[i][j] = kernel @@ -60,16 +60,16 @@ class ShortestPath(GraphKernel): def _compute_gm_imap_unordered(self): self._all_graphs_have_edges(self._graphs) # get shortest path graph of each graph. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) get_sp_graphs_fun = self._wrapper_get_sp_graphs itr = zip(self._graphs, range(0, len(self._graphs))) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), desc='getting sp graphs', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, g in iterator: self._graphs[i] = g pool.close() @@ -83,7 +83,7 @@ class ShortestPath(GraphKernel): G_gs = gs_toshare do_fun = self._wrapper_sp_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -92,12 +92,12 @@ class ShortestPath(GraphKernel): self._all_graphs_have_edges([g1] + g_list) # get shortest path graphs of g1 and each graph in g_list. g1 = getSPGraph(g1, edge_weight=self._edge_weight) - iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] # compute kernel list. kernel_list = [None] * len(g_list) - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._sp_do(g1, g_list[i]) kernel_list[i] = kernel @@ -109,16 +109,16 @@ class ShortestPath(GraphKernel): self._all_graphs_have_edges([g1] + g_list) # get shortest path graphs of g1 and each graph in g_list. g1 = getSPGraph(g1, edge_weight=self._edge_weight) - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) get_sp_graphs_fun = self._wrapper_get_sp_graphs itr = zip(g_list, range(0, len(g_list))) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), desc='getting sp graphs', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, g in iterator: g_list[i] = g pool.close() @@ -137,7 +137,7 @@ class ShortestPath(GraphKernel): itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/spectral_decomposition.py b/gklearn/kernels/spectral_decomposition.py index 561f632..bc06e26 100644 --- a/gklearn/kernels/spectral_decomposition.py +++ b/gklearn/kernels/spectral_decomposition.py @@ -28,9 +28,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_gm_series(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') @@ -41,7 +41,7 @@ class SpectralDecomposition(RandomWalkMeta): # precompute the spectral decomposition of each graph. P_list = [] D_list = [] - iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -58,7 +58,7 @@ class SpectralDecomposition(RandomWalkMeta): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) - iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel) @@ -74,9 +74,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') @@ -87,7 +87,7 @@ class SpectralDecomposition(RandomWalkMeta): # precompute the spectral decomposition of each graph. P_list = [] D_list = [] - iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -107,7 +107,7 @@ class SpectralDecomposition(RandomWalkMeta): do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(q_T_list, P_list, D_list), n_jobs=self.n_jobs, verbose=self.verbose) else: # @todo pass @@ -118,9 +118,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') @@ -133,7 +133,7 @@ class SpectralDecomposition(RandomWalkMeta): D1, P1 = np.linalg.eig(A1) P_list = [] D_list = [] - iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -145,7 +145,7 @@ class SpectralDecomposition(RandomWalkMeta): if self._p is None: # p is uniform distribution as default. q_T1 = 1 / nx.number_of_nodes(g1) q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel) @@ -160,9 +160,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') @@ -175,8 +175,8 @@ class SpectralDecomposition(RandomWalkMeta): D1, P1 = np.linalg.eig(A1) P_list = [] D_list = [] - if self._verbose >= 2: - iterator = tqdm(g_list, desc='spectral decompose', file=sys.stdout) + if self.verbose >= 2: + iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout) else: iterator = g_list for G in iterator: @@ -207,7 +207,7 @@ class SpectralDecomposition(RandomWalkMeta): itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) else: # @todo pass @@ -222,9 +222,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2], self._verbose) + self._check_edge_weight([g1] + [g2], self.verbose) self._check_graphs([g1] + [g2]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') diff --git a/gklearn/kernels/structural_sp.py b/gklearn/kernels/structural_sp.py index 35ed9d1..1fd68f7 100644 --- a/gklearn/kernels/structural_sp.py +++ b/gklearn/kernels/structural_sp.py @@ -41,7 +41,7 @@ class StructuralSP(GraphKernel): def _compute_gm_series(self): # get shortest paths of each graph in the graphs. splist = [] - iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) if self._compute_method == 'trie': for g in iterator: splist.append(self._get_sps_as_trie(g)) @@ -56,7 +56,7 @@ class StructuralSP(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, - length=len_itr, verbose=(self._verbose >= 2)) + length=len_itr, verbose=(self.verbose >= 2)) if self._compute_method == 'trie': for i, j in iterator: kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) @@ -76,10 +76,10 @@ class StructuralSP(GraphKernel): def _compute_gm_imap_unordered(self): # get shortest paths of each graph in the graphs. splist = [None] * len(self._graphs) - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 # get shortest path graphs of self._graphs @@ -89,7 +89,7 @@ class StructuralSP(GraphKernel): get_sps_fun = self._wrapper_get_sps_naive iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), desc='getting shortest paths', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, sp in iterator: splist[i] = sp pool.close() @@ -107,7 +107,7 @@ class StructuralSP(GraphKernel): else: do_fun = self._wrapper_ssp_do_naive parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(splist, self._graphs), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -117,7 +117,7 @@ class StructuralSP(GraphKernel): sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) splist = [] iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, - verbose=(self._verbose >= 2)) + verbose=(self.verbose >= 2)) if self._compute_method == 'trie': for g in iterator: splist.append(self._get_sps_as_trie(g)) @@ -128,7 +128,7 @@ class StructuralSP(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) iterator = get_iters(range(len(g_list)), desc='Computing kernels', - file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) if self._compute_method == 'trie': for i in iterator: kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i]) @@ -145,10 +145,10 @@ class StructuralSP(GraphKernel): # get shortest paths of g1 and each graph in g_list. sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) splist = [None] * len(g_list) - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(g_list, range(0, len(g_list))) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 # get shortest path graphs of g_list @@ -158,7 +158,7 @@ class StructuralSP(GraphKernel): get_sps_fun = self._wrapper_get_sps_naive iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), desc='getting shortest paths', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, sp in iterator: splist[i] = sp pool.close() @@ -182,7 +182,7 @@ class StructuralSP(GraphKernel): itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/sylvester_equation.py b/gklearn/kernels/sylvester_equation.py index 9f8fc66..b898ae9 100644 --- a/gklearn/kernels/sylvester_equation.py +++ b/gklearn/kernels/sylvester_equation.py @@ -27,9 +27,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_gm_series(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') @@ -41,7 +41,7 @@ class SylvesterEquation(RandomWalkMeta): if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. - iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # # normalized adjacency matrices # A_wave_list = [] @@ -55,7 +55,7 @@ class SylvesterEquation(RandomWalkMeta): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) - iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda) @@ -71,9 +71,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') @@ -83,7 +83,7 @@ class SylvesterEquation(RandomWalkMeta): if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. - iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? if self._p is None: # p is uniform distribution as default. @@ -94,7 +94,7 @@ class SylvesterEquation(RandomWalkMeta): do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(A_wave_list,), n_jobs=self.n_jobs, verbose=self.verbose) else: # @todo pass @@ -105,9 +105,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') @@ -120,11 +120,11 @@ class SylvesterEquation(RandomWalkMeta): # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() - iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] if self._p is None: # p is uniform distribution as default. - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda) @@ -139,9 +139,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') @@ -152,7 +152,7 @@ class SylvesterEquation(RandomWalkMeta): # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() - iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? if self._p is None: # p is uniform distribution as default. @@ -169,7 +169,7 @@ class SylvesterEquation(RandomWalkMeta): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) else: # @todo pass @@ -184,9 +184,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2], self._verbose) + self._check_edge_weight([g1] + [g2], self.verbose) self._check_graphs([g1] + [g2]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') diff --git a/gklearn/kernels/treelet.py b/gklearn/kernels/treelet.py index 32cad43..d546e74 100644 --- a/gklearn/kernels/treelet.py +++ b/gklearn/kernels/treelet.py @@ -18,6 +18,8 @@ import numpy as np import networkx as nx from collections import Counter from itertools import chain +from sklearn.utils.validation import check_is_fitted +from sklearn.exceptions import NotFittedError from gklearn.utils import SpecialLabel from gklearn.utils.parallel import parallel_gm, parallel_me from gklearn.utils.utils import find_all_paths, get_mlti_dim_node_attrs @@ -26,14 +28,211 @@ from gklearn.kernels import GraphKernel class Treelet(GraphKernel): - def __init__(self, **kwargs): - GraphKernel.__init__(self) - self._node_labels = kwargs.get('node_labels', []) - self._edge_labels = kwargs.get('edge_labels', []) - self._sub_kernel = kwargs.get('sub_kernel', None) - self._ds_infos = kwargs.get('ds_infos', {}) - if self._sub_kernel is None: - raise Exception('Sub kernel not set.') + def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs): + """Initialise a treelet kernel. + """ + super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose) + self.node_labels = kwargs.get('node_labels', []) + self.edge_labels = kwargs.get('edge_labels', []) + self.sub_kernel = kwargs.get('sub_kernel', None) + self.ds_infos = kwargs.get('ds_infos', {}) + self.precompute_canonkeys = precompute_canonkeys + self.save_canonkeys = save_canonkeys + + + ########################################################################## + # The following is the 1st paradigm to compute kernel matrix, which is + # compatible with `scikit-learn`. + # ------------------------------------------------------------------- + # Special thanks to the "GraKeL" library for providing an excellent template! + ########################################################################## + + + def clear_attributes(self): + super().clear_attributes() + if hasattr(self, '_canonkeys'): + delattr(self, '_canonkeys') + if hasattr(self, '_Y_canonkeys'): + delattr(self, '_Y_canonkeys') + if hasattr(self, '_dummy_labels_considered'): + delattr(self, '_dummy_labels_considered') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + super().validate_parameters() + if self.sub_kernel is None: + raise ValueError('Sub-kernel not set.') + + + def _compute_kernel_matrix_series(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) without parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + + # self._add_dummy_labels will modify the input in place. + self._add_dummy_labels() # For self._graphs +# Y = [g.copy() for g in Y] # @todo: ? + self._add_dummy_labels(Y) + + # get all canonical keys of all graphs before computing kernels to save + # time, but this may cost a lot of memory for large dataset. + + # Canonical keys for self._graphs. + try: + check_is_fitted(self, ['_canonkeys']) + canonkeys_list1 = self._canonkeys + except NotFittedError: + canonkeys_list1 = [] + iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2)) + for g in iterator: + canonkeys_list1.append(self._get_canonkeys(g)) + + if self.save_canonkeys: + self._canonkeys = canonkeys_list1 + + # Canonical keys for Y. + canonkeys_list2 = [] + iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2)) + for g in iterator: + canonkeys_list2.append(self._get_canonkeys(g)) + + if self.save_canonkeys: + self._Y_canonkeys = canonkeys_list2 + + # compute kernel matrix. + kernel_matrix = np.zeros((len(Y), len(canonkeys_list1))) + + from itertools import product + itr = product(range(len(Y)), range(len(canonkeys_list1))) + len_itr = int(len(Y) * len(canonkeys_list1)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, + length=len_itr, verbose=(self.verbose >= 2)) + for i_y, i_x in iterator: + kernel = self._kernel_do(canonkeys_list2[i_y], canonkeys_list1[i_x]) + kernel_matrix[i_y][i_x] = kernel + + return kernel_matrix + + + def _compute_kernel_matrix_imap_unordered(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) using imap unordered parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + raise Exception('Parallelization for kernel matrix is not implemented.') + + + def pairwise_kernel(self, x, y, are_keys=False): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + are_keys : boolean, optional + If `True`, `x` and `y` are canonical keys, otherwise are graphs. + The default is False. + + Returns + ------- + kernel: float + The computed kernel. + + """ + if are_keys: + # x, y are canonical keys. + kernel = self._kernel_do(x, y) + + else: + # x, y are graphs. + kernel = self._compute_single_kernel_series(x, y) + + return kernel + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + try: + check_is_fitted(self, ['_canonkeys']) + for i, x in enumerate(self._canonkeys): + self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=True) # @todo: parallel? + except NotFittedError: + for i, x in enumerate(self._graphs): + self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=False) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + try: + check_is_fitted(self, ['_Y_canonkeys']) + for (i, y) in enumerate(self._Y_canonkeys): + self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=True) # @todo: parallel? + except NotFittedError: + for (i, y) in enumerate(self._Y): + self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=False) # @todo: parallel? + + return self._X_diag, self._Y_diag + + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + + ########################################################################## + # The following is the 2nd paradigm to compute kernel matrix. It is + # simplified and not compatible with `scikit-learn`. + ########################################################################## def _compute_gm_series(self): @@ -43,10 +242,13 @@ class Treelet(GraphKernel): # time, but this may cost a lot of memory for large dataset. canonkeys = [] iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout, - verbose=(self._verbose >= 2)) + verbose=(self.verbose >= 2)) for g in iterator: canonkeys.append(self._get_canonkeys(g)) + if self.save_canonkeys: + self._canonkeys = canonkeys + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) @@ -54,7 +256,7 @@ class Treelet(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, - length=len_itr, verbose=(self._verbose >= 2)) + length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(canonkeys[i], canonkeys[j]) gram_matrix[i][j] = kernel @@ -68,22 +270,25 @@ class Treelet(GraphKernel): # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 canonkeys = [[] for _ in range(len(self._graphs))] get_fun = self._wrapper_get_canonkeys iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), desc='getting canonkeys', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, ck in iterator: canonkeys[i] = ck pool.close() pool.join() + if self.save_canonkeys: + self._canonkeys = canonkeys + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) @@ -92,7 +297,7 @@ class Treelet(GraphKernel): G_canonkeys = canonkeys_toshare do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(canonkeys,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(canonkeys,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -104,13 +309,13 @@ class Treelet(GraphKernel): # time, but this may cost a lot of memory for large dataset. canonkeys_1 = self._get_canonkeys(g1) canonkeys_list = [] - iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self.verbose >= 2)) for g in iterator: canonkeys_list.append(self._get_canonkeys(g)) # compute kernel list. kernel_list = [None] * len(g_list) - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(canonkeys_1, canonkeys_list[i]) kernel_list[i] = kernel @@ -125,16 +330,16 @@ class Treelet(GraphKernel): # time, but this may cost a lot of memory for large dataset. canonkeys_1 = self._get_canonkeys(g1) canonkeys_list = [[] for _ in range(len(g_list))] - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(g_list, range(0, len(g_list))) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 get_fun = self._wrapper_get_canonkeys iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), desc='getting canonkeys', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, ck in iterator: canonkeys_list[i] = ck pool.close() @@ -154,7 +359,7 @@ class Treelet(GraphKernel): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list @@ -187,7 +392,7 @@ class Treelet(GraphKernel): keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) - kernel = self._sub_kernel(vector1, vector2) + kernel = self.sub_kernel(vector1, vector2) return kernel @@ -223,7 +428,7 @@ class Treelet(GraphKernel): patterns['0'] = list(G.nodes()) canonkey['0'] = nx.number_of_nodes(G) for i in range(1, 6): # for i in range(1, 6): - patterns[str(i)] = find_all_paths(G, i, self._ds_infos['directed']) + patterns[str(i)] = find_all_paths(G, i, self.ds_infos['directed']) canonkey[str(i)] = len(patterns[str(i)]) # n-star patterns @@ -317,11 +522,11 @@ class Treelet(GraphKernel): ### pattern obtained in the structural analysis section above, which is a ### string corresponding to a unique treelet. A dictionary is built to keep ### track of the amount of every treelet. - if len(self._node_labels) > 0 or len(self._edge_labels) > 0: + if len(self.node_labels) > 0 or len(self.edge_labels) > 0: canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. # linear patterns - canonkey_t = Counter(get_mlti_dim_node_attrs(G, self._node_labels)) + canonkey_t = Counter(get_mlti_dim_node_attrs(G, self.node_labels)) for key in canonkey_t: canonkey_l[('0', key)] = canonkey_t[key] @@ -330,9 +535,9 @@ class Treelet(GraphKernel): for pattern in patterns[str(i)]: canonlist = [] for idx, node in enumerate(pattern[:-1]): - canonlist.append(tuple(G.nodes[node][nl] for nl in self._node_labels)) - canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self._edge_labels)) - canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self._node_labels)) + canonlist.append(tuple(G.nodes[node][nl] for nl in self.node_labels)) + canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self.edge_labels)) + canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self.node_labels)) canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1] treelet.append(tuple([str(i)] + canonkey_t)) canonkey_l.update(Counter(treelet)) @@ -343,13 +548,13 @@ class Treelet(GraphKernel): for pattern in patterns[str(i) + 'star']: canonlist = [] for leaf in pattern[1:]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonlist = list(chain.from_iterable(canonlist)) canonkey_t = tuple(['d' if i == 5 else str(i * 2)] + - [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) @@ -359,17 +564,17 @@ class Treelet(GraphKernel): for pattern in patterns['7']: canonlist = [] for leaf in pattern[1:3]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonlist = list(chain.from_iterable(canonlist)) canonkey_t = tuple(['7'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] - + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]) + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist + + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] + + [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)]) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) @@ -378,38 +583,38 @@ class Treelet(GraphKernel): for pattern in patterns['11']: canonlist = [] for leaf in pattern[1:4]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonlist = list(chain.from_iterable(canonlist)) canonkey_t = tuple(['b'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist - + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[4]][pattern[0]][el] for el in self._edge_labels)] - + [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)]) + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist + + [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[4]][pattern[0]][el] for el in self.edge_labels)] + + [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)]) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) # pattern 10 treelet = [] for pattern in patterns['10']: - canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), - tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)] + canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), + tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)] canonlist = [] for leaf in pattern[1:3]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonkey0 = list(chain.from_iterable(canonlist)) canonkey_t = tuple(['a'] - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] - + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] + + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + + [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)] + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] + canonkey4 + canonkey0) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) @@ -419,15 +624,15 @@ class Treelet(GraphKernel): for pattern in patterns['12']: canonlist0 = [] for leaf in pattern[1:3]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist0.append(tuple((nlabels, elabels))) canonlist0.sort() canonlist0 = list(chain.from_iterable(canonlist0)) canonlist3 = [] for leaf in pattern[4:6]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[3]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[3]][el] for el in self.edge_labels) canonlist3.append(tuple((nlabels, elabels))) canonlist3.sort() canonlist3 = list(chain.from_iterable(canonlist3)) @@ -435,14 +640,14 @@ class Treelet(GraphKernel): # 2 possible key can be generated from 2 nodes with extended label 3, # select the one with lower lexicographic order. canonkey_t1 = tuple(['c'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0 - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist0 + + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] + canonlist3) canonkey_t2 = tuple(['c'] - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3 - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] + + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + canonlist3 + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] + canonlist0) treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) canonkey_l.update(Counter(treelet)) @@ -450,24 +655,24 @@ class Treelet(GraphKernel): # pattern 9 treelet = [] for pattern in patterns['9']: - canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels), - tuple(G[pattern[4]][pattern[2]][el] for el in self._edge_labels)] - canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), - tuple(G[pattern[5]][pattern[3]][el] for el in self._edge_labels)] - prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self._node_labels), - tuple(G[pattern[2]][pattern[0]][el] for el in self._edge_labels)] - prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels), - tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] + canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels), + tuple(G[pattern[4]][pattern[2]][el] for el in self.edge_labels)] + canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), + tuple(G[pattern[5]][pattern[3]][el] for el in self.edge_labels)] + prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self.node_labels), + tuple(G[pattern[2]][pattern[0]][el] for el in self.edge_labels)] + prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels), + tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] if prekey2 + canonkey2 < prekey3 + canonkey3: - canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ - + [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ + canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ + + [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ + prekey2 + prekey3 + canonkey2 + canonkey3 else: - canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ - + [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ + canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ + + [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ + prekey3 + prekey2 + canonkey3 + canonkey2 treelet.append(tuple(['9'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonkey_t)) canonkey_l.update(Counter(treelet)) @@ -482,12 +687,33 @@ class Treelet(GraphKernel): return i, self._get_canonkeys(g) - def _add_dummy_labels(self, Gn): - if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): - for i in range(len(Gn)): - nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) - self._node_labels = [SpecialLabel.DUMMY] - if len(self._edge_labels) == 0 or (len(self._edge_labels) == 1 and self._edge_labels[0] == SpecialLabel.DUMMY): - for i in range(len(Gn)): - nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) - self._edge_labels = [SpecialLabel.DUMMY] \ No newline at end of file + def _add_dummy_labels(self, Gn=None): + def _add_dummy(Gn): + if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.node_labels = [SpecialLabel.DUMMY] + if len(self.edge_labels) == 0 or (len(self.edge_labels) == 1 and self.edge_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.edge_labels = [SpecialLabel.DUMMY] + + if Gn is None or Gn is self._graphs: + # Add dummy labels for the copy of self._graphs. + try: + check_is_fitted(self, ['_dummy_labels_considered']) + if not self._dummy_labels_considered: + Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] + _add_dummy(Gn) + self._graphs = Gn + self._dummy_labels_considered = True + except NotFittedError: + Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] + _add_dummy(Gn) + self._graphs = Gn + self._dummy_labels_considered = True + + else: + # Add dummy labels for the input. + _add_dummy(Gn) + diff --git a/gklearn/kernels/weisfeiler_lehman.py b/gklearn/kernels/weisfeiler_lehman.py index 1f52755..64069b7 100644 --- a/gklearn/kernels/weisfeiler_lehman.py +++ b/gklearn/kernels/weisfeiler_lehman.py @@ -33,7 +33,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. def _compute_gm_series(self): -# if self._verbose >= 2: +# if self.verbose >= 2: # import warnings # warnings.warn('A part of the computation is parallelized.') @@ -74,17 +74,17 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. G_gn = gn_toshare do_fun = self._wrapper_pairwise parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix else: - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') return self._compute_gm_series() def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better. -# if self._verbose >= 2: +# if self.verbose >= 2: # import warnings # warnings.warn('A part of the computation is parallelized.') @@ -126,10 +126,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list else: - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') return self._compute_kernel_list_series(g1, g_list) @@ -332,15 +332,15 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. def _compute_gram_itr(self, gram_matrix, all_num_of_each_label): """Compute Gram matrix using the base kernel. """ -# if self._parallel == 'imap_unordered': +# if self.parallel == 'imap_unordered': # # compute kernels. # def init_worker(alllabels_toshare): # global G_alllabels # G_alllabels = alllabels_toshare # do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix) # parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, -# glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose) -# elif self._parallel is None: +# glbv=(all_num_of_each_label,), n_jobs=self.n_jobs, verbose=self.verbose) +# elif self.parallel is None: for i in range(len(gram_matrix)): for j in range(i, len(gram_matrix)): gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i], From ed170105357449d07d5becf701dac7ff0b8029c1 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sat, 22 May 2021 11:59:05 +0200 Subject: [PATCH 20/35] [Major Feature][Todo] Add model_learning module, including a new NestedCV class and a Workflow class for graph kernel computation. --- gklearn/model_learning/__init__.py | 14 + gklearn/model_learning/nested_cv.py | 714 ++++++++++++++++++++++++++++++++++++ gklearn/model_learning/workflow.py | 109 ++++++ 3 files changed, 837 insertions(+) create mode 100644 gklearn/model_learning/__init__.py create mode 100644 gklearn/model_learning/nested_cv.py create mode 100644 gklearn/model_learning/workflow.py diff --git a/gklearn/model_learning/__init__.py b/gklearn/model_learning/__init__.py new file mode 100644 index 0000000..d4c6aaa --- /dev/null +++ b/gklearn/model_learning/__init__.py @@ -0,0 +1,14 @@ +# -*-coding:utf-8 -*- +""" +model learning. +""" + +# info +__version__ = "0.2" +__author__ = "Linlin Jia" +__date__ = "November 2020" + + +from gklearn.model_learning.nested_cv import NestedCV +from gklearn.model_learning.workflow import Workflow +from gklearn.model_learning.parameters import dichotomous_permutation \ No newline at end of file diff --git a/gklearn/model_learning/nested_cv.py b/gklearn/model_learning/nested_cv.py new file mode 100644 index 0000000..92bc8f9 --- /dev/null +++ b/gklearn/model_learning/nested_cv.py @@ -0,0 +1,714 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Nov 27 18:59:28 2020 + +@author: ljia +""" +import os +import datetime +import time +import sys +from tqdm import tqdm +from multiprocessing import Pool, Array +from functools import partial +import numpy as np +from matplotlib import pyplot as plt +from sklearn.model_selection import KFold, train_test_split, ParameterGrid +from sklearn.kernel_ridge import KernelRidge +from sklearn.svm import SVC +from sklearn.metrics import accuracy_score, mean_squared_error + + +class NestedCV(object): + """Perform model selection, fitting and testing for precomputed kernels + using nested CV. Print out neccessary data during the process then finally + the results. + + Parameters + ---------- + datafile : string + Path of dataset file. + estimator : function + kernel function used to estimate. This function needs to return a gram matrix. + param_grid_precomputed : dictionary + Dictionary with names (string) of parameters used to calculate gram + matrices as keys and lists of parameter settings to try as values. This + enables searching over any sequence of parameter settings. Params with + length 1 will be omitted. + param_grid : dictionary + Dictionary with names (string) of parameters used as penelties as keys + and lists of parameter settings to try as values. This enables + searching over any sequence of parameter settings. Params with length 1 + will be omitted. + model_type : string + Type of the problem, can be 'regression' or 'classification'. + NUM_TRIALS : integer + Number of random trials of the outer CV loop. The default is 30. + datafile_y : string + Path of file storing y data. This parameter is optional depending on + the given dataset file. + extra_params : dict + Extra parameters for loading dataset. See function gklearn.utils. + graphfiles.loadDataset for detail. + ds_name : string + Name of the dataset. + n_jobs : int + Number of jobs for parallelization. + read_gm_from_file : boolean + Whether gram matrices are loaded from a file. + + Examples + -------- + >>> import numpy as np + >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel + >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel + >>> + >>> datafile = '../datasets/MUTAG/MUTAG_A.txt' + >>> estimator = untilhpathkernel + >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: + [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} + >>> # ’C’ for classification problems and ’alpha’ for regression problems. + >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: + np.logspace(-10, 10, num=41, base=10)}] + >>> + >>> model_selection_for_precomputed_kernel(datafile, estimator, + param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) + """ + def __init__(self, dataset, estimator, param_grid_precomputed=None, param_grid=None, model_type=None, num_trials=30, output_dir=None, n_jobs=1, save_gms=True, save_gm_figs=False, logging=True, verbose=True, **kwargs): + tqdm.monitor_interval = 0 + self._ds = dataset + self._estimator = estimator + self._num_trials = num_trials + self._n_jobs = n_jobs + self._save_gms = save_gms + self._save_gm_figs = save_gm_figs + self._logging = logging + self._verbose = verbose + self._kwargs = kwargs + + # Set dataset name. + if self._ds._ds_name is None: + self._ds_name = 'ds-unknown' + else: + self._ds_name = self._ds._ds_name + + # The output directory. + if output_dir is None: + self._output_dir = os.path.join('outputs/', estimator.__name__) + else: + self._output_dir = output_dir + os.makedirs(self._output_dir, exist_ok=True) + + # Setup the model type. + if model_type is None: + self._model_type = dataset._task_type + else: + self._model_type = model_type.lower() + if self._model_type != 'regression' and self._model_type != 'classification': + raise Exception('The model type is incorrect! Please choose from regression or classification.') + + # @todo: Set param_grid_precomputed and param_grid. + self._param_grid_precomputed = param_grid_precomputed + self._param_grid = param_grid + + if self._verbose: + print() + print('--- This is a %s problem ---' % self._model_type) + # A string to save all the results. + if self._logging: + self._str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' + self._str_fw += '# This file contains results of ' + self._estimator.__name__ + ' on dataset ' + self._ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' + self._str_fw += 'This is a %s problem.\n' % self._model_type + + self.run() + + + def run(self): + self.fit() + self.compute_gram_matrices() + if len(self._gram_matrices) == 0: + if self._verbose: + print('All gram matrices are ignored, no results obtained.') + if self._logging: + self._str_fw += '\nAll gram matrices are ignored, no results obtained.\n\n' + else: + self.do_cv() + + # print out results as table. + if self._logging: + self._str_fw += self.printResultsInTable(self._param_list, self._param_list_pre_revised, self._average_val_scores, self._std_val_scores, self._average_perf_scores, self._std_perf_scores, self._average_train_scores, self._std_train_scores, self._gram_matrix_time, self._model_type, self._verbose) + + # open file to save all results for this dataset. + if not os.path.exists(self._output_dir + '/' + self._ds_name + '.output.txt'): + with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'w') as f: + f.write(self._str_fw) + else: + with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'r+') as f: + content = f.read() + f.seek(0, 0) + f.write(self._str_fw + '\n\n\n' + content) + + return self._final_performance, self._final_confidence + + + def fit(self): + return + + + def compute_gram_matrices(self): + """Compute all gram matrices. + + Returns + ------- + None. + + """ + # Grid of parameters with a discrete number of values for each. + self._param_list_precomputed = list(ParameterGrid(self._param_grid_precomputed)) + self._param_list = list(ParameterGrid(self._param_grid)) + + self._gram_matrices = [ + ] # a list to store gram matrices for all param_grid_precomputed + self._gram_matrix_time = [ + ] # a list to store time to calculate gram matrices + self._param_list_pre_revised = [ + ] # list to store param grids precomputed ignoring the useless ones + + if self._verbose: + print() + print('\n1. Computing gram matrices. This could take a while...') + if self._logging: + self._str_fw += '\nI. Gram matrices.\n\n' + self._tts = time.time() # start training time + nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) + for idx, params_out in enumerate(self._param_list_precomputed): + y = self._ds.targets[:] + params_out['n_jobs'] = self._n_jobs + params_out['verbose'] = self._verbose +# print(dataset) +# import networkx as nx +# nx.draw_networkx(dataset[1]) +# plt.show() + rtn_data = self._estimator(self._ds.graphs[:], **params_out) # @todo: Attention! this will not copy the graphs. + Kmatrix = rtn_data[0] + current_run_time = rtn_data[1] + # for some kernels, some graphs in datasets may not meet the + # kernels' requirements for graph structure. These graphs are trimmed. + if len(rtn_data) == 3: + idx_trim = rtn_data[2] # the index of trimmed graph list + y = [y[idxt] for idxt in idx_trim] # trim y accordingly +# Kmatrix = np.random.rand(2250, 2250) +# current_run_time = 0.1 + + # remove graphs whose kernels with themselves are zeros + # @todo: y not changed accordingly? + Kmatrix_diag = Kmatrix.diagonal().copy() + nb_g_ignore = 0 + for idxk, diag in enumerate(Kmatrix_diag): + if diag == 0: + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) + nb_g_ignore += 1 + + # normalization + # @todo: works only for undirected graph? + Kmatrix_diag = Kmatrix.diagonal().copy() + for i in range(len(Kmatrix)): + for j in range(i, len(Kmatrix)): + Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) + Kmatrix[j][i] = Kmatrix[i][j] + if self._verbose: + print() + + if params_out == {}: + if self._verbose: + print('the gram matrix is: ') + if self._logging: + self._str_fw += 'the gram matrix is:\n\n' + else: + if self._verbose: + print('the gram matrix with parameters', params_out, 'is: \n\n') + if self._logging: + self._str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out + + if len(Kmatrix) < 2: + nb_gm_ignore += 1 + if self._verbose: + print('ignored, as at most only one of all its diagonal value is non-zero.') + if self._logging: + self._str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' + else: + if np.isnan(Kmatrix).any( + ): # if the matrix contains elements that are not numbers + nb_gm_ignore += 1 + if self._verbose: + print('ignored, as it contains elements that are not numbers.') + if self._logging: + self._str_fw += 'ignored, as it contains elements that are not numbers.\n\n' + else: +# print(Kmatrix) + if self._logging: + self._str_fw += np.array2string( + Kmatrix, + separator=',') + '\n\n' +# separator=',', +# threshold=np.inf, +# floatmode='unique') + '\n\n' + + # Draw and save Gram matrix figures. + if self._save_gm_figs: + fig_file_name = self._output_dir + '/GM[ds]' + self._ds_name + if params_out != {}: + fig_file_name += '[params]' + str(idx) + plt.imshow(Kmatrix) + plt.colorbar() + plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) + # plt.show() + plt.clf() + + self._gram_matrices.append(Kmatrix) + self._gram_matrix_time.append(current_run_time) + self._param_list_pre_revised.append(params_out) + + if nb_g_ignore > 0: + if self._verbose: + print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) + if self._logging: + self._str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore + + if self._verbose: + print() + print('{} gram matrices are calculated, {} of which are ignored.'.format(len(self._param_list_precomputed), nb_gm_ignore)) + if self._logging: + self._str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(self._param_list_precomputed), nb_gm_ignore) + self._str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' + self._str_fw += ''.join(['{}: {}\n'.format(idx, params_out) for idx, params_out in enumerate(self._param_list_precomputed)]) + + + def do_cv(self): +# save gram matrices to file. +# np.savez(output_dir + '/' + ds_name + '.gm', +# gms=gram_matrices, params=param_list_pre_revised, y=y, +# gmtime=gram_matrix_time) + if self._verbose: + print('2. Fitting and predicting using nested cross validation. This could really take a while...') + + # ---- use pool.imap_unordered to parallel and track progress. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# def func_assign(result, var_to_assign): +# for idx, itm in enumerate(var_to_assign): +# itm.append(result[idx]) +# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) +# +# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, +# [train_pref, val_pref, test_pref], glbv=gram_matrices, +# method='imap_unordered', n_jobs=n_jobs, chunksize=1, +# itr_desc='cross validation') + + def init_worker(gms_toshare): + global G_gms + G_gms = gms_toshare + +# gram_matrices = np.array(gram_matrices) +# gms_shape = gram_matrices.shape +# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) +# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) + pool = Pool(processes=self._n_jobs, initializer=init_worker, initargs=(self._gram_matrices,)) + trial_do_partial = partial(self._parallel_trial_do, self._param_list_pre_revised, self._param_list, self._ds.targets[:], self._model_type) # @todo: maybe self._ds.targets[:] should be y. + train_pref = [] + val_pref = [] + test_pref = [] +# if NUM_TRIALS < 1000 * n_jobs: +# chunksize = int(NUM_TRIALS / n_jobs) + 1 +# else: +# chunksize = 1000 + chunksize = 1 + if self._verbose: + iterator = tqdm(pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize), desc='cross validation', file=sys.stdout) + else: + iterator = pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize) + for o1, o2, o3 in iterator: + train_pref.append(o1) + val_pref.append(o2) + test_pref.append(o3) + pool.close() + pool.join() + +# # ---- use pool.map to parallel. ---- +# pool = Pool(n_jobs) +# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type) +# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) +# train_pref = [item[0] for item in result_perf] +# val_pref = [item[1] for item in result_perf] +# test_pref = [item[2] for item in result_perf] + +# # ---- direct running, normally use a single CPU core. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): +# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) +# train_pref.append(o1) +# val_pref.append(o2) +# test_pref.append(o3) +# print() + + if self._verbose: + print() + print('3. Getting final performance...') + if self._logging: + self._str_fw += '\nII. Performance.\n\n' + + # averages and confidences of performances on outer trials for each combination of parameters + self._average_train_scores = np.mean(train_pref, axis=0) +# print('val_pref: ', val_pref[0][0]) + self._average_val_scores = np.mean(val_pref, axis=0) +# print('test_pref: ', test_pref[0][0]) + self._average_perf_scores = np.mean(test_pref, axis=0) + # sample std is used here + self._std_train_scores = np.std(train_pref, axis=0, ddof=1) + self._std_val_scores = np.std(val_pref, axis=0, ddof=1) + self._std_perf_scores = np.std(test_pref, axis=0, ddof=1) + + if self._model_type == 'regression': + best_val_perf = np.amin(self._average_val_scores) + else: + best_val_perf = np.amax(self._average_val_scores) +# print('average_val_scores: ', self._average_val_scores) +# print('best_val_perf: ', best_val_perf) +# print() + best_params_index = np.where(self._average_val_scores == best_val_perf) + # find smallest val std with best val perf. + best_val_stds = [ + self._std_val_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + min_val_std = np.amin(best_val_stds) + best_params_index = np.where(self._std_val_scores == min_val_std) + best_params_out = [self._param_list_pre_revised[i] for i in best_params_index[0]] + best_params_in = [self._param_list[i] for i in best_params_index[1]] + + if self._verbose: + print('best_params_out: ', best_params_out) + print('best_params_in: ', best_params_in) + print() + print('best_val_perf: ', best_val_perf) + print('best_val_std: ', min_val_std) + if self._logging: + self._str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out + self._str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in + self._str_fw += 'best_val_perf: %s\n' % best_val_perf + self._str_fw += 'best_val_std: %s\n' % min_val_std + +# print(best_params_index) +# print(best_params_index[0]) +# print(self._average_perf_scores) + self._final_performance = [ + self._average_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + self._final_confidence = [ + self._std_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + + if self._verbose: + print('final_performance: ', self._final_performance) + print('final_confidence: ', self._final_confidence) + if self._logging: + self._str_fw += 'final_performance: %s\n' % self._final_performance + self._str_fw += 'final_confidence: %s\n' % self._final_confidence + + train_performance = [ + self._average_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + train_std = [ + self._std_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + + if self._verbose: + print('train_performance: %s' % train_performance) + print('train_std: ', train_std) + if self._logging: + self._str_fw += 'train_performance: %s\n' % train_performance + self._str_fw += 'train_std: %s\n\n' % train_std + + if self._verbose: + print() + + tt_total = time.time() - self._tts # training time for all hyper-parameters + average_gram_matrix_time = np.mean(self._gram_matrix_time) + std_gram_matrix_time = np.std(self._gram_matrix_time, ddof=1) if len(self._gram_matrix_time) > 1 else 0 + best_gram_matrix_time = [self._gram_matrix_time[i] for i in best_params_index[0]] + ave_bgmt = np.mean(best_gram_matrix_time) + std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 + + if self._verbose: + print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' + .format(average_gram_matrix_time, std_gram_matrix_time)) + print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( + ave_bgmt, std_bgmt)) + print('total training time with all hyper-param choices: {:.2f}s'.format( + tt_total)) + if self._logging: + self._str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) + self._str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) + self._str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) + + # # save results to file + # np.savetxt(results_name_pre + 'average_train_scores.dt', + # average_train_scores) + # np.savetxt(results_name_pre + 'average_val_scores', self._average_val_scores) + # np.savetxt(results_name_pre + 'average_perf_scores.dt', + # average_perf_scores) + # np.savetxt(results_name_pre + 'std_train_scores.dt', self._std_train_scores) + # np.savetxt(results_name_pre + 'std_val_scores.dt', self._std_val_scores) + # np.savetxt(results_name_pre + 'std_perf_scores.dt', self._std_perf_scores) + + # np.save(results_name_pre + 'best_params_index', best_params_index) + # np.save(results_name_pre + 'best_params_pre.dt', best_params_out) + # np.save(results_name_pre + 'best_params_in.dt', best_params_in) + # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) + # np.save(results_name_pre + 'best_val_std.dt', best_val_std) + # np.save(results_name_pre + 'final_performance.dt', self._final_performance) + # np.save(results_name_pre + 'final_confidence.dt', self._final_confidence) + # np.save(results_name_pre + 'train_performance.dt', train_performance) + # np.save(results_name_pre + 'train_std.dt', train_std) + + # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) + # np.save(results_name_pre + 'average_gram_matrix_time.dt', + # average_gram_matrix_time) + # np.save(results_name_pre + 'std_gram_matrix_time.dt', + # std_gram_matrix_time) + # np.save(results_name_pre + 'best_gram_matrix_time.dt', + # best_gram_matrix_time) + + + def trial_do(self, param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level + + # # get gram matrices from global variables. + # gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') + + # Arrays to store scores + train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + + # randomness added to seeds of split function below. "high" is "size" times + # 10 so that at least 10 different random output will be yielded. Remove + # these lines if identical outputs is required. + rdm_out = np.random.RandomState(seed=None) + rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, + size=len(param_list_pre_revised)) + # print(trial, rdm_seed_out_l) + # print() + # loop for each outer param tuple + for index_out, params_out in enumerate(param_list_pre_revised): + # get gram matrices from global variables. + # gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] + # gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') + gm_now = gram_matrices[index_out].copy() + + # split gram matrix and y to app and test sets. + indices = range(len(y)) + # The argument "random_state" in function "train_test_split" can not be + # set to None, because it will use RandomState instance used by + # np.random, which is possible for multiple subprocesses to inherit the + # same seed if they forked at the same time, leading to identical + # random variates for different subprocesses. Instead, we use "trial" + # and "index_out" parameters to generate different seeds for different + # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add + # randomness into seeds, so that it yields a different output every + # time the program is run. To yield identical outputs every time, + # remove the second line below. Same method is used to the "KFold" + # function in the inner loop. + rdm_seed_out = (trial + 1) * (index_out + 1) + rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) + # print(trial, rdm_seed_out) + X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( + gm_now, y, indices, test_size=0.1, + random_state=rdm_seed_out, shuffle=True) + # print(trial, idx_app, idx_test) + # print() + X_app = X_app[:, idx_app] + X_test = X_test[:, idx_app] + y_app = np.array(y_app) + y_test = np.array(y_test) + + rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, + size=len(param_list)) + # loop for each inner param tuple + for index_in, params_in in enumerate(param_list): + # if trial == 0: + # print(index_out, index_in) + # print('params_in: ', params_in) + # st = time.time() + rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) + # print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) + rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) + # print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) + inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) + current_train_perf = [] + current_valid_perf = [] + current_test_perf = [] + + # For regression use the Kernel Ridge method + # try: + if self._model_type == 'regression': + kr = KernelRidge(kernel='precomputed', **params_in) + # loop for each split on validation set level + # validation set level + for train_index, valid_index in inner_cv.split(X_app): + # print("train_index, valid_index: ", trial, index_in, train_index, valid_index) + # if trial == 0: + # print('train_index: ', train_index) + # print('valid_index: ', valid_index) + # print('idx_test: ', idx_test) + # print('y_app[train_index]: ', y_app[train_index]) + # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) + # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) + kr.fit(X_app[train_index, :][:, train_index], + y_app[train_index]) + + # predict on the train, validation and test set + y_pred_train = kr.predict( + X_app[train_index, :][:, train_index]) + y_pred_valid = kr.predict( + X_app[valid_index, :][:, train_index]) + # if trial == 0: + # print('y_pred_valid: ', y_pred_valid) + # print() + y_pred_test = kr.predict( + X_test[:, train_index]) + + # root mean squared errors + current_train_perf.append( + np.sqrt( + mean_squared_error( + y_app[train_index], y_pred_train))) + current_valid_perf.append( + np.sqrt( + mean_squared_error( + y_app[valid_index], y_pred_valid))) + # if trial == 0: + # print(mean_squared_error( + # y_app[valid_index], y_pred_valid)) + current_test_perf.append( + np.sqrt( + mean_squared_error( + y_test, y_pred_test))) + # For clcassification use SVM + else: + svc = SVC(kernel='precomputed', cache_size=200, + verbose=False, **params_in) + # loop for each split on validation set level + # validation set level + for train_index, valid_index in inner_cv.split(X_app): + # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) + # if trial == 0: + # print('train_index: ', train_index) + # print('valid_index: ', valid_index) + # print('idx_test: ', idx_test) + # print('y_app[train_index]: ', y_app[train_index]) + # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) + # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) + svc.fit(X_app[train_index, :][:, train_index], + y_app[train_index]) + + # predict on the train, validation and test set + y_pred_train = svc.predict( + X_app[train_index, :][:, train_index]) + y_pred_valid = svc.predict( + X_app[valid_index, :][:, train_index]) + y_pred_test = svc.predict( + X_test[:, train_index]) + + # root mean squared errors + current_train_perf.append( + accuracy_score(y_app[train_index], + y_pred_train)) + current_valid_perf.append( + accuracy_score(y_app[valid_index], + y_pred_valid)) + current_test_perf.append( + accuracy_score(y_test, y_pred_test)) + # except ValueError: + # print(sys.exc_info()[0]) + # print(params_out, params_in) + + # average performance on inner splits + train_pref[index_out][index_in] = np.mean( + current_train_perf) + val_pref[index_out][index_in] = np.mean( + current_valid_perf) + test_pref[index_out][index_in] = np.mean( + current_test_perf) + # print(time.time() - st) + # if trial == 0: + # print('val_pref: ', val_pref) + # print('test_pref: ', test_pref) + + return train_pref, val_pref, test_pref + + + def _parallel_trial_do(self, param_list_pre_revised, param_list, y, model_type, trial): + train_pref, val_pref, test_pref = self._trial_do(param_list_pre_revised, + param_list, G_gms, y, + model_type, trial) + return train_pref, val_pref, test_pref + + + def printResultsInTable(self, param_list, param_list_pre_revised, average_val_scores, + std_val_scores, average_perf_scores, std_perf_scores, + average_train_scores, std_train_scores, gram_matrix_time, + model_type, verbose): + from collections import OrderedDict + from tabulate import tabulate + table_dict = {} + if model_type == 'regression': + for param_in in param_list: + param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) + else: + for param_in in param_list: + param_in['C'] = '{:.2e}'.format(param_in['C']) + table_dict['params'] = [{**param_out, **param_in} + for param_in in param_list for param_out in param_list_pre_revised] + table_dict['gram_matrix_time'] = [ + '{:.2f}'.format(gram_matrix_time[index_out]) + for param_in in param_list + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['valid_perf'] = [ + '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], + std_val_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['test_perf'] = [ + '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], + std_perf_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['train_perf'] = [ + '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], + std_train_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + + keyorder = [ + 'params', 'train_perf', 'valid_perf', 'test_perf', + 'gram_matrix_time' + ] + if verbose: + print() + tb_print = tabulate(OrderedDict(sorted(table_dict.items(), + key=lambda i: keyorder.index(i[0]))), headers='keys') + # print(tb_print) + return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print \ No newline at end of file diff --git a/gklearn/model_learning/workflow.py b/gklearn/model_learning/workflow.py new file mode 100644 index 0000000..7f1be1d --- /dev/null +++ b/gklearn/model_learning/workflow.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Nov 27 19:33:51 2020 + +@author: ljia +""" +import os +import numpy as np +import pickle +from gklearn.dataset import Dataset +from gklearn.model_learning import NestedCV +from gklearn.kernels import GRAPH_KERNELS + +class Workflow(object): + + + def __init__(self, **kwargs): + self._job_prefix = kwargs.get('job_prefix', 'gktask') + self._max_num_running_tasks = kwargs.get('max_num_running_tasks', np.inf) + self._root_dir = kwargs.get('root_dir', 'outputs/') + + + def run(self, tasks): + ### Check inputs. + if self._check_inputs(tasks): + self._tasks = tasks + else: + raise ValueError('The input "tasks" is not correct.') + + + ### Sort tasks. + self.sort_tasks_by_complexity() + + + ### The main process. + complete = False + while not complete: + + self.get_running_tasks() + + if self._num_running_tasks < self._max_num_running_tasks: + + ### Load results from table. + self.load_results_from_table() + + for task in self._tasks: + state = self.get_task_state(task) + if state != 'complete' and state != 'runnning': + self.run_task(task) + + if self._num_running_tasks >= self._max_num_running_tasks: + break + + ### Save results. + self.save_results() + + complete = self.check_completeness() + +# sleep() + + + def _check_inputs(self, tasks): + if not isinstance(tasks, list): + return False + else: + for i in tasks: + if not 'kernel' in i or not 'dataset' in i: + return False + return True + + + def sort_tasks_by_complexity(self): + return + + + def get_running_tasks(self): + command = 'squeue --user $USER --format "%.50j" --noheader' + stream = os.popen(command) + output = stream.readlines() + running_tasks = [o for o in output if o.strip().startswith(self._job_prefix)] + self._num_running_tasks = len(running_tasks) + + + def load_results_from_table(self): + pass + + + def get_task_state(self, task): + task_dir = os.path.join(self._root_dir, task['kernel'] + '.' + task['dataset'] + '/') + fn_summary = os.path.join(task_dir, 'results_summary.pkl') + if os.path.isfile(fn_summary): + output = pickle.loads(fn_summary) + state = output['state'] + return state + else: + return 'unstarted' + + + def run_task(self, task): + ds_name = task['dataset'] + k_name = task['kernel'] + + # Get dataset. + ds = Dataset(ds_name) + graph_kernel = GRAPH_KERNELS[k_name] + + # Start CV. + results = NestedCV(ds, graph_kernel) \ No newline at end of file From 325f51cb6f5db23491a3479b21a0ef1e2d4464db Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sat, 22 May 2021 12:01:39 +0200 Subject: [PATCH 21/35] [Feature] Add the gklearn.model_learning.dichotomous_permutation function. --- gklearn/model_learning/parameters.py | 89 ++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 gklearn/model_learning/parameters.py diff --git a/gklearn/model_learning/parameters.py b/gklearn/model_learning/parameters.py new file mode 100644 index 0000000..b4c1d04 --- /dev/null +++ b/gklearn/model_learning/parameters.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri May 21 12:18:02 2021 + +@author: ljia +""" + +def dichotomous_permutation(arr, layer=0): + import math + +# def seperate_arr(arr, new_arr): +# if (length % 2) == 0: +# half = int(length / 2) +# new_arr += [arr[half - 1], arr[half]] +# subarr1 = [arr[i] for i in range(1, half - 1)] +# else: +# half = math.floor(length / 2) +# new_arr.append(arr[half]) +# subarr1 = [arr[i] for i in range(1, half)] +# subarr2 = [arr[i] for i in range(half + 1, length - 1)] +# subarrs = [subarr1, subarr2] +# return subarrs + + + if layer == 0: + length = len(arr) + if length <= 2: + return arr + + new_arr = [arr[0], arr[-1]] + if (length % 2) == 0: + half = int(length / 2) + new_arr += [arr[half - 1], arr[half]] + subarr1 = [arr[i] for i in range(1, half - 1)] + else: + half = math.floor(length / 2) + new_arr.append(arr[half]) + subarr1 = [arr[i] for i in range(1, half)] + subarr2 = [arr[i] for i in range(half + 1, length - 1)] + subarrs = [subarr1, subarr2] +# subarrs = seperate_arr(arr, new_arr) + new_arr += dichotomous_permutation(subarrs, layer=layer+1) + + else: + new_arr = [] + subarrs = [] + for a in arr: + length = len(a) + if length <= 2: + new_arr += a + else: +# subarrs += seperate_arr(a, new_arr) + if (length % 2) == 0: + half = int(length / 2) + new_arr += [a[half - 1], a[half]] + subarr1 = [a[i] for i in range(0, half - 1)] + else: + half = math.floor(length / 2) + new_arr.append(a[half]) + subarr1 = [a[i] for i in range(0, half)] + subarr2 = [a[i] for i in range(half + 1, length)] + subarrs += [subarr1, subarr2] + + if len(subarrs) > 0: + new_arr += dichotomous_permutation(subarrs, layer=layer+1) + + return new_arr + +# length = len(arr) +# if length <= 2: +# return arr + +# new_arr = [arr[0], arr[-1]] +# if (length % 2) == 0: +# half = int(length / 2) +# new_arr += [arr[half - 1], arr[half]] +# subarr1 = [arr[i] for i in range(1, half - 1)] +# else: +# half = math.floor(length / 2) +# new_arr.append(arr[half]) +# subarr1 = [arr[i] for i in range(1, half)] +# subarr2 = [arr[i] for i in range(half + 1, length - 1)] +# if len(subarr1) > 0: +# new_arr += dichotomous_permutation(subarr1) +# if len(subarr2) > 0: +# new_arr += dichotomous_permutation(subarr2) + +# return new_arr \ No newline at end of file From ce6399593da6ef925ec8b2c9d6eb07a5b5f6f044 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sat, 22 May 2021 12:09:00 +0200 Subject: [PATCH 22/35] [CI] Ignore downloaded datasets. --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index 522b2f6..33bffb7 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,9 @@ outputs/ # pyCharm. .idea/ + +# tests. +gklearn/tests/datasets/ + +# Experiments. +gklearn/experiments/datasets/ From b843707e9c88e693d9bec05e137cf917f55bfe1f Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sat, 22 May 2021 12:12:58 +0200 Subject: [PATCH 23/35] [Enhancement] gklearn.dataset.Dataset class can now automatically get the task type of the given dataset (regression or classification). --- gklearn/dataset/dataset.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/gklearn/dataset/dataset.py b/gklearn/dataset/dataset.py index 75684c2..595826c 100644 --- a/gklearn/dataset/dataset.py +++ b/gklearn/dataset/dataset.py @@ -40,6 +40,7 @@ class Dataset(object): self._edge_attr_dim = None self._class_number = None self._ds_name = None + self._task_type = None if inputs is None: self._graphs = None @@ -117,11 +118,16 @@ class Dataset(object): ds_file = [os.path.join(path, fn) for fn in load_files[0]] fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None + # Get extra_params. if 'extra_params' in DATASET_META[ds_name]: kwargs = DATASET_META[ds_name]['extra_params'] else: kwargs = {} + # Get the task type that is associated with the dataset. If it is classification, get the number of classes. + self._get_task_type(ds_name) + + self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets, **kwargs).data self._node_labels = label_names['node_labels'] @@ -276,7 +282,8 @@ class Dataset(object): 'edge_attr_dim', 'class_number', 'all_degree_entropy', - 'ave_degree_entropy' + 'ave_degree_entropy', + 'class_type' ] # dataset size @@ -408,7 +415,7 @@ class Dataset(object): if 'class_number' in keys: if self._class_number is None: - self._class_number = self._get_class_number() + self._class_number = self._get_class_num() infos['class_number'] = self._class_number if 'node_attr_dim' in keys: @@ -437,6 +444,11 @@ class Dataset(object): base = None infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) + if 'task_type' in keys: + if self._task_type is None: + self._task_type = self._get_task_type() + infos['task_type'] = self._task_type + return infos @@ -790,6 +802,13 @@ class Dataset(object): return degree_entropy + def _get_task_type(self, ds_name): + if 'task_type' in DATASET_META[ds_name]: + self._task_type = DATASET_META[ds_name]['task_type'] + if self._task_type == 'classification' and self._class_number is None and 'class_number' in DATASET_META[ds_name]: + self._class_number = DATASET_META[ds_name]['class_number'] + + @property def graphs(self): return self._graphs From 70a11dfd1ba25e74a244466efbefa0b17154ceef Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sat, 22 May 2021 12:26:40 +0200 Subject: [PATCH 24/35] [Enhancement] gklearn.utils.normalize_gram_matrix function now raises an excepetion if the diagonal includes negative values.\n[Enhancement] gklearn.utils.get_graph_kernel_by_name function now accepts strings of kernel names the same as the keys of gklearn.kernels.GRAPH_KERNELS and supports more graph kernels. --- gklearn/utils/utils.py | 73 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 9 deletions(-) diff --git a/gklearn/utils/utils.py b/gklearn/utils/utils.py index fca19dd..5758291 100644 --- a/gklearn/utils/utils.py +++ b/gklearn/utils/utils.py @@ -366,19 +366,62 @@ def get_edge_labels(Gn, edge_label): def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): if len(kwargs) != 0: kernel_options = kwargs - if name == 'Marginalized': + + if name == 'CommonWalk' or name == 'common walk': + from gklearn.kernels import CommonWalk + graph_kernel = CommonWalk(node_labels=node_labels, + edge_labels=edge_labels, + ds_infos=ds_infos, + **kernel_options) + + elif name == 'Marginalized' or name == 'marginalized': from gklearn.kernels import Marginalized graph_kernel = Marginalized(node_labels=node_labels, edge_labels=edge_labels, ds_infos=ds_infos, **kernel_options) - elif name == 'ShortestPath': + + elif name == 'SylvesterEquation' or name == 'sylvester equation': + from gklearn.kernels import SylvesterEquation + graph_kernel = SylvesterEquation( + ds_infos=ds_infos, + **kernel_options) + + elif name == 'FixedPoint' or name == 'fixed point': + from gklearn.kernels import FixedPoint + graph_kernel = FixedPoint(node_labels=node_labels, + edge_labels=edge_labels, + node_attrs=node_attrs, + edge_attrs=edge_attrs, + ds_infos=ds_infos, + **kernel_options) + + elif name == 'ConjugateGradient' or name == 'conjugate gradient': + from gklearn.kernels import ConjugateGradient + graph_kernel = ConjugateGradient(node_labels=node_labels, + edge_labels=edge_labels, + node_attrs=node_attrs, + edge_attrs=edge_attrs, + ds_infos=ds_infos, + **kernel_options) + + elif name == 'SpectralDecomposition' or name == 'spectral decomposition': + from gklearn.kernels import SpectralDecomposition + graph_kernel = SpectralDecomposition(node_labels=node_labels, + edge_labels=edge_labels, + node_attrs=node_attrs, + edge_attrs=edge_attrs, + ds_infos=ds_infos, + **kernel_options) + + elif name == 'ShortestPath' or name == 'shortest path': from gklearn.kernels import ShortestPath graph_kernel = ShortestPath(node_labels=node_labels, node_attrs=node_attrs, ds_infos=ds_infos, **kernel_options) - elif name == 'StructuralSP': + + elif name == 'StructuralSP' or name == 'structural shortest path': from gklearn.kernels import StructuralSP graph_kernel = StructuralSP(node_labels=node_labels, edge_labels=edge_labels, @@ -386,25 +429,29 @@ def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attr edge_attrs=edge_attrs, ds_infos=ds_infos, **kernel_options) - elif name == 'PathUpToH': + + elif name == 'PathUpToH' or name == 'path up to length h': from gklearn.kernels import PathUpToH graph_kernel = PathUpToH(node_labels=node_labels, edge_labels=edge_labels, ds_infos=ds_infos, **kernel_options) - elif name == 'Treelet': + + elif name == 'Treelet' or name == 'treelet': from gklearn.kernels import Treelet graph_kernel = Treelet(node_labels=node_labels, edge_labels=edge_labels, ds_infos=ds_infos, **kernel_options) - elif name == 'WLSubtree': + + elif name == 'WLSubtree' or name == 'weisfeiler-lehman subtree': from gklearn.kernels import WLSubtree graph_kernel = WLSubtree(node_labels=node_labels, edge_labels=edge_labels, ds_infos=ds_infos, **kernel_options) - elif name == 'WeisfeilerLehman': + + elif name == 'WeisfeilerLehman' or name == 'weisfeiler-lehman': from gklearn.kernels import WeisfeilerLehman graph_kernel = WeisfeilerLehman(node_labels=node_labels, edge_labels=edge_labels, @@ -541,10 +588,18 @@ def get_mlti_dim_edge_attrs(G, attr_names): def normalize_gram_matrix(gram_matrix): diag = gram_matrix.diagonal().copy() + old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. for i in range(len(gram_matrix)): for j in range(i, len(gram_matrix)): - gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) - gram_matrix[j][i] = gram_matrix[i][j] + try: + gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) + except: +# rollback() + np.seterr(**old_settings) + raise + else: + gram_matrix[j][i] = gram_matrix[i][j] + np.seterr(**old_settings) return gram_matrix From 1800923c45eb722c19a6c4315d1923d20318008b Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sat, 22 May 2021 12:30:49 +0200 Subject: [PATCH 25/35] [API change] gklearn.utils.kernels.gaussiankernel function no longer transforms inputs to numpy.array of float values. This may be faster, but may cause some errors to the previous codes. --- gklearn/utils/kernels.py | 225 ++++++++++++++++++++++++----------------------- 1 file changed, 114 insertions(+), 111 deletions(-) diff --git a/gklearn/utils/kernels.py b/gklearn/utils/kernels.py index 5bd7e4d..afcfb0c 100644 --- a/gklearn/utils/kernels.py +++ b/gklearn/utils/kernels.py @@ -4,155 +4,158 @@ These kernels are defined between pairs of vectors. import numpy as np def deltakernel(x, y): - """Delta kernel. Return 1 if x == y, 0 otherwise. + """Delta kernel. Return 1 if x == y, 0 otherwise. - Parameters - ---------- - x, y : any - Two parts to compare. + Parameters + ---------- + x, y : any + Two parts to compare. - Return - ------ - kernel : integer - Delta kernel. + Return + ------ + kernel : integer + Delta kernel. - References - ---------- - [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between - labeled graphs. In Proceedings of the 20th International Conference on - Machine Learning, Washington, DC, United States, 2003. - """ - return x == y #(1 if condition else 0) + References + ---------- + [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between + labeled graphs. In Proceedings of the 20th International Conference on + Machine Learning, Washington, DC, United States, 2003. + """ + return x == y #(1 if condition else 0) def gaussiankernel(x, y, gamma=None): - """Gaussian kernel. - Compute the rbf (gaussian) kernel between x and y: + """Gaussian kernel. + Compute the rbf (gaussian) kernel between x and y: - K(x, y) = exp(-gamma ||x-y||^2). + K(x, y) = exp(-gamma ||x-y||^2). - Read more in the `User Guide of scikit-learn library `__. + Read more in the `User Guide of scikit-learn library `__. - Parameters - ---------- - x, y : array + Parameters + ---------- + x, y : array - gamma : float, default None - If None, defaults to 1.0 / n_features + gamma : float, default None + If None, defaults to 1.0 / n_features - Returns - ------- - kernel : float - """ - if gamma is None: - gamma = 1.0 / len(x) + Returns + ------- + kernel : float + """ + if gamma is None: + gamma = 1.0 / len(x) + + # xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up. + # yt = np.array([float(itm) for itm in y]) +# kernel = xt - yt +# kernel = kernel ** 2 +# kernel = np.sum(kernel) +# kernel *= -gamma +# kernel = np.exp(kernel) +# return kernel + + return np.exp((np.sum((x - y) ** 2)) * -gamma) - xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up. - yt = np.array([float(itm) for itm in y]) - kernel = xt - yt - kernel = kernel ** 2 - kernel = np.sum(kernel) - kernel *= -gamma - kernel = np.exp(kernel) - return kernel def polynomialkernel(x, y, d=1, c=0): - """Polynomial kernel. - Compute the polynomial kernel between x and y: + """Polynomial kernel. + Compute the polynomial kernel between x and y: - K(x, y) = ^d + c. + K(x, y) = ^d + c. - Parameters - ---------- - x, y : array + Parameters + ---------- + x, y : array - d : integer, default 1 + d : integer, default 1 - c : float, default 0 + c : float, default 0 - Returns - ------- - kernel : float - """ - return np.dot(x, y) ** d + c + Returns + ------- + kernel : float + """ + return np.dot(x, y) ** d + c def linearkernel(x, y): - """Polynomial kernel. - Compute the polynomial kernel between x and y: + """Polynomial kernel. + Compute the polynomial kernel between x and y: - K(x, y) = . + K(x, y) = . - Parameters - ---------- - x, y : array + Parameters + ---------- + x, y : array - d : integer, default 1 + d : integer, default 1 - c : float, default 0 + c : float, default 0 - Returns - ------- - kernel : float - """ - return np.dot(x, y) + Returns + ------- + kernel : float + """ + return np.dot(x, y) def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): - """Sum of a pair of kernels. + """Sum of a pair of kernels. - k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) + k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) - Parameters - ---------- - k1, k2 : function - A pair of kernel functions. - d11, d12: - Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. - d21, d22: - Inputs of k2. - lamda1, lamda2: float - Coefficients of the product. + Parameters + ---------- + k1, k2 : function + A pair of kernel functions. + d11, d12: + Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. + d21, d22: + Inputs of k2. + lamda1, lamda2: float + Coefficients of the product. - Return - ------ - kernel : integer + Return + ------ + kernel : integer - """ - if d21 == None or d22 == None: - kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12) - else: - kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) - return kernel + """ + if d21 == None or d22 == None: + kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12) + else: + kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22) + return kernel def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1): - """Product of a pair of kernels. - - k = lamda * k1(d11, d12) * k2(d21, d22) - - Parameters - ---------- - k1, k2 : function - A pair of kernel functions. - d11, d12: - Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. - d21, d22: - Inputs of k2. - lamda: float - Coefficient of the product. - - Return - ------ - kernel : integer - """ - if d21 == None or d22 == None: - kernel = lamda * k1(d11, d12) * k2(d11, d12) - else: - kernel = lamda * k1(d11, d12) * k2(d21, d22) - return kernel + """Product of a pair of kernels. + + k = lamda * k1(d11, d12) * k2(d21, d22) + + Parameters + ---------- + k1, k2 : function + A pair of kernel functions. + d11, d12: + Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2. + d21, d22: + Inputs of k2. + lamda: float + Coefficient of the product. + + Return + ------ + kernel : integer + """ + if d21 == None or d22 == None: + kernel = lamda * k1(d11, d12) * k2(d11, d12) + else: + kernel = lamda * k1(d11, d12) * k2(d21, d22) + return kernel if __name__ == '__main__': - o = polynomialkernel([1, 2], [3, 4], 2, 3) + o = polynomialkernel([1, 2], [3, 4], 2, 3) From 4fd314d37420c9530c03f4dd0adba21272fbf22e Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sat, 22 May 2021 23:26:51 +0200 Subject: [PATCH 26/35] [Fix] Fixed the bug that gklearn.utils.kernels.gaussiankernel can not handle lists as inputs. --- gklearn/utils/kernels.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gklearn/utils/kernels.py b/gklearn/utils/kernels.py index afcfb0c..fe78ac8 100644 --- a/gklearn/utils/kernels.py +++ b/gklearn/utils/kernels.py @@ -3,6 +3,7 @@ These kernels are defined between pairs of vectors. """ import numpy as np + def deltakernel(x, y): """Delta kernel. Return 1 if x == y, 0 otherwise. @@ -56,7 +57,7 @@ def gaussiankernel(x, y, gamma=None): # kernel = np.exp(kernel) # return kernel - return np.exp((np.sum((x - y) ** 2)) * -gamma) + return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma) From 23a938482abdf4412d94c3f4ea560d8083704906 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 25 May 2021 14:19:37 +0200 Subject: [PATCH 27/35] =?UTF-8?q?[[Enhancement]=20gklearn.utils.normalize?= =?UTF-8?q?=5Fgram=5Fmatrix=20function=20now=20raises=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gklearn/kernels/graph_kernel.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index 90a0906..6d9517f 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -124,7 +124,13 @@ class GraphKernel(BaseEstimator): #, ABC): self._is_transformed = True if self.normalize: X_diag, Y_diag = self.diagonals() - kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) + old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + try: + kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) + except: + raise + finally: + np.seterr(**old_settings) return kernel_matrix @@ -150,9 +156,15 @@ class GraphKernel(BaseEstimator): #, ABC): gram_matrix = self.compute_kernel_matrix() # Normalize. - self._X_diag = np.diagonal(gram_matrix).copy() if self.normalize: - gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) + self._X_diag = np.diagonal(gram_matrix).copy() + old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + try: + gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) + except: + raise + finally: + np.seterr(**old_settings) return gram_matrix From 231c050f88c07382784200e922335103c8414d39 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 3 Jun 2021 01:29:26 +0200 Subject: [PATCH 28/35] [To finish] Remove dummy labels in treelet kernel (for the contest.) --- gklearn/kernels/treelet.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/gklearn/kernels/treelet.py b/gklearn/kernels/treelet.py index d546e74..08be407 100644 --- a/gklearn/kernels/treelet.py +++ b/gklearn/kernels/treelet.py @@ -369,13 +369,13 @@ class Treelet(GraphKernel): def _compute_single_kernel_series(self, g1, g2): - self._add_dummy_labels([g1] + [g2]) +# self._add_dummy_labels([g1] + [g2]) canonkeys_1 = self._get_canonkeys(g1) canonkeys_2 = self._get_canonkeys(g2) kernel = self._kernel_do(canonkeys_1, canonkeys_2) return kernel - +# @profile def _kernel_do(self, canonkey1, canonkey2): """Compute treelet graph kernel between 2 graphs. @@ -392,6 +392,23 @@ class Treelet(GraphKernel): keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) + +# vector1, vector2 = [], [] +# keys1, keys2 = canonkey1, canonkey2 +# keys_searched = {} +# for k, v in canonkey1.items(): +# if k in keys2: +# vector1.append(v) +# vector2.append(canonkey2[k]) +# keys_searched[k] = v + +# for k, v in canonkey2.items(): +# if k in keys1 and k not in keys_searched: +# vector1.append(canonkey1[k]) +# vector2.append(v) + +# vector1, vector2 = np.array(vector1), np.array(vector2) + kernel = self.sub_kernel(vector1, vector2) return kernel From 785a5c1926502cd65280fafc4c1383f33c029659 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 4 Jun 2021 22:31:26 +0200 Subject: [PATCH 29/35] [To finish] Remove dummy labels in treelet kernel (for the contest.) --- gklearn/kernels/treelet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gklearn/kernels/treelet.py b/gklearn/kernels/treelet.py index 08be407..e42142b 100644 --- a/gklearn/kernels/treelet.py +++ b/gklearn/kernels/treelet.py @@ -303,7 +303,7 @@ class Treelet(GraphKernel): def _compute_kernel_list_series(self, g1, g_list): - self._add_dummy_labels(g_list + [g1]) +# self._add_dummy_labels(g_list + [g1]) # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. From 609c8c15183a9fd752cc2ccd36680520e4b22b52 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Wed, 9 Jun 2021 17:16:51 +0200 Subject: [PATCH 30/35] [Enhancement] Allow deciding whether or not to make a copy of input graphs in GraphKernel class. --- gklearn/kernels/graph_kernel.py | 43 ++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index 6d9517f..1db38b3 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -77,8 +77,6 @@ class GraphKernel(BaseEstimator): #, ABC): # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; self.clear_attributes() -# X = check_array(X, accept_sparse=True) - # Validate parameters for the transformer. self.validate_parameters() @@ -386,35 +384,58 @@ class GraphKernel(BaseEstimator): #, ABC): self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) self.normalize = kwargs.get('normalize', True) self.verbose = kwargs.get('verbose', 2) + self.copy_graphs = kwargs.get('copy_graphs', True) + self.save_unnormed = kwargs.get('save_unnormed', True) self.validate_parameters() + # If the inputs is a list of graphs. if len(graphs) == 1: if not isinstance(graphs[0], list): raise Exception('Cannot detect graphs.') elif len(graphs[0]) == 0: raise Exception('The graph list given is empty. No computation was performed.') else: - self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. + if self.copy_graphs: + self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. + else: + self._graphs = graphs self._gram_matrix = self._compute_gram_matrix() - self._gram_matrix_unnorm = np.copy(self._gram_matrix) + + if self.save_unnormed: + self._gram_matrix_unnorm = np.copy(self._gram_matrix) if self.normalize: self._gram_matrix = normalize_gram_matrix(self._gram_matrix) return self._gram_matrix, self._run_time elif len(graphs) == 2: + # If the inputs are two graphs. if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): - kernel = self._compute_single_kernel(graphs[0].copy(), graphs[1].copy()) + if self.copy_graphs: + G0, G1 = graphs[0].copy(), graphs[1].copy() + else: + G0, G1 = graphs[0], graphs[1] + kernel = self._compute_single_kernel(G0, G1) return kernel, self._run_time + + # If the inputs are a graph and a list of graphs. elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): - g1 = graphs[0].copy() - g_list = [g.copy() for g in graphs[1]] - kernel_list = self._compute_kernel_list(g1, g_list) + if self.copy_graphs: + g1 = graphs[0].copy() + g_list = [g.copy() for g in graphs[1]] + kernel_list = self._compute_kernel_list(g1, g_list) + else: + kernel_list = self._compute_kernel_list(graphs[0], graphs[1]) return kernel_list, self._run_time + elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): - g1 = graphs[1].copy() - g_list = [g.copy() for g in graphs[0]] - kernel_list = self._compute_kernel_list(g1, g_list) + if self.copy_graphs: + g1 = graphs[1].copy() + g_list = [g.copy() for g in graphs[0]] + kernel_list = self._compute_kernel_list(g1, g_list) + else: + kernel_list = self._compute_kernel_list(graphs[1], graphs[0]) return kernel_list, self._run_time + else: raise Exception('Cannot detect graphs.') From bd8bf2b2347de7c1c005d78aa14ec3e3a7ed02f3 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Wed, 9 Jun 2021 17:25:27 +0200 Subject: [PATCH 31/35] [Enhancement] Add more verbose for the WL kernel. --- gklearn/kernels/weisfeiler_lehman.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/gklearn/kernels/weisfeiler_lehman.py b/gklearn/kernels/weisfeiler_lehman.py index 64069b7..aeca3ea 100644 --- a/gklearn/kernels/weisfeiler_lehman.py +++ b/gklearn/kernels/weisfeiler_lehman.py @@ -14,11 +14,14 @@ Created on Tue Apr 14 15:16:34 2020 import numpy as np import networkx as nx +import sys from collections import Counter # from functools import partial +from itertools import combinations_with_replacement from gklearn.utils import SpecialLabel from gklearn.utils.parallel import parallel_gm, parallel_me from gklearn.kernels import GraphKernel +from gklearn.utils.iters import get_iters class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. @@ -268,7 +271,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration # for each graph - for G in Gn: + if self.verbose >= 2: + iterator = get_iters(Gn, desc='Setting all labels into a tuple') + else: + iterator = Gn + for G in iterator: # set all labels into a tuple. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels) @@ -288,6 +295,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. all_num_of_each_label = [] # number of occurence of each label in G # @todo: parallel this part. +# if self.verbose >= 2: +# iterator = get_iters(enumerate(Gn), desc='Going through iteration ' + str(h), length=len(Gn)) +# else: +# iterator = enumerate(Gn) for idx, G in enumerate(Gn): all_multisets = [] @@ -341,11 +352,15 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. # parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, # glbv=(all_num_of_each_label,), n_jobs=self.n_jobs, verbose=self.verbose) # elif self.parallel is None: - for i in range(len(gram_matrix)): - for j in range(i, len(gram_matrix)): - gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i], - all_num_of_each_label[j], gram_matrix[i][j]) - gram_matrix[j][i] = gram_matrix[i][j] + itr = combinations_with_replacement(range(0, len(gram_matrix)), 2) + len_itr = int(len(gram_matrix) * (len(gram_matrix) + 1) / 2) + iterator = get_iters(itr, desc='Computing Gram matrix for this iteration', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) + for i, j in iterator: +# for i in iterator: +# for j in range(i, len(gram_matrix)): + gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i], + all_num_of_each_label[j], gram_matrix[i][j]) + gram_matrix[j][i] = gram_matrix[i][j] def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel): From 2e0105c4884a03b76d3335f052a190b4d1c26d9a Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 25 Jun 2021 18:28:50 +0200 Subject: [PATCH 32/35] [Fix] Typos in README.md. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0380c68..3bf2b57 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ The docs of the library can be found [here](https://graphkit-learn.readthedocs.i * [The common walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/common_walk.py) [1] * Exponential * Geometric - * [The marginalized kenrel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py) + * [The marginalized kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py) * With tottering [2] * Without tottering [7] * [The generalized random walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/random_walk.py) [3] From 310b01c3477ca6ed490f38f017fa5149944edaec Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 25 Jun 2021 18:36:20 +0200 Subject: [PATCH 33/35] [Major Feature] WLSubtree kernel can now deal with symbolic edge labels (for Gram matrix computation without parallelization). Meanwhile, the dummy labels are no longer added a priori. --- gklearn/kernels/weisfeiler_lehman.py | 481 ++++++++++++++++++++++++++++++----- gklearn/tests/test_graph_kernels.py | 66 ++--- 2 files changed, 454 insertions(+), 93 deletions(-) diff --git a/gklearn/kernels/weisfeiler_lehman.py b/gklearn/kernels/weisfeiler_lehman.py index aeca3ea..f02926e 100644 --- a/gklearn/kernels/weisfeiler_lehman.py +++ b/gklearn/kernels/weisfeiler_lehman.py @@ -26,21 +26,36 @@ from gklearn.utils.iters import get_iters class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. + def __init__(self, **kwargs): GraphKernel.__init__(self) - self._node_labels = kwargs.get('node_labels', []) - self._edge_labels = kwargs.get('edge_labels', []) - self._height = int(kwargs.get('height', 0)) + self.node_labels = kwargs.get('node_labels', []) + self.edge_labels = kwargs.get('edge_labels', []) + self.height = int(kwargs.get('height', 0)) self._base_kernel = kwargs.get('base_kernel', 'subtree') self._ds_infos = kwargs.get('ds_infos', {}) + ########################################################################## + # The following is the 1st paradigm to compute kernel matrix, which is + # compatible with `scikit-learn`. + # ------------------------------------------------------------------- + # Special thanks to the "GraKeL" library for providing an excellent template! + ########################################################################## + + + ########################################################################## + # The following is the 2nd paradigm to compute kernel matrix. It is + # simplified and not compatible with `scikit-learn`. + ########################################################################## + + def _compute_gm_series(self): # if self.verbose >= 2: # import warnings # warnings.warn('A part of the computation is parallelized.') - self._add_dummy_node_labels(self._graphs) +# self._add_dummy_node_labels(self._graphs) # for WL subtree kernel if self._base_kernel == 'subtree': @@ -62,7 +77,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. def _compute_gm_imap_unordered(self): - self._add_dummy_node_labels(self._graphs) +# self._add_dummy_node_labels(self._graphs) if self._base_kernel == 'subtree': gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) @@ -163,6 +178,30 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. return gram_matrix[0][1] + ########################################################################## + # The following are the methods used by both diagrams. + ########################################################################## + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + super().validate_parameters() + if len(self.node_labels) == 0: + if len(self.edge_labels) == 0: + self._subtree_kernel_do = self._subtree_kernel_do_unlabeled + else: + self._subtree_kernel_do = self._subtree_kernel_do_el + else: + if len(self.edge_labels) == 0: + self._subtree_kernel_do = self._subtree_kernel_do_nl + else: + self._subtree_kernel_do = self._subtree_kernel_do_labeled + def pairwise_kernel(self, g1, g2): Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster! @@ -175,9 +214,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. for G in Gn: # set all labels into a tuple. for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. - G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels) + G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) # get the set of original labels - labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) + labels_ori = list(nx.get_node_attributes(G, 'lt').values()) # number of occurence of each label in G all_num_of_each_label.append(dict(Counter(labels_ori))) @@ -185,22 +224,22 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. kernel = self._compute_kernel_itr(kernel, all_num_of_each_label) # iterate each height - for h in range(1, self._height + 1): + for h in range(1, self.height + 1): all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration all_num_of_each_label = [] # number of occurence of each label in G # @todo: parallel this part. - for idx, G in enumerate(Gn): + for G in Gn: all_multisets = [] for node, attrs in G.nodes(data=True): # Multiset-label determination. - multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] + multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] # sorting each multiset multiset.sort() - multiset = [attrs['label_tuple']] + multiset # add the prefix + multiset = [attrs['lt']] + multiset # add the prefix all_multisets.append(tuple(multiset)) # label compression @@ -211,19 +250,19 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. # else assign the number of labels occured + 1 as the compressed label. for value in set_unique: if value in all_set_compressed.keys(): - set_compressed.update({value: all_set_compressed[value]}) + set_compressed[value] = all_set_compressed[value] else: - set_compressed.update({value: str(num_of_labels_occured + 1)}) + set_compressed[value] = str(num_of_labels_occured + 1) num_of_labels_occured += 1 all_set_compressed.update(set_compressed) # relabel nodes for idx, node in enumerate(G.nodes()): - G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] + G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] # get the set of compressed labels - labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) + labels_comp = list(nx.get_node_attributes(G, 'lt').values()) # all_labels_ori.update(labels_comp) all_num_of_each_label.append(dict(Counter(labels_comp))) @@ -252,8 +291,8 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. return kernel - def _subtree_kernel_do(self, Gn): - """Compute Weisfeiler-Lehman kernels between graphs. + def _subtree_kernel_do_nl(self, Gn): + """Compute Weisfeiler-Lehman kernels between graphs with node labels. Parameters ---------- @@ -276,11 +315,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. else: iterator = Gn for G in iterator: - # set all labels into a tuple. + # set all labels into a tuple. # @todo: remove this original labels or not? for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. - G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels) + G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) # get the set of original labels - labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values()) + labels_ori = list(nx.get_node_attributes(G, 'lt').values()) # number of occurence of each label in G all_num_of_each_label.append(dict(Counter(labels_ori))) @@ -288,7 +327,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. self._compute_gram_itr(gram_matrix, all_num_of_each_label) # iterate each height - for h in range(1, self._height + 1): + for h in range(1, self.height + 1): all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration @@ -299,47 +338,363 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. # iterator = get_iters(enumerate(Gn), desc='Going through iteration ' + str(h), length=len(Gn)) # else: # iterator = enumerate(Gn) - for idx, G in enumerate(Gn): + for G in Gn: + num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) - all_multisets = [] - for node, attrs in G.nodes(data=True): - # Multiset-label determination. - multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]] - # sorting each multiset - multiset.sort() - multiset = [attrs['label_tuple']] + multiset # add the prefix - all_multisets.append(tuple(multiset)) + # Compute subtree kernel with h iterations and add it to the final kernel + self._compute_gram_itr(gram_matrix, all_num_of_each_label) - # label compression - set_unique = list(set(all_multisets)) # set of unique multiset labels - # a dictionary mapping original labels to new ones. - set_compressed = {} - # if a label occured before, assign its former compressed label, - # else assign the number of labels occured + 1 as the compressed label. - for value in set_unique: - if value in all_set_compressed.keys(): - set_compressed.update({value: all_set_compressed[value]}) - else: - set_compressed.update({value: str(num_of_labels_occured + 1)}) - num_of_labels_occured += 1 + return gram_matrix - all_set_compressed.update(set_compressed) - # relabel nodes - for idx, node in enumerate(G.nodes()): - G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]] + def _subtree_kernel_do_el(self, Gn): + """Compute Weisfeiler-Lehman kernels between graphs with edge labels. - # get the set of compressed labels - labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values()) - # all_labels_ori.update(labels_comp) - all_num_of_each_label.append(dict(Counter(labels_comp))) + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are computed. - # Compute subtree kernel with h iterations and add it to the final kernel + Return + ------ + gram_matrix : Numpy matrix + Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. + """ + gram_matrix = np.zeros((len(Gn), len(Gn))) + + # initial for height = 0 + all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration + + # Compute subtree kernel with the 0th iteration and add it to the final kernel. + iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) + for i, j in iterator: + gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) + gram_matrix[j][i] = gram_matrix[i][j] + + + # if h >= 1. + if self.height > 0: + # Set all edge labels into a tuple. # @todo: remove this original labels or not? + if self.verbose >= 2: + iterator = get_iters(Gn, desc='Setting all labels into a tuple') + else: + iterator = Gn + for G in iterator: + for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way. + G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels) + + # When h == 1, compute the kernel. + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + all_num_of_each_label = [] # number of occurence of each label in G + + # @todo: parallel this part. + for G in Gn: + num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) + + # Compute subtree kernel with h iterations and add it to the final kernel. + self._compute_gram_itr(gram_matrix, all_num_of_each_label) + + + # Iterate along heights (>= 2). + for h in range(2, self.height + 1): + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + all_num_of_each_label = [] # number of occurence of each label in G + + # @todo: parallel this part. + for G in Gn: + num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) + + # Compute subtree kernel with h iterations and add it to the final kernel. + self._compute_gram_itr(gram_matrix, all_num_of_each_label) + + return gram_matrix + + + def _subtree_kernel_do_labeled(self, Gn): + """Compute Weisfeiler-Lehman kernels between graphs with both node and + edge labels. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are computed. + + Return + ------ + gram_matrix : Numpy matrix + Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. + """ + gram_matrix = np.zeros((len(Gn), len(Gn))) + + # initial for height = 0 + all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration + + # Set all node labels into a tuple and get # of occurence of each label. + if self.verbose >= 2: + iterator = get_iters(Gn, desc='Setting all node labels into a tuple') + else: + iterator = Gn + for G in iterator: + # Set all node labels into a tuple. # @todo: remove this original labels or not? + for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. + G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) + # Get the set of original labels. + labels_ori = list(nx.get_node_attributes(G, 'lt').values()) + # number of occurence of each label in G + all_num_of_each_label.append(dict(Counter(labels_ori))) + + # Compute subtree kernel with the 0th iteration and add it to the final kernel. + self._compute_gram_itr(gram_matrix, all_num_of_each_label) + + + # if h >= 1. + if self.height > 0: + # Set all edge labels into a tuple. # @todo: remove this original labels or not? + if self.verbose >= 2: + iterator = get_iters(Gn, desc='Setting all edge labels into a tuple') + else: + iterator = Gn + for G in iterator: + for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way. + G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels) + + # When h == 1, compute the kernel. + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + all_num_of_each_label = [] # number of occurence of each label in G + + # @todo: parallel this part. + for G in Gn: + num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) + + # Compute subtree kernel with h iterations and add it to the final kernel. + self._compute_gram_itr(gram_matrix, all_num_of_each_label) + + + # Iterate along heights. + for h in range(2, self.height + 1): + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + all_num_of_each_label = [] # number of occurence of each label in G + + # @todo: parallel this part. + for G in Gn: + num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) + + # Compute subtree kernel with h iterations and add it to the final kernel. self._compute_gram_itr(gram_matrix, all_num_of_each_label) return gram_matrix + def _subtree_kernel_do_unlabeled(self, Gn): + """Compute Weisfeiler-Lehman kernels between graphs without labels. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are computed. + + Return + ------ + gram_matrix : Numpy matrix + Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. + """ + gram_matrix = np.zeros((len(Gn), len(Gn))) + + # initial for height = 0 + all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration + + # Compute subtree kernel with the 0th iteration and add it to the final kernel. + iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) + for i, j in iterator: + gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) + gram_matrix[j][i] = gram_matrix[i][j] + + + # if h >= 1. + if self.height > 0: + # When h == 1, compute the kernel. + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + all_num_of_each_label = [] # number of occurence of each label in G + + # @todo: parallel this part. + for G in Gn: + num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) + + # Compute subtree kernel with h iterations and add it to the final kernel. + self._compute_gram_itr(gram_matrix, all_num_of_each_label) + + + # Iterate along heights (>= 2). + for h in range(2, self.height + 1): + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs + all_num_of_each_label = [] # number of occurence of each label in G + + # @todo: parallel this part. + for G in Gn: + num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) + + # Compute subtree kernel with h iterations and add it to the final kernel. + self._compute_gram_itr(gram_matrix, all_num_of_each_label) + + return gram_matrix + + + def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): + all_multisets = [] + for node, attrs in G.nodes(data=True): + # Multiset-label determination. + multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] + # sorting each multiset + multiset.sort() + multiset = [attrs['lt']] + multiset # add the prefix + all_multisets.append(tuple(multiset)) + + # label compression + set_unique = list(set(all_multisets)) # set of unique multiset labels + # a dictionary mapping original labels to new ones. + set_compressed = {} + # If a label occured before, assign its former compressed label; + # otherwise assign the number of labels occured + 1 as the + # compressed label. + for value in set_unique: + if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? + set_compressed[value] = all_set_compressed[value] + else: + set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? and what if num_of_labels_occured is extremely big. + num_of_labels_occured += 1 + + all_set_compressed.update(set_compressed) + + # Relabel nodes. + for idx, node in enumerate(G.nodes()): + G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] + + # Get the set of compressed labels. + labels_comp = list(nx.get_node_attributes(G, 'lt').values()) + all_num_of_each_label.append(dict(Counter(labels_comp))) + + return num_of_labels_occured + + + def _subtree_1graph_el(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): + all_multisets = [] +# for node, attrs in G.nodes(data=True): + for node in G.nodes(): + # Multiset-label determination. + multiset = [G.edges[(node, neighbors)]['lt'] for neighbors in G[node]] # @todo: check reference for this. + # sorting each multiset + multiset.sort() +# multiset = [attrs['lt']] + multiset # add the prefix + all_multisets.append(tuple(multiset)) + + # label compression + set_unique = list(set(all_multisets)) # set of unique multiset labels + # a dictionary mapping original labels to new ones. + set_compressed = {} + # If a label occured before, assign its former compressed label; + # otherwise assign the number of labels occured + 1 as the + # compressed label. + for value in set_unique: + if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? + set_compressed[value] = all_set_compressed[value] + else: + set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? + num_of_labels_occured += 1 + + all_set_compressed.update(set_compressed) + + # Relabel nodes. + for idx, node in enumerate(G.nodes()): + G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] + + # Get the set of compressed labels. + labels_comp = list(nx.get_node_attributes(G, 'lt').values()) # @todo: maybe can be faster. + all_num_of_each_label.append(dict(Counter(labels_comp))) + + return num_of_labels_occured + + + def _subtree_1graph_labeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): + all_multisets = [] + for node, attrs in G.nodes(data=True): + # Multiset-label determination. + multiset = [tuple((G.edges[(node, neighbors)]['lt'], G.nodes[neighbors]['lt'])) for neighbors in G[node]] # @todo: check reference for this. + # sorting each multiset + multiset.sort() + multiset = [attrs['lt']] + multiset # add the prefix + all_multisets.append(tuple(multiset)) + + # label compression + set_unique = list(set(all_multisets)) # set of unique multiset labels + # a dictionary mapping original labels to new ones. + set_compressed = {} + # If a label occured before, assign its former compressed label; + # otherwise assign the number of labels occured + 1 as the + # compressed label. + for value in set_unique: + if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? + set_compressed[value] = all_set_compressed[value] + else: + set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? + num_of_labels_occured += 1 + + all_set_compressed.update(set_compressed) + + # Relabel nodes. + for idx, node in enumerate(G.nodes()): + G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] + + # Get the set of compressed labels. + labels_comp = list(nx.get_node_attributes(G, 'lt').values()) + all_num_of_each_label.append(dict(Counter(labels_comp))) + + return num_of_labels_occured + + + def _subtree_1graph_unlabeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): +# all_multisets = [] +# for node, attrs in G.nodes(data=True): # @todo: it can be better. +# # Multiset-label determination. +# multiset = [0 for neighbors in G[node]] +# # sorting each multiset +# multiset.sort() +# multiset = [0] + multiset # add the prefix +# all_multisets.append(tuple(multiset)) + all_multisets = [len(G[node]) for node in G.nodes()] + + # label compression + set_unique = list(set(all_multisets)) # set of unique multiset labels + # a dictionary mapping original labels to new ones. + set_compressed = {} + # If a label occured before, assign its former compressed label; + # otherwise assign the number of labels occured + 1 as the + # compressed label. + for value in set_unique: + if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop? + set_compressed[value] = all_set_compressed[value] + else: + set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? + num_of_labels_occured += 1 + + all_set_compressed.update(set_compressed) + + # Relabel nodes. + for idx, node in enumerate(G.nodes()): + G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] + + # Get the set of compressed labels. + labels_comp = list(nx.get_node_attributes(G, 'lt').values()) + all_num_of_each_label.append(dict(Counter(labels_comp))) + + return num_of_labels_occured + + def _compute_gram_itr(self, gram_matrix, all_num_of_each_label): """Compute Gram matrix using the base kernel. """ @@ -358,12 +713,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. for i, j in iterator: # for i in iterator: # for j in range(i, len(gram_matrix)): - gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i], - all_num_of_each_label[j], gram_matrix[i][j]) + gram_matrix[i][j] += self._compute_subtree_kernel(all_num_of_each_label[i], + all_num_of_each_label[j]) gram_matrix[j][i] = gram_matrix[i][j] - def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel): + def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2): """Compute the subtree kernel. """ labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys())) @@ -373,7 +728,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. vector2 = np.array([(num_of_each_label2[label] if (label in num_of_each_label2.keys()) else 0) for label in labels]) - kernel += np.dot(vector1, vector2) + kernel = np.dot(vector1, vector2) return kernel @@ -441,9 +796,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label for value in set_unique: if value in all_set_compressed.keys(): - set_compressed.update({ value : all_set_compressed[value] }) + set_compressed[value] = all_set_compressed[value] else: - set_compressed.update({ value : str(num_of_labels_occured + 1) }) + set_compressed[value] = str(num_of_labels_occured + 1) num_of_labels_occured += 1 all_set_compressed.update(set_compressed) @@ -519,9 +874,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label for value in set_unique: if value in all_set_compressed.keys(): - set_compressed.update({ value : all_set_compressed[value] }) + set_compressed[value] = all_set_compressed[value] else: - set_compressed.update({ value : str(num_of_labels_occured + 1) }) + set_compressed[value] = str(num_of_labels_occured + 1) num_of_labels_occured += 1 all_set_compressed.update(set_compressed) @@ -592,9 +947,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label for value in set_unique: if value in all_set_compressed.keys(): - set_compressed.update({ value : all_set_compressed[value] }) + set_compressed[value] = all_set_compressed[value] else: - set_compressed.update({ value : str(num_of_labels_occured + 1) }) + set_compressed[value] = str(num_of_labels_occured + 1) num_of_labels_occured += 1 all_set_compressed.update(set_compressed) @@ -610,10 +965,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. def _add_dummy_node_labels(self, Gn): - if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): + if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY): for i in range(len(Gn)): nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) - self._node_labels = [SpecialLabel.DUMMY] + self.node_labels = [SpecialLabel.DUMMY] class WLSubtree(WeisfeilerLehman): diff --git a/gklearn/tests/test_graph_kernels.py b/gklearn/tests/test_graph_kernels.py index 8c593f1..85ffe0b 100644 --- a/gklearn/tests/test_graph_kernels.py +++ b/gklearn/tests/test_graph_kernels.py @@ -25,34 +25,40 @@ def chooseDataset(ds_name): current_path = os.path.dirname(os.path.realpath(__file__)) + '/' root = current_path + '../../datasets/' - # no node labels (and no edge labels). - if ds_name == 'Alkane': + # no labels at all. + if ds_name == 'Alkane_unlabeled': dataset = Dataset('Alkane_unlabeled', root=root) dataset.trim_dataset(edge_required=False) dataset.cut_graphs(range(1, 10)) - # node symbolic labels. + # node symbolic labels only. elif ds_name == 'Acyclic': dataset = Dataset('Acyclic', root=root) dataset.trim_dataset(edge_required=False) - # node non-symbolic labels. + # node non-symbolic labels only. elif ds_name == 'Letter-med': dataset = Dataset('Letter-med', root=root) dataset.trim_dataset(edge_required=False) - # node symbolic and non-symbolic labels (and edge symbolic labels). + # node symbolic + non-symbolic labels + edge symbolic labels. elif ds_name == 'AIDS': dataset = Dataset('AIDS', root=root) dataset.trim_dataset(edge_required=False) - # edge non-symbolic labels (no node labels). - elif ds_name == 'Fingerprint_edge': + # node non-symbolic labels + edge non-symbolic labels. + elif ds_name == 'Fingerprint': dataset = Dataset('Fingerprint', root=root) dataset.trim_dataset(edge_required=True) - irrelevant_labels = {'edge_attrs': ['orient', 'angle']} + # edge symbolic only. + elif ds_name == 'MAO': + dataset = Dataset('MAO', root=root) + dataset.trim_dataset(edge_required=True) + irrelevant_labels = {'node_labels': ['atom_symbol'], 'node_attrs': ['x', 'y']} dataset.remove_labels(**irrelevant_labels) - # edge non-symbolic labels (and node non-symbolic labels). - elif ds_name == 'Fingerprint': + # edge non-symbolic labels only. + elif ds_name == 'Fingerprint_edge': dataset = Dataset('Fingerprint', root=root) dataset.trim_dataset(edge_required=True) - # edge symbolic and non-symbolic labels (and node symbolic and non-symbolic labels). + irrelevant_labels = {'edge_attrs': ['orient', 'angle']} + dataset.remove_labels(**irrelevant_labels) + # node symbolic and non-symbolic labels + edge symbolic and non-symbolic labels. elif ds_name == 'Cuneiform': dataset = Dataset('Cuneiform', root=root) dataset.trim_dataset(edge_required=True) @@ -91,7 +97,7 @@ def assert_equality(compute_fun, **kwargs): assert np.array_equal(lst[i], lst[i + 1]) -@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) @pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_CommonWalk(ds_name, weight, compute_method): @@ -126,7 +132,7 @@ def test_CommonWalk(ds_name, weight, compute_method): assert_equality(compute, parallel=['imap_unordered', None]) -@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) @pytest.mark.parametrize('remove_totters', [False]) #[True, False]) # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_Marginalized(ds_name, remove_totters): @@ -319,13 +325,13 @@ def test_SpectralDecomposition(ds_name, sub_kernel): # @pytest.mark.parametrize( # 'compute_method,ds_name,sub_kernel', # [ -# ('sylvester', 'Alkane', None), -# ('conjugate', 'Alkane', None), +# ('sylvester', 'Alkane_unlabeled', None), +# ('conjugate', 'Alkane_unlabeled', None), # ('conjugate', 'AIDS', None), -# ('fp', 'Alkane', None), +# ('fp', 'Alkane_unlabeled', None), # ('fp', 'AIDS', None), -# ('spectral', 'Alkane', 'exp'), -# ('spectral', 'Alkane', 'geo'), +# ('spectral', 'Alkane_unlabeled', 'exp'), +# ('spectral', 'Alkane_unlabeled', 'geo'), # ] # ) # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) @@ -365,7 +371,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel): # assert False, exception -@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) +@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_ShortestPath(ds_name): """Test shortest path kernel. @@ -401,8 +407,8 @@ def test_ShortestPath(ds_name): assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) -#@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) -@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) +#@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) +@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_StructuralSP(ds_name): """Test structural shortest path kernel. @@ -441,7 +447,7 @@ def test_StructuralSP(ds_name): assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False]) -@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) #@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto', None]) @pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto']) @@ -476,7 +482,7 @@ def test_PathUpToH(ds_name, k_func): compute_method=['trie', 'naive']) -@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS']) # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_Treelet(ds_name): """Test treelet kernel. @@ -510,7 +516,7 @@ def test_Treelet(ds_name): assert_equality(compute, parallel=['imap_unordered', None]) -@pytest.mark.parametrize('ds_name', ['Acyclic']) +@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'MAO', 'AIDS']) #@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge']) # @pytest.mark.parametrize('base_kernel', ['subtree']) # @pytest.mark.parametrize('parallel', ['imap_unordered', None]) @@ -540,17 +546,17 @@ def test_WLSubtree(ds_name): else: return gram_matrix, kernel_list, kernel - assert_equality(compute, parallel=['imap_unordered', None]) + assert_equality(compute, parallel=[None, 'imap_unordered']) if __name__ == "__main__": - test_list_graph_kernels() -# test_spkernel('Alkane', 'imap_unordered') - # test_ShortestPath('Alkane') + # test_list_graph_kernels() +# test_spkernel('Alkane_unlabeled', 'imap_unordered') + # test_ShortestPath('Alkane_unlabeled') # test_StructuralSP('Fingerprint_edge', 'imap_unordered') # test_StructuralSP('Acyclic') # test_StructuralSP('Cuneiform', None) - # test_WLSubtree('Acyclic') + test_WLSubtree('MAO') # 'Alkane_unlabeled', 'Acyclic', 'AIDS' # test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'fp', None, None) @@ -559,7 +565,7 @@ if __name__ == "__main__": # test_Marginalized('Acyclic', False) # test_ShortestPath('Acyclic') # test_PathUpToH('Acyclic', 'MinMax') -# test_Treelet('Acyclic') + # test_Treelet('AIDS') # test_SylvesterEquation('Acyclic') # test_ConjugateGradient('Acyclic') # test_FixedPoint('Acyclic') From 92fd38c4d608afa1b06f16afb2b1d8b845dd6d1a Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 25 Jun 2021 18:58:41 +0200 Subject: [PATCH 34/35] [CI] Migrate from travis.org to travis.com. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3bf2b57..1ff792a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # graphkit-learn -[![Build Status](https://travis-ci.org/jajupmochi/graphkit-learn.svg?branch=master)](https://travis-ci.org/jajupmochi/graphkit-learn) +[![Build Status](https://travis-ci.com/jajupmochi/graphkit-learn.svg?branch=master)](https://travis-ci.com/jajupmochi/graphkit-learn) [![Build status](https://ci.appveyor.com/api/projects/status/bdxsolk0t1uji9rd?svg=true)](https://ci.appveyor.com/project/jajupmochi/graphkit-learn) [![codecov](https://codecov.io/gh/jajupmochi/graphkit-learn/branch/master/graph/badge.svg)](https://codecov.io/gh/jajupmochi/graphkit-learn) [![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master) From 7d3e929b3620261571bbd76f731e482384c821f9 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 30 Nov 2021 15:48:52 +0100 Subject: [PATCH 35/35] [Feature] Add several new kernels between vectors. --- gklearn/utils/kernels.py | 78 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/gklearn/utils/kernels.py b/gklearn/utils/kernels.py index fe78ac8..c500097 100644 --- a/gklearn/utils/kernels.py +++ b/gklearn/utils/kernels.py @@ -4,7 +4,7 @@ These kernels are defined between pairs of vectors. import numpy as np -def deltakernel(x, y): +def delta_kernel(x, y): """Delta kernel. Return 1 if x == y, 0 otherwise. Parameters @@ -26,7 +26,11 @@ def deltakernel(x, y): return x == y #(1 if condition else 0) -def gaussiankernel(x, y, gamma=None): +def deltakernel(x, y): + return delta_kernel(x, y) + + +def gaussian_kernel(x, y, gamma=None): """Gaussian kernel. Compute the rbf (gaussian) kernel between x and y: @@ -60,8 +64,15 @@ def gaussiankernel(x, y, gamma=None): return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma) +def gaussiankernel(x, y, gamma=None): + return gaussian_kernel(x, y, gamma=gamma) -def polynomialkernel(x, y, d=1, c=0): + +def polynomial_kernel(x, y, gamma=1, coef0=0, d=1): + return (np.dot(x, y) * gamma + coef0) ** d + + +def highest_polynomial_kernel(x, y, d=1, c=0): """Polynomial kernel. Compute the polynomial kernel between x and y: @@ -82,7 +93,11 @@ def polynomialkernel(x, y, d=1, c=0): return np.dot(x, y) ** d + c -def linearkernel(x, y): +def polynomialkernel(x, y, d=1, c=0): + return highest_polynomial_kernel(x, y, d=d, c=c) + + +def linear_kernel(x, y): """Polynomial kernel. Compute the polynomial kernel between x and y: @@ -103,6 +118,61 @@ def linearkernel(x, y): return np.dot(x, y) +def linearkernel(x, y): + return linear_kernel(x, y) + + +def cosine_kernel(x, y): + return np.dot(x, y) / (np.abs(x) * np.abs(y)) + + +def sigmoid_kernel(x, y, gamma=None, coef0=1): + if gamma is None: + gamma = 1.0 / len(x) + + k = np.dot(x, y) + k *= gamma + k += coef0 + k = np.tanh(k) +# k = np.tanh(k, k) # compute tanh in-place + return k + + +def laplacian_kernel(x, y, gamma=None): + if gamma is None: + gamma = 1.0 / len(x) + + k = -gamma * np.abs(np.subtract(x, y)) + k = np.exp(k) + return k + + +def chi2_kernel(x, y, gamma=1.0): + k = np.divide(np.subtract(x, y) ** 2, np.add(x, y)) + k = np.sum(k) + k *= -gamma + return np.exp(k) + + +def exponential_kernel(x, y, gamma=None): + if gamma is None: + gamma = 1.0 / len(x) + + return np.exp(np.dot(x, y) * gamma) + + +def intersection_kernel(x, y): + return np.sum(np.minimum(x, y)) + + +def multiquadratic_kernel(x, y, c=0): + return np.sqrt((np.sum(np.subtract(x, y) ** 2)) + c) + + +def inverse_multiquadratic_kernel(x, y, c=0): + return 1 / multiquadratic_kernel(x, y, c=c) + + def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): """Sum of a pair of kernels.