From dd58f602ecafb4caa5dea7a5709c2d645f57ae6b Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 2 Dec 2021 16:29:57 +0100 Subject: [PATCH 01/11] [Feature] Add kronecker_delta_kernel function between a pair of numpy vectors. --- gklearn/utils/kernels.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gklearn/utils/kernels.py b/gklearn/utils/kernels.py index c500097..c35cc2f 100644 --- a/gklearn/utils/kernels.py +++ b/gklearn/utils/kernels.py @@ -4,7 +4,7 @@ These kernels are defined between pairs of vectors. import numpy as np -def delta_kernel(x, y): +def kronecker_delta_kernel(x, y): """Delta kernel. Return 1 if x == y, 0 otherwise. Parameters @@ -23,6 +23,10 @@ def delta_kernel(x, y): labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003. """ + return (1 if np.array_equal(x, y) else 0) + + +def delta_kernel(x, y): return x == y #(1 if condition else 0) From 42acfd02364095f29e85a62d039f91bbe66a1100 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 2 Dec 2021 16:52:41 +0100 Subject: [PATCH 02/11] [Fix] Fix laplacian_kernel and cosine_kernel between pairs of numpy vectors. --- gklearn/utils/kernels.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gklearn/utils/kernels.py b/gklearn/utils/kernels.py index c35cc2f..182668b 100644 --- a/gklearn/utils/kernels.py +++ b/gklearn/utils/kernels.py @@ -127,7 +127,7 @@ def linearkernel(x, y): def cosine_kernel(x, y): - return np.dot(x, y) / (np.abs(x) * np.abs(y)) + return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) def sigmoid_kernel(x, y, gamma=None, coef0=1): @@ -146,7 +146,7 @@ def laplacian_kernel(x, y, gamma=None): if gamma is None: gamma = 1.0 / len(x) - k = -gamma * np.abs(np.subtract(x, y)) + k = -gamma * np.linalg.norm(np.subtract(x, y)) k = np.exp(k) return k From 45747641df470837b2e95dd866f93accfa922491 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 14 Feb 2022 20:20:19 +0100 Subject: [PATCH 03/11] [CI] Fix travis badge link. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ff792a..2abcec7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # graphkit-learn -[![Build Status](https://travis-ci.com/jajupmochi/graphkit-learn.svg?branch=master)](https://travis-ci.com/jajupmochi/graphkit-learn) + +[![Build Status](https://app.travis-ci.com/jajupmochi/graphkit-learn.svg?branch=master)](https://app.travis-ci.com/jajupmochi/graphkit-learn) [![Build status](https://ci.appveyor.com/api/projects/status/bdxsolk0t1uji9rd?svg=true)](https://ci.appveyor.com/project/jajupmochi/graphkit-learn) [![codecov](https://codecov.io/gh/jajupmochi/graphkit-learn/branch/master/graph/badge.svg)](https://codecov.io/gh/jajupmochi/graphkit-learn) [![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master) From 08ee17d1538692b58606babebdcc12374f8b30bf Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 14 Feb 2022 20:23:41 +0100 Subject: [PATCH 04/11] [CI] Add tests on Python 3.9. --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index b40cbcb..a569f8e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,8 @@ python: - '3.6' - '3.7' - '3.8' +- '3.9' +#- '3.10' before_install: - python --version From 1946d469643d618d9c381354f25933af2da2aed0 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 14 Mar 2022 15:49:27 +0100 Subject: [PATCH 05/11] [Feature] Allow to permutate nodes in graphs when using bipartite to estimate GED. This feature is implementated in the method in Python, which invokes GEDLIB in C++ by Cython. --- .../edit_costs.max_num_sols.ratios.bipartite.py | 147 ------- .../edit_costs.real_data.nums_sols.ratios.IPFP.py | 11 +- ...t_costs.real_data.nums_sols.ratios.bipartite.py | 172 ++++++++ gklearn/experiments/ged/stability/group_results.py | 1 + ..._costs.real_data.nums_sols.ratios.bipartite.py} | 27 +- gklearn/experiments/ged/stability/utils.py | 18 +- gklearn/ged/util/util.py | 210 +++++++-- gklearn/utils/utils.py | 471 ++++++++++++--------- 8 files changed, 665 insertions(+), 392 deletions(-) delete mode 100644 gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py create mode 100644 gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py rename gklearn/experiments/ged/stability/{run_job_edit_costs.max_nums_sols.ratios.bipartite.py => run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py} (52%) diff --git a/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py deleted file mode 100644 index 1f01fd5..0000000 --- a/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Mon Nov 2 16:17:01 2020 - -@author: ljia -""" -# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. - -import os -import multiprocessing -import pickle -import logging -from gklearn.ged.util import compute_geds -import time -from utils import get_dataset -import sys -from group_results import group_trials - - -def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): - - save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) - - # Return if the file exists. - if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): - return None, None - - """**2. Set parameters.**""" - - # Parameters for GED computation. - ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. - # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) - 'lsape_model': 'ECBP', # - # ??when bigger than 1, then the method is considered mIPFP. - # the actual number of computed solutions might be smaller than the specified value - 'max_num_solutions': max_num_solutions, - 'edit_cost': 'CONSTANT', # use CONSTANT cost. - 'greedy_method': 'BASIC', # - # the distance between non-symbolic node/edge labels is computed by euclidean distance. - 'attr_distance': 'euclidean', - 'optimal': True, # if TRUE, the option --greedy-method has no effect - # parallel threads. Do not work if mpg_options['parallel'] = False. - 'threads': multiprocessing.cpu_count(), - 'centrality_method': 'NONE', - 'centrality_weight': 0.7, - 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' - } - - edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] -# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] -# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) - - options = ged_options.copy() - options['edit_cost_constants'] = edit_cost_constants - options['node_labels'] = dataset.node_labels - options['edge_labels'] = dataset.edge_labels - options['node_attrs'] = dataset.node_attrs - options['edge_attrs'] = dataset.edge_attrs - parallel = True # if num_solutions == 1 else False - - """**5. Compute GED matrix.**""" - ged_mat = 'error' - runtime = 0 - try: - time0 = time.time() - ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) - runtime = time.time() - time0 - except Exception as exp: - print('An exception occured when running this experiment:') - LOG_FILENAME = save_dir + 'error.txt' - logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) - logging.exception(save_file_suffix) - print(repr(exp)) - - """**6. Get results.**""" - - with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: - pickle.dump(ged_mat, f) - with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: - pickle.dump(runtime, f) - - return ged_mat, runtime - - -def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): - # Return if the group file exists. - name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' - name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' - if os.path.isfile(name_group): - return - - ged_mats = [] - runtimes = [] - for trial in range(1, 101): - print() - print('Trial:', trial) - ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial) - ged_mats.append(ged_mat) - runtimes.append(runtime) - - # Group trials and Remove single files. - name_prefix = 'ged_matrix' + name_middle - group_trials(save_dir, name_prefix, True, True, False) - name_prefix = 'runtime' + name_middle - group_trials(save_dir, name_prefix, True, True, False) - - -def results_for_a_dataset(ds_name): - """**1. Get dataset.**""" - dataset = get_dataset(ds_name) - - for max_num_solutions in mnum_solutions_list: - print() - print('Max # of solutions:', max_num_solutions) - for ratio in ratio_list: - print() - print('Ratio:', ratio) - save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) - - -def get_param_lists(ds_name): - if ds_name == 'AIDS_symb': - mnum_solutions_list = [1, 20, 40, 60, 80, 100] - ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] - else: - mnum_solutions_list = [1, 20, 40, 60, 80, 100] - ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] - - return mnum_solutions_list, ratio_list - - -if __name__ == '__main__': - if len(sys.argv) > 1: - ds_name_list = sys.argv[1:] - else: - ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - - save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' - os.makedirs(save_dir, exist_ok=True) - os.makedirs(save_dir + 'groups/', exist_ok=True) - - for ds_name in ds_name_list: - print() - print('Dataset:', ds_name) - mnum_solutions_list, ratio_list = get_param_lists(ds_name) - results_for_a_dataset(ds_name) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py index aa08579..82b6604 100644 --- a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py @@ -13,7 +13,7 @@ import pickle import logging from gklearn.ged.util import compute_geds import time -from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation +from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids import sys from group_results import group_trials, check_group_existence, update_group_marker @@ -125,9 +125,10 @@ def get_param_lists(ds_name, mode='test'): elif mode == 'simple': from sklearn.model_selection import ParameterGrid - param_grid = ParameterGrid([ - {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]}, - {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]) + param_grid = mix_param_grids([list(ParameterGrid([ + {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])), + list(ParameterGrid([ + {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))]) # print(list(param_grid)) if ds_name == 'AIDS_symb': @@ -148,7 +149,7 @@ if __name__ == '__main__': # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' + save_dir = 'outputs/CRIANN/edit_costs.real_data.num_sols.ratios.IPFP/' os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir + 'groups/', exist_ok=True) diff --git a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py new file mode 100644 index 0000000..f450c1e --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Nov 2 16:17:01 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. + +import os +import multiprocessing +import pickle +import logging +from gklearn.ged.util import compute_geds +import time +from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids +import sys +from group_results import group_trials, check_group_existence, update_group_marker + + +def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): + + save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. + # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) + 'lsape_model': 'ECBP', # + # ??when bigger than 1, then the method is considered mIPFP. + # the actual number of computed solutions might be smaller than the specified value + 'max_num_solutions': 1, # @ max_num_solutions, + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + 'greedy_method': 'BASIC', # + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'optimal': True, # if TRUE, the option --greedy-method has no effect + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'centrality_method': 'NONE', + 'centrality_weight': 0.7, + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = set_edit_cost_consts(ratio, + node_labeled=len(dataset.node_labels), + edge_labeled=len(dataset.edge_labels), + mode='uniform') +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = dataset.node_labels + options['edge_labels'] = dataset.edge_labels + options['node_attrs'] = dataset.node_attrs + options['edge_attrs'] = dataset.edge_attrs + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + runtime = 0 + try: + time0 = time.time() + ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, + options=options, + repeats=num_solutions, + permute_nodes=True, + random_state=None, + parallel=parallel, + verbose=True) + runtime = time.time() - time0 + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception(save_file_suffix) + print(repr(exp)) + + """**6. Get results.**""" + + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(dataset, ds_name, num_solutions, ratio): + # Return if the group file exists. + name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if check_group_existence(name_group): + return + + ged_mats = [] + runtimes = [] + num_trials = 100 + for trial in range(1, num_trials + 1): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + + # Group trials and remove single files. + # @todo: if the program stops between the following lines, then there may be errors. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) + update_group_marker(name_group) + + +def results_for_a_dataset(ds_name): + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + for params in list(param_grid): + print() + print(params) + save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio']) + + +def get_param_lists(ds_name, mode='test'): + if mode == 'test': + num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] + ratio_list = [10] + return num_solutions_list, ratio_list + + elif mode == 'simple': + from sklearn.model_selection import ParameterGrid + param_grid = mix_param_grids([list(ParameterGrid([ + {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])), + list(ParameterGrid([ + {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))]) +# print(list(param_grid)) + + if ds_name == 'AIDS_symb': + num_solutions_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + else: + num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] + + return param_grid + + +if __name__ == '__main__': + if len(sys.argv) > 1: + ds_name_list = sys.argv[1:] + else: + ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] +# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] +# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] + + save_dir = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) + + for ds_name in ds_name_list: + print() + print('Dataset:', ds_name) + param_grid = get_param_lists(ds_name, mode='simple') + results_for_a_dataset(ds_name) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/group_results.py b/gklearn/experiments/ged/stability/group_results.py index bdbe89f..10f930c 100644 --- a/gklearn/experiments/ged/stability/group_results.py +++ b/gklearn/experiments/ged/stability/group_results.py @@ -32,6 +32,7 @@ def check_group_existence(file_name): def update_group_marker(file_name): + # @todo: possible error when seveal tasks are using this file at the same time. path, name = os.path.split(file_name) marker_fn = os.path.join(path, 'group_names_finished.pkl') if os.path.isfile(marker_fn): diff --git a/gklearn/experiments/ged/stability/run_job_edit_costs.max_nums_sols.ratios.bipartite.py b/gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py similarity index 52% rename from gklearn/experiments/ged/stability/run_job_edit_costs.max_nums_sols.ratios.bipartite.py rename to gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py index 276a1a5..a33a9c1 100644 --- a/gklearn/experiments/ged/stability/run_job_edit_costs.max_nums_sols.ratios.bipartite.py +++ b/gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py @@ -9,36 +9,45 @@ import os import re +cur_path = os.path.dirname(os.path.abspath(__file__)) + + def get_job_script(arg): script = r""" #!/bin/bash #SBATCH --exclusive #SBATCH --job-name="st.""" + arg + r""".bp" -#SBATCH --partition=tlong +#SBATCH --partition=court #SBATCH --mail-type=ALL #SBATCH --mail-user=jajupmochi@gmail.com -#SBATCH --output="outputs/output_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" -#SBATCH --error="errors/error_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" +#SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt" +#SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt" # #SBATCH --ntasks=1 #SBATCH --nodes=1 #SBATCH --cpus-per-task=1 -#SBATCH --time=300:00:00 +#SBATCH --time=48:00:00 #SBATCH --mem-per-cpu=4000 srun hostname -srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability -srun python3 edit_costs.max_nums_sols.ratios.bipartite.py """ + arg +cd """ + cur_path + r""" +echo Working directory : $PWD +echo Local work dir : $LOCAL_WORK_DIR +python3 edit_costs.real_data.nums_sols.ratios.bipartite.py """ + arg script = script.strip() script = re.sub('\n\t+', '\n', script) script = re.sub('\n +', '\n', script) - + return script if __name__ == '__main__': - ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: + + os.makedirs('outputs/', exist_ok=True) + os.makedirs('errors/', exist_ok=True) + + ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] + for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]: job_script = get_job_script(ds_name) command = 'sbatch < 0: + for g_idx, grid in enumerate(list_of_grids): + if idx < len(grid): + mixed_grids.append(grid[idx]) + else: + not_finished[g_idx] = False + idx += 1 + + return mixed_grids + + + if __name__ == '__main__': root_dir = 'outputs/CRIANN/' # for dir_ in sorted(os.listdir(root_dir)): @@ -337,4 +353,4 @@ if __name__ == '__main__': # get_relative_errors(save_dir) # except Exception as exp: # print('An exception occured when running this experiment:') -# print(repr(exp)) \ No newline at end of file +# print(repr(exp)) diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index a5a5ac5..d75939a 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -64,10 +64,12 @@ def pairwise_ged(g1, g2, options={}, sort=True, repeats=1, parallel=False, verbo g = listID[0] h = listID[1] dis_min = np.inf +# print('------------------------------------------') for i in range(0, repeats): ged_env.run_method(g, h) upper = ged_env.get_upper_bound(g, h) dis = upper +# print(dis) if dis < dis_min: dis_min = dis pi_forward = ged_env.get_forward_map(g, h) @@ -169,12 +171,100 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True return ged_vec, ged_mat, n_edit_operations -def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, n_jobs=None, verbose=True): +#%% + + +def compute_geds(graphs, + options={}, + sort=True, + repeats=1, + permute_nodes=False, + random_state=None, + parallel=False, + n_jobs=None, + verbose=True): + """Compute graph edit distance matrix using GEDLIB. + """ + if permute_nodes: + return _compute_geds_with_permutation(graphs, + options=options, + sort=sort, + repeats=repeats, + random_state=random_state, + parallel=parallel, + n_jobs=n_jobs, + verbose=verbose) + else: + return _compute_geds_without_permutation(graphs, + options=options, + sort=sort, + repeats=repeats, + parallel=parallel, + n_jobs=n_jobs, + verbose=verbose) + + +#%% + + +def _compute_geds_with_permutation(graphs, + options={}, + sort=True, + repeats=1, + random_state=None, + parallel=False, + n_jobs=None, + verbose=True): + + from gklearn.utils.utils import nx_permute_nodes + + # Initialze variables. + ged_mat_optim = np.full((len(graphs), len(graphs)), np.inf) + np.fill_diagonal(ged_mat_optim, 0) + len_itr = int(len(graphs) * (len(graphs) - 1) / 2) + ged_vec = [0] * len_itr + n_edit_operations = [0] * len_itr + + # for each repeats: + for i in range(0, repeats): + # Permutate nodes. + graphs_pmut = [nx_permute_nodes(g, random_state=random_state) for g in graphs] + + out = _compute_geds_without_permutation(graphs_pmut, + options=options, + sort=sort, + repeats=1, + parallel=parallel, + n_jobs=n_jobs, + verbose=verbose) + + # Compare current results with the best one. + idx_cnt = 0 + for i in range(len(graphs)): + for j in range(i + 1, len(graphs)): + if out[1][i, j] < ged_mat_optim[i ,j]: + ged_mat_optim[i, j] = out[1][i, j] + ged_mat_optim[j, i] = out[1][j, i] + ged_vec[idx_cnt] = out[0][idx_cnt] + n_edit_operations[idx_cnt] = out[2][idx_cnt] + idx_cnt += 1 + + return ged_vec, ged_mat_optim, n_edit_operations + + +def _compute_geds_without_permutation(graphs, + options={}, + sort=True, + repeats=1, + parallel=False, + n_jobs=None, + verbose=True): from gklearn.gedlib import librariesImport, gedlibpy # initialize ged env. ged_env = gedlibpy.GEDEnv() ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) + for g in graphs: ged_env.add_nx_graph(g, '') listID = ged_env.get_all_graph_ids() @@ -266,6 +356,11 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): dis = upper # make the map label correct (label remove map as np.inf) + # Attention: using node indices instead of NetworkX node labels (as + # implemented here) may cause several issues: + # - Fail if NetworkX node labels are not consecutive integers; + # - Return wrong mappings if nodes are permutated (e.g., by using + # `gklearn.utis.utils.nx_permute_nodes()`.) nodes1 = [n for n in g1.nodes()] nodes2 = [n for n in g2.nodes()] nb1 = nx.number_of_nodes(g1) @@ -278,46 +373,57 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): pi_forward_min = pi_forward pi_backward_min = pi_backward +# print('-----') +# print(pi_forward_min) +# print(pi_backward_min) + return dis_min, pi_forward_min, pi_backward_min -def label_costs_to_matrix(costs, nb_labels): - """Reform a label cost vector to a matrix. +#%% + + +def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): + """Calculate the numbers of the occurence of each edit operation in a given + edit path. Parameters ---------- - costs : numpy.array - The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. - nb_labels : integer - Number of labels. + g1 : TYPE + DESCRIPTION. + g2 : TYPE + DESCRIPTION. + forward_map : TYPE + DESCRIPTION. + backward_map : TYPE + DESCRIPTION. + edit_cost : TYPE, optional + DESCRIPTION. The default is None. + is_cml : TYPE, optional + DESCRIPTION. The default is False. + **kwargs : TYPE + DESCRIPTION. + + Raises + ------ + Exception + DESCRIPTION. Returns ------- - cost_matrix : numpy.array. - The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. + TYPE + DESCRIPTION. + + Notes + ----- + Attention: when implementing a function to get the numbers of edit + operations, make sure that: + - It does not fail if NetworkX node labels are not consecutive integers; + - It returns correct results if nodes are permutated (e.g., by using + `gklearn.utis.utils.nx_permute_nodes()`.) + Generally speaking, it means you need to distinguish the NetworkX label of + a node from the position (index) of that node in the node list. """ - # Initialize label cost matrix. - cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) - i = 0 - # Costs of insertions. - for col in range(1, nb_labels + 1): - cost_matrix[0, col] = costs[i] - i += 1 - # Costs of deletions. - for row in range(1, nb_labels + 1): - cost_matrix[row, 0] = costs[i] - i += 1 - # Costs of substitutions. - for row in range(1, nb_labels + 1): - for col in range(row + 1, nb_labels + 1): - cost_matrix[row, col] = costs[i] - cost_matrix[col, row] = costs[i] - i += 1 - - return cost_matrix - - -def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): if is_cml: if edit_cost == 'CONSTANT': node_labels = kwargs.get('node_labels', []) @@ -611,6 +717,48 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es +#%% + + +def label_costs_to_matrix(costs, nb_labels): + """Reform a label cost vector to a matrix. + + Parameters + ---------- + costs : numpy.array + The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. + nb_labels : integer + Number of labels. + + Returns + ------- + cost_matrix : numpy.array. + The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. + """ + # Initialize label cost matrix. + cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) + i = 0 + # Costs of insertions. + for col in range(1, nb_labels + 1): + cost_matrix[0, col] = costs[i] + i += 1 + # Costs of deletions. + for row in range(1, nb_labels + 1): + cost_matrix[row, 0] = costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, nb_labels + 1): + for col in range(row + 1, nb_labels + 1): + cost_matrix[row, col] = costs[i] + cost_matrix[col, row] = costs[i] + i += 1 + + return cost_matrix + + +#%% + + def ged_options_to_string(options): opt_str = ' ' for key, val in options.items(): diff --git a/gklearn/utils/utils.py b/gklearn/utils/utils.py index 5758291..f0e49fd 100644 --- a/gklearn/utils/utils.py +++ b/gklearn/utils/utils.py @@ -7,6 +7,9 @@ from enum import Enum, unique # from tqdm import tqdm +#%% + + def getSPLengths(G1): sp = nx.shortest_path(G1) distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) @@ -286,81 +289,146 @@ def direct_product_graph(G1, G2, node_labels, edge_labels): return gt -def graph_deepcopy(G): - """Deep copy a graph, including deep copy of all nodes, edges and - attributes of the graph, nodes and edges. +def find_paths(G, source_node, length): + """Find all paths with a certain length those start from a source node. + A recursive depth first search is applied. - Note - ---- - It is the same as the NetworkX function graph.copy(), as far as I know. + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + source_node : integer + The number of the node from where all paths start. + length : integer + The length of paths. + + Return + ------ + path : list of list + List of paths retrieved, where each path is represented by a list of nodes. """ - # add graph attributes. - labels = {} - for k, v in G.graph.items(): - labels[k] = deepcopy(v) - if G.is_directed(): - G_copy = nx.DiGraph(**labels) - else: - G_copy = nx.Graph(**labels) + if length == 0: + return [[source_node]] + path = [[source_node] + path for neighbor in G[source_node] \ + for path in find_paths(G, neighbor, length - 1) if source_node not in path] + return path - # add nodes - for nd, attrs in G.nodes(data=True): - labels = {} - for k, v in attrs.items(): - labels[k] = deepcopy(v) - G_copy.add_node(nd, **labels) - # add edges. - for nd1, nd2, attrs in G.edges(data=True): - labels = {} - for k, v in attrs.items(): - labels[k] = deepcopy(v) - G_copy.add_edge(nd1, nd2, **labels) +def find_all_paths(G, length, is_directed): + """Find all paths with a certain length in a graph. A recursive depth first + search is applied. - return G_copy + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + length : integer + The length of paths. + Return + ------ + path : list of list + List of paths retrieved, where each path is represented by a list of nodes. + """ + all_paths = [] + for node in G: + all_paths.extend(find_paths(G, node, length)) -def graph_isIdentical(G1, G2): - """Check if two graphs are identical, including: same nodes, edges, node - labels/attributes, edge labels/attributes. + if not is_directed: + # For each path, two presentations are retrieved from its two extremities. + # Remove one of them. + all_paths_r = [path[::-1] for path in all_paths] + for idx, path in enumerate(all_paths[:-1]): + for path2 in all_paths_r[idx+1::]: + if path == path2: + all_paths[idx] = [] + break + all_paths = list(filter(lambda a: a != [], all_paths)) - Notes - ----- - 1. The type of graphs has to be the same. + return all_paths - 2. Global/Graph attributes are neglected as they may contain names for graphs. - """ - # check nodes. - nlist1 = [n for n in G1.nodes(data=True)] - nlist2 = [n for n in G2.nodes(data=True)] - if not nlist1 == nlist2: - return False - # check edges. - elist1 = [n for n in G1.edges(data=True)] - elist2 = [n for n in G2.edges(data=True)] - if not elist1 == elist2: - return False - # check graph attributes. - return True +# @todo: use it in ShortestPath. +def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]): + """Compute kernels between each pair of vertices in two graphs. + Parameters + ---------- + g1, g2 : NetworkX graph + The kernels bewteen pairs of vertices in these two graphs are computed. + node_kernels : dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when nodes are unlabeled. This argument + is designated to conjugate gradient method and fixed-point iterations. + node_labels : list, optional + The list of the name strings of the node labels. The default is []. + node_attrs : list, optional + The list of the name strings of the node attributes. The default is []. -def get_node_labels(Gn, node_label): - """Get node labels of dataset Gn. - """ - nl = set() - for G in Gn: - nl = nl | set(nx.get_node_attributes(G, node_label).values()) - return nl + Returns + ------- + vk_dict : dict + Vertex kernels keyed by vertices. + Notes + ----- + This function is used by ``gklearn.kernels.FixedPoint'' and + ``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1]. -def get_edge_labels(Gn, edge_label): - """Get edge labels of dataset Gn. + References + ---------- + .. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang. + Parallelization of shortest path graph kernels on multi-core cpus and gpus. + Proceedings of the Programmability Issues for Heterogeneous Multicores + (MultiProg), Vienna, Austria, 2014. """ - el = set() - for G in Gn: - el = el | set(nx.get_edge_attributes(G, edge_label).values()) - return el + vk_dict = {} # shortest path matrices dict + if len(node_labels) > 0: + # node symb and non-synb labeled + if len(node_attrs) > 0: + kn = node_kernels['mix'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in node_labels] + n2_labels = [n2[1][nl] for nl in node_labels] + n1_attrs = [n1[1][na] for na in node_attrs] + n2_attrs = [n2[1][na] for na in node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in node_labels] + n2_labels = [n2[1][nl] for nl in node_labels] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + else: + # node non-synb labeled + if len(node_attrs) > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_attrs = [n1[1][na] for na in node_attrs] + n2_attrs = [n2[1][na] for na in node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) + # node unlabeled + else: + pass # @todo: add edge weights. +# for e1 in g1.edges(data=True): +# for e2 in g2.edges(data=True): +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 +# return kernel + + return vk_dict + + +#%% def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): @@ -513,79 +581,6 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d print('\ncomplete.') -def find_paths(G, source_node, length): - """Find all paths with a certain length those start from a source node. - A recursive depth first search is applied. - - Parameters - ---------- - G : NetworkX graphs - The graph in which paths are searched. - source_node : integer - The number of the node from where all paths start. - length : integer - The length of paths. - - Return - ------ - path : list of list - List of paths retrieved, where each path is represented by a list of nodes. - """ - if length == 0: - return [[source_node]] - path = [[source_node] + path for neighbor in G[source_node] \ - for path in find_paths(G, neighbor, length - 1) if source_node not in path] - return path - - -def find_all_paths(G, length, is_directed): - """Find all paths with a certain length in a graph. A recursive depth first - search is applied. - - Parameters - ---------- - G : NetworkX graphs - The graph in which paths are searched. - length : integer - The length of paths. - - Return - ------ - path : list of list - List of paths retrieved, where each path is represented by a list of nodes. - """ - all_paths = [] - for node in G: - all_paths.extend(find_paths(G, node, length)) - - if not is_directed: - # For each path, two presentations are retrieved from its two extremities. - # Remove one of them. - all_paths_r = [path[::-1] for path in all_paths] - for idx, path in enumerate(all_paths[:-1]): - for path2 in all_paths_r[idx+1::]: - if path == path2: - all_paths[idx] = [] - break - all_paths = list(filter(lambda a: a != [], all_paths)) - - return all_paths - - -def get_mlti_dim_node_attrs(G, attr_names): - attributes = [] - for nd, attrs in G.nodes(data=True): - attributes.append(tuple(attrs[aname] for aname in attr_names)) - return attributes - - -def get_mlti_dim_edge_attrs(G, attr_names): - attributes = [] - for ed, attrs in G.edges(data=True): - attributes.append(tuple(attrs[aname] for aname in attr_names)) - return attributes - - def normalize_gram_matrix(gram_matrix): diag = gram_matrix.diagonal().copy() old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. @@ -621,84 +616,162 @@ def compute_distance_matrix(gram_matrix): return dis_mat, dis_max, dis_min, dis_mean -# @todo: use it in ShortestPath. -def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]): - """Compute kernels between each pair of vertices in two graphs. +#%% + + +def graph_deepcopy(G): + """Deep copy a graph, including deep copy of all nodes, edges and + attributes of the graph, nodes and edges. + + Note + ---- + - It is the same as the NetworkX function graph.copy(), as far as I know. + + - This function only supports Networkx.Graph and Networkx.DiGraph. + """ + # add graph attributes. + labels = {} + for k, v in G.graph.items(): + labels[k] = deepcopy(v) + if G.is_directed(): + G_copy = nx.DiGraph(**labels) + else: + G_copy = nx.Graph(**labels) + + # add nodes + for nd, attrs in G.nodes(data=True): + labels = {} + for k, v in attrs.items(): + labels[k] = deepcopy(v) + G_copy.add_node(nd, **labels) + + # add edges. + for nd1, nd2, attrs in G.edges(data=True): + labels = {} + for k, v in attrs.items(): + labels[k] = deepcopy(v) + G_copy.add_edge(nd1, nd2, **labels) + + return G_copy + + +def graph_isIdentical(G1, G2): + """Check if two graphs are identical, including: same nodes, edges, node + labels/attributes, edge labels/attributes. + + Notes + ----- + 1. The type of graphs has to be the same. + + 2. Global/Graph attributes are neglected as they may contain names for graphs. + """ + # check nodes. + nlist1 = [n for n in G1.nodes(data=True)] + nlist2 = [n for n in G2.nodes(data=True)] + if not nlist1 == nlist2: + return False + # check edges. + elist1 = [n for n in G1.edges(data=True)] + elist2 = [n for n in G2.edges(data=True)] + if not elist1 == elist2: + return False + # check graph attributes. + + return True + + +def get_node_labels(Gn, node_label): + """Get node labels of dataset Gn. + """ + nl = set() + for G in Gn: + nl = nl | set(nx.get_node_attributes(G, node_label).values()) + return nl + + +def get_edge_labels(Gn, edge_label): + """Get edge labels of dataset Gn. + """ + el = set() + for G in Gn: + el = el | set(nx.get_edge_attributes(G, edge_label).values()) + return el + + +def get_mlti_dim_node_attrs(G, attr_names): + attributes = [] + for nd, attrs in G.nodes(data=True): + attributes.append(tuple(attrs[aname] for aname in attr_names)) + return attributes + + +def get_mlti_dim_edge_attrs(G, attr_names): + attributes = [] + for ed, attrs in G.edges(data=True): + attributes.append(tuple(attrs[aname] for aname in attr_names)) + return attributes + + +def nx_permute_nodes(G, random_state=None): + """Permute node indices in a NetworkX graph. Parameters ---------- - g1, g2 : NetworkX graph - The kernels bewteen pairs of vertices in these two graphs are computed. - node_kernels : dict - A dictionary of kernel functions for nodes, including 3 items: 'symb' - for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' - for both labels. The first 2 functions take two node labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two nodes. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns a number - as the kernel value. Ignored when nodes are unlabeled. This argument - is designated to conjugate gradient method and fixed-point iterations. - node_labels : list, optional - The list of the name strings of the node labels. The default is []. - node_attrs : list, optional - The list of the name strings of the node attributes. The default is []. + G : TYPE + DESCRIPTION. + random_state : TYPE, optional + DESCRIPTION. The default is None. Returns ------- - vk_dict : dict - Vertex kernels keyed by vertices. + G_new : TYPE + DESCRIPTION. Notes ----- - This function is used by ``gklearn.kernels.FixedPoint'' and - ``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1]. - - References - ---------- - .. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang. - Parallelization of shortest path graph kernels on multi-core cpus and gpus. - Proceedings of the Programmability Issues for Heterogeneous Multicores - (MultiProg), Vienna, Austria, 2014. + - This function only supports Networkx.Graph and Networkx.DiGraph. """ - vk_dict = {} # shortest path matrices dict - if len(node_labels) > 0: - # node symb and non-synb labeled - if len(node_attrs) > 0: - kn = node_kernels['mix'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_labels = [n1[1][nl] for nl in node_labels] - n2_labels = [n2[1][nl] for nl in node_labels] - n1_attrs = [n1[1][na] for na in node_attrs] - n2_attrs = [n2[1][na] for na in node_attrs] - vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) - # node symb labeled - else: - kn = node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_labels = [n1[1][nl] for nl in node_labels] - n2_labels = [n2[1][nl] for nl in node_labels] - vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + # @todo: relabel node with integers? (in case something went wrong...) + # Add graph attributes. + labels = {} + for k, v in G.graph.items(): + labels[k] = deepcopy(v) + if G.is_directed(): + G_new = nx.DiGraph(**labels) else: - # node non-synb labeled - if len(node_attrs) > 0: - kn = node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_attrs = [n1[1][na] for na in node_attrs] - n2_attrs = [n2[1][na] for na in node_attrs] - vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) - # node unlabeled - else: - pass # @todo: add edge weights. -# for e1 in g1.edges(data=True): -# for e2 in g2.edges(data=True): -# if e1[2]['cost'] == e2[2]['cost']: -# kernel += 1 -# return kernel + G_new = nx.Graph(**labels) - return vk_dict + # Create a random mapping old node indices <-> new indices. + nb_nodes = nx.number_of_nodes(G) + indices_orig = range(nb_nodes) + idx_mapping = np.random.RandomState(seed=random_state).permutation(indices_orig) + + # Add nodes. + nodes_orig = list(G.nodes) + for i_orig in range(nb_nodes): + i_new = idx_mapping[i_orig] + labels = {} + for k, v in G.nodes[nodes_orig[i_new]].items(): + labels[k] = deepcopy(v) + G_new.add_node(nodes_orig[i_new], **labels) + + # Add edges. + for nd1, nd2, attrs in G.edges(data=True): + labels = {} + for k, v in attrs.items(): + labels[k] = deepcopy(v) + G_new.add_edge(nd1, nd2, **labels) + + +# # create a random mapping old label -> new label +# node_mapping = dict(zip(G.nodes(), np.random.RandomState(seed=random_state).permutation(G.nodes()))) +# # build a new graph +# G_new = nx.relabel_nodes(G, node_mapping) + + return G_new + + +#%% def dummy_node(): From a7e189134d5cc3541ab272d77a6bf3668a216ac1 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 18 Mar 2022 10:24:21 +0100 Subject: [PATCH 06/11] [Fix] Change the output directory name. --- .../ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py index f450c1e..d2d4db5 100644 --- a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py +++ b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py @@ -161,7 +161,7 @@ if __name__ == '__main__': # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - save_dir = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' + save_dir = 'outputs/CRIANN/edit_costs.real_data.nums_sols.ratios.bipartite/' os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir + 'groups/', exist_ok=True) @@ -169,4 +169,4 @@ if __name__ == '__main__': print() print('Dataset:', ds_name) param_grid = get_param_lists(ds_name, mode='simple') - results_for_a_dataset(ds_name) \ No newline at end of file + results_for_a_dataset(ds_name) From 5e37d4447f96953ffd7004d0e695dd30d53f9242 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 6 May 2022 14:12:31 +0200 Subject: [PATCH 07/11] [Major Features] Add GEDModel which is compatibale with . --- gklearn/ged/__init__.py | 1 + gklearn/ged/model/distances.py | 43 +++ gklearn/ged/model/ged_com.py | 97 ++++++ gklearn/ged/model/ged_model.py | 724 +++++++++++++++++++++++++++++++++++++++ gklearn/ged/model/optim_costs.py | 149 ++++++++ 5 files changed, 1014 insertions(+) create mode 100644 gklearn/ged/model/distances.py create mode 100644 gklearn/ged/model/ged_com.py create mode 100644 gklearn/ged/model/ged_model.py create mode 100644 gklearn/ged/model/optim_costs.py diff --git a/gklearn/ged/__init__.py b/gklearn/ged/__init__.py index e69de29..8696f76 100644 --- a/gklearn/ged/__init__.py +++ b/gklearn/ged/__init__.py @@ -0,0 +1 @@ +from gklearn.ged.model.ged_model import GEDModel \ No newline at end of file diff --git a/gklearn/ged/model/distances.py b/gklearn/ged/model/distances.py new file mode 100644 index 0000000..3e27eb3 --- /dev/null +++ b/gklearn/ged/model/distances.py @@ -0,0 +1,43 @@ +import numpy as np + + +def sum_squares(a, b): + """ + Return the sum of squares of the difference between a and b, aka MSE + """ + return np.sum([(a[i] - b[i])**2 for i in range(len(a))]) + + +def euclid_d(x, y): + """ + 1D euclidean distance + """ + return np.sqrt((x-y)**2) + + +def man_d(x, y): + """ + 1D manhattan distance + """ + return np.abs((x-y)) + + +def classif_d(x, y): + """ + Function adapted to classification problems + """ + return np.array(0 if x == y else 1) + + +def rmse(pred, ground_truth): + import numpy as np + return np.sqrt(sum_squares(pred, ground_truth)/len(ground_truth)) + + +def accuracy(pred, ground_truth): + import numpy as np + return np.mean([a == b for a, b in zip(pred, ground_truth)]) + + +def rbf_k(D, sigma=1): + return np.exp(-(D**2)/sigma) diff --git a/gklearn/ged/model/ged_com.py b/gklearn/ged/model/ged_com.py new file mode 100644 index 0000000..9da5f87 --- /dev/null +++ b/gklearn/ged/model/ged_com.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu May 5 14:02:17 2022 + +@author: ljia +""" +import sys +from gklearn.ged.model.distances import euclid_d +from gklearn.ged.util import pairwise_ged, get_nb_edit_operations +from gklearn.utils import get_iters + + +def compute_ged(Gi, Gj, edit_cost, method='BIPARTITE', **kwargs): + """ + Compute GED between two graph according to edit_cost + """ + ged_options = {'edit_cost': 'CONSTANT', + 'method': method, + 'edit_cost_constants': edit_cost} + node_labels = kwargs.get('node_labels', []) + edge_labels = kwargs.get('edge_labels', []) + dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10) + n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, edit_cost='CONSTANT', node_labels=node_labels, edge_labels=edge_labels) + return dis, n_eo_tmp + + +def compute_ged_all_dataset(Gn, edit_cost, ed_method, **kwargs): + N = len(Gn) + G_pairs = [] + for i in range(N): + for j in range(i, N): + G_pairs.append([i, j]) + return compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs) + + +def compute_geds(G_pairs, Gn, edit_cost, ed_method, verbose=True, **kwargs): + """ + Compute GED between all indexes in G_pairs given edit_cost + :return: ged_vec : the list of computed distances, n_edit_operations : the list of edit operations + """ + ged_vec = [] + n_edit_operations = [] + for k in get_iters(range(len(G_pairs)), desc='Computing GED', file=sys.stdout, length=len(G_pairs), verbose=verbose): + [i, j] = G_pairs[k] + dis, n_eo_tmp = compute_ged( + Gn[i], Gn[j], edit_cost=edit_cost, method=ed_method, **kwargs) + ged_vec.append(dis) + n_edit_operations.append(n_eo_tmp) + + return ged_vec, n_edit_operations + + +def compute_D(G_app, edit_cost, G_test=None, ed_method='BIPARTITE', **kwargs): + import numpy as np + N = len(G_app) + D_app = np.zeros((N, N)) + + for i, G1 in get_iters(enumerate(G_app), desc='Computing D - app', file=sys.stdout, length=N): + for j, G2 in enumerate(G_app[i+1:], i+1): + D_app[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) + D_app[j, i] = D_app[i, j] + if (G_test is None): + return D_app, edit_cost + else: + D_test = np.zeros((len(G_test), N)) + for i, G1 in get_iters(enumerate(G_test), desc='Computing D - test', file=sys.stdout, length=len(G_test)): + for j, G2 in enumerate(G_app): + D_test[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) + return D_app, D_test, edit_cost + + +def compute_D_random(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): + import numpy as np + edit_costs = np.random.rand(6) + return compute_D(G_app, edit_costs, G_test, ed_method=ed_method, **kwargs) + + +def compute_D_expert(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): + edit_cost = [3, 3, 1, 3, 3, 1] + return compute_D(G_app, edit_cost, G_test, ed_method=ed_method, **kwargs) + + +def compute_D_fitted(G_app, y_app, G_test=None, y_distance=euclid_d, + mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs): + from gklearn.ged.models.optim_costs import compute_optimal_costs + + costs_optim = compute_optimal_costs( + G_app, y_app, y_distance=y_distance, + mode=mode, unlabeled=unlabeled, ed_method=ed_method, **kwargs) + return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs) + + +def compute_D_GH2020(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): + from gklearn.ged.optim_costs import get_optimal_costs_GH2020 + costs_optim = get_optimal_costs_GH2020(**kwargs) + return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs) diff --git a/gklearn/ged/model/ged_model.py b/gklearn/ged/model/ged_model.py new file mode 100644 index 0000000..9bdbc90 --- /dev/null +++ b/gklearn/ged/model/ged_model.py @@ -0,0 +1,724 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu May 5 09:42:30 2022 + +@author: ljia +""" +import sys +import multiprocessing +import time +import numpy as np +import networkx as nx + +# from abc import ABC, abstractmethod +from sklearn.base import BaseEstimator # , TransformerMixin +from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, +from sklearn.exceptions import NotFittedError + +from gklearn.ged.model.distances import euclid_d +from gklearn.ged.util import pairwise_ged, get_nb_edit_operations +# from gklearn.utils import normalize_gram_matrix +from gklearn.utils import get_iters + + +class GEDModel(BaseEstimator): #, ABC): + """The graph edit distance model class compatible with `scikit-learn`. + + Attributes + ---------- + _graphs : list + Stores the input graphs on fit input data. + Default format of the list objects is `NetworkX` graphs. + **We don't guarantee that the input graphs remain unchanged during the + computation.** + + References + ---------- + https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. + """ + + def __init__(self, + ed_method='BIPARTITE', + edit_cost_fun='CONSTANT', + init_edit_cost_constants=[3, 3, 1, 3, 3, 1], + optim_method='init', + optim_options={'y_distance': euclid_d, 'mode': 'reg'}, + node_labels=[], + edge_labels=[], + parallel=None, + n_jobs=None, + chunksize=None, +# normalize=True, + copy_graphs=True, # make sure it is a full deep copy. and faster! + verbose=2): + """`__init__` for `GEDModel` object.""" + # @todo: the default settings of the parameters are different from those in the self.compute method. +# self._graphs = None + self.ed_method = ed_method + self.edit_cost_fun = edit_cost_fun + self.init_edit_cost_constants = init_edit_cost_constants + self.optim_method=optim_method + self.optim_options=optim_options + self.node_labels=node_labels + self.edge_labels=edge_labels + self.parallel = parallel + self.n_jobs = n_jobs + self.chunksize = chunksize +# self.normalize = normalize + self.copy_graphs = copy_graphs + self.verbose = verbose +# self._run_time = 0 +# self._gram_matrix = None +# self._gram_matrix_unnorm = None + + + ########################################################################## + # The following is the 1st paradigm to compute GED distance matrix, which is + # compatible with `scikit-learn`. + ########################################################################## + + + def fit(self, X, y=None): + """Fit a graph dataset for a transformer. + + Parameters + ---------- + X : iterable + DESCRIPTION. + + y : None, optional + There is no need of a target in a transformer, yet the `scikit-learn` + pipeline API requires this parameter. + + Returns + ------- + object + Returns self. + + """ +# self._is_tranformed = False + + # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; + self.clear_attributes() + + # Validate parameters for the transformer. + self.validate_parameters() + + # Validate the input. + self._graphs = self.validate_input(X) + if y is not None: + self._targets = y + # self._targets = self.validate_input(y) + +# self._X = X +# self._kernel = self._get_kernel_instance() + + # Return the transformer. + return self + + + def transform(self, X=None, return_dm_train=False): + """Compute the graph kernel matrix between given and fitted data. + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Raises + ------ + ValueError + DESCRIPTION. + + Returns + ------- + None. + + """ + # If `return_dm_train`, return the fitted GED distance matrix of training data. + if return_dm_train: + check_is_fitted(self, '_dm_train') + self._is_transformed = True + return self._dm_train # @todo: copy or not? + + # Check if method "fit" had been called. + check_is_fitted(self, '_graphs') + + # Validate the input. + Y = self.validate_input(X) + + # Transform: compute the graph kernel matrix. + dis_matrix = self.compute_distance_matrix(Y) + self._Y = Y + + # Self transform must appear before the diagonal call on normilization. + self._is_transformed = True +# if self.normalize: +# X_diag, Y_diag = self.diagonals() +# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. +# try: +# kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) +# except: +# raise +# finally: +# np.seterr(**old_settings) + + return dis_matrix + + + def fit_transform(self, X, y=None, save_dm_train=False): + """Fit and transform: compute GED distance matrix on the same data. + + Parameters + ---------- + X : list of graphs + Input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [len(X), len(X)] + The distance matrix of X. + + """ + self.fit(X, y) + + # Compute edit cost constants. + self.compute_edit_costs() + + # Transform: compute Gram matrix. + dis_matrix = self.compute_distance_matrix() + +# # Normalize. +# if self.normalize: +# self._X_diag = np.diagonal(gram_matrix).copy() +# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. +# try: +# gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) +# except: +# raise +# finally: +# np.seterr(**old_settings) + + if save_dm_train: + self._dm_train = dis_matrix + + return dis_matrix + + + def get_params(self): + pass + + + def set_params(self): + pass + + + def clear_attributes(self): # @todo: update +# if hasattr(self, '_X_diag'): +# delattr(self, '_X_diag') + if hasattr(self, '_graphs'): + delattr(self, '_graphs') + if hasattr(self, '_Y'): + delattr(self, '_Y') + if hasattr(self, '_run_time'): + delattr(self, '_run_time') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + if self.parallel is not None and self.parallel != 'imap_unordered': + raise ValueError('Parallel mode is not set correctly.') + + if self.parallel == 'imap_unordered' and self.n_jobs is None: + self.n_jobs = multiprocessing.cpu_count() + + + def validate_input(self, X): + """Validate the given input and raise errors if it is invalid. + + Parameters + ---------- + X : list + The input to check. Should be a list of graph. + + Raises + ------ + ValueError + Raise if the input is not correct. + + Returns + ------- + X : list + The input. A list of graph. + + """ + if X is None: + raise ValueError('Please add graphs before computing.') + elif not isinstance(X, list): + raise ValueError('Cannot detect graphs. The input must be a list.') + elif len(X) == 0: + raise ValueError('The graph list given is empty. No computation will be performed.') + + return X + + + def compute_distance_matrix(self, Y=None): + """Compute the distance matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) or the distance matrix for the fitted + graphs (X / self._graphs). + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. The default is None. If None kernel is computed + between X and itself. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + if Y is None: + # Compute Gram matrix for self._graphs (X). + dis_matrix = self._compute_X_distance_matrix() +# self._gram_matrix_unnorm = np.copy(self._gram_matrix) + + else: + # Compute kernel matrix between Y and self._graphs (X). + start_time = time.time() + + if self.parallel == 'imap_unordered': + dis_matrix = self._compute_distance_matrix_imap_unordered(Y) + + elif self.parallel is None: + Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y) + graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + dis_matrix = self._compute_distance_matrix_series(Y_copy, graphs_copy) + + self._run_time = time.time() - start_time + if self.verbose: + print('Distance matrix of size (%d, %d) built in %s seconds.' + % (len(Y), len(self._graphs), self._run_time)) + + return dis_matrix + + + def _compute_distance_matrix_series(self, X, Y): + """Compute the GED distance matrix between two sets of graphs (X and Y) + without parallelization. + + Parameters + ---------- + X, Y : list of graphs + The input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [n_X, n_Y] + The computed distance matrix. + + """ + dis_matrix = np.zeros((len(X), len(Y))) + + for i_x, g_x in enumerate(X): + for i_y, g_y in enumerate(Y): + dis_matrix[i_x, i_y], _ = self.compute_ged(g_x, g_y) + + return dis_matrix + + + def _compute_kernel_matrix_imap_unordered(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) using imap unordered parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + raise Exception('Parallelization for kernel matrix is not implemented.') + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + for i, x in enumerate(graphs): + self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) + for (i, y) in enumerate(Y): + self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? + + return self._X_diag, self._Y_diag + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + +# @abstractmethod + def pairwise_distance(self, x, y): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + Returns + ------- + kernel: float + The computed kernel. + +# Notes +# ----- +# This method is abstract and must be implemented by a subclass. + + """ + raise NotImplementedError('Pairwise kernel computation is not implemented!') + + + + def compute_edit_costs(self, Y=None, Y_targets=None): + """Compute edit cost constants. When optimizing method is `fiited`, + apply Jia2021's metric learning method by using a given target graphs (Y) + the fitted graphs (X / self._graphs). + + Parameters + ---------- + Y : TYPE, optional + DESCRIPTION. The default is None. + + Returns + ------- + None. + + """ + # Get or compute. + if self.optim_method == 'random': + self._edit_cost_constants = np.random.rand(6) + + elif self.optim_method == 'init': + self._edit_cost_constants = self.init_edit_cost_constants + + + elif self.optim_method == 'expert': + self._edit_cost_constants = [3, 3, 1, 3, 3, 1] + + + elif self.optim_method == 'fitted': # Jia2021 method + # Get proper inputs. + if Y is None: + check_is_fitted(self, ['_graphs']) + check_is_fitted(self, ['_targets']) + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + targets = self._targets + else: + graphs = ([g.copy() for g in Y] if self.copy_graphs else Y) + targets = Y_targets + + # Get optimization options. + node_labels = self.node_labels + edge_labels = self.edge_labels + unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0) + from gklearn.ged.model.optim_costs import compute_optimal_costs + self._edit_cost_constants = compute_optimal_costs( + graphs, targets, + node_labels=node_labels, edge_labels=edge_labels, + unlabeled=unlabeled, ed_method=self.ed_method, + verbose=(self.verbose >= 2), + **self.optim_options) + + + ########################################################################## + # The following is the 2nd paradigm to compute kernel matrix. It is + # simplified and not compatible with `scikit-learn`. + ########################################################################## + + +# def compute(self, *graphs, **kwargs): +# self.parallel = kwargs.get('parallel', 'imap_unordered') +# self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) +# self.normalize = kwargs.get('normalize', True) +# self.verbose = kwargs.get('verbose', 2) +# self.copy_graphs = kwargs.get('copy_graphs', True) +# self.save_unnormed = kwargs.get('save_unnormed', True) +# self.validate_parameters() + +# # If the inputs is a list of graphs. +# if len(graphs) == 1: +# if not isinstance(graphs[0], list): +# raise Exception('Cannot detect graphs.') +# elif len(graphs[0]) == 0: +# raise Exception('The graph list given is empty. No computation was performed.') +# else: +# if self.copy_graphs: +# self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. +# else: +# self._graphs = graphs +# self._gram_matrix = self._compute_gram_matrix() + +# if self.save_unnormed: +# self._gram_matrix_unnorm = np.copy(self._gram_matrix) +# if self.normalize: +# self._gram_matrix = normalize_gram_matrix(self._gram_matrix) +# return self._gram_matrix, self._run_time + +# elif len(graphs) == 2: +# # If the inputs are two graphs. +# if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): +# if self.copy_graphs: +# G0, G1 = graphs[0].copy(), graphs[1].copy() +# else: +# G0, G1 = graphs[0], graphs[1] +# kernel = self._compute_single_kernel(G0, G1) +# return kernel, self._run_time + +# # If the inputs are a graph and a list of graphs. +# elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): +# if self.copy_graphs: +# g1 = graphs[0].copy() +# g_list = [g.copy() for g in graphs[1]] +# kernel_list = self._compute_kernel_list(g1, g_list) +# else: +# kernel_list = self._compute_kernel_list(graphs[0], graphs[1]) +# return kernel_list, self._run_time + +# elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): +# if self.copy_graphs: +# g1 = graphs[1].copy() +# g_list = [g.copy() for g in graphs[0]] +# kernel_list = self._compute_kernel_list(g1, g_list) +# else: +# kernel_list = self._compute_kernel_list(graphs[1], graphs[0]) +# return kernel_list, self._run_time + +# else: +# raise Exception('Cannot detect graphs.') + +# elif len(graphs) == 0 and self._graphs is None: +# raise Exception('Please add graphs before computing.') + +# else: +# raise Exception('Cannot detect graphs.') + + +# def normalize_gm(self, gram_matrix): +# import warnings +# warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning) + +# diag = gram_matrix.diagonal().copy() +# for i in range(len(gram_matrix)): +# for j in range(i, len(gram_matrix)): +# gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) +# gram_matrix[j][i] = gram_matrix[i][j] +# return gram_matrix + + +# def compute_distance_matrix(self): +# if self._gram_matrix is None: +# raise Exception('Please compute the Gram matrix before computing distance matrix.') +# dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix))) +# for i in range(len(self._gram_matrix)): +# for j in range(i, len(self._gram_matrix)): +# dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j] +# if dis < 0: +# if dis > -1e-10: +# dis = 0 +# else: +# raise ValueError('The distance is negative.') +# dis_mat[i, j] = np.sqrt(dis) +# dis_mat[j, i] = dis_mat[i, j] +# dis_max = np.max(np.max(dis_mat)) +# dis_min = np.min(np.min(dis_mat[dis_mat != 0])) +# dis_mean = np.mean(np.mean(dis_mat)) +# return dis_mat, dis_max, dis_min, dis_mean + + + def _compute_X_distance_matrix(self): + start_time = time.time() + + if self.parallel == 'imap_unordered': + dis_matrix = self._compute_X_dm_imap_unordered() + elif self.parallel is None: + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + dis_matrix = self._compute_X_dm_series(graphs) + else: + raise Exception('Parallel mode is not set correctly.') + + self._run_time = time.time() - start_time + if self.verbose: + print('Distance matrix of size %d built in %s seconds.' + % (len(self._graphs), self._run_time)) + + return dis_matrix + + + def _compute_X_dm_series(self, graphs): + N = len(graphs) + dis_matrix = np.zeros((N, N)) + + for i, G1 in get_iters(enumerate(graphs), desc='Computing distance matrix', file=sys.stdout, verbose=(self.verbose >= 2)): + for j, G2 in enumerate(graphs[i+1:], i+1): + dis_matrix[i, j], _ = self.compute_ged(G1, G2) + dis_matrix[j, i] = dis_matrix[i, j] + return dis_matrix + + + def _compute_X_dm_imap_unordered(self, graphs): + pass + + + def compute_ged(self, Gi, Gj, **kwargs): + """ + Compute GED between two graph according to edit_cost. + """ + ged_options = {'edit_cost': self.edit_cost_fun, + 'method': self.ed_method, + 'edit_cost_constants': self._edit_cost_constants} + dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10) + n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, + edit_cost=self.edit_cost_fun, + node_labels=self.node_labels, + edge_labels=self.edge_labels) + return dis, n_eo_tmp + + +# def _compute_kernel_list(self, g1, g_list): +# start_time = time.time() + +# if self.parallel == 'imap_unordered': +# kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) +# elif self.parallel is None: +# kernel_list = self._compute_kernel_list_series(g1, g_list) +# else: +# raise Exception('Parallel mode is not set correctly.') + +# self._run_time = time.time() - start_time +# if self.verbose: +# print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' +# % (len(g_list), self._run_time)) + +# return kernel_list + + +# def _compute_kernel_list_series(self, g1, g_list): +# pass + + +# def _compute_kernel_list_imap_unordered(self, g1, g_list): +# pass + + +# def _compute_single_kernel(self, g1, g2): +# start_time = time.time() + +# kernel = self._compute_single_kernel_series(g1, g2) + +# self._run_time = time.time() - start_time +# if self.verbose: +# print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) + +# return kernel + + +# def _compute_single_kernel_series(self, g1, g2): +# pass + + + def is_graph(self, graph): + if isinstance(graph, nx.Graph): + return True + if isinstance(graph, nx.DiGraph): + return True + if isinstance(graph, nx.MultiGraph): + return True + if isinstance(graph, nx.MultiDiGraph): + return True + return False + + + @property + def graphs(self): + return self._graphs + + +# @property +# def parallel(self): +# return self.parallel + + +# @property +# def n_jobs(self): +# return self.n_jobs + + +# @property +# def verbose(self): +# return self.verbose + + +# @property +# def normalize(self): +# return self.normalize + + + @property + def run_time(self): + return self._run_time + + + @property + def dis_matrix(self): + return self._dis_matrix + + @dis_matrix.setter + def dis_matrix(self, value): + self._dis_matrix = value + + +# @property +# def gram_matrix_unnorm(self): +# return self._gram_matrix_unnorm + +# @gram_matrix_unnorm.setter +# def gram_matrix_unnorm(self, value): +# self._gram_matrix_unnorm = value \ No newline at end of file diff --git a/gklearn/ged/model/optim_costs.py b/gklearn/ged/model/optim_costs.py new file mode 100644 index 0000000..1e23732 --- /dev/null +++ b/gklearn/ged/model/optim_costs.py @@ -0,0 +1,149 @@ +import numpy as np + +from gklearn.ged.model.distances import sum_squares, euclid_d +from gklearn.ged.model.ged_com import compute_geds + + +def optimize_costs_unlabeled(nb_cost_mat, dis_k_vec): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph + :param dis_k_vec: The N distances to fit + """ + import cvxpy as cp + import numpy as np + MAX_SAMPLE = 1000 + nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] for x in nb_cost_mat]) + dis_k_vec = np.array(dis_k_vec) + # dis_k_vec_norm = dis_k_vec/np.max(dis_k_vec) + + # import pickle + # pickle.dump([nb_cost_mat, dis_k_vec], open('debug', 'wb')) + N = nb_cost_mat_m.shape[0] + sub_sample = np.random.permutation(np.arange(N)) + sub_sample = sub_sample[:MAX_SAMPLE] + + x = cp.Variable(nb_cost_mat_m.shape[1]) + cost = cp.sum_squares((nb_cost_mat_m[sub_sample, :] @ x) - dis_k_vec[sub_sample]) + prob = cp.Problem(cp.Minimize(cost), [x >= 0]) + prob.solve() + edit_costs_new = [x.value[0], x.value[1], 0, x.value[2], x.value[3], 0] + edit_costs_new = [xi if xi > 0 else 0 for xi in edit_costs_new] + residual = prob.value + return edit_costs_new, residual + + +def optimize_costs_classif_unlabeled(nb_cost_mat, Y): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in + nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit + operations for each pair of graph + :param dis_k_vec: {-1,1}^N vector of common classes + """ + # import cvxpy as cp + from ml import reg_log + # import pickle + # pickle.dump([nb_cost_mat, Y], open('debug', 'wb')) + nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] + for x in nb_cost_mat]) + w, J, _ = reg_log(nb_cost_mat_m, Y, pos_contraint=True) + edit_costs_new = [w[0], w[1], 0, w[2], w[3], 0] + residual = J[-1] + + return edit_costs_new, residual + + +def optimize_costs_classif(nb_cost_mat, Y): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph + :param dis_k_vec: {-1,1}^N vector of common classes + """ + #import pickle + # pickle.dump([nb_cost_mat, Y], open("test.pickle", "wb")) + from ml import reg_log + w, J, _ = reg_log(nb_cost_mat, Y, pos_contraint=True) + return w, J[-1] + + +def optimize_costs(nb_cost_mat, dis_k_vec): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph + :param dis_k_vec: The N distances to fit + """ + import cvxpy as cp + x = cp.Variable(nb_cost_mat.shape[1]) + cost = cp.sum_squares((nb_cost_mat @ x) - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost), constraints) + prob.solve() + edit_costs_new = x.value + residual = prob.value + + return edit_costs_new, residual + + +def compute_optimal_costs(G, y, init_costs=[3, 3, 1, 3, 3, 1], + y_distance=euclid_d, + mode='reg', unlabeled=False, + ed_method='BIPARTITE', + verbose=True, + **kwargs): + N = len(y) + + G_pairs = [] + distances_vec = [] + + for i in range(N): + for j in range(i+1, N): + G_pairs.append([i, j]) + distances_vec.append(y_distance(y[i], y[j])) + ged_vec_init, n_edit_operations = compute_geds(G_pairs, G, init_costs, ed_method, + verbose=verbose, **kwargs) + + residual_list = [sum_squares(ged_vec_init, distances_vec)] + + if (mode == 'reg'): + if unlabeled: + method_optim = optimize_costs_unlabeled + else: + method_optim = optimize_costs + + elif (mode == 'classif'): + if unlabeled: + method_optim = optimize_costs_classif_unlabeled + else: + method_optim = optimize_costs_classif + + ite_max = 5 + for i in range(ite_max): + if verbose: + print('ite', i + 1, '/', ite_max, ':') + # compute GEDs and numbers of edit operations. + edit_costs_new, residual = method_optim( + np.array(n_edit_operations), distances_vec) + ged_vec, n_edit_operations = compute_geds(G_pairs, G, edit_costs_new, ed_method, + verbose=verbose, **kwargs) + residual_list.append(sum_squares(ged_vec, distances_vec)) + + return edit_costs_new + + +def get_optimal_costs_GH2020(**kwargs): + import pickle + import os + dir_root = 'cj/output/' + ds_name = kwargs.get('ds_name') + nb_trial = kwargs.get('nb_trial') + file_name = os.path.join(dir_root, 'costs.' + ds_name + '.' + str(nb_trial) + '.pkl') + with open(file_name, 'rb') as f: + edit_costs = pickle.load(f) + return edit_costs From a76335ed16c0d635a636c477dbe4d179981b4452 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 6 May 2022 14:15:03 +0200 Subject: [PATCH 08/11] [Features][API Changes] Update kernel classes. --- gklearn/kernels/graph_kernel.py | 59 ++++++++----- gklearn/kernels/treelet.py | 66 +++++++++------ gklearn/kernels/weisfeiler_lehman.py | 155 ++++++++++++----------------------- gklearn/utils/kernels.py | 5 ++ 4 files changed, 135 insertions(+), 150 deletions(-) diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index 1db38b3..c7a5718 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -32,7 +32,13 @@ class GraphKernel(BaseEstimator): #, ABC): https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. """ - def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2): + def __init__(self, + parallel=None, + n_jobs=None, + chunksize=None, + normalize=True, + copy_graphs=True, # make sure it is a full deep copy. and faster! + verbose=2): """`__init__` for `GraphKernel` object.""" # @todo: the default settings of the parameters are different from those in the self.compute method. # self._graphs = None @@ -40,6 +46,7 @@ class GraphKernel(BaseEstimator): #, ABC): self.n_jobs = n_jobs self.chunksize = chunksize self.normalize = normalize + self.copy_graphs = copy_graphs self.verbose = verbose # self._run_time = 0 # self._gram_matrix = None @@ -90,7 +97,7 @@ class GraphKernel(BaseEstimator): #, ABC): return self - def transform(self, X): + def transform(self, X=None, load_gm_train=False): """Compute the graph kernel matrix between given and fitted data. Parameters @@ -108,6 +115,12 @@ class GraphKernel(BaseEstimator): #, ABC): None. """ + # If `load_gm_train`, load Gram matrix of training data. + if load_gm_train: + check_is_fitted(self, '_gm_train') + self._is_transformed = True + return self._gm_train # @todo: copy or not? + # Check if method "fit" had been called. check_is_fitted(self, '_graphs') @@ -133,8 +146,7 @@ class GraphKernel(BaseEstimator): #, ABC): return kernel_matrix - - def fit_transform(self, X): + def fit_transform(self, X, save_gm_train=False): """Fit and transform: compute Gram matrix on the same data. Parameters @@ -164,6 +176,9 @@ class GraphKernel(BaseEstimator): #, ABC): finally: np.seterr(**old_settings) + if save_gm_train: + self._gm_train = gram_matrix + return gram_matrix @@ -260,7 +275,9 @@ class GraphKernel(BaseEstimator): #, ABC): kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y) elif self.parallel is None: - kernel_matrix = self._compute_kernel_matrix_series(Y) + Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y) + graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + kernel_matrix = self._compute_kernel_matrix_series(Y_copy, graphs_copy) self._run_time = time.time() - start_time if self.verbose: @@ -270,26 +287,25 @@ class GraphKernel(BaseEstimator): #, ABC): return kernel_matrix - def _compute_kernel_matrix_series(self, Y): - """Compute the kernel matrix between a given target graphs (Y) and - the fitted graphs (X / self._graphs) without parallelization. + def _compute_kernel_matrix_series(self, X, Y): + """Compute the kernel matrix between two sets of graphs (X and Y) without parallelization. Parameters ---------- - Y : list of graphs, optional - The target graphs. + X, Y : list of graphs + The input graphs. Returns ------- - kernel_matrix : numpy array, shape = [n_targets, n_inputs] + kernel_matrix : numpy array, shape = [n_X, n_Y] The computed kernel matrix. """ - kernel_matrix = np.zeros((len(Y), len(self._graphs))) + kernel_matrix = np.zeros((len(X), len(Y))) - for i_y, g_y in enumerate(Y): - for i_x, g_x in enumerate(self._graphs): - kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x) + for i_x, g_x in enumerate(X): + for i_y, g_y in enumerate(Y): + kernel_matrix[i_x, i_y] = self.pairwise_kernel(g_x, g_y) return kernel_matrix @@ -335,14 +351,16 @@ class GraphKernel(BaseEstimator): #, ABC): except NotFittedError: # Compute diagonals of X. self._X_diag = np.empty(shape=(len(self._graphs),)) - for i, x in enumerate(self._graphs): + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + for i, x in enumerate(graphs): self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? try: # If transform has happened, return both diagonals. check_is_fitted(self, ['_Y']) self._Y_diag = np.empty(shape=(len(self._Y),)) - for (i, y) in enumerate(self._Y): + Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) + for (i, y) in enumerate(Y): self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? return self._X_diag, self._Y_diag @@ -484,7 +502,8 @@ class GraphKernel(BaseEstimator): #, ABC): if self.parallel == 'imap_unordered': gram_matrix = self._compute_gm_imap_unordered() elif self.parallel is None: - gram_matrix = self._compute_gm_series() + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + gram_matrix = self._compute_gm_series(graphs) else: raise Exception('Parallel mode is not set correctly.') @@ -496,11 +515,11 @@ class GraphKernel(BaseEstimator): #, ABC): return gram_matrix - def _compute_gm_series(self): + def _compute_gm_series(self, graphs): pass - def _compute_gm_imap_unordered(self): + def _compute_gm_imap_unordered(self, graphs): pass diff --git a/gklearn/kernels/treelet.py b/gklearn/kernels/treelet.py index e42142b..c981bdf 100644 --- a/gklearn/kernels/treelet.py +++ b/gklearn/kernels/treelet.py @@ -28,16 +28,16 @@ from gklearn.kernels import GraphKernel class Treelet(GraphKernel): - def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs): + def __init__(self, **kwargs): """Initialise a treelet kernel. """ - super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose) + GraphKernel.__init__(self, **{k: kwargs.get(k) for k in ['parallel', 'n_jobs', 'chunksize', 'normalize', 'copy_graphs', 'verbose'] if k in kwargs}) self.node_labels = kwargs.get('node_labels', []) self.edge_labels = kwargs.get('edge_labels', []) self.sub_kernel = kwargs.get('sub_kernel', None) self.ds_infos = kwargs.get('ds_infos', {}) - self.precompute_canonkeys = precompute_canonkeys - self.save_canonkeys = save_canonkeys + self.precompute_canonkeys = kwargs.get('precompute_canonkeys', True) + self.save_canonkeys = kwargs.get('save_canonkeys', True) ########################################################################## @@ -71,7 +71,7 @@ class Treelet(GraphKernel): raise ValueError('Sub-kernel not set.') - def _compute_kernel_matrix_series(self, Y): + def _compute_kernel_matrix_series(self, Y, X=None, load_canonkeys=True): """Compute the kernel matrix between a given target graphs (Y) and the fitted graphs (X / self._graphs) without parallelization. @@ -86,36 +86,45 @@ class Treelet(GraphKernel): The computed kernel matrix. """ + if_comp_X_canonkeys = True + + # if load saved canonkeys of X from the instance: + if load_canonkeys: + # Canonical keys for self._graphs. + try: + check_is_fitted(self, ['_canonkeys']) + canonkeys_list1 = self._canonkeys + if_comp_X_canonkeys = False + except NotFittedError: + import warnings + warnings.warn('The canonkeys of self._graphs are not computed/saved. The keys of `X` is computed instead.') + if_comp_X_canonkeys = True - # self._add_dummy_labels will modify the input in place. - self._add_dummy_labels() # For self._graphs -# Y = [g.copy() for g in Y] # @todo: ? - self._add_dummy_labels(Y) # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. - # Canonical keys for self._graphs. - try: - check_is_fitted(self, ['_canonkeys']) - canonkeys_list1 = self._canonkeys - except NotFittedError: + # Compute the canonical keys of X. + if if_comp_X_canonkeys: + if X is None: + raise('X can not be None.') + # self._add_dummy_labels will modify the input in place. + self._add_dummy_labels(X) # for X canonkeys_list1 = [] - iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2)) + iterator = get_iters(self._graphs, desc='Getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2)) for g in iterator: canonkeys_list1.append(self._get_canonkeys(g)) - if self.save_canonkeys: - self._canonkeys = canonkeys_list1 - # Canonical keys for Y. +# Y = [g.copy() for g in Y] # @todo: ? + self._add_dummy_labels(Y) canonkeys_list2 = [] - iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2)) + iterator = get_iters(Y, desc='Getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2)) for g in iterator: canonkeys_list2.append(self._get_canonkeys(g)) - if self.save_canonkeys: - self._Y_canonkeys = canonkeys_list2 +# if self.save_canonkeys: +# self._Y_canonkeys = canonkeys_list2 # compute kernel matrix. kernel_matrix = np.zeros((len(Y), len(canonkeys_list1))) @@ -235,13 +244,13 @@ class Treelet(GraphKernel): ########################################################################## - def _compute_gm_series(self): - self._add_dummy_labels(self._graphs) + def _compute_gm_series(self, graphs): + self._add_dummy_labels(graphs) # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys = [] - iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout, + iterator = get_iters(graphs, desc='getting canonkeys', file=sys.stdout, verbose=(self.verbose >= 2)) for g in iterator: canonkeys.append(self._get_canonkeys(g)) @@ -250,11 +259,11 @@ class Treelet(GraphKernel): self._canonkeys = canonkeys # compute Gram matrix. - gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + gram_matrix = np.zeros((len(graphs), len(graphs))) from itertools import combinations_with_replacement - itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + itr = combinations_with_replacement(range(0, len(graphs)), 2) + len_itr = int(len(graphs) * (len(graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: @@ -390,6 +399,9 @@ class Treelet(GraphKernel): Treelet kernel between 2 graphs. """ keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs + if len(keys) == 0: # There is nothing in common... + return 0 + vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) diff --git a/gklearn/kernels/weisfeiler_lehman.py b/gklearn/kernels/weisfeiler_lehman.py index f02926e..905b31f 100644 --- a/gklearn/kernels/weisfeiler_lehman.py +++ b/gklearn/kernels/weisfeiler_lehman.py @@ -28,7 +28,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. def __init__(self, **kwargs): - GraphKernel.__init__(self) + GraphKernel.__init__(self, **{k: kwargs.get(k) for k in ['parallel', 'n_jobs', 'chunksize', 'normalize', 'copy_graphs', 'verbose'] if k in kwargs}) self.node_labels = kwargs.get('node_labels', []) self.edge_labels = kwargs.get('edge_labels', []) self.height = int(kwargs.get('height', 0)) @@ -50,7 +50,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. ########################################################################## - def _compute_gm_series(self): + def _compute_gm_series(self, graphs): # if self.verbose >= 2: # import warnings # warnings.warn('A part of the computation is parallelized.') @@ -59,19 +59,19 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. # for WL subtree kernel if self._base_kernel == 'subtree': - gram_matrix = self._subtree_kernel_do(self._graphs) + gram_matrix = self._subtree_kernel_do(graphs) # for WL shortest path kernel elif self._base_kernel == 'sp': - gram_matrix = self._sp_kernel_do(self._graphs) + gram_matrix = self._sp_kernel_do(graphs) # for WL edge kernel elif self._base_kernel == 'edge': - gram_matrix = self._edge_kernel_do(self._graphs) + gram_matrix = self._edge_kernel_do(graphs) # for user defined base kernel else: - gram_matrix = self._user_kernel_do(self._graphs) + gram_matrix = self._user_kernel_do(graphs) return gram_matrix @@ -204,70 +204,13 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. def pairwise_kernel(self, g1, g2): - Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster! - kernel = 0 - - # initial for height = 0 - all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration - - # for each graph - for G in Gn: - # set all labels into a tuple. - for nd, attrs in G.nodes(data=True): # @todo: there may be a better way. - G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels) - # get the set of original labels - labels_ori = list(nx.get_node_attributes(G, 'lt').values()) - # number of occurence of each label in G - all_num_of_each_label.append(dict(Counter(labels_ori))) - - # Compute subtree kernel with the 0th iteration and add it to the final kernel. - kernel = self._compute_kernel_itr(kernel, all_num_of_each_label) - - # iterate each height - for h in range(1, self.height + 1): - all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration - num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs - # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration - all_num_of_each_label = [] # number of occurence of each label in G - - # @todo: parallel this part. - for G in Gn: - - all_multisets = [] - for node, attrs in G.nodes(data=True): - # Multiset-label determination. - multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]] - # sorting each multiset - multiset.sort() - multiset = [attrs['lt']] + multiset # add the prefix - all_multisets.append(tuple(multiset)) - - # label compression - set_unique = list(set(all_multisets)) # set of unique multiset labels - # a dictionary mapping original labels to new ones. - set_compressed = {} - # if a label occured before, assign its former compressed label, - # else assign the number of labels occured + 1 as the compressed label. - for value in set_unique: - if value in all_set_compressed.keys(): - set_compressed[value] = all_set_compressed[value] - else: - set_compressed[value] = str(num_of_labels_occured + 1) - num_of_labels_occured += 1 - - all_set_compressed.update(set_compressed) - - # relabel nodes - for idx, node in enumerate(G.nodes()): - G.nodes[node]['lt'] = set_compressed[all_multisets[idx]] - - # get the set of compressed labels - labels_comp = list(nx.get_node_attributes(G, 'lt').values()) - # all_labels_ori.update(labels_comp) - all_num_of_each_label.append(dict(Counter(labels_comp))) +# Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster! + Gn = [g1, g2] + # for WL subtree kernel + if self._base_kernel == 'subtree': + kernel = self._subtree_kernel_do(Gn, return_mat=False) - # Compute subtree kernel with h iterations and add it to the final kernel - kernel = self._compute_kernel_itr(kernel, all_num_of_each_label) + # @todo: other subkernels. return kernel @@ -291,7 +234,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. return kernel - def _subtree_kernel_do_nl(self, Gn): + def _subtree_kernel_do_nl(self, Gn, return_mat=True): """Compute Weisfeiler-Lehman kernels between graphs with node labels. Parameters @@ -301,10 +244,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. Return ------ - gram_matrix : Numpy matrix + kernel_matrix : Numpy matrix / float Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. """ - gram_matrix = np.zeros((len(Gn), len(Gn))) + kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0) + gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr) # initial for height = 0 all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration @@ -324,7 +268,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. all_num_of_each_label.append(dict(Counter(labels_ori))) # Compute subtree kernel with the 0th iteration and add it to the final kernel. - self._compute_gram_itr(gram_matrix, all_num_of_each_label) + kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) # iterate each height for h in range(1, self.height + 1): @@ -342,12 +286,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) # Compute subtree kernel with h iterations and add it to the final kernel - self._compute_gram_itr(gram_matrix, all_num_of_each_label) + kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) - return gram_matrix + return kernel_matrix - def _subtree_kernel_do_el(self, Gn): + def _subtree_kernel_do_el(self, Gn, return_mat=True): """Compute Weisfeiler-Lehman kernels between graphs with edge labels. Parameters @@ -357,19 +301,20 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. Return ------ - gram_matrix : Numpy matrix + kernel_matrix : Numpy matrix Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. """ - gram_matrix = np.zeros((len(Gn), len(Gn))) + kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0) + gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr) # initial for height = 0 all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration # Compute subtree kernel with the 0th iteration and add it to the final kernel. - iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) - for i, j in iterator: - gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) - gram_matrix[j][i] = gram_matrix[i][j] + iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2) + for i, j in iterator: # @todo: not correct if return_mat == False. + kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) + kernel_matrix[j][i] = kernel_matrix[i][j] # if h >= 1. @@ -393,7 +338,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) # Compute subtree kernel with h iterations and add it to the final kernel. - self._compute_gram_itr(gram_matrix, all_num_of_each_label) + kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) # Iterate along heights (>= 2). @@ -407,12 +352,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) # Compute subtree kernel with h iterations and add it to the final kernel. - self._compute_gram_itr(gram_matrix, all_num_of_each_label) + kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) - return gram_matrix + return kernel_matrix - def _subtree_kernel_do_labeled(self, Gn): + def _subtree_kernel_do_labeled(self, Gn, return_mat=True): """Compute Weisfeiler-Lehman kernels between graphs with both node and edge labels. @@ -423,10 +368,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. Return ------ - gram_matrix : Numpy matrix + kernel_matrix : Numpy matrix Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. """ - gram_matrix = np.zeros((len(Gn), len(Gn))) + kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0) + gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr) # initial for height = 0 all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration @@ -446,10 +392,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. all_num_of_each_label.append(dict(Counter(labels_ori))) # Compute subtree kernel with the 0th iteration and add it to the final kernel. - self._compute_gram_itr(gram_matrix, all_num_of_each_label) + kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) - # if h >= 1. + # if h >= 1: if self.height > 0: # Set all edge labels into a tuple. # @todo: remove this original labels or not? if self.verbose >= 2: @@ -470,7 +416,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) # Compute subtree kernel with h iterations and add it to the final kernel. - self._compute_gram_itr(gram_matrix, all_num_of_each_label) + kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) # Iterate along heights. @@ -484,12 +430,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) # Compute subtree kernel with h iterations and add it to the final kernel. - self._compute_gram_itr(gram_matrix, all_num_of_each_label) + kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) - return gram_matrix + return kernel_matrix - def _subtree_kernel_do_unlabeled(self, Gn): + def _subtree_kernel_do_unlabeled(self, Gn, return_mat=True): """Compute Weisfeiler-Lehman kernels between graphs without labels. Parameters @@ -499,19 +445,20 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. Return ------ - gram_matrix : Numpy matrix + kernel_matrix : Numpy matrix Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. """ - gram_matrix = np.zeros((len(Gn), len(Gn))) + kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0) + gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr) # initial for height = 0 all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration # Compute subtree kernel with the 0th iteration and add it to the final kernel. - iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2) - for i, j in iterator: - gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) - gram_matrix[j][i] = gram_matrix[i][j] + iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2) + for i, j in iterator: # @todo: not correct if return_mat == False. + kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j]) + kernel_matrix[j][i] = kernel_matrix[i][j] # if h >= 1. @@ -526,7 +473,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) # Compute subtree kernel with h iterations and add it to the final kernel. - self._compute_gram_itr(gram_matrix, all_num_of_each_label) + kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) # Iterate along heights (>= 2). @@ -540,9 +487,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) # Compute subtree kernel with h iterations and add it to the final kernel. - self._compute_gram_itr(gram_matrix, all_num_of_each_label) + kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label) - return gram_matrix + return kernel_matrix def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): @@ -717,6 +664,8 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. all_num_of_each_label[j]) gram_matrix[j][i] = gram_matrix[i][j] + return gram_matrix + def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2): """Compute the subtree kernel. diff --git a/gklearn/utils/kernels.py b/gklearn/utils/kernels.py index 182668b..1e1ea52 100644 --- a/gklearn/utils/kernels.py +++ b/gklearn/utils/kernels.py @@ -68,6 +68,11 @@ def gaussian_kernel(x, y, gamma=None): return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma) +def tanimoto_kernel(x, y): + xy = np.dot(x, y) + return xy / (np.dot(x, x) + np.dot(y, y) - xy) + + def gaussiankernel(x, y, gamma=None): return gaussian_kernel(x, y, gamma=gamma) From 5eb90655a4986b301ccb2a4988ec4d3ddf651bf3 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 24 Jun 2022 17:45:16 +0200 Subject: [PATCH 09/11] [Features] Add model seletion methods with validation set: , , . Required version of scikit-learn is upgraded to 1.1.0, to support the argument of used in . --- gklearn/model_selection/__init__.py | 25 ++++ gklearn/model_selection/_split.py | 285 ++++++++++++++++++++++++++++++++++++ requirements.txt | 2 +- requirements_pypi.txt | 4 +- 4 files changed, 313 insertions(+), 3 deletions(-) create mode 100644 gklearn/model_selection/__init__.py create mode 100644 gklearn/model_selection/_split.py diff --git a/gklearn/model_selection/__init__.py b/gklearn/model_selection/__init__.py new file mode 100644 index 0000000..661478b --- /dev/null +++ b/gklearn/model_selection/__init__.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 24 14:25:57 2022 + +@author: ljia +""" + +from ._split import BaseCrossValidatorWithValid +# from ._split import BaseShuffleSplit +from ._split import KFoldWithValid +# from ._split import GroupKFold +# from ._split import StratifiedKFoldWithValid +# from ._split import TimeSeriesSplit +# from ._split import LeaveOneGroupOut +# from ._split import LeaveOneOut +# from ._split import LeavePGroupsOut +# from ._split import LeavePOut +from ._split import RepeatedKFoldWithValid +# from ._split import RepeatedStratifiedKFold +# from ._split import ShuffleSplit +# from ._split import GroupShuffleSplit +# from ._split import StratifiedShuffleSplit +# from ._split import StratifiedGroupKFold +# from ._split import PredefinedSplit \ No newline at end of file diff --git a/gklearn/model_selection/_split.py b/gklearn/model_selection/_split.py new file mode 100644 index 0000000..a982fec --- /dev/null +++ b/gklearn/model_selection/_split.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 24 11:13:26 2022 + +@author: ljia +""" +from abc import abstractmethod +import numbers +import warnings +import numpy as np +from sklearn.utils import check_random_state, check_array, column_or_1d, indexable +from sklearn.utils.validation import _num_samples +from sklearn.utils.multiclass import type_of_target + + +class BaseCrossValidatorWithValid(object): + """Base class for all cross-validators. + Implementations must define `_iter_valid_test_masks` or `_iter_valid_stest_indices`. + """ + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training, valid, and test set. + + Parameters + ---------- + + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + valid : ndarray + The valid set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + X, y, groups = indexable(X, y, groups) + indices = np.arange(_num_samples(X)) + for valid_index, test_index in self._iter_valid_test_masks(X, y, groups): + train_index = indices[np.logical_not(np.logical_or(valid_index, test_index))] + valid_index = indices[valid_index] + test_index = indices[test_index] + yield train_index, valid_index, test_index + + + # Since subclasses must implement either _iter_valid_test_masks or + # _iter_valid_test_indices, neither can be abstract. + def _iter_valid_test_masks(self, X=None, y=None, groups=None): + """Generates boolean masks corresponding to valid and test sets. + By default, delegates to _iter_valid_test_indices(X, y, groups) + """ + for valid_index, test_index in self._iter_valid_test_indices(X, y, groups): + valid_mask = np.zeros(_num_samples(X), dtype=bool) + test_mask = np.zeros(_num_samples(X), dtype=bool) + valid_mask[valid_index] = True + test_mask[test_index] = True + yield valid_mask, test_mask + + + def _iter_valid_test_indices(self, X=None, y=None, groups=None): + """Generates integer indices corresponding to valid and test sets.""" + raise NotImplementedError + + + @abstractmethod + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator""" + + + def __repr__(self): + return _build_repr(self) + + +class _BaseKFoldWithValid(BaseCrossValidatorWithValid): + """Base class for KFold, GroupKFold, and StratifiedKFold""" + + @abstractmethod + def __init__(self, n_splits, *, stratify, shuffle, random_state): + if not isinstance(n_splits, numbers.Integral): + raise ValueError( + 'The number of folds must be of Integral type. ' + '%s of type %s was passed.' % (n_splits, type(n_splits)) + ) + n_splits = int(n_splits) + + if n_splits <= 2: + raise ValueError( + 'k-fold cross-validation requires at least one' + ' train/valid/test split by setting n_splits=3 or more,' + ' got n_splits={0}.'.format(n_splits) + ) + + if not isinstance(shuffle, bool): + raise TypeError('shuffle must be True or False; got {0}'.format(shuffle)) + + if not shuffle and random_state is not None: # None is the default + raise ValueError( + 'Setting a random_state has no effect since shuffle is ' + 'False. You should leave ' + 'random_state to its default (None), or set shuffle=True.', + ) + + self.n_splits = n_splits + self.stratify = stratify + self.shuffle = shuffle + self.random_state = random_state + + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training, valid and test set.""" + X, y, groups = indexable(X, y, groups) + n_samples = _num_samples(X) + if self.n_splits > n_samples: + raise ValueError( + ( + 'Cannot have number of splits n_splits={0} greater' + ' than the number of samples: n_samples={1}.' + ).format(self.n_splits, n_samples) + ) + + for train, valid, test in super().split(X, y, groups): + yield train, valid, test + + +class KFoldWithValid(_BaseKFoldWithValid): + + + def __init__( + self, + n_splits=5, + *, + stratify=False, + shuffle=False, + random_state=None + ): + super().__init__( + n_splits=n_splits, + stratify=stratify, + shuffle=shuffle, + random_state=random_state + ) + + + def _make_valid_test_folds(self, X, y=None): + rng = check_random_state(self.random_state) + y = np.asarray(y) + type_of_target_y = type_of_target(y) + allowed_target_types = ('binary', 'multiclass') + if type_of_target_y not in allowed_target_types: + raise ValueError( + 'Supported target types are: {}. Got {!r} instead.'.format( + allowed_target_types, type_of_target_y + ) + ) + + y = column_or_1d(y) + + _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True) + # y_inv encodes y according to lexicographic order. We invert y_idx to + # map the classes so that they are encoded by order of appearance: + # 0 represents the first label appearing in y, 1 the second, etc. + _, class_perm = np.unique(y_idx, return_inverse=True) + y_encoded = class_perm[y_inv] + + n_classes = len(y_idx) + y_counts = np.bincount(y_encoded) + min_groups = np.min(y_counts) + if np.all(self.n_splits > y_counts): + raise ValueError( + "n_splits=%d cannot be greater than the" + " number of members in each class." % (self.n_splits) + ) + if self.n_splits > min_groups: + warnings.warn( + "The least populated class in y has only %d" + " members, which is less than n_splits=%d." + % (min_groups, self.n_splits), + UserWarning, + ) + + # Determine the optimal number of samples from each class in each fold, + # using round robin over the sorted y. (This can be done direct from + # counts, but that code is unreadable.) + y_order = np.sort(y_encoded) + allocation = np.asarray( + [ + np.bincount(y_order[i :: self.n_splits], minlength=n_classes) + for i in range(self.n_splits) + ] + ) + + # To maintain the data order dependencies as best as possible within + # the stratification constraint, we assign samples from each class in + # blocks (and then mess that up when shuffle=True). + test_folds = np.empty(len(y), dtype='i') + for k in range(n_classes): + # since the kth column of allocation stores the number of samples + # of class k in each test set, this generates blocks of fold + # indices corresponding to the allocation for class k. + folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k]) + if self.shuffle: + rng.shuffle(folds_for_class) + test_folds[y_encoded == k] = folds_for_class + return test_folds + + + def _iter_valid_test_masks(self, X, y=None, groups=None): + test_folds = self._make_valid_test_folds(X, y) + for i in range(self.n_splits): + if i + 1 < self.n_splits: + j = i + 1 + else: + j = 0 + yield test_folds == i, test_folds == j + + + def split(self, X, y, groups=None): + y = check_array(y, input_name='y', ensure_2d=False, dtype=None) + return super().split(X, y, groups) + + +class _RepeatedSplitsWithValid(object): + + + def __init__( + self, + cv, + *, + n_repeats=10, + random_state=None, + **cvargs + ): + if not isinstance(n_repeats, int): + raise ValueError('Number of repetitions must be of integer type.') + + if n_repeats <= 0: + raise ValueError('Number of repetitions must be greater than 0.') + + self.cv = cv + self.n_repeats = n_repeats + self.random_state = random_state + self.cvargs = cvargs + + + def split(self, X, y=None, groups=None): + n_repeats = self.n_repeats + rng = check_random_state(self.random_state) + + for idx in range(n_repeats): + cv = self.cv(random_state=rng, shuffle=True, **self.cvargs) + for train_index, valid_index, test_index in cv.split(X, y, groups): + yield train_index, valid_index, test_index + + +class RepeatedKFoldWithValid(_RepeatedSplitsWithValid): + + + def __init__( + self, + *, + n_splits=5, + n_repeats=10, + stratify=False, + random_state=None + ): + super().__init__( + KFoldWithValid, + n_repeats=n_repeats, + stratify=stratify, + random_state=random_state, + n_splits=n_splits, + ) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4b25bb3..da822f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ numpy>=1.16.2 scipy>=1.1.0 matplotlib>=3.1.0 networkx>=2.2 -scikit-learn>=0.20.0 +scikit-learn>=1.1.0 tabulate>=0.8.2 tqdm>=4.26.0 control>=0.8.2 # for generalized random walk kernels only. diff --git a/requirements_pypi.txt b/requirements_pypi.txt index 3c68618..d1718a0 100644 --- a/requirements_pypi.txt +++ b/requirements_pypi.txt @@ -1,8 +1,8 @@ numpy>=1.16.2 scipy>=1.1.0 -matplotlib>=3.0.0 +matplotlib>=3.1.0 networkx>=2.2 -scikit-learn>=0.20.0 +scikit-learn>=1.1.0 tabulate>=0.8.2 tqdm>=4.26.0 control>=0.8.2 # for generalized random walk kernels only. From b36eaae177f84168fad15ffd105f5cb70d4c5c1d Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 24 Jun 2022 17:46:28 +0200 Subject: [PATCH 10/11] [Features] Add model seletion methods with validation set: KFoldWithValid, RepeatedKFoldWithValid, BaseCrossValidatorWithValid. Required version of scikit-learn is upgraded to 1.1.0, to support the input_name argument of sklearn.utils.check_array used in gklearn.model_selection._split.py. --- gklearn/model_selection/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gklearn/model_selection/__init__.py b/gklearn/model_selection/__init__.py index 661478b..23f5f67 100644 --- a/gklearn/model_selection/__init__.py +++ b/gklearn/model_selection/__init__.py @@ -5,7 +5,6 @@ Created on Fri Jun 24 14:25:57 2022 @author: ljia """ - from ._split import BaseCrossValidatorWithValid # from ._split import BaseShuffleSplit from ._split import KFoldWithValid From ed78e65dd8492c17edbb72f04c93bd0917cd5c9d Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 24 Jun 2022 17:48:16 +0200 Subject: [PATCH 11/11] [Features] Add model seletion methods with validation set: KFoldWithValid, RepeatedKFoldWithValid, BaseCrossValidatorWithValid. Required version of scikit-learn is upgraded to 1.1.0, to support the input_name argument of sklearn.utils.check_array used in gklearn.model_selection._split.py. --- gklearn/model_selection/_split.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gklearn/model_selection/_split.py b/gklearn/model_selection/_split.py index a982fec..10d5cc8 100644 --- a/gklearn/model_selection/_split.py +++ b/gklearn/model_selection/_split.py @@ -4,6 +4,8 @@ Created on Fri Jun 24 11:13:26 2022 @author: ljia + +Reference: scikit-learn. """ from abc import abstractmethod import numbers @@ -85,7 +87,7 @@ class BaseCrossValidatorWithValid(object): class _BaseKFoldWithValid(BaseCrossValidatorWithValid): - """Base class for KFold, GroupKFold, and StratifiedKFold""" + """Base class for KFoldWithValid, GroupKFoldWithValid, and StratifiedKFoldWithValid""" @abstractmethod def __init__(self, n_splits, *, stratify, shuffle, random_state):