From 1946d469643d618d9c381354f25933af2da2aed0 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 14 Mar 2022 15:49:27 +0100 Subject: [PATCH] [Feature] Allow to permutate nodes in graphs when using bipartite to estimate GED. This feature is implementated in the method in Python, which invokes GEDLIB in C++ by Cython. --- .../edit_costs.max_num_sols.ratios.bipartite.py | 147 ------- .../edit_costs.real_data.nums_sols.ratios.IPFP.py | 11 +- ...t_costs.real_data.nums_sols.ratios.bipartite.py | 172 ++++++++ gklearn/experiments/ged/stability/group_results.py | 1 + ..._costs.real_data.nums_sols.ratios.bipartite.py} | 27 +- gklearn/experiments/ged/stability/utils.py | 18 +- gklearn/ged/util/util.py | 210 +++++++-- gklearn/utils/utils.py | 471 ++++++++++++--------- 8 files changed, 665 insertions(+), 392 deletions(-) delete mode 100644 gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py create mode 100644 gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py rename gklearn/experiments/ged/stability/{run_job_edit_costs.max_nums_sols.ratios.bipartite.py => run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py} (52%) diff --git a/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py deleted file mode 100644 index 1f01fd5..0000000 --- a/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Mon Nov 2 16:17:01 2020 - -@author: ljia -""" -# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. - -import os -import multiprocessing -import pickle -import logging -from gklearn.ged.util import compute_geds -import time -from utils import get_dataset -import sys -from group_results import group_trials - - -def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): - - save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) - - # Return if the file exists. - if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): - return None, None - - """**2. Set parameters.**""" - - # Parameters for GED computation. - ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. - # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) - 'lsape_model': 'ECBP', # - # ??when bigger than 1, then the method is considered mIPFP. - # the actual number of computed solutions might be smaller than the specified value - 'max_num_solutions': max_num_solutions, - 'edit_cost': 'CONSTANT', # use CONSTANT cost. - 'greedy_method': 'BASIC', # - # the distance between non-symbolic node/edge labels is computed by euclidean distance. - 'attr_distance': 'euclidean', - 'optimal': True, # if TRUE, the option --greedy-method has no effect - # parallel threads. Do not work if mpg_options['parallel'] = False. - 'threads': multiprocessing.cpu_count(), - 'centrality_method': 'NONE', - 'centrality_weight': 0.7, - 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' - } - - edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] -# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] -# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) - - options = ged_options.copy() - options['edit_cost_constants'] = edit_cost_constants - options['node_labels'] = dataset.node_labels - options['edge_labels'] = dataset.edge_labels - options['node_attrs'] = dataset.node_attrs - options['edge_attrs'] = dataset.edge_attrs - parallel = True # if num_solutions == 1 else False - - """**5. Compute GED matrix.**""" - ged_mat = 'error' - runtime = 0 - try: - time0 = time.time() - ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) - runtime = time.time() - time0 - except Exception as exp: - print('An exception occured when running this experiment:') - LOG_FILENAME = save_dir + 'error.txt' - logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) - logging.exception(save_file_suffix) - print(repr(exp)) - - """**6. Get results.**""" - - with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: - pickle.dump(ged_mat, f) - with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: - pickle.dump(runtime, f) - - return ged_mat, runtime - - -def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): - # Return if the group file exists. - name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' - name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' - if os.path.isfile(name_group): - return - - ged_mats = [] - runtimes = [] - for trial in range(1, 101): - print() - print('Trial:', trial) - ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial) - ged_mats.append(ged_mat) - runtimes.append(runtime) - - # Group trials and Remove single files. - name_prefix = 'ged_matrix' + name_middle - group_trials(save_dir, name_prefix, True, True, False) - name_prefix = 'runtime' + name_middle - group_trials(save_dir, name_prefix, True, True, False) - - -def results_for_a_dataset(ds_name): - """**1. Get dataset.**""" - dataset = get_dataset(ds_name) - - for max_num_solutions in mnum_solutions_list: - print() - print('Max # of solutions:', max_num_solutions) - for ratio in ratio_list: - print() - print('Ratio:', ratio) - save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) - - -def get_param_lists(ds_name): - if ds_name == 'AIDS_symb': - mnum_solutions_list = [1, 20, 40, 60, 80, 100] - ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] - else: - mnum_solutions_list = [1, 20, 40, 60, 80, 100] - ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] - - return mnum_solutions_list, ratio_list - - -if __name__ == '__main__': - if len(sys.argv) > 1: - ds_name_list = sys.argv[1:] - else: - ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - - save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' - os.makedirs(save_dir, exist_ok=True) - os.makedirs(save_dir + 'groups/', exist_ok=True) - - for ds_name in ds_name_list: - print() - print('Dataset:', ds_name) - mnum_solutions_list, ratio_list = get_param_lists(ds_name) - results_for_a_dataset(ds_name) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py index aa08579..82b6604 100644 --- a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py @@ -13,7 +13,7 @@ import pickle import logging from gklearn.ged.util import compute_geds import time -from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation +from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids import sys from group_results import group_trials, check_group_existence, update_group_marker @@ -125,9 +125,10 @@ def get_param_lists(ds_name, mode='test'): elif mode == 'simple': from sklearn.model_selection import ParameterGrid - param_grid = ParameterGrid([ - {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]}, - {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]) + param_grid = mix_param_grids([list(ParameterGrid([ + {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])), + list(ParameterGrid([ + {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))]) # print(list(param_grid)) if ds_name == 'AIDS_symb': @@ -148,7 +149,7 @@ if __name__ == '__main__': # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' + save_dir = 'outputs/CRIANN/edit_costs.real_data.num_sols.ratios.IPFP/' os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir + 'groups/', exist_ok=True) diff --git a/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py new file mode 100644 index 0000000..f450c1e --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Nov 2 16:17:01 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. + +import os +import multiprocessing +import pickle +import logging +from gklearn.ged.util import compute_geds +import time +from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids +import sys +from group_results import group_trials, check_group_existence, update_group_marker + + +def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): + + save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. + # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) + 'lsape_model': 'ECBP', # + # ??when bigger than 1, then the method is considered mIPFP. + # the actual number of computed solutions might be smaller than the specified value + 'max_num_solutions': 1, # @ max_num_solutions, + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + 'greedy_method': 'BASIC', # + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'optimal': True, # if TRUE, the option --greedy-method has no effect + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'centrality_method': 'NONE', + 'centrality_weight': 0.7, + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = set_edit_cost_consts(ratio, + node_labeled=len(dataset.node_labels), + edge_labeled=len(dataset.edge_labels), + mode='uniform') +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = dataset.node_labels + options['edge_labels'] = dataset.edge_labels + options['node_attrs'] = dataset.node_attrs + options['edge_attrs'] = dataset.edge_attrs + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + runtime = 0 + try: + time0 = time.time() + ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, + options=options, + repeats=num_solutions, + permute_nodes=True, + random_state=None, + parallel=parallel, + verbose=True) + runtime = time.time() - time0 + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception(save_file_suffix) + print(repr(exp)) + + """**6. Get results.**""" + + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(dataset, ds_name, num_solutions, ratio): + # Return if the group file exists. + name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if check_group_existence(name_group): + return + + ged_mats = [] + runtimes = [] + num_trials = 100 + for trial in range(1, num_trials + 1): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + + # Group trials and remove single files. + # @todo: if the program stops between the following lines, then there may be errors. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) + update_group_marker(name_group) + + +def results_for_a_dataset(ds_name): + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + for params in list(param_grid): + print() + print(params) + save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio']) + + +def get_param_lists(ds_name, mode='test'): + if mode == 'test': + num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] + ratio_list = [10] + return num_solutions_list, ratio_list + + elif mode == 'simple': + from sklearn.model_selection import ParameterGrid + param_grid = mix_param_grids([list(ParameterGrid([ + {'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])), + list(ParameterGrid([ + {'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))]) +# print(list(param_grid)) + + if ds_name == 'AIDS_symb': + num_solutions_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + else: + num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] + + return param_grid + + +if __name__ == '__main__': + if len(sys.argv) > 1: + ds_name_list = sys.argv[1:] + else: + ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] +# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] +# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] + + save_dir = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) + + for ds_name in ds_name_list: + print() + print('Dataset:', ds_name) + param_grid = get_param_lists(ds_name, mode='simple') + results_for_a_dataset(ds_name) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/group_results.py b/gklearn/experiments/ged/stability/group_results.py index bdbe89f..10f930c 100644 --- a/gklearn/experiments/ged/stability/group_results.py +++ b/gklearn/experiments/ged/stability/group_results.py @@ -32,6 +32,7 @@ def check_group_existence(file_name): def update_group_marker(file_name): + # @todo: possible error when seveal tasks are using this file at the same time. path, name = os.path.split(file_name) marker_fn = os.path.join(path, 'group_names_finished.pkl') if os.path.isfile(marker_fn): diff --git a/gklearn/experiments/ged/stability/run_job_edit_costs.max_nums_sols.ratios.bipartite.py b/gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py similarity index 52% rename from gklearn/experiments/ged/stability/run_job_edit_costs.max_nums_sols.ratios.bipartite.py rename to gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py index 276a1a5..a33a9c1 100644 --- a/gklearn/experiments/ged/stability/run_job_edit_costs.max_nums_sols.ratios.bipartite.py +++ b/gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py @@ -9,36 +9,45 @@ import os import re +cur_path = os.path.dirname(os.path.abspath(__file__)) + + def get_job_script(arg): script = r""" #!/bin/bash #SBATCH --exclusive #SBATCH --job-name="st.""" + arg + r""".bp" -#SBATCH --partition=tlong +#SBATCH --partition=court #SBATCH --mail-type=ALL #SBATCH --mail-user=jajupmochi@gmail.com -#SBATCH --output="outputs/output_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" -#SBATCH --error="errors/error_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" +#SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt" +#SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt" # #SBATCH --ntasks=1 #SBATCH --nodes=1 #SBATCH --cpus-per-task=1 -#SBATCH --time=300:00:00 +#SBATCH --time=48:00:00 #SBATCH --mem-per-cpu=4000 srun hostname -srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability -srun python3 edit_costs.max_nums_sols.ratios.bipartite.py """ + arg +cd """ + cur_path + r""" +echo Working directory : $PWD +echo Local work dir : $LOCAL_WORK_DIR +python3 edit_costs.real_data.nums_sols.ratios.bipartite.py """ + arg script = script.strip() script = re.sub('\n\t+', '\n', script) script = re.sub('\n +', '\n', script) - + return script if __name__ == '__main__': - ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: + + os.makedirs('outputs/', exist_ok=True) + os.makedirs('errors/', exist_ok=True) + + ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] + for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]: job_script = get_job_script(ds_name) command = 'sbatch < 0: + for g_idx, grid in enumerate(list_of_grids): + if idx < len(grid): + mixed_grids.append(grid[idx]) + else: + not_finished[g_idx] = False + idx += 1 + + return mixed_grids + + + if __name__ == '__main__': root_dir = 'outputs/CRIANN/' # for dir_ in sorted(os.listdir(root_dir)): @@ -337,4 +353,4 @@ if __name__ == '__main__': # get_relative_errors(save_dir) # except Exception as exp: # print('An exception occured when running this experiment:') -# print(repr(exp)) \ No newline at end of file +# print(repr(exp)) diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index a5a5ac5..d75939a 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -64,10 +64,12 @@ def pairwise_ged(g1, g2, options={}, sort=True, repeats=1, parallel=False, verbo g = listID[0] h = listID[1] dis_min = np.inf +# print('------------------------------------------') for i in range(0, repeats): ged_env.run_method(g, h) upper = ged_env.get_upper_bound(g, h) dis = upper +# print(dis) if dis < dis_min: dis_min = dis pi_forward = ged_env.get_forward_map(g, h) @@ -169,12 +171,100 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True return ged_vec, ged_mat, n_edit_operations -def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, n_jobs=None, verbose=True): +#%% + + +def compute_geds(graphs, + options={}, + sort=True, + repeats=1, + permute_nodes=False, + random_state=None, + parallel=False, + n_jobs=None, + verbose=True): + """Compute graph edit distance matrix using GEDLIB. + """ + if permute_nodes: + return _compute_geds_with_permutation(graphs, + options=options, + sort=sort, + repeats=repeats, + random_state=random_state, + parallel=parallel, + n_jobs=n_jobs, + verbose=verbose) + else: + return _compute_geds_without_permutation(graphs, + options=options, + sort=sort, + repeats=repeats, + parallel=parallel, + n_jobs=n_jobs, + verbose=verbose) + + +#%% + + +def _compute_geds_with_permutation(graphs, + options={}, + sort=True, + repeats=1, + random_state=None, + parallel=False, + n_jobs=None, + verbose=True): + + from gklearn.utils.utils import nx_permute_nodes + + # Initialze variables. + ged_mat_optim = np.full((len(graphs), len(graphs)), np.inf) + np.fill_diagonal(ged_mat_optim, 0) + len_itr = int(len(graphs) * (len(graphs) - 1) / 2) + ged_vec = [0] * len_itr + n_edit_operations = [0] * len_itr + + # for each repeats: + for i in range(0, repeats): + # Permutate nodes. + graphs_pmut = [nx_permute_nodes(g, random_state=random_state) for g in graphs] + + out = _compute_geds_without_permutation(graphs_pmut, + options=options, + sort=sort, + repeats=1, + parallel=parallel, + n_jobs=n_jobs, + verbose=verbose) + + # Compare current results with the best one. + idx_cnt = 0 + for i in range(len(graphs)): + for j in range(i + 1, len(graphs)): + if out[1][i, j] < ged_mat_optim[i ,j]: + ged_mat_optim[i, j] = out[1][i, j] + ged_mat_optim[j, i] = out[1][j, i] + ged_vec[idx_cnt] = out[0][idx_cnt] + n_edit_operations[idx_cnt] = out[2][idx_cnt] + idx_cnt += 1 + + return ged_vec, ged_mat_optim, n_edit_operations + + +def _compute_geds_without_permutation(graphs, + options={}, + sort=True, + repeats=1, + parallel=False, + n_jobs=None, + verbose=True): from gklearn.gedlib import librariesImport, gedlibpy # initialize ged env. ged_env = gedlibpy.GEDEnv() ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) + for g in graphs: ged_env.add_nx_graph(g, '') listID = ged_env.get_all_graph_ids() @@ -266,6 +356,11 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): dis = upper # make the map label correct (label remove map as np.inf) + # Attention: using node indices instead of NetworkX node labels (as + # implemented here) may cause several issues: + # - Fail if NetworkX node labels are not consecutive integers; + # - Return wrong mappings if nodes are permutated (e.g., by using + # `gklearn.utis.utils.nx_permute_nodes()`.) nodes1 = [n for n in g1.nodes()] nodes2 = [n for n in g2.nodes()] nb1 = nx.number_of_nodes(g1) @@ -278,46 +373,57 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): pi_forward_min = pi_forward pi_backward_min = pi_backward +# print('-----') +# print(pi_forward_min) +# print(pi_backward_min) + return dis_min, pi_forward_min, pi_backward_min -def label_costs_to_matrix(costs, nb_labels): - """Reform a label cost vector to a matrix. +#%% + + +def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): + """Calculate the numbers of the occurence of each edit operation in a given + edit path. Parameters ---------- - costs : numpy.array - The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. - nb_labels : integer - Number of labels. + g1 : TYPE + DESCRIPTION. + g2 : TYPE + DESCRIPTION. + forward_map : TYPE + DESCRIPTION. + backward_map : TYPE + DESCRIPTION. + edit_cost : TYPE, optional + DESCRIPTION. The default is None. + is_cml : TYPE, optional + DESCRIPTION. The default is False. + **kwargs : TYPE + DESCRIPTION. + + Raises + ------ + Exception + DESCRIPTION. Returns ------- - cost_matrix : numpy.array. - The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. + TYPE + DESCRIPTION. + + Notes + ----- + Attention: when implementing a function to get the numbers of edit + operations, make sure that: + - It does not fail if NetworkX node labels are not consecutive integers; + - It returns correct results if nodes are permutated (e.g., by using + `gklearn.utis.utils.nx_permute_nodes()`.) + Generally speaking, it means you need to distinguish the NetworkX label of + a node from the position (index) of that node in the node list. """ - # Initialize label cost matrix. - cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) - i = 0 - # Costs of insertions. - for col in range(1, nb_labels + 1): - cost_matrix[0, col] = costs[i] - i += 1 - # Costs of deletions. - for row in range(1, nb_labels + 1): - cost_matrix[row, 0] = costs[i] - i += 1 - # Costs of substitutions. - for row in range(1, nb_labels + 1): - for col in range(row + 1, nb_labels + 1): - cost_matrix[row, col] = costs[i] - cost_matrix[col, row] = costs[i] - i += 1 - - return cost_matrix - - -def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): if is_cml: if edit_cost == 'CONSTANT': node_labels = kwargs.get('node_labels', []) @@ -611,6 +717,48 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es +#%% + + +def label_costs_to_matrix(costs, nb_labels): + """Reform a label cost vector to a matrix. + + Parameters + ---------- + costs : numpy.array + The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. + nb_labels : integer + Number of labels. + + Returns + ------- + cost_matrix : numpy.array. + The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. + """ + # Initialize label cost matrix. + cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) + i = 0 + # Costs of insertions. + for col in range(1, nb_labels + 1): + cost_matrix[0, col] = costs[i] + i += 1 + # Costs of deletions. + for row in range(1, nb_labels + 1): + cost_matrix[row, 0] = costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, nb_labels + 1): + for col in range(row + 1, nb_labels + 1): + cost_matrix[row, col] = costs[i] + cost_matrix[col, row] = costs[i] + i += 1 + + return cost_matrix + + +#%% + + def ged_options_to_string(options): opt_str = ' ' for key, val in options.items(): diff --git a/gklearn/utils/utils.py b/gklearn/utils/utils.py index 5758291..f0e49fd 100644 --- a/gklearn/utils/utils.py +++ b/gklearn/utils/utils.py @@ -7,6 +7,9 @@ from enum import Enum, unique # from tqdm import tqdm +#%% + + def getSPLengths(G1): sp = nx.shortest_path(G1) distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) @@ -286,81 +289,146 @@ def direct_product_graph(G1, G2, node_labels, edge_labels): return gt -def graph_deepcopy(G): - """Deep copy a graph, including deep copy of all nodes, edges and - attributes of the graph, nodes and edges. +def find_paths(G, source_node, length): + """Find all paths with a certain length those start from a source node. + A recursive depth first search is applied. - Note - ---- - It is the same as the NetworkX function graph.copy(), as far as I know. + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + source_node : integer + The number of the node from where all paths start. + length : integer + The length of paths. + + Return + ------ + path : list of list + List of paths retrieved, where each path is represented by a list of nodes. """ - # add graph attributes. - labels = {} - for k, v in G.graph.items(): - labels[k] = deepcopy(v) - if G.is_directed(): - G_copy = nx.DiGraph(**labels) - else: - G_copy = nx.Graph(**labels) + if length == 0: + return [[source_node]] + path = [[source_node] + path for neighbor in G[source_node] \ + for path in find_paths(G, neighbor, length - 1) if source_node not in path] + return path - # add nodes - for nd, attrs in G.nodes(data=True): - labels = {} - for k, v in attrs.items(): - labels[k] = deepcopy(v) - G_copy.add_node(nd, **labels) - # add edges. - for nd1, nd2, attrs in G.edges(data=True): - labels = {} - for k, v in attrs.items(): - labels[k] = deepcopy(v) - G_copy.add_edge(nd1, nd2, **labels) +def find_all_paths(G, length, is_directed): + """Find all paths with a certain length in a graph. A recursive depth first + search is applied. - return G_copy + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + length : integer + The length of paths. + Return + ------ + path : list of list + List of paths retrieved, where each path is represented by a list of nodes. + """ + all_paths = [] + for node in G: + all_paths.extend(find_paths(G, node, length)) -def graph_isIdentical(G1, G2): - """Check if two graphs are identical, including: same nodes, edges, node - labels/attributes, edge labels/attributes. + if not is_directed: + # For each path, two presentations are retrieved from its two extremities. + # Remove one of them. + all_paths_r = [path[::-1] for path in all_paths] + for idx, path in enumerate(all_paths[:-1]): + for path2 in all_paths_r[idx+1::]: + if path == path2: + all_paths[idx] = [] + break + all_paths = list(filter(lambda a: a != [], all_paths)) - Notes - ----- - 1. The type of graphs has to be the same. + return all_paths - 2. Global/Graph attributes are neglected as they may contain names for graphs. - """ - # check nodes. - nlist1 = [n for n in G1.nodes(data=True)] - nlist2 = [n for n in G2.nodes(data=True)] - if not nlist1 == nlist2: - return False - # check edges. - elist1 = [n for n in G1.edges(data=True)] - elist2 = [n for n in G2.edges(data=True)] - if not elist1 == elist2: - return False - # check graph attributes. - return True +# @todo: use it in ShortestPath. +def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]): + """Compute kernels between each pair of vertices in two graphs. + Parameters + ---------- + g1, g2 : NetworkX graph + The kernels bewteen pairs of vertices in these two graphs are computed. + node_kernels : dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when nodes are unlabeled. This argument + is designated to conjugate gradient method and fixed-point iterations. + node_labels : list, optional + The list of the name strings of the node labels. The default is []. + node_attrs : list, optional + The list of the name strings of the node attributes. The default is []. -def get_node_labels(Gn, node_label): - """Get node labels of dataset Gn. - """ - nl = set() - for G in Gn: - nl = nl | set(nx.get_node_attributes(G, node_label).values()) - return nl + Returns + ------- + vk_dict : dict + Vertex kernels keyed by vertices. + Notes + ----- + This function is used by ``gklearn.kernels.FixedPoint'' and + ``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1]. -def get_edge_labels(Gn, edge_label): - """Get edge labels of dataset Gn. + References + ---------- + .. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang. + Parallelization of shortest path graph kernels on multi-core cpus and gpus. + Proceedings of the Programmability Issues for Heterogeneous Multicores + (MultiProg), Vienna, Austria, 2014. """ - el = set() - for G in Gn: - el = el | set(nx.get_edge_attributes(G, edge_label).values()) - return el + vk_dict = {} # shortest path matrices dict + if len(node_labels) > 0: + # node symb and non-synb labeled + if len(node_attrs) > 0: + kn = node_kernels['mix'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in node_labels] + n2_labels = [n2[1][nl] for nl in node_labels] + n1_attrs = [n1[1][na] for na in node_attrs] + n2_attrs = [n2[1][na] for na in node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in node_labels] + n2_labels = [n2[1][nl] for nl in node_labels] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + else: + # node non-synb labeled + if len(node_attrs) > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_attrs = [n1[1][na] for na in node_attrs] + n2_attrs = [n2[1][na] for na in node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) + # node unlabeled + else: + pass # @todo: add edge weights. +# for e1 in g1.edges(data=True): +# for e2 in g2.edges(data=True): +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 +# return kernel + + return vk_dict + + +#%% def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): @@ -513,79 +581,6 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d print('\ncomplete.') -def find_paths(G, source_node, length): - """Find all paths with a certain length those start from a source node. - A recursive depth first search is applied. - - Parameters - ---------- - G : NetworkX graphs - The graph in which paths are searched. - source_node : integer - The number of the node from where all paths start. - length : integer - The length of paths. - - Return - ------ - path : list of list - List of paths retrieved, where each path is represented by a list of nodes. - """ - if length == 0: - return [[source_node]] - path = [[source_node] + path for neighbor in G[source_node] \ - for path in find_paths(G, neighbor, length - 1) if source_node not in path] - return path - - -def find_all_paths(G, length, is_directed): - """Find all paths with a certain length in a graph. A recursive depth first - search is applied. - - Parameters - ---------- - G : NetworkX graphs - The graph in which paths are searched. - length : integer - The length of paths. - - Return - ------ - path : list of list - List of paths retrieved, where each path is represented by a list of nodes. - """ - all_paths = [] - for node in G: - all_paths.extend(find_paths(G, node, length)) - - if not is_directed: - # For each path, two presentations are retrieved from its two extremities. - # Remove one of them. - all_paths_r = [path[::-1] for path in all_paths] - for idx, path in enumerate(all_paths[:-1]): - for path2 in all_paths_r[idx+1::]: - if path == path2: - all_paths[idx] = [] - break - all_paths = list(filter(lambda a: a != [], all_paths)) - - return all_paths - - -def get_mlti_dim_node_attrs(G, attr_names): - attributes = [] - for nd, attrs in G.nodes(data=True): - attributes.append(tuple(attrs[aname] for aname in attr_names)) - return attributes - - -def get_mlti_dim_edge_attrs(G, attr_names): - attributes = [] - for ed, attrs in G.edges(data=True): - attributes.append(tuple(attrs[aname] for aname in attr_names)) - return attributes - - def normalize_gram_matrix(gram_matrix): diag = gram_matrix.diagonal().copy() old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. @@ -621,84 +616,162 @@ def compute_distance_matrix(gram_matrix): return dis_mat, dis_max, dis_min, dis_mean -# @todo: use it in ShortestPath. -def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]): - """Compute kernels between each pair of vertices in two graphs. +#%% + + +def graph_deepcopy(G): + """Deep copy a graph, including deep copy of all nodes, edges and + attributes of the graph, nodes and edges. + + Note + ---- + - It is the same as the NetworkX function graph.copy(), as far as I know. + + - This function only supports Networkx.Graph and Networkx.DiGraph. + """ + # add graph attributes. + labels = {} + for k, v in G.graph.items(): + labels[k] = deepcopy(v) + if G.is_directed(): + G_copy = nx.DiGraph(**labels) + else: + G_copy = nx.Graph(**labels) + + # add nodes + for nd, attrs in G.nodes(data=True): + labels = {} + for k, v in attrs.items(): + labels[k] = deepcopy(v) + G_copy.add_node(nd, **labels) + + # add edges. + for nd1, nd2, attrs in G.edges(data=True): + labels = {} + for k, v in attrs.items(): + labels[k] = deepcopy(v) + G_copy.add_edge(nd1, nd2, **labels) + + return G_copy + + +def graph_isIdentical(G1, G2): + """Check if two graphs are identical, including: same nodes, edges, node + labels/attributes, edge labels/attributes. + + Notes + ----- + 1. The type of graphs has to be the same. + + 2. Global/Graph attributes are neglected as they may contain names for graphs. + """ + # check nodes. + nlist1 = [n for n in G1.nodes(data=True)] + nlist2 = [n for n in G2.nodes(data=True)] + if not nlist1 == nlist2: + return False + # check edges. + elist1 = [n for n in G1.edges(data=True)] + elist2 = [n for n in G2.edges(data=True)] + if not elist1 == elist2: + return False + # check graph attributes. + + return True + + +def get_node_labels(Gn, node_label): + """Get node labels of dataset Gn. + """ + nl = set() + for G in Gn: + nl = nl | set(nx.get_node_attributes(G, node_label).values()) + return nl + + +def get_edge_labels(Gn, edge_label): + """Get edge labels of dataset Gn. + """ + el = set() + for G in Gn: + el = el | set(nx.get_edge_attributes(G, edge_label).values()) + return el + + +def get_mlti_dim_node_attrs(G, attr_names): + attributes = [] + for nd, attrs in G.nodes(data=True): + attributes.append(tuple(attrs[aname] for aname in attr_names)) + return attributes + + +def get_mlti_dim_edge_attrs(G, attr_names): + attributes = [] + for ed, attrs in G.edges(data=True): + attributes.append(tuple(attrs[aname] for aname in attr_names)) + return attributes + + +def nx_permute_nodes(G, random_state=None): + """Permute node indices in a NetworkX graph. Parameters ---------- - g1, g2 : NetworkX graph - The kernels bewteen pairs of vertices in these two graphs are computed. - node_kernels : dict - A dictionary of kernel functions for nodes, including 3 items: 'symb' - for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' - for both labels. The first 2 functions take two node labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two nodes. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns a number - as the kernel value. Ignored when nodes are unlabeled. This argument - is designated to conjugate gradient method and fixed-point iterations. - node_labels : list, optional - The list of the name strings of the node labels. The default is []. - node_attrs : list, optional - The list of the name strings of the node attributes. The default is []. + G : TYPE + DESCRIPTION. + random_state : TYPE, optional + DESCRIPTION. The default is None. Returns ------- - vk_dict : dict - Vertex kernels keyed by vertices. + G_new : TYPE + DESCRIPTION. Notes ----- - This function is used by ``gklearn.kernels.FixedPoint'' and - ``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1]. - - References - ---------- - .. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang. - Parallelization of shortest path graph kernels on multi-core cpus and gpus. - Proceedings of the Programmability Issues for Heterogeneous Multicores - (MultiProg), Vienna, Austria, 2014. + - This function only supports Networkx.Graph and Networkx.DiGraph. """ - vk_dict = {} # shortest path matrices dict - if len(node_labels) > 0: - # node symb and non-synb labeled - if len(node_attrs) > 0: - kn = node_kernels['mix'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_labels = [n1[1][nl] for nl in node_labels] - n2_labels = [n2[1][nl] for nl in node_labels] - n1_attrs = [n1[1][na] for na in node_attrs] - n2_attrs = [n2[1][na] for na in node_attrs] - vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) - # node symb labeled - else: - kn = node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_labels = [n1[1][nl] for nl in node_labels] - n2_labels = [n2[1][nl] for nl in node_labels] - vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + # @todo: relabel node with integers? (in case something went wrong...) + # Add graph attributes. + labels = {} + for k, v in G.graph.items(): + labels[k] = deepcopy(v) + if G.is_directed(): + G_new = nx.DiGraph(**labels) else: - # node non-synb labeled - if len(node_attrs) > 0: - kn = node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_attrs = [n1[1][na] for na in node_attrs] - n2_attrs = [n2[1][na] for na in node_attrs] - vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) - # node unlabeled - else: - pass # @todo: add edge weights. -# for e1 in g1.edges(data=True): -# for e2 in g2.edges(data=True): -# if e1[2]['cost'] == e2[2]['cost']: -# kernel += 1 -# return kernel + G_new = nx.Graph(**labels) - return vk_dict + # Create a random mapping old node indices <-> new indices. + nb_nodes = nx.number_of_nodes(G) + indices_orig = range(nb_nodes) + idx_mapping = np.random.RandomState(seed=random_state).permutation(indices_orig) + + # Add nodes. + nodes_orig = list(G.nodes) + for i_orig in range(nb_nodes): + i_new = idx_mapping[i_orig] + labels = {} + for k, v in G.nodes[nodes_orig[i_new]].items(): + labels[k] = deepcopy(v) + G_new.add_node(nodes_orig[i_new], **labels) + + # Add edges. + for nd1, nd2, attrs in G.edges(data=True): + labels = {} + for k, v in attrs.items(): + labels[k] = deepcopy(v) + G_new.add_edge(nd1, nd2, **labels) + + +# # create a random mapping old label -> new label +# node_mapping = dict(zip(G.nodes(), np.random.RandomState(seed=random_state).permutation(G.nodes()))) +# # build a new graph +# G_new = nx.relabel_nodes(G, node_mapping) + + return G_new + + +#%% def dummy_node():