diff --git a/README.md b/README.md index 91215c8..0380c68 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,20 @@ Fork the library and open a pull request! Make your own contribute to the commun ## Citation -Still waiting... +If you have used `graphkit-learn` in your publication, please cite the the following paper: +``` +@article{JIA2021, + title = "graphkit-learn: A Python Library for Graph Kernels Based on Linear Patterns", + journal = "Pattern Recognition Letters", + year = "2021", + issn = "0167-8655", + doi = "https://doi.org/10.1016/j.patrec.2021.01.003", + url = "http://www.sciencedirect.com/science/article/pii/S0167865521000131", + author = "Linlin Jia and Benoit Gaüzère and Paul Honeine", + keywords = "Graph Kernels, Linear Patterns, Python Implementation", + abstract = "This paper presents graphkit-learn, the first Python library for efficient computation of graph kernels based on linear patterns, able to address various types of graphs. Graph kernels based on linear patterns are thoroughly implemented, each with specific computing methods, as well as two well-known graph kernels based on non-linear patterns for comparative analysis. Since computational complexity is an Achilles’ heel of graph kernels, we provide several strategies to address this critical issue, including parallelization, the trie data structure, and the FCSP method that we extend to other kernels and edge comparison. All proposed strategies save orders of magnitudes of computing time and memory usage. Moreover, all the graph kernels can be simply computed with a single Python statement, thus are appealing to researchers and practitioners. For the convenience of use, an advanced model selection procedure is provided for both regression and classification problems. Experiments on synthesized datasets and 11 real-world benchmark datasets show the relevance of the proposed library." +} +``` ## Acknowledgments diff --git a/gklearn/dataset/dataset.py b/gklearn/dataset/dataset.py index faca89b..75684c2 100644 --- a/gklearn/dataset/dataset.py +++ b/gklearn/dataset/dataset.py @@ -14,7 +14,7 @@ from gklearn.dataset import DATASET_META, DataFetcher, DataLoader class Dataset(object): - def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs): + def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', remove_null_graphs=True, clean_labels=True, reload=False, verbose=False, **kwargs): self._substructures = None self._node_label_dim = None self._edge_label_dim = None @@ -82,6 +82,8 @@ class Dataset(object): else: raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.') + if remove_null_graphs: + self.trim_dataset(edge_required=False) def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs): @@ -537,7 +539,7 @@ class Dataset(object): def trim_dataset(self, edge_required=False): - if edge_required: + if edge_required: # @todo: there is a possibility that some node labels will be removed. trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)] else: trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0] diff --git a/gklearn/dataset/file_managers.py b/gklearn/dataset/file_managers.py index 9a804f5..df00d4c 100644 --- a/gklearn/dataset/file_managers.py +++ b/gklearn/dataset/file_managers.py @@ -332,7 +332,8 @@ class DataLoader(): content_targets = ga.read().splitlines() # targets (regression) targets = [int(i) for i in content_targets] else: - raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') + exp_msg = 'Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.' + raise Exception(exp_msg) if class_label_map is not None: targets = [class_label_map[t] for t in targets] diff --git a/gklearn/experiments/__init__.py b/gklearn/experiments/__init__.py new file mode 100644 index 0000000..7564765 --- /dev/null +++ b/gklearn/experiments/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Dec 15 18:22:34 2020 + +@author: ljia +""" + +import os +EXP_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/' +DATASET_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/datasets/' \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py similarity index 72% rename from gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py rename to gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py index 710213a..33c6973 100644 --- a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py @@ -4,7 +4,7 @@ Created on Wed Oct 20 11:48:02 2020 @author: ljia -""" +""" # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. import os @@ -13,15 +13,15 @@ import pickle import logging from gklearn.ged.util import compute_geds import time -from utils import get_dataset +from utils import get_dataset, set_edit_cost_consts import sys -from group_results import group_trials +from group_results import group_trials, check_group_existence, update_group_marker def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) - + # Return if the file exists. if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): return None, None @@ -41,8 +41,11 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): 'threads': multiprocessing.cpu_count(), 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' } - - edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] + + edit_cost_constants = set_edit_cost_consts(ratio, + node_labeled=len(dataset.node_labels), + edge_labeled=len(dataset.edge_labels), + mode='uniform') # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) @@ -53,7 +56,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): options['node_attrs'] = dataset.node_attrs options['edge_attrs'] = dataset.edge_attrs parallel = True # if num_solutions == 1 else False - + """**5. Compute GED matrix.**""" ged_mat = 'error' runtime = 0 @@ -67,9 +70,9 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception(save_file_suffix) print(repr(exp)) - + """**6. Get results.**""" - + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: pickle.dump(ged_mat, f) with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: @@ -77,66 +80,76 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): return ged_mat, runtime - + def save_trials_as_group(dataset, ds_name, num_solutions, ratio): # Return if the group file exists. name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' - if os.path.isfile(name_group): + if check_group_existence(name_group): return - + ged_mats = [] runtimes = [] - for trial in range(1, 101): + num_trials = 100 + for trial in range(1, num_trials + 1): print() print('Trial:', trial) ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) ged_mats.append(ged_mat) runtimes.append(runtime) - + # Group trials and Remove single files. + # @todo: if the program stops between the following lines, then there may be errors. name_prefix = 'ged_matrix' + name_middle - group_trials(save_dir, name_prefix, True, True, False) + group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) name_prefix = 'runtime' + name_middle - group_trials(save_dir, name_prefix, True, True, False) + group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) + update_group_marker(name_group) def results_for_a_dataset(ds_name): """**1. Get dataset.**""" dataset = get_dataset(ds_name) - - for num_solutions in num_solutions_list: + + for ratio in ratio_list: print() - print('# of solutions:', num_solutions) - for ratio in ratio_list: + print('Ratio:', ratio) + for num_solutions in num_solutions_list: print() - print('Ratio:', ratio) + print('# of solutions:', num_solutions) save_trials_as_group(dataset, ds_name, num_solutions, ratio) - - -def get_param_lists(ds_name): + + +def get_param_lists(ds_name, test=False): + if test: + num_solutions_list = [1, 10, 20, 30, 40, 50] + ratio_list = [10] + return num_solutions_list, ratio_list + if ds_name == 'AIDS_symb': num_solutions_list = [1, 20, 40, 60, 80, 100] ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] else: - num_solutions_list = [1, 20, 40, 60, 80, 100] - ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] - + num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] + return num_solutions_list, ratio_list - + if __name__ == '__main__': if len(sys.argv) > 1: ds_name_list = sys.argv[1:] else: - ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - - save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' + ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] +# ds_name_list = ['Acyclic'] # 'Alkane_unlabeled'] +# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] + + save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir + 'groups/', exist_ok=True) - + for ds_name in ds_name_list: print() print('Dataset:', ds_name) - num_solutions_list, ratio_list = get_param_lists(ds_name) + num_solutions_list, ratio_list = get_param_lists(ds_name, test=False) results_for_a_dataset(ds_name) diff --git a/gklearn/experiments/ged/stability/group_results.py b/gklearn/experiments/ged/stability/group_results.py index e1f999e..bdbe89f 100644 --- a/gklearn/experiments/ged/stability/group_results.py +++ b/gklearn/experiments/ged/stability/group_results.py @@ -5,7 +5,7 @@ Created on Thu Oct 29 17:26:43 2020 @author: ljia -This script groups results together into a single file for the sake of faster +This script groups results together into a single file for the sake of faster searching and loading. """ import os @@ -16,9 +16,55 @@ from tqdm import tqdm import sys +def check_group_existence(file_name): + path, name = os.path.split(file_name) + marker_fn = os.path.join(path, 'group_names_finished.pkl') + if os.path.isfile(marker_fn): + with open(marker_fn, 'rb') as f: + fns = pickle.load(f) + if name in fns: + return True + + if os.path.isfile(file_name): + return True + + return False + + +def update_group_marker(file_name): + path, name = os.path.split(file_name) + marker_fn = os.path.join(path, 'group_names_finished.pkl') + if os.path.isfile(marker_fn): + with open(marker_fn, 'rb') as f: + fns = pickle.load(f) + if name in fns: + return + else: + fns.add(name) + else: + fns = set({name}) + with open(marker_fn, 'wb') as f: + pickle.dump(fns, f) + + +def create_group_marker_file(dir_folder, overwrite=True): + if not overwrite: + return + + fns = set() + for file in sorted(os.listdir(dir_folder)): + if os.path.isfile(os.path.join(dir_folder, file)): + if file.endswith('.npy'): + fns.add(file) + + marker_fn = os.path.join(dir_folder, 'group_names_finished.pkl') + with open(marker_fn, 'wb') as f: + pickle.dump(fns, f) + + # This function is used by other scripts. Modify it carefully. -def group_trials(dir_folder, name_prefix, override, clear, backup): - +def group_trials(dir_folder, name_prefix, overwrite, clear, backup, num_trials=100): + # Get group name. label_name = name_prefix.split('.')[0] if label_name == 'ged_matrix': @@ -33,10 +79,10 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): else: name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl' - if not override and os.path.isfile(name_group): + if not overwrite and os.path.isfile(name_group): # Check if all trial files exist. trials_complete = True - for trial in range(1, 101): + for trial in range(1, num_trials + 1): file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' if not os.path.isfile(file_name): trials_complete = False @@ -44,7 +90,7 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): else: # Get data. data_group = [] - for trial in range(1, 101): + for trial in range(1, num_trials + 1): file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' if os.path.isfile(file_name): with open(file_name, 'rb') as f: @@ -64,7 +110,7 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): else: # Not all trials are completed. return - + # Write groups. if label_name == 'ged_matrix': data_group = np.array(data_group) @@ -73,31 +119,31 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): else: with open(name_group, 'wb') as f: pickle.dump(data_group, f) - + trials_complete = True if trials_complete: # Backup. if backup: - for trial in range(1, 101): + for trial in range(1, num_trials + 1): src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl' copyfile(src, dst) - + # Clear. if clear: - for trial in range(1, 101): + for trial in range(1, num_trials + 1): src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' os.remove(src) -def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): - +def group_all_in_folder(dir_folder, overwrite=False, clear=True, backup=True): + # Create folders. os.makedirs(dir_folder + 'groups/', exist_ok=True) if backup: os.makedirs(dir_folder + 'backups', exist_ok=True) - + # Iterate all files. cur_file_prefix = '' for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout): @@ -106,20 +152,23 @@ def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): # print(name) # print(name_prefix) if name_prefix != cur_file_prefix: - group_trials(dir_folder, name_prefix, override, clear, backup) + group_trials(dir_folder, name_prefix, overwrite, clear, backup) cur_file_prefix = name_prefix - - + + if __name__ == '__main__': - dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' - group_all_in_folder(dir_folder) - - dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' - group_all_in_folder(dir_folder) - - dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' - group_all_in_folder(dir_folder) - - dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' - group_all_in_folder(dir_folder) \ No newline at end of file + # dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' + # group_all_in_folder(dir_folder) + + # dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' + # group_all_in_folder(dir_folder) + + # dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' + # group_all_in_folder(dir_folder) + + # dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' + # group_all_in_folder(dir_folder) + + dir_folder = 'outputs/CRIANN/edit_costs.real_data.num_sols.ratios.IPFP/groups/' + create_group_marker_file(dir_folder) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/run_job_edit_costs.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py similarity index 62% rename from gklearn/experiments/ged/stability/run_job_edit_costs.nums_sols.ratios.IPFP.py rename to gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py index 6939a06..7ab72b2 100644 --- a/gklearn/experiments/ged/stability/run_job_edit_costs.nums_sols.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py @@ -15,30 +15,30 @@ def get_job_script(arg): #SBATCH --exclusive #SBATCH --job-name="st.""" + arg + r""".IPFP" -#SBATCH --partition=tlong +#SBATCH --partition=court #SBATCH --mail-type=ALL #SBATCH --mail-user=jajupmochi@gmail.com -#SBATCH --output="outputs/output_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" -#SBATCH --error="errors/error_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" +#SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.IPFP.""" + arg + """.txt" +#SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.IPFP.""" + arg + """.txt" # #SBATCH --ntasks=1 #SBATCH --nodes=1 #SBATCH --cpus-per-task=1 -#SBATCH --time=300:00:00 +#SBATCH --time=48:00:00 #SBATCH --mem-per-cpu=4000 srun hostname srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability -srun python3 edit_costs.nums_sols.ratios.IPFP.py """ + arg +srun python3 edit_costs.real_data.nums_sols.ratios.IPFP.py """ + arg script = script.strip() script = re.sub('\n\t+', '\n', script) script = re.sub('\n +', '\n', script) - + return script if __name__ == '__main__': - ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - for ds_name in [ds_list[i] for i in [0, 3]]: + ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] + for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]: job_script = get_job_script(ds_name) command = 'sbatch < 1: - fcsp = True if sys.argv[1] == 'True' else False + kernel_name = sys.argv[1] + ds_name = sys.argv[2] + fcsp = True if sys.argv[3] == 'True' else False else: + kernel_name = 'ShortestPath' + ds_name = 'Acyclic' fcsp = True - run_all(fcsp) + save_dir = 'outputs/' + os.makedirs(save_dir, exist_ok=True) + + run_task(kernel_name, ds_name, fcsp) \ No newline at end of file diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp_space.py b/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp_space.py new file mode 100644 index 0000000..3beacfd --- /dev/null +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp_space.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Dec 2 17:41:54 2020 + +@author: ljia + +This script compares the results with and without FCSP. +""" +from gklearn.dataset import Dataset +from shortest_path import SPSpace +from structural_sp import SSPSpace +from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct +from gklearn.experiments import DATASET_ROOT +import functools +import os +import pickle +import sys +import logging + + +def run_task(kernel_name, ds_name, fcsp): + save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) + file_name = os.path.join(save_dir, 'space' + save_file_suffix + '.pkl') + + # Return if the task is already completed. + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + data = pickle.load(f) + if data['completed']: + return + + print() + print((kernel_name, ds_name, str(fcsp))) + + try: + gram_matrix, run_time = compute(kernel_name, ds_name, fcsp, file_name) + + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = os.path.join(save_dir, 'error.space' + save_file_suffix + '.txt') + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception('\n--------------' + save_file_suffix + '------------------') + print(repr(exp)) + +# else: +# with open(file_name, 'wb') as f: +# pickle.dump(run_time, f) + + +def compute(kernel_name, ds_name, fcsp, file_name): + dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) + if kernel_name == 'ShortestPath': + dataset.trim_dataset(edge_required=True) +# dataset.cut_graphs(range(0, 10)) + kernel_class = SPSpace + else: +# dataset.cut_graphs(range(0, 10)) + kernel_class = SSPSpace + + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + edge_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + + graph_kernel = kernel_class(name=kernel_name, + node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, + edge_attrs=dataset.edge_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + fcsp=fcsp, + compute_method='naive', + node_kernels=node_kernels, + edge_kernels=edge_kernels, + file_name=file_name + ) + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, + parallel=None, + normalize=False, + verbose=2 + ) + return gram_matrix, run_time + + +if __name__ == '__main__': + if len(sys.argv) > 1: + kernel_name = sys.argv[1] + ds_name = sys.argv[2] + fcsp = True if sys.argv[3] == 'True' else False + else: + kernel_name = 'StructuralSP' + ds_name = 'Fingerprint' + fcsp = True + + save_dir = 'outputs/' + os.makedirs(save_dir, exist_ok=True) + + run_task(kernel_name, ds_name, fcsp) \ No newline at end of file diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py index 53ae39c..7a98686 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py @@ -10,27 +10,86 @@ import os import re -def get_job_script(param): +OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), + ('StructuralSP', 'ENZYMES', 'True'), + ('StructuralSP', 'ENZYMES', 'False'), + ('StructuralSP', 'AIDS', 'False'), + ('ShortestPath', 'NCI1', 'False'), + ('StructuralSP', 'NCI1', 'True'), + ('StructuralSP', 'NCI1', 'False'), + ('ShortestPath', 'NCI109', 'False'), + ('StructuralSP', 'NCI109', 'True'), + ('StructuralSP', 'NCI109', 'False'), + ('ShortestPath', 'DD', 'True'), + ('ShortestPath', 'DD', 'False'), + ('StructuralSP', 'BZR', 'False'), + ('ShortestPath', 'COX2', 'False'), + ('StructuralSP', 'COX2', 'False'), + ('ShortestPath', 'DHFR', 'False'), + ('StructuralSP', 'DHFR', 'False'), + ('StructuralSP', 'OHSU', 'True'), + ('StructuralSP', 'OHSU', 'False'), + ('StructuralSP', 'SYNTHETIC', 'False'), + ('StructuralSP', 'SYNTHETIC', 'True'), + ('StructuralSP', 'SYNTHETIC', 'False'), + ('ShortestPath', 'SYNTHETICnew', 'False'), + ('StructuralSP', 'SYNTHETICnew', 'True'), + ('StructuralSP', 'SYNTHETICnew', 'False'), + ('ShortestPath', 'Synthie', 'False'), + ('StructuralSP', 'Synthie', 'True'), + ('StructuralSP', 'Synthie', 'False'), + ('ShortestPath', 'COIL-DEL', 'False'), + ('StructuralSP', 'COIL-DEL', 'True'), + ('StructuralSP', 'COIL-DEL', 'False'), + ('ShortestPath', 'PROTEINS', 'False'), + ('ShortestPath', 'PROTEINS_full', 'False'), + ('StructuralSP', 'Mutagenicity', 'True'), + ('StructuralSP', 'Mutagenicity', 'False'), + ('StructuralSP', 'REDDIT-BINARY', 'True'), + ('StructuralSP', 'REDDIT-BINARY', 'False'), + }) + +OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'), + ('StructuralSP', 'DD', 'False'), + ('StructuralSP', 'PROTEINS', 'True'), + ('StructuralSP', 'PROTEINS', 'False'), + ('StructuralSP', 'PROTEINS_full', 'True'), + ('StructuralSP', 'PROTEINS_full', 'False'), + ('ShortestPath', 'REDDIT-BINARY', 'True'), + ('ShortestPath', 'TWITTER-Real-Graph-Partial', 'True'), + ('ShortestPath', 'TWITTER-Real-Graph-Partial', 'False'), + ('StructuralSP', 'TWITTER-Real-Graph-Partial', 'True'), + }) + +MISS_LABEL_LIST = set({('StructuralSP', 'GREC', 'True'), + ('StructuralSP', 'GREC', 'False'), + ('StructuralSP', 'Web', 'True'), + ('StructuralSP', 'Web', 'False'), + }) + + +def get_job_script(kernel, dataset, fcsp): script = r""" #!/bin/bash -#SBATCH --exclusive -#SBATCH --job-name="fcsp.""" + param + r"""" -#SBATCH --partition=long +##SBATCH --exclusive +#SBATCH --job-name="fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r"""" +#SBATCH --partition=tlong #SBATCH --mail-type=ALL #SBATCH --mail-user=jajupmochi@gmail.com -#SBATCH --output="outputs/output_fcsp.""" + param + r""".txt" -#SBATCH --error="errors/error_fcsp.""" + param + r""".txt" +#SBATCH --output="outputs/output_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" +#SBATCH --error="errors/error_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" # #SBATCH --ntasks=1 #SBATCH --nodes=1 #SBATCH --cpus-per-task=1 -#SBATCH --time=100:00:00 -#SBATCH --mem-per-cpu=4000 +#SBATCH --time=300:00:00 +##SBATCH --mem-per-cpu=4000 +#SBATCH --mem=40000 srun hostname srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp -srun python3 compare_fcsp.py """ + param +srun python3 compare_fcsp.py """ + kernel + r" " + dataset + r" " + fcsp script = script.strip() script = re.sub('\n\t+', '\n', script) script = re.sub('\n +', '\n', script) @@ -38,15 +97,83 @@ srun python3 compare_fcsp.py """ + param return script +def check_task_status(save_dir, *params): + str_task_id = '.' + '.'.join(params) + + # Check if the task is in out of memeory or out of space lists or missing labels. + if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: + return True + + # Check if the task is running or in queue of slurm. + command = 'squeue --user $USER --name "fcsp' + str_task_id + '" --format "%.2t" --noheader' + stream = os.popen(command) + output = stream.readlines() + if len(output) > 0: + return True + + # Check if there are more than 10 tlong tasks running. + command = 'squeue --user $USER --partition tlong --noheader' + stream = os.popen(command) + output = stream.readlines() + if len(output) >= 10: + return True + + + # Check if the results are already computed. + file_name = os.path.join(save_dir, 'run_time' + str_task_id + '.pkl') + if os.path.isfile(file_name): + return True + + return False + + if __name__ == '__main__': + save_dir = 'outputs/' + os.makedirs(save_dir, exist_ok=True) os.makedirs('outputs/', exist_ok=True) os.makedirs('errors/', exist_ok=True) - param_list = ['True', 'False'] - for param in param_list[:]: - job_script = get_job_script(param) - command = 'sbatch < 0: + return True + + # Check if the task is already computed. + file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + data = pickle.load(f) + if data['completed']: + return True + + return False + + +if __name__ == '__main__': + save_dir = 'outputs/' + os.makedirs(save_dir, exist_ok=True) + os.makedirs('outputs/', exist_ok=True) + os.makedirs('errors/', exist_ok=True) + + from sklearn.model_selection import ParameterGrid + + Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', + 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', + 'Letter-high', 'Letter-med', 'Letter-low', + 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', + # new: not so large. + 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', + 'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', + 'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', + 'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', + # new: large. + 'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', + 'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', + 'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', + 'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', + 'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', + 'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', + 'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', + 'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', + 'COLLAB', 'COIL-DEL', + 'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', + 'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', + 'REDDIT-MULTI-12K'] + + Kernel_List = ['ShortestPath', 'StructuralSP'] + + fcsp_list = ['True', 'False'] + + task_grid = ParameterGrid({'kernel': Kernel_List[:], + 'dataset': Dataset_List[:], + 'fcsp': fcsp_list[:]}) + + from tqdm import tqdm + + for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): + + if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): + job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) + command = 'sbatch < 0: + results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) + save_results(file_name, results) + + +class SPSpace(ShortestPath): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._file_name = kwargs.get('file_name') + +# @profile + def _compute_gm_series(self): + self._all_graphs_have_edges(self._graphs) + # get shortest path graph of each graph. + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] + + + results = load_results(self._file_name, self._fcsp) + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', + length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) + + time0 = time.time() + for i, j in iterator: + if i > results['i'] or (i == results['i'] and j > results['j']): + data = self._sp_do_space(self._graphs[i], self._graphs[j]) + if self._fcsp: + results['nb_comparison'].append(data[0]) + if data[1] != {}: + results['vk_dict_mem'].append(estimate_vk_memory(data[1], + nx.number_of_nodes(self._graphs[i]), + nx.number_of_nodes(self._graphs[j]))) + else: + results['nb_comparison'].append(data) + results['i'] = i + results['j'] = j + + time1 = time.time() + if time1 - time0 > 600: + save_results(self._file_name, results) + time0 = time1 + + compute_stats(self._file_name, results) + + return gram_matrix + + + def _sp_do_space(self, g1, g2): + + if self._fcsp: # @todo: it may be put outside the _sp_do(). + return self._sp_do_fcsp(g1, g2) + else: + return self._sp_do_naive(g1, g2) + + + def _sp_do_fcsp(self, g1, g2): + + nb_comparison = 0 + + # compute shortest path matrices first, method borrowed from FCSP. + vk_dict = {} # shortest path matrices dict + if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do(). + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + kn = self._node_kernels['mix'] + for n1, n2 in product( + g1.nodes(data=True), g2.nodes(data=True)): + n1_labels = [n1[1][nl] for nl in self._node_labels] + n2_labels = [n2[1][nl] for nl in self._node_labels] + n1_attrs = [n1[1][na] for na in self._node_attrs] + n2_attrs = [n2[1][na] for na in self._node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + nb_comparison += 1 + # node symb labeled + else: + kn = self._node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in self._node_labels] + n2_labels = [n2[1][nl] for nl in self._node_labels] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + nb_comparison += 1 + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + kn = self._node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_attrs = [n1[1][na] for na in self._node_attrs] + n2_attrs = [n2[1][na] for na in self._node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) + nb_comparison += 1 + # node unlabeled + else: + for e1, e2 in product( + g1.edges(data=True), g2.edges(data=True)): + pass +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 +# nb_comparison += 1 + + return nb_comparison, vk_dict + +# # compute graph kernels +# if self._ds_infos['directed']: +# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): +# if e1[2]['cost'] == e2[2]['cost']: +# nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])] +# kn1 = nk11 * nk22 +# kernel += kn1 +# else: +# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): +# if e1[2]['cost'] == e2[2]['cost']: +# # each edge walk is counted twice, starting from both its extreme nodes. +# nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( +# e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])] +# kn1 = nk11 * nk22 +# kn2 = nk12 * nk21 +# kernel += kn1 + kn2 + + + def _sp_do_naive(self, g1, g2): + + nb_comparison = 0 + + # Define the function to compute kernels between vertices in each condition. + if len(self._node_labels) > 0: + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['mix'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + # node symb labeled + else: + def compute_vk(n1, n2): + kn = self._node_kernels['symb'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + return kn(n1_labels, n2_labels) + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['nsymb'] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_attrs, n2_attrs) + # node unlabeled + else: +# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 + return 0 + + # compute graph kernels + if self._ds_infos['directed']: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: +# nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1]) +# kn1 = nk11 * nk22 +# kernel += kn1 + nb_comparison += 2 + else: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + # each edge walk is counted twice, starting from both its extreme nodes. +# nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk( +# e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1]) +# kn1 = nk11 * nk22 +# kn2 = nk12 * nk21 +# kernel += kn1 + kn2 + nb_comparison += 4 + + return nb_comparison \ No newline at end of file diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/structural_sp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/structural_sp.py new file mode 100644 index 0000000..7f5b721 --- /dev/null +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/structural_sp.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 30 11:59:57 2020 + +@author: ljia + +@references: + + [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For + Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). +""" +import sys +from itertools import product +from gklearn.utils import get_iters +import numpy as np +import time +import os, errno +import pickle +from pympler import asizeof +import networkx as nx +from gklearn.utils.utils import get_shortest_paths +from gklearn.kernels import StructuralSP + + +def load_splist(file_name): + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + return pickle.load(f) + else: + results_path = {'splist': [], 'i': -1, 'completed': False} + return results_path + + +def load_results(file_name, fcsp): + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + return pickle.load(f) + else: + results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False} + if fcsp: + results['vk_dict_mem'] = [] + results['ek_dict_mem'] = [] + return results + + +def save_results(file_name, results): + with open(file_name, 'wb') as f: + pickle.dump(results, f) + + +def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): +# asizeof.asized(obj, detail=1).format() +# return asizeof.asizeof(obj) + key, val = next(iter(obj.items())) +# key = dict.iterkeys().next() +# key_mem = asizeof.asizeof(key) + dict_flat = sys.getsizeof(obj) + key_mem = 64 + + if isinstance(val, float): + val_mem = 24 + mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) + else: # value is True or False + mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) + +# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') + return mem + + +def estimate_ek_memory(obj, nb_nodes1, nb_nodes2): +# asizeof.asized(obj, detail=1).format() +# return asizeof.asizeof(obj) + key, val = next(iter(obj.items())) +# key = dict.iterkeys().next() +# key_mem = asizeof.asizeof(key) + dict_flat = sys.getsizeof(obj) + key_mem = 192 + + if isinstance(val, float): + val_mem = 24 + mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) + else: # value is True or False + mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) + +# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') + return mem + + +def compute_stats(file_name, results, splist): + del results['i'] + del results['j'] + results['nb_v_comparison'] = np.mean(results['nb_v_comparison']) +# if len(results['nb_e_comparison']) > 0: + results['nb_e_comparison'] = np.mean(results['nb_e_comparison']) + results['completed'] = True + if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: + results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) + if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0: + results['ek_dict_mem'] = np.mean(results['ek_dict_mem']) + results['nb_sp_ave'] = np.mean([len(ps) for ps in splist]) + results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist]) + results['sp_mem_all'] = asizeof.asizeof(splist) + save_results(file_name, results) + + +class SSPSpace(StructuralSP): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._file_name = kwargs.get('file_name') + +# @profile + def _compute_gm_series(self): + # get shortest paths of each graph in the graphs. + fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl' + results_path = load_splist(fn_paths) + + if not results_path['completed']: + + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + if self._compute_method == 'trie': + for g in iterator: + splist.append(self._get_sps_as_trie(g)) + else: + time0 = time.time() + for i, g in enumerate(iterator): + if i > results_path['i']: + results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) + results_path['i'] = i + + time1 = time.time() + if time1 - time0 > 600: + save_results(fn_paths, results_path) + time0 = time1 + + del results_path['i'] + results_path['completed'] = True + save_results(fn_paths, results_path) + + ######### + splist = results_path['splist'] + results = load_results(self._file_name, self._fcsp) + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, + length=len_itr, verbose=(self._verbose >= 2)) + if self._compute_method == 'trie': + for i, j in iterator: + kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + else: + time0 = time.time() + for i, j in iterator: + if i > results['i'] or (i == results['i'] and j > results['j']): + data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j]) + results['nb_v_comparison'].append(data[0]) + results['nb_e_comparison'].append(data[1]) + if self._fcsp: + if data[2] != {}: + results['vk_dict_mem'].append(estimate_vk_memory(data[2], + nx.number_of_nodes(self._graphs[i]), + nx.number_of_nodes(self._graphs[j]))) + if data[3] != {}: + results['ek_dict_mem'].append(estimate_ek_memory(data[3], + nx.number_of_nodes(self._graphs[i]), + nx.number_of_nodes(self._graphs[j]))) + results['i'] = i + results['j'] = j + + time1 = time.time() + if time1 - time0 > 600: + save_results(self._file_name, results) + time0 = time1 + + compute_stats(self._file_name, results, splist) + # @todo: may not remove the path file if the program stops exactly here. + try: + os.remove(fn_paths) + except OSError as e: + if e.errno != errno.ENOENT: + raise + + return gram_matrix + + + def _ssp_do_naive_space(self, g1, g2, spl1, spl2): + if self._fcsp: # @todo: it may be put outside the _sp_do(). + return self._sp_do_naive_fcsp(g1, g2, spl1, spl2) + else: + return self._sp_do_naive_naive(g1, g2, spl1, spl2) + + + def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2): + + # First, compute shortest path matrices, method borrowed from FCSP. + vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2) + # Then, compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2) + + return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict + + + def _sp_do_naive_naive(self, g1, g2, spl1, spl2): + + nb_v_comparison = 0 + nb_e_comparison = 0 + + # Define the function to compute kernels between vertices in each condition. + if len(self._node_labels) > 0: + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['mix'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + # node symb labeled + else: + def compute_vk(n1, n2): + kn = self._node_kernels['symb'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + return kn(n1_labels, n2_labels) + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['nsymb'] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_attrs, n2_attrs) +# # node unlabeled +# else: +# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 +# return kernel + + # Define the function to compute kernels between edges in each condition. + if len(self._edge_labels) > 0: + # edge symb and non-synb labeled + if len(self._edge_attrs) > 0: + def compute_ek(e1, e2): + ke = self._edge_kernels['mix'] + e1_labels = [g1.edges[e1][el] for el in self._edge_labels] + e2_labels = [g2.edges[e2][el] for el in self._edge_labels] + e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] + e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] + return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) + # edge symb labeled + else: + def compute_ek(e1, e2): + ke = self._edge_kernels['symb'] + e1_labels = [g1.edges[e1][el] for el in self._edge_labels] + e2_labels = [g2.edges[e2][el] for el in self._edge_labels] + return ke(e1_labels, e2_labels) + else: + # edge non-synb labeled + if len(self._edge_attrs) > 0: + def compute_ek(e1, e2): + ke = self._edge_kernels['nsymb'] + e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] + e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] + return ke(e1_attrs, e2_attrs) + + + # compute graph kernels + if len(self._node_labels) > 0 or len(self._node_attrs) > 0: + if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): +# nb_v_comparison = len(p1) +# nb_e_comparison = len(p1) - 1 + kpath = compute_vk(p1[0], p2[0]) + nb_v_comparison += 1 + if kpath: + for idx in range(1, len(p1)): + kpath *= compute_vk(p1[idx], p2[idx]) * \ + compute_ek((p1[idx-1], p1[idx]), + (p2[idx-1], p2[idx])) + nb_v_comparison += 1 + nb_e_comparison += 1 + if not kpath: + break +# kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = compute_vk(p1[0], p2[0]) + nb_v_comparison += 1 + if kpath: + for idx in range(1, len(p1)): + kpath *= compute_vk(p1[idx], p2[idx]) + nb_v_comparison += 1 + if not kpath: + break +# kernel += kpath # add up kernels of all paths + else: + if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + if len(p1) == 0: + pass + else: + kpath = 1 + for idx in range(0, len(p1) - 1): + kpath *= compute_ek((p1[idx], p1[idx+1]), + (p2[idx], p2[idx+1])) + nb_e_comparison += 1 + if not kpath: + break + else: + pass +# for p1, p2 in product(spl1, spl2): +# if len(p1) == len(p2): +# kernel += 1 +# try: +# kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average +# except ZeroDivisionError: +# print(spl1, spl2) +# print(g1.nodes(data=True)) +# print(g1.edges(data=True)) +# raise Exception + + return nb_v_comparison, nb_e_comparison + + + def _get_all_node_kernels(self, g1, g2): + nb_comparison = 0 + + vk_dict = {} # shortest path matrices dict + if len(self._node_labels) > 0: + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + kn = self._node_kernels['mix'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in self._node_labels] + n2_labels = [n2[1][nl] for nl in self._node_labels] + n1_attrs = [n1[1][na] for na in self._node_attrs] + n2_attrs = [n2[1][na] for na in self._node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + nb_comparison += 1 + # node symb labeled + else: + kn = self._node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in self._node_labels] + n2_labels = [n2[1][nl] for nl in self._node_labels] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + nb_comparison += 1 + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + kn = self._node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_attrs = [n1[1][na] for na in self._node_attrs] + n2_attrs = [n2[1][na] for na in self._node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) + nb_comparison += 1 + # node unlabeled + else: + pass # @todo: add edge weights. + # for e1 in g1.edges(data=True): + # for e2 in g2.edges(data=True): + # if e1[2]['cost'] == e2[2]['cost']: + # kernel += 1 + # return kernel + + return vk_dict, nb_comparison + + + def _get_all_edge_kernels(self, g1, g2): + nb_comparison = 0 + + # compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = {} # dict of edge kernels + if len(self._edge_labels) > 0: + # edge symb and non-synb labeled + if len(self._edge_attrs) > 0: + ke = self._edge_kernels['mix'] + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + e1_labels = [e1[2][el] for el in self._edge_labels] + e2_labels = [e2[2][el] for el in self._edge_labels] + e1_attrs = [e1[2][ea] for ea in self._edge_attrs] + e2_attrs = [e2[2][ea] for ea in self._edge_attrs] + ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + nb_comparison += 1 + # edge symb labeled + else: + ke = self._edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + e1_labels = [e1[2][el] for el in self._edge_labels] + e2_labels = [e2[2][el] for el in self._edge_labels] + ek_temp = ke(e1_labels, e2_labels) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + nb_comparison += 1 + else: + # edge non-synb labeled + if len(self._edge_attrs) > 0: + ke = self._edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + e1_attrs = [e1[2][ea] for ea in self._edge_attrs] + e2_attrs = [e2[2][ea] for ea in self._edge_attrs] + ek_temp = ke(e1_attrs, e2_attrs) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + nb_comparison += 1 + # edge unlabeled + else: + pass + + return ek_dict, nb_comparison \ No newline at end of file diff --git a/gklearn/ged/util/__init__.py b/gklearn/ged/util/__init__.py index f885b18..fe85783 100644 --- a/gklearn/ged/util/__init__.py +++ b/gklearn/ged/util/__init__.py @@ -1,3 +1,3 @@ from gklearn.ged.util.lsape_solver import LSAPESolver -from gklearn.ged.util.util import compute_geds, ged_options_to_string +from gklearn.ged.util.util import pairwise_ged, compute_geds, get_nb_edit_operations, ged_options_to_string from gklearn.ged.util.util import compute_geds_cml, label_costs_to_matrix diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index 7453b65..a5a5ac5 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -11,9 +11,10 @@ import multiprocessing from multiprocessing import Pool from functools import partial import sys -from tqdm import tqdm +# from tqdm import tqdm import networkx as nx from gklearn.ged.env import GEDEnv +from gklearn.utils import get_iters def compute_ged(g1, g2, options): @@ -23,7 +24,7 @@ def compute_ged(g1, g2, options): ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) ged_env.add_nx_graph(g1, '') ged_env.add_nx_graph(g2, '') - listID = ged_env.get_all_graph_ids() + listID = ged_env.get_all_graph_ids() ged_env.init(init_type=options['init_option']) ged_env.set_method(options['method'], ged_options_to_string(options)) ged_env.init_method() @@ -33,9 +34,46 @@ def compute_ged(g1, g2, options): ged_env.run_method(g, h) pi_forward = ged_env.get_forward_map(g, h) pi_backward = ged_env.get_backward_map(g, h) - upper = ged_env.get_upper_bound(g, h) + upper = ged_env.get_upper_bound(g, h) dis = upper - + + # make the map label correct (label remove map as np.inf) + nodes1 = [n for n in g1.nodes()] + nodes2 = [n for n in g2.nodes()] + nb1 = nx.number_of_nodes(g1) + nb2 = nx.number_of_nodes(g2) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] +# print(pi_forward) + + return dis, pi_forward, pi_backward + + +def pairwise_ged(g1, g2, options={}, sort=True, repeats=1, parallel=False, verbose=True): + from gklearn.gedlib import librariesImport, gedlibpy + + ged_env = gedlibpy.GEDEnv() + ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) + ged_env.add_nx_graph(g1, '') + ged_env.add_nx_graph(g2, '') + listID = ged_env.get_all_graph_ids() + ged_env.init(init_option=(options['init_option'] if 'init_option' in options else 'EAGER_WITHOUT_SHUFFLED_COPIES')) + ged_env.set_method(options['method'], ged_options_to_string(options)) + ged_env.init_method() + + g = listID[0] + h = listID[1] + dis_min = np.inf + for i in range(0, repeats): + ged_env.run_method(g, h) + upper = ged_env.get_upper_bound(g, h) + dis = upper + if dis < dis_min: + dis_min = dis + pi_forward = ged_env.get_forward_map(g, h) + pi_backward = ged_env.get_backward_map(g, h) +# lower = ged_env.get_lower_bound(g, h) + # make the map label correct (label remove map as np.inf) nodes1 = [n for n in g1.nodes()] nodes2 = [n for n in g2.nodes()] @@ -56,7 +94,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True for g in graphs: ged_env.add_nx_graph(g, '') listID = ged_env.get_all_graph_ids() - + node_labels = ged_env.get_all_node_labels() edge_labels = ged_env.get_all_edge_labels() node_label_costs = label_costs_to_matrix(options['node_label_costs'], len(node_labels)) if 'node_label_costs' in options else None @@ -73,7 +111,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True if node_label_costs is None and edge_label_costs is None: neo_options = {'edit_cost': options['edit_cost'], 'is_cml': False, - 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], + 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} else: neo_options = {'edit_cost': options['edit_cost'], @@ -98,11 +136,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True G_listID = listID_toshare do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) - if verbose: - iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), - desc='computing GEDs', file=sys.stdout) - else: - iterator = pool.imap_unordered(do_partial, itr, chunksize) + iterator = get_iters(pool.imap_unordered(do_partial, itr, chunksize), desc='computing GEDs', file=sys.stdout, length=len_itr, verbose=verbose) # iterator = pool.imap_unordered(do_partial, itr, chunksize) for i, j, dis, n_eo_tmp in iterator: idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2) @@ -114,14 +148,11 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True # print(i, j, idx_itr, dis) pool.close() pool.join() - + else: ged_vec = [] n_edit_operations = [] - if verbose: - iterator = tqdm(range(len(graphs)), desc='computing GEDs', file=sys.stdout) - else: - iterator = range(len(graphs)) + iterator = get_iters(range(len(graphs)), desc='computing GEDs', file=sys.stdout, length=len(graphs), verbose=verbose) for i in iterator: # for i in range(len(graphs)): for j in range(i + 1, len(graphs)): @@ -138,7 +169,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True return ged_vec, ged_mat, n_edit_operations -def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbose=True): +def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, n_jobs=None, verbose=True): from gklearn.gedlib import librariesImport, gedlibpy # initialize ged env. @@ -146,7 +177,7 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) for g in graphs: ged_env.add_nx_graph(g, '') - listID = ged_env.get_all_graph_ids() + listID = ged_env.get_all_graph_ids() ged_env.init() if parallel: options['threads'] = 1 @@ -155,7 +186,7 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo # compute ged. neo_options = {'edit_cost': options['edit_cost'], - 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], + 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} ged_mat = np.zeros((len(graphs), len(graphs))) if parallel: @@ -163,7 +194,8 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo ged_vec = [0 for i in range(len_itr)] n_edit_operations = [0 for i in range(len_itr)] itr = combinations(range(0, len(graphs)), 2) - n_jobs = multiprocessing.cpu_count() + if n_jobs is None: + n_jobs = multiprocessing.cpu_count() if len_itr < 100 * n_jobs: chunksize = int(len_itr / n_jobs) + 1 else: @@ -175,11 +207,7 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo G_listID = listID_toshare do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort, repeats) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) - if verbose: - iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), - desc='computing GEDs', file=sys.stdout) - else: - iterator = pool.imap_unordered(do_partial, itr, chunksize) + iterator = get_iters(pool.imap_unordered(do_partial, itr, chunksize), desc='computing GEDs', file=sys.stdout, length=len_itr, verbose=verbose) # iterator = pool.imap_unordered(do_partial, itr, chunksize) for i, j, dis, n_eo_tmp in iterator: idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2) @@ -191,14 +219,11 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo # print(i, j, idx_itr, dis) pool.close() pool.join() - + else: ged_vec = [] n_edit_operations = [] - if verbose: - iterator = tqdm(range(len(graphs)), desc='computing GEDs', file=sys.stdout) - else: - iterator = range(len(graphs)) + iterator = get_iters(range(len(graphs)), desc='computing GEDs', file=sys.stdout, length=len(graphs), verbose=verbose) for i in iterator: # for i in range(len(graphs)): for j in range(i + 1, len(graphs)): @@ -232,14 +257,14 @@ def _compute_ged_parallel(env, gid1, gid2, g1, g2, options, sort, repeats): def _compute_ged(env, gid1, gid2, g1, g2, repeats): - dis_min = np.inf + dis_min = np.inf # @todo: maybe compare distance and then do others (faster). for i in range(0, repeats): env.run_method(gid1, gid2) pi_forward = env.get_forward_map(gid1, gid2) pi_backward = env.get_backward_map(gid1, gid2) - upper = env.get_upper_bound(gid1, gid2) + upper = env.get_upper_bound(gid1, gid2) dis = upper - + # make the map label correct (label remove map as np.inf) nodes1 = [n for n in g1.nodes()] nodes2 = [n for n in g2.nodes()] @@ -247,7 +272,7 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): nb2 = nx.number_of_nodes(g2) pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] - + if dis < dis_min: dis_min = dis pi_forward_min = pi_forward @@ -268,7 +293,7 @@ def label_costs_to_matrix(costs, nb_labels): Returns ------- - cost_matrix : numpy.array. + cost_matrix : numpy.array. The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. """ # Initialize label cost matrix. @@ -282,13 +307,13 @@ def label_costs_to_matrix(costs, nb_labels): for row in range(1, nb_labels + 1): cost_matrix[row, 0] = costs[i] i += 1 - # Costs of substitutions. + # Costs of substitutions. for row in range(1, nb_labels + 1): for col in range(row + 1, nb_labels + 1): cost_matrix[row, col] = costs[i] cost_matrix[col, row] = costs[i] i += 1 - + return cost_matrix @@ -299,7 +324,7 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is edge_labels = kwargs.get('edge_labels', []) return get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, node_labels=node_labels, edge_labels=edge_labels) - else: + else: raise Exception('Edit cost "', edit_cost, '" is not supported.') else: if edit_cost == 'LETTER' or edit_cost == 'LETTER2': @@ -307,21 +332,21 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is elif edit_cost == 'NON_SYMBOLIC': node_attrs = kwargs.get('node_attrs', []) edge_attrs = kwargs.get('edge_attrs', []) - return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, + return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, node_attrs=node_attrs, edge_attrs=edge_attrs) elif edit_cost == 'CONSTANT': node_labels = kwargs.get('node_labels', []) edge_labels = kwargs.get('edge_labels', []) - return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, + return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, node_labels=node_labels, edge_labels=edge_labels) - else: - return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) - - -def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, + else: + return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) + + +def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, node_labels=[], edge_labels=[]): """Compute times that edit operations are used in an edit path for symbolic-labeled graphs, where the costs are different for each pair of nodes. - + Returns ------- list @@ -330,7 +355,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, # Initialize. nb_ops_node = np.zeros((1 + len(node_labels), 1 + len(node_labels))) nb_ops_edge = np.zeros((1 + len(edge_labels), 1 + len(edge_labels))) - + # For nodes. nodes1 = [n for n in g1.nodes()] for i, map_i in enumerate(forward_map): @@ -350,7 +375,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, label = tuple(g2.nodes[nodes2[i]].items()) idx_label = node_labels.index(label) # @todo: faster nb_ops_node[0, idx_label + 1] += 1 - + # For edges. edges1 = [e for e in g1.edges()] edges2_marked = [] @@ -371,7 +396,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, label2 = tuple(g2.edges[(nf2, nt2)].items()) if label1 != label2: idx_label2 = edge_labels.index(label2) # @todo: faster - nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1 + nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1 # Switch nf2 and nt2, for directed graphs. elif (nt2, nf2) in g2.edges(): edges2_marked.append((nt2, nf2)) @@ -389,7 +414,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, label = tuple(g2.edges[(nt, nf)].items()) idx_label = edge_labels.index(label) # @todo: faster nb_ops_edge[0, idx_label + 1] += 1 - + # Reform the numbers of edit oeprations into a vector. nb_eo_vector = [] # node insertion. @@ -412,9 +437,9 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, for i in range(1, len(nb_ops_edge)): for j in range(i + 1, len(nb_ops_edge)): nb_eo_vector.append(nb_ops_edge[i, j]) - + return nb_eo_vector - + def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, node_labels=[], edge_labels=[]): @@ -426,7 +451,7 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, n_ei = 0 n_er = 0 n_es = 0 - + nodes1 = [n for n in g1.nodes()] for i, map_i in enumerate(forward_map): if map_i == np.inf: @@ -441,9 +466,9 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, for map_i in backward_map: if map_i == np.inf: n_vi += 1 - + # idx_nodes1 = range(0, len(node1)) - + edges1 = [e for e in g1.edges()] nb_edges2_cnted = 0 for n1, n2 in edges1: @@ -475,7 +500,7 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, else: n_er += 1 n_ei = nx.number_of_edges(g2) - nb_edges2_cnted - + return n_vi, n_vr, n_vs, n_ei, n_er, n_es @@ -488,7 +513,7 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): sod_vs = 0 n_ei = 0 n_er = 0 - + nodes1 = [n for n in g1.nodes()] for i, map_i in enumerate(forward_map): if map_i == np.inf: @@ -501,9 +526,9 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): for map_i in backward_map: if map_i == np.inf: n_vi += 1 - + # idx_nodes1 = range(0, len(node1)) - + edges1 = [e for e in g1.edges()] nb_edges2_cnted = 0 for n1, n2 in edges1: @@ -520,7 +545,7 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): else: n_er += 1 n_ei = nx.number_of_edges(g2) - nb_edges2_cnted - + return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er @@ -536,7 +561,7 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, n_er = 0 n_es = 0 sod_es = 0 - + nodes1 = [n for n in g1.nodes()] for i, map_i in enumerate(forward_map): if map_i == np.inf: @@ -551,9 +576,9 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, for map_i in backward_map: if map_i == np.inf: n_vi += 1 - + # idx_nodes1 = range(0, len(node1)) - + edges1 = [e for e in g1.edges()] for n1, n2 in edges1: idx1 = nodes1.index(n1) @@ -582,7 +607,7 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, else: n_er += 1 n_ei = nx.number_of_edges(g2) - n_es - + return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es @@ -615,7 +640,7 @@ def ged_options_to_string(options): opt_str += '--log ' + str(val) + ' ' elif key == 'randomness': opt_str += '--randomness ' + str(val) + ' ' - + # if not isinstance(val, list): # opt_str += '--' + key.replace('_', '-') + ' ' # if val == False: diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index e9a4032..2692713 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -37,7 +37,7 @@ class GraphKernel(object): elif len(graphs[0]) == 0: raise Exception('The graph list given is empty. No computation was performed.') else: - self._graphs = [g.copy() for g in graphs[0]] + self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. self._gram_matrix = self._compute_gram_matrix() self._gram_matrix_unnorm = np.copy(self._gram_matrix) if self._normalize: diff --git a/gklearn/kernels/structural_sp.py b/gklearn/kernels/structural_sp.py index 5662d18..35ed9d1 100644 --- a/gklearn/kernels/structural_sp.py +++ b/gklearn/kernels/structural_sp.py @@ -14,7 +14,7 @@ import sys from itertools import product # from functools import partial from multiprocessing import Pool -from tqdm import tqdm +from gklearn.utils import get_iters # import networkx as nx import numpy as np from gklearn.utils.parallel import parallel_gm, parallel_me @@ -41,10 +41,7 @@ class StructuralSP(GraphKernel): def _compute_gm_series(self): # get shortest paths of each graph in the graphs. splist = [] - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='getting sp graphs', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) if self._compute_method == 'trie': for g in iterator: splist.append(self._get_sps_as_trie(g)) @@ -57,10 +54,9 @@ class StructuralSP(GraphKernel): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) - else: - iterator = itr + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, + length=len_itr, verbose=(self._verbose >= 2)) if self._compute_method == 'trie': for i, j in iterator: kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) @@ -91,11 +87,9 @@ class StructuralSP(GraphKernel): get_sps_fun = self._wrapper_get_sps_trie else: get_sps_fun = self._wrapper_get_sps_naive - if self.verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize), - desc='getting shortest paths', file=sys.stdout) - else: - iterator = pool.imap_unordered(get_sps_fun, itr, chunksize) + iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), + desc='getting shortest paths', file=sys.stdout, + length=len(self._graphs), verbose=(self._verbose >= 2)) for i, sp in iterator: splist[i] = sp pool.close() @@ -122,10 +116,8 @@ class StructuralSP(GraphKernel): # get shortest paths of g1 and each graph in g_list. sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) splist = [] - if self._verbose >= 2: - iterator = tqdm(g_list, desc='getting sp graphs', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, + verbose=(self._verbose >= 2)) if self._compute_method == 'trie': for g in iterator: splist.append(self._get_sps_as_trie(g)) @@ -135,10 +127,8 @@ class StructuralSP(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) - else: - iterator = range(len(g_list)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', + file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) if self._compute_method == 'trie': for i in iterator: kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i]) @@ -166,11 +156,9 @@ class StructuralSP(GraphKernel): get_sps_fun = self._wrapper_get_sps_trie else: get_sps_fun = self._wrapper_get_sps_naive - if self.verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize), - desc='getting shortest paths', file=sys.stdout) - else: - iterator = pool.imap_unordered(get_sps_fun, itr, chunksize) + iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), + desc='getting shortest paths', file=sys.stdout, + length=len(g_list), verbose=(self._verbose >= 2)) for i, sp in iterator: splist[i] = sp pool.close() diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 8e225d6..3d6fa2c 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -12,13 +12,13 @@ import os class Dataset(object): - - import warnings - warnings.simplefilter('always', DeprecationWarning) - warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) - - + + def __init__(self, filename=None, filename_targets=None, **kwargs): + import warnings + warnings.simplefilter('always', DeprecationWarning) + warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) + if filename is None: self._graphs = None self._targets = None @@ -28,7 +28,7 @@ class Dataset(object): self._edge_attrs = None else: self.load_dataset(filename, filename_targets=filename_targets, **kwargs) - + self._substructures = None self._node_label_dim = None self._edge_label_dim = None @@ -53,8 +53,8 @@ class Dataset(object): self._node_attr_dim = None self._edge_attr_dim = None self._class_number = None - - + + def load_dataset(self, filename, filename_targets=None, **kwargs): self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs) self._node_labels = label_names['node_labels'] @@ -62,15 +62,15 @@ class Dataset(object): self._edge_labels = label_names['edge_labels'] self._edge_attrs = label_names['edge_attrs'] self.clean_labels() - - + + def load_graphs(self, graphs, targets=None): # this has to be followed by set_labels(). self._graphs = graphs self._targets = targets # self.set_labels_attrs() # @todo - - + + def load_predefined_dataset(self, ds_name): current_path = os.path.dirname(os.path.realpath(__file__)) + '/' if ds_name == 'Acyclic': @@ -130,7 +130,7 @@ class Dataset(object): self._graphs, self._targets, label_names = load_dataset(ds_file) elif ds_name == 'NCI109': ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) + self._graphs, self._targets, label_names = load_dataset(ds_file) elif ds_name == 'PAH': ds_file = current_path + '../../datasets/PAH/dataset.ds' self._graphs, self._targets, label_names = load_dataset(ds_file) @@ -143,13 +143,13 @@ class Dataset(object): pass else: raise Exception('The dataset name "', ds_name, '" is not pre-defined.') - + self._node_labels = label_names['node_labels'] self._node_attrs = label_names['node_attrs'] self._edge_labels = label_names['edge_labels'] self._edge_attrs = label_names['edge_attrs'] self.clean_labels() - + def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): self._node_labels = node_labels @@ -157,7 +157,7 @@ class Dataset(object): self._edge_labels = edge_labels self._edge_attrs = edge_attrs - + def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): # @todo: remove labels which have only one possible values. if node_labels is None: @@ -183,86 +183,86 @@ class Dataset(object): # if 'attributes' in e[2]: # return len(e[2]['attributes']) # return 0 - - + + def get_dataset_infos(self, keys=None, params=None): """Computes and returns the structure and property information of the graph dataset. - + Parameters ---------- keys : list, optional A list of strings which indicate which informations will be returned. The possible choices includes: - - 'substructures': sub-structures graphs contains, including 'linear', 'non + + 'substructures': sub-structures graphs contains, including 'linear', 'non linear' and 'cyclic'. - + 'node_label_dim': whether vertices have symbolic labels. - + 'edge_label_dim': whether egdes have symbolic labels. - + 'directed': whether graphs in dataset are directed. - + 'dataset_size': number of graphs in dataset. - + 'total_node_num': total number of vertices of all graphs in dataset. - + 'ave_node_num': average number of vertices of graphs in dataset. - + 'min_node_num': minimum number of vertices of graphs in dataset. - + 'max_node_num': maximum number of vertices of graphs in dataset. - + 'total_edge_num': total number of edges of all graphs in dataset. - + 'ave_edge_num': average number of edges of graphs in dataset. - + 'min_edge_num': minimum number of edges of graphs in dataset. - + 'max_edge_num': maximum number of edges of graphs in dataset. - + 'ave_node_degree': average vertex degree of graphs in dataset. - + 'min_node_degree': minimum vertex degree of graphs in dataset. - + 'max_node_degree': maximum vertex degree of graphs in dataset. - - 'ave_fill_factor': average fill factor (number_of_edges / + + 'ave_fill_factor': average fill factor (number_of_edges / (number_of_nodes ** 2)) of graphs in dataset. - + 'min_fill_factor': minimum fill factor of graphs in dataset. - + 'max_fill_factor': maximum fill factor of graphs in dataset. - + 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset. - + 'edge_label_nums': list number of symbolic edge labels of graphs in dataset. - - 'node_attr_dim': number of dimensions of non-symbolic vertex labels. + + 'node_attr_dim': number of dimensions of non-symbolic vertex labels. Extracted from the 'attributes' attribute of graph nodes. - - 'edge_attr_dim': number of dimensions of non-symbolic edge labels. + + 'edge_attr_dim': number of dimensions of non-symbolic edge labels. Extracted from the 'attributes' attribute of graph edges. - + 'class_number': number of classes. Only available for classification problems. - + 'all_degree_entropy': the entropy of degree distribution of each graph. - + 'ave_degree_entropy': the average entropy of degree distribution of all graphs. - + All informations above will be returned if `keys` is not given. - + params: dict of dict, optional - A dictinary which contains extra parameters for each possible + A dictinary which contains extra parameters for each possible element in ``keys``. - + Return ------ dict Information of the graph dataset keyed by `keys`. """ infos = {} - + if keys == None: keys = [ 'substructures', @@ -292,13 +292,13 @@ class Dataset(object): 'all_degree_entropy', 'ave_degree_entropy' ] - + # dataset size if 'dataset_size' in keys: if self._dataset_size is None: self._dataset_size = self._get_dataset_size() infos['dataset_size'] = self._dataset_size - + # graph node number if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']): all_node_nums = self._get_all_node_nums() @@ -307,22 +307,22 @@ class Dataset(object): if self._total_node_num is None: self._total_node_num = self._get_total_node_num(all_node_nums) infos['total_node_num'] = self._total_node_num - + if 'ave_node_num' in keys: if self._ave_node_num is None: self._ave_node_num = self._get_ave_node_num(all_node_nums) infos['ave_node_num'] = self._ave_node_num - + if 'min_node_num' in keys: if self._min_node_num is None: self._min_node_num = self._get_min_node_num(all_node_nums) infos['min_node_num'] = self._min_node_num - + if 'max_node_num' in keys: if self._max_node_num is None: self._max_node_num = self._get_max_node_num(all_node_nums) infos['max_node_num'] = self._max_node_num - + # graph edge number if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']): all_edge_nums = self._get_all_edge_nums() @@ -331,12 +331,12 @@ class Dataset(object): if self._total_edge_num is None: self._total_edge_num = self._get_total_edge_num(all_edge_nums) infos['total_edge_num'] = self._total_edge_num - + if 'ave_edge_num' in keys: if self._ave_edge_num is None: self._ave_edge_num = self._get_ave_edge_num(all_edge_nums) infos['ave_edge_num'] = self._ave_edge_num - + if 'max_edge_num' in keys: if self._max_edge_num is None: self._max_edge_num = self._get_max_edge_num(all_edge_nums) @@ -346,120 +346,120 @@ class Dataset(object): if self._min_edge_num is None: self._min_edge_num = self._get_min_edge_num(all_edge_nums) infos['min_edge_num'] = self._min_edge_num - + # label number if 'node_label_dim' in keys: if self._node_label_dim is None: self._node_label_dim = self._get_node_label_dim() - infos['node_label_dim'] = self._node_label_dim - + infos['node_label_dim'] = self._node_label_dim + if 'node_label_nums' in keys: if self._node_label_nums is None: self._node_label_nums = {} for node_label in self._node_labels: self._node_label_nums[node_label] = self._get_node_label_num(node_label) infos['node_label_nums'] = self._node_label_nums - + if 'edge_label_dim' in keys: if self._edge_label_dim is None: self._edge_label_dim = self._get_edge_label_dim() - infos['edge_label_dim'] = self._edge_label_dim - + infos['edge_label_dim'] = self._edge_label_dim + if 'edge_label_nums' in keys: if self._edge_label_nums is None: self._edge_label_nums = {} for edge_label in self._edge_labels: self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label) infos['edge_label_nums'] = self._edge_label_nums - + if 'directed' in keys or 'substructures' in keys: if self._directed is None: self._directed = self._is_directed() infos['directed'] = self._directed - + # node degree if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']): all_node_degrees = self._get_all_node_degrees() - + if 'ave_node_degree' in keys: if self._ave_node_degree is None: self._ave_node_degree = self._get_ave_node_degree(all_node_degrees) infos['ave_node_degree'] = self._ave_node_degree - + if 'max_node_degree' in keys: if self._max_node_degree is None: self._max_node_degree = self._get_max_node_degree(all_node_degrees) infos['max_node_degree'] = self._max_node_degree - + if 'min_node_degree' in keys: if self._min_node_degree is None: self._min_node_degree = self._get_min_node_degree(all_node_degrees) infos['min_node_degree'] = self._min_node_degree - + # fill factor if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']): all_fill_factors = self._get_all_fill_factors() - + if 'ave_fill_factor' in keys: if self._ave_fill_factor is None: self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors) infos['ave_fill_factor'] = self._ave_fill_factor - + if 'max_fill_factor' in keys: if self._max_fill_factor is None: self._max_fill_factor = self._get_max_fill_factor(all_fill_factors) infos['max_fill_factor'] = self._max_fill_factor - + if 'min_fill_factor' in keys: if self._min_fill_factor is None: self._min_fill_factor = self._get_min_fill_factor(all_fill_factors) infos['min_fill_factor'] = self._min_fill_factor - + if 'substructures' in keys: if self._substructures is None: self._substructures = self._get_substructures() infos['substructures'] = self._substructures - + if 'class_number' in keys: if self._class_number is None: self._class_number = self._get_class_number() infos['class_number'] = self._class_number - + if 'node_attr_dim' in keys: if self._node_attr_dim is None: self._node_attr_dim = self._get_node_attr_dim() infos['node_attr_dim'] = self._node_attr_dim - + if 'edge_attr_dim' in keys: if self._edge_attr_dim is None: self._edge_attr_dim = self._get_edge_attr_dim() infos['edge_attr_dim'] = self._edge_attr_dim - + # entropy of degree distribution. - + if 'all_degree_entropy' in keys: if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): base = params['all_degree_entropy']['base'] else: base = None infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base) - + if 'ave_degree_entropy' in keys: if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): base = params['ave_degree_entropy']['base'] else: base = None infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) - + return infos - - + + def print_graph_infos(self, infos): from collections import OrderedDict keys = list(infos.keys()) print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) - - + + def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): node_labels = [item for item in node_labels if item in self._node_labels] edge_labels = [item for item in edge_labels if item in self._edge_labels] @@ -485,8 +485,8 @@ class Dataset(object): self._node_attrs = [na for na in self._node_attrs if na not in node_attrs] if len(edge_attrs) > 0: self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs] - - + + def clean_labels(self): labels = [] for name in self._node_labels: @@ -543,8 +543,8 @@ class Dataset(object): for ed in G.edges(): del G.edges[ed][name] self._edge_attrs = labels - - + + def cut_graphs(self, range_): self._graphs = [self._graphs[i] for i in range_] if self._targets is not None: @@ -561,8 +561,8 @@ class Dataset(object): self._graphs = [p[1] for p in trimed_pairs] self._targets = [self._targets[i] for i in idx] self.clean_labels() - - + + def copy(self): dataset = Dataset() graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None @@ -575,8 +575,8 @@ class Dataset(object): dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) # @todo: clean_labels and add other class members? return dataset - - + + def get_all_node_labels(self): node_labels = [] for g in self._graphs: @@ -585,8 +585,8 @@ class Dataset(object): if nl not in node_labels: node_labels.append(nl) return node_labels - - + + def get_all_edge_labels(self): edge_labels = [] for g in self._graphs: @@ -595,94 +595,94 @@ class Dataset(object): if el not in edge_labels: edge_labels.append(el) return edge_labels - - + + def _get_dataset_size(self): return len(self._graphs) - - + + def _get_all_node_nums(self): return [nx.number_of_nodes(G) for G in self._graphs] - - + + def _get_total_node_nums(self, all_node_nums): return np.sum(all_node_nums) - - + + def _get_ave_node_num(self, all_node_nums): return np.mean(all_node_nums) - - + + def _get_min_node_num(self, all_node_nums): return np.amin(all_node_nums) - - + + def _get_max_node_num(self, all_node_nums): return np.amax(all_node_nums) - - + + def _get_all_edge_nums(self): return [nx.number_of_edges(G) for G in self._graphs] - - + + def _get_total_edge_nums(self, all_edge_nums): return np.sum(all_edge_nums) - - + + def _get_ave_edge_num(self, all_edge_nums): return np.mean(all_edge_nums) - - + + def _get_min_edge_num(self, all_edge_nums): return np.amin(all_edge_nums) - - + + def _get_max_edge_num(self, all_edge_nums): return np.amax(all_edge_nums) - - + + def _get_node_label_dim(self): return len(self._node_labels) - - + + def _get_node_label_num(self, node_label): nl = set() for G in self._graphs: nl = nl | set(nx.get_node_attributes(G, node_label).values()) return len(nl) - - + + def _get_edge_label_dim(self): return len(self._edge_labels) - - + + def _get_edge_label_num(self, edge_label): el = set() for G in self._graphs: el = el | set(nx.get_edge_attributes(G, edge_label).values()) return len(el) - - + + def _is_directed(self): return nx.is_directed(self._graphs[0]) - - + + def _get_all_node_degrees(self): return [np.mean(list(dict(G.degree()).values())) for G in self._graphs] - - + + def _get_ave_node_degree(self, all_node_degrees): return np.mean(all_node_degrees) - - + + def _get_max_node_degree(self, all_node_degrees): return np.amax(all_node_degrees) - - + + def _get_min_node_degree(self, all_node_degrees): return np.amin(all_node_degrees) - - + + def _get_all_fill_factors(self): """Get fill factor, the number of non-zero entries in the adjacency matrix. @@ -692,20 +692,20 @@ class Dataset(object): List of fill factors for all graphs. """ return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs] - + def _get_ave_fill_factor(self, all_fill_factors): return np.mean(all_fill_factors) - - + + def _get_max_fill_factor(self, all_fill_factors): return np.amax(all_fill_factors) - - + + def _get_min_fill_factor(self, all_fill_factors): return np.amin(all_fill_factors) - - + + def _get_substructures(self): subs = set() for G in self._graphs: @@ -737,22 +737,22 @@ class Dataset(object): # if any(len(i) > 2 for i in cyc): # subs.add('cyclic') # break - + return subs - - + + def _get_class_num(self): return len(set(self._targets)) - - + + def _get_node_attr_dim(self): return len(self._node_attrs) - - + + def _get_edge_attr_dim(self): return len(self._edge_attrs) - + def _compute_all_degree_entropy(self, base=None): """Compute the entropy of degree distribution of each graph. @@ -767,15 +767,15 @@ class Dataset(object): The calculated entropy. """ from gklearn.utils.stats import entropy - + degree_entropy = [] for g in self._graphs: degrees = list(dict(g.degree()).values()) en = entropy(degrees, base=base) degree_entropy.append(en) return degree_entropy - - + + @property def graphs(self): return self._graphs @@ -784,8 +784,8 @@ class Dataset(object): @property def targets(self): return self._targets - - + + @property def node_labels(self): return self._node_labels @@ -794,25 +794,25 @@ class Dataset(object): @property def edge_labels(self): return self._edge_labels - - + + @property def node_attrs(self): return self._node_attrs - - + + @property def edge_attrs(self): return self._edge_attrs - - + + def split_dataset_by_target(dataset): import warnings warnings.simplefilter('always', DeprecationWarning) warnings.warn('This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) - + from gklearn.preimage.utils import get_same_item_indices - + graphs = dataset.graphs targets = dataset.targets datasets = [] diff --git a/gklearn/utils/graph_files.py b/gklearn/utils/graph_files.py index 57d0052..bcb983e 100644 --- a/gklearn/utils/graph_files.py +++ b/gklearn/utils/graph_files.py @@ -1,8 +1,8 @@ """ Utilities function to manage graph files """ -import warnings -warnings.simplefilter('always', DeprecationWarning) -warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning) +# import warnings +# warnings.simplefilter('always', DeprecationWarning) +# warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning) from os.path import dirname, splitext @@ -26,17 +26,17 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): y : List Targets corresponding to graphs. - + Notes ----- This function supports following graph dataset formats: 'ds': load data from .ds file. See comments of function loadFromDS for a example. - 'cxl': load data from Graph eXchange Language file (.cxl file). See + 'cxl': load data from Graph eXchange Language file (.cxl file). See `here `__ for detail. - 'sdf': load data from structured data file (.sdf file). See + 'sdf': load data from structured data file (.sdf file). See `here `__ for details. @@ -77,20 +77,20 @@ def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', **kwargs): import warnings warnings.simplefilter('always', DeprecationWarning) warnings.warn('The function "gklearn.utils.save_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataSaver" instead.', DeprecationWarning) - + import os dirname_ds = os.path.dirname(filename) if dirname_ds != '': dirname_ds += '/' os.makedirs(dirname_ds, exist_ok=True) - + if 'graph_dir' in kwargs: graph_dir = kwargs['graph_dir'] + '/' os.makedirs(graph_dir, exist_ok=True) del kwargs['graph_dir'] else: - graph_dir = dirname_ds - + graph_dir = dirname_ds + if group == 'xml' and gformat == 'gxl': with open(filename + '.xml', 'w') as fgroup: fgroup.write("") @@ -122,7 +122,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo 2 3 1 1 - + Check `CTFile Formats file `__ for detailed format discription. """ @@ -144,7 +144,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he if count_line_tags[i] != '': # if not obsoleted g.graph[count_line_tags[i]] = tmp[i].strip() i += 1 - + # read the atom block. atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag'] for i in range(0, nb_atoms): @@ -156,7 +156,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he if atom_tags[j] != '': g.nodes[i][atom_tags[j]] = tmp[j].strip() j += 1 - + # read the bond block. bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status'] for i in range(0, nb_bonds): @@ -169,7 +169,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he if bond_tags[j] != '': g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip() j += 1 - + # get label names. label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1] @@ -188,7 +188,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he else: label_names['edge_attrs'].append(key) break - + return g, label_names @@ -215,19 +215,19 @@ def load_gxl(filename): # @todo: directed graphs. for attr in edge.iter('attr'): labels[attr.attrib['name']] = attr[0].text g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) - + # get label names. label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} for node in root.iter('node'): for attr in node.iter('attr'): - if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. + if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. label_names['node_labels'].append(attr.attrib['name']) else: label_names['node_attrs'].append(attr.attrib['name']) break for edge in root.iter('edge'): for attr in edge.iter('attr'): - if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. + if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. label_names['edge_labels'].append(attr.attrib['name']) else: label_names['edge_attrs'].append(attr.attrib['name']) @@ -249,20 +249,20 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[], gxl_file.write("\n") for v, attrs in graph.nodes(data=True): gxl_file.write("") - for l_name in node_labels: - gxl_file.write("" + + for l_name in node_labels: + gxl_file.write("" + str(attrs[l_name]) + "") - for a_name in node_attrs: - gxl_file.write("" + + for a_name in node_attrs: + gxl_file.write("" + str(attrs[a_name]) + "") gxl_file.write("\n") for v1, v2, attrs in graph.edges(data=True): gxl_file.write("") - for l_name in edge_labels: - gxl_file.write("" + + for l_name in edge_labels: + gxl_file.write("" + str(attrs[l_name]) + "") - for a_name in edge_attrs: - gxl_file.write("" + + for a_name in edge_attrs: + gxl_file.write("" + str(attrs[a_name]) + "") gxl_file.write("\n") gxl_file.write("\n") @@ -276,7 +276,7 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[], attr['edgeids'] = 'true' attr['edgemode'] = 'undirected' graph_node = ET.SubElement(root_node, 'graph', attrib=attr) - + for v in graph: current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) for attr in graph.nodes[v].keys(): @@ -285,7 +285,7 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[], cur_value = ET.SubElement(cur_attr, graph.nodes[v][attr].__class__.__name__) cur_value.text = graph.nodes[v][attr] - + for v1 in graph: for v2 in graph[v1]: if (v1 < v2): # Non oriented graphs @@ -302,7 +302,7 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[], cur_value = ET.SubElement( cur_attr, graph[v1][v2][attr].__class__.__name__) cur_value.text = str(graph[v1][v2][attr]) - + tree = ET.ElementTree(root_node) tree.write(filename) elif method == 'gedlib': @@ -458,11 +458,11 @@ def load_mat(filename, order): # @todo: need to be updated (auto order) or depre g.add_edge(col, row) data.append(g) # print(g.edges(data=True)) - + label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} if order[1] == 0: label_names['edge_labels'].append('label_1') - + return data, y, label_names @@ -477,12 +477,12 @@ def load_tud(filename): import networkx as nx from os import listdir from os.path import dirname, basename - - + + def get_infos_from_readme(frm): # @todo: add README (cuniform), maybe node/edge label maps. """Get information from DS_label_readme.txt file. """ - + def get_label_names_from_line(line): """Get names of labels/attributes from a line. """ @@ -490,8 +490,8 @@ def load_tud(filename): names = str_names.split(',') names = [attr.strip() for attr in names] return names - - + + def get_class_label_map(label_map_strings): label_map = {} for string in label_map_strings: @@ -500,7 +500,7 @@ def load_tud(filename): return label_map - label_names = {'node_labels': [], 'node_attrs': [], + label_names = {'node_labels': [], 'node_attrs': [], 'edge_labels': [], 'edge_attrs': []} class_label_map = None class_label_map_strings = [] @@ -528,16 +528,16 @@ def load_tud(filename): line = content_rm[i].strip() class_label_map = get_class_label_map(class_label_map_strings) i += 1 - + return label_names, class_label_map - + # get dataset name. dirname_dataset = dirname(filename) filename = basename(filename) fn_split = filename.split('_A') ds_name = fn_split[0].strip() - + # load data file names for name in listdir(dirname_dataset): if ds_name + '_A' in name: @@ -561,20 +561,20 @@ def load_tud(filename): # this is supposed to be the node attrs, make sure to put this as the last 'elif' elif ds_name + '_attributes' in name: fna = dirname_dataset + '/' + name - + # get labels and attributes names. if 'frm' in locals(): label_names, class_label_map = get_infos_from_readme(frm) else: - label_names = {'node_labels': [], 'node_attrs': [], + label_names = {'node_labels': [], 'node_attrs': [], 'edge_labels': [], 'edge_attrs': []} class_label_map = None - + with open(fgi) as gi: content_gi = gi.read().splitlines() # graph indicator with open(fam) as am: content_am = am.read().splitlines() # adjacency matrix - + # load targets. if 'fgl' in locals(): with open(fgl) as gl: @@ -609,7 +609,7 @@ def load_tud(filename): else: for i, line in enumerate(content_gi): data[int(line) - 1].add_node(i) - + # add edges for line in content_am: tmp = line.split(',') @@ -670,7 +670,7 @@ def load_tud(filename): data[g].edges[n[0], n[1]][a_name] = attrs[i] return data, targets, label_names - + def load_from_ds(filename, filename_targets): """Load data from .ds file. @@ -681,9 +681,9 @@ def load_from_ds(filename, filename_targets): '.gxl': see dunction load_gxl for detail. - Note these graph formats are checked automatically by the extensions of + Note these graph formats are checked automatically by the extensions of graph files. - """ + """ dirname_dataset = dirname(filename) data = [] y = [] @@ -695,7 +695,7 @@ def load_from_ds(filename, filename_targets): load_file_fun = load_ct elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet. load_file_fun = load_gxl - + if filename_targets is None or filename_targets == '': for i in range(0, len(content)): tmp = content[i].split(' ') @@ -711,7 +711,7 @@ def load_from_ds(filename, filename_targets): g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1)) data.append(g) _append_label_names(label_names, l_names) - + with open(filename_targets) as fnt: content_y = fnt.read().splitlines() # assume entries in filename and filename_targets have the same order. @@ -719,13 +719,13 @@ def load_from_ds(filename, filename_targets): tmp = item.split(' ') # assume the 3rd entry in a line is y (for Alkane dataset) y.append(float(tmp[2])) - + return data, y, label_names # def load_from_cxl(filename): # import xml.etree.ElementTree as ET -# +# # dirname_dataset = dirname(filename) # tree = ET.parse(filename) # root = tree.getroot() @@ -736,11 +736,11 @@ def load_from_ds(filename, filename_targets): # mol_class = graph.attrib['class'] # data.append(load_gxl(dirname_dataset + '/' + mol_filename)) # y.append(mol_class) - - + + def load_from_xml(filename, dir_dataset=None): import xml.etree.ElementTree as ET - + if dir_dataset is not None: dir_dataset = dir_dataset else: @@ -757,16 +757,16 @@ def load_from_xml(filename, dir_dataset=None): data.append(g) _append_label_names(label_names, l_names) y.append(mol_class) - + return data, y, label_names def _append_label_names(label_names, new_names): for key, val in label_names.items(): label_names[key] += [name for name in new_names[key] if name not in val] - - -if __name__ == '__main__': + + +if __name__ == '__main__': # ### Load dataset from .ds file. # # .ct files. # ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', @@ -782,7 +782,7 @@ if __name__ == '__main__': # print(Gn[1].nodes(data=True)) # print(Gn[1].edges(data=True)) # print(targets[1]) - + # # .gxl file. # ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb # Gn, y, label_names = load_dataset(ds_file) @@ -803,7 +803,7 @@ if __name__ == '__main__': # ### Convert graph from one format to another. # # .gxl file. # import networkx as nx -# ds = {'name': 'monoterpenoides', +# ds = {'name': 'monoterpenoides', # 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb # Gn, y = loadDataset(ds['dataset']) # y = [int(i) for i in y] @@ -826,13 +826,13 @@ if __name__ == '__main__': # filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' # xparams = {'method': 'gedlib'} # saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) - + # save dataset. # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) # saveDataset(Gn, y, group='xml', filename='temp/temp') - + # test - new way to add labels and attributes. # dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' # filename = '../../datasets/Fingerprint/Fingerprint_A.txt' diff --git a/gklearn/utils/graphdataset.py b/gklearn/utils/graphdataset.py index 4c64fd0..4dd7881 100644 --- a/gklearn/utils/graphdataset.py +++ b/gklearn/utils/graphdataset.py @@ -5,345 +5,345 @@ This file is for old version of graphkit-learn. def get_dataset_attributes(Gn, - target=None, - attr_names=[], - node_label=None, - edge_label=None): - """Returns the structure and property information of the graph dataset Gn. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs whose information will be returned. - - target : list - The list of classification targets corresponding to Gn. Only works for - classification problems. - - attr_names : list - List of strings which indicate which informations will be returned. The - possible choices includes: - - 'substructures': sub-structures Gn contains, including 'linear', 'non + target=None, + attr_names=[], + node_label=None, + edge_label=None): + """Returns the structure and property information of the graph dataset Gn. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs whose information will be returned. + + target : list + The list of classification targets corresponding to Gn. Only works for + classification problems. + + attr_names : list + List of strings which indicate which informations will be returned. The + possible choices includes: + + 'substructures': sub-structures Gn contains, including 'linear', 'non linear' and 'cyclic'. - 'node_labeled': whether vertices have symbolic labels. + 'node_labeled': whether vertices have symbolic labels. - 'edge_labeled': whether egdes have symbolic labels. + 'edge_labeled': whether egdes have symbolic labels. - 'is_directed': whether graphs in Gn are directed. + 'is_directed': whether graphs in Gn are directed. - 'dataset_size': number of graphs in Gn. + 'dataset_size': number of graphs in Gn. - 'ave_node_num': average number of vertices of graphs in Gn. + 'ave_node_num': average number of vertices of graphs in Gn. - 'min_node_num': minimum number of vertices of graphs in Gn. + 'min_node_num': minimum number of vertices of graphs in Gn. - 'max_node_num': maximum number of vertices of graphs in Gn. + 'max_node_num': maximum number of vertices of graphs in Gn. - 'ave_edge_num': average number of edges of graphs in Gn. + 'ave_edge_num': average number of edges of graphs in Gn. - 'min_edge_num': minimum number of edges of graphs in Gn. + 'min_edge_num': minimum number of edges of graphs in Gn. - 'max_edge_num': maximum number of edges of graphs in Gn. + 'max_edge_num': maximum number of edges of graphs in Gn. - 'ave_node_degree': average vertex degree of graphs in Gn. + 'ave_node_degree': average vertex degree of graphs in Gn. - 'min_node_degree': minimum vertex degree of graphs in Gn. + 'min_node_degree': minimum vertex degree of graphs in Gn. - 'max_node_degree': maximum vertex degree of graphs in Gn. + 'max_node_degree': maximum vertex degree of graphs in Gn. - 'ave_fill_factor': average fill factor (number_of_edges / + 'ave_fill_factor': average fill factor (number_of_edges / (number_of_nodes ** 2)) of graphs in Gn. - 'min_fill_factor': minimum fill factor of graphs in Gn. + 'min_fill_factor': minimum fill factor of graphs in Gn. - 'max_fill_factor': maximum fill factor of graphs in Gn. + 'max_fill_factor': maximum fill factor of graphs in Gn. - 'node_label_num': number of symbolic vertex labels. + 'node_label_num': number of symbolic vertex labels. - 'edge_label_num': number of symbolic edge labels. + 'edge_label_num': number of symbolic edge labels. - 'node_attr_dim': number of dimensions of non-symbolic vertex labels. + 'node_attr_dim': number of dimensions of non-symbolic vertex labels. Extracted from the 'attributes' attribute of graph nodes. - 'edge_attr_dim': number of dimensions of non-symbolic edge labels. + 'edge_attr_dim': number of dimensions of non-symbolic edge labels. Extracted from the 'attributes' attribute of graph edges. - 'class_number': number of classes. Only available for classification problems. + 'class_number': number of classes. Only available for classification problems. - node_label : string - Node attribute used as label. The default node label is atom. Mandatory - when 'node_labeled' or 'node_label_num' is required. + node_label : string + Node attribute used as label. The default node label is atom. Mandatory + when 'node_labeled' or 'node_label_num' is required. - edge_label : string - Edge attribute used as label. The default edge label is bond_type. - Mandatory when 'edge_labeled' or 'edge_label_num' is required. - - Return - ------ - attrs : dict - Value for each property. - """ - import networkx as nx - import numpy as np - - attrs = {} - - def get_dataset_size(Gn): - return len(Gn) - - def get_all_node_num(Gn): - return [nx.number_of_nodes(G) for G in Gn] - - def get_ave_node_num(all_node_num): - return np.mean(all_node_num) - - def get_min_node_num(all_node_num): - return np.amin(all_node_num) - - def get_max_node_num(all_node_num): - return np.amax(all_node_num) - - def get_all_edge_num(Gn): - return [nx.number_of_edges(G) for G in Gn] - - def get_ave_edge_num(all_edge_num): - return np.mean(all_edge_num) - - def get_min_edge_num(all_edge_num): - return np.amin(all_edge_num) - - def get_max_edge_num(all_edge_num): - return np.amax(all_edge_num) - - def is_node_labeled(Gn): - return False if node_label is None else True - - def get_node_label_num(Gn): - nl = set() - for G in Gn: - nl = nl | set(nx.get_node_attributes(G, node_label).values()) - return len(nl) - - def is_edge_labeled(Gn): - return False if edge_label is None else True - - def get_edge_label_num(Gn): - el = set() - for G in Gn: - el = el | set(nx.get_edge_attributes(G, edge_label).values()) - return len(el) - - def is_directed(Gn): - return nx.is_directed(Gn[0]) - - def get_ave_node_degree(Gn): - return np.mean([np.mean(list(dict(G.degree()).values())) for G in Gn]) - - def get_max_node_degree(Gn): - return np.amax([np.mean(list(dict(G.degree()).values())) for G in Gn]) - - def get_min_node_degree(Gn): - return np.amin([np.mean(list(dict(G.degree()).values())) for G in Gn]) - - # get fill factor, the number of non-zero entries in the adjacency matrix. - def get_ave_fill_factor(Gn): - return np.mean([nx.number_of_edges(G) / (nx.number_of_nodes(G) - * nx.number_of_nodes(G)) for G in Gn]) - - def get_max_fill_factor(Gn): - return np.amax([nx.number_of_edges(G) / (nx.number_of_nodes(G) - * nx.number_of_nodes(G)) for G in Gn]) - - def get_min_fill_factor(Gn): - return np.amin([nx.number_of_edges(G) / (nx.number_of_nodes(G) - * nx.number_of_nodes(G)) for G in Gn]) - - def get_substructures(Gn): - subs = set() - for G in Gn: - degrees = list(dict(G.degree()).values()) - if any(i == 2 for i in degrees): - subs.add('linear') - if np.amax(degrees) >= 3: - subs.add('non linear') - if 'linear' in subs and 'non linear' in subs: - break - - if is_directed(Gn): - for G in Gn: - if len(list(nx.find_cycle(G))) > 0: - subs.add('cyclic') - break - # else: - # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way. - # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10 - # for G in Gn: - # if (nx.number_of_edges(G) < upper): - # cyc = list(nx.simple_cycles(G.to_directed())) - # if any(len(i) > 2 for i in cyc): - # subs.add('cyclic') - # break - # if 'cyclic' not in subs: - # for G in Gn: - # cyc = list(nx.simple_cycles(G.to_directed())) - # if any(len(i) > 2 for i in cyc): - # subs.add('cyclic') - # break - - return subs - - def get_class_num(target): - return len(set(target)) - - def get_node_attr_dim(Gn): - for G in Gn: - for n in G.nodes(data=True): - if 'attributes' in n[1]: - return len(n[1]['attributes']) - return 0 - - def get_edge_attr_dim(Gn): - for G in Gn: - if nx.number_of_edges(G) > 0: - for e in G.edges(data=True): - if 'attributes' in e[2]: - return len(e[2]['attributes']) - return 0 - - if attr_names == []: - attr_names = [ - 'substructures', - 'node_labeled', - 'edge_labeled', - 'is_directed', - 'dataset_size', - 'ave_node_num', - 'min_node_num', - 'max_node_num', - 'ave_edge_num', - 'min_edge_num', - 'max_edge_num', - 'ave_node_degree', - 'min_node_degree', - 'max_node_degree', - 'ave_fill_factor', - 'min_fill_factor', - 'max_fill_factor', - 'node_label_num', - 'edge_label_num', - 'node_attr_dim', - 'edge_attr_dim', - 'class_number', - ] - - # dataset size - if 'dataset_size' in attr_names: - - attrs.update({'dataset_size': get_dataset_size(Gn)}) - - # graph node number - if any(i in attr_names - for i in ['ave_node_num', 'min_node_num', 'max_node_num']): - - all_node_num = get_all_node_num(Gn) - - if 'ave_node_num' in attr_names: - - attrs.update({'ave_node_num': get_ave_node_num(all_node_num)}) - - if 'min_node_num' in attr_names: - - attrs.update({'min_node_num': get_min_node_num(all_node_num)}) - - if 'max_node_num' in attr_names: - - attrs.update({'max_node_num': get_max_node_num(all_node_num)}) - - # graph edge number - if any(i in attr_names for i in - ['ave_edge_num', 'min_edge_num', 'max_edge_num']): - - all_edge_num = get_all_edge_num(Gn) + edge_label : string + Edge attribute used as label. The default edge label is bond_type. + Mandatory when 'edge_labeled' or 'edge_label_num' is required. - if 'ave_edge_num' in attr_names: + Return + ------ + attrs : dict + Value for each property. + """ + import networkx as nx + import numpy as np + + attrs = {} + + def get_dataset_size(Gn): + return len(Gn) + + def get_all_node_num(Gn): + return [nx.number_of_nodes(G) for G in Gn] + + def get_ave_node_num(all_node_num): + return np.mean(all_node_num) + + def get_min_node_num(all_node_num): + return np.amin(all_node_num) + + def get_max_node_num(all_node_num): + return np.amax(all_node_num) + + def get_all_edge_num(Gn): + return [nx.number_of_edges(G) for G in Gn] + + def get_ave_edge_num(all_edge_num): + return np.mean(all_edge_num) + + def get_min_edge_num(all_edge_num): + return np.amin(all_edge_num) + + def get_max_edge_num(all_edge_num): + return np.amax(all_edge_num) + + def is_node_labeled(Gn): + return False if node_label is None else True + + def get_node_label_num(Gn): + nl = set() + for G in Gn: + nl = nl | set(nx.get_node_attributes(G, node_label).values()) + return len(nl) + + def is_edge_labeled(Gn): + return False if edge_label is None else True + + def get_edge_label_num(Gn): + el = set() + for G in Gn: + el = el | set(nx.get_edge_attributes(G, edge_label).values()) + return len(el) + + def is_directed(Gn): + return nx.is_directed(Gn[0]) + + def get_ave_node_degree(Gn): + return np.mean([np.mean(list(dict(G.degree()).values())) for G in Gn]) + + def get_max_node_degree(Gn): + return np.amax([np.mean(list(dict(G.degree()).values())) for G in Gn]) + + def get_min_node_degree(Gn): + return np.amin([np.mean(list(dict(G.degree()).values())) for G in Gn]) + + # get fill factor, the number of non-zero entries in the adjacency matrix. + def get_ave_fill_factor(Gn): + return np.mean([nx.number_of_edges(G) / (nx.number_of_nodes(G) + * nx.number_of_nodes(G)) for G in Gn]) + + def get_max_fill_factor(Gn): + return np.amax([nx.number_of_edges(G) / (nx.number_of_nodes(G) + * nx.number_of_nodes(G)) for G in Gn]) + + def get_min_fill_factor(Gn): + return np.amin([nx.number_of_edges(G) / (nx.number_of_nodes(G) + * nx.number_of_nodes(G)) for G in Gn]) + + def get_substructures(Gn): + subs = set() + for G in Gn: + degrees = list(dict(G.degree()).values()) + if any(i == 2 for i in degrees): + subs.add('linear') + if np.amax(degrees) >= 3: + subs.add('non linear') + if 'linear' in subs and 'non linear' in subs: + break + + if is_directed(Gn): + for G in Gn: + if len(list(nx.find_cycle(G))) > 0: + subs.add('cyclic') + break +# else: +# # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way. +# upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10 +# for G in Gn: +# if (nx.number_of_edges(G) < upper): +# cyc = list(nx.simple_cycles(G.to_directed())) +# if any(len(i) > 2 for i in cyc): +# subs.add('cyclic') +# break +# if 'cyclic' not in subs: +# for G in Gn: +# cyc = list(nx.simple_cycles(G.to_directed())) +# if any(len(i) > 2 for i in cyc): +# subs.add('cyclic') +# break + + return subs + + def get_class_num(target): + return len(set(target)) + + def get_node_attr_dim(Gn): + for G in Gn: + for n in G.nodes(data=True): + if 'attributes' in n[1]: + return len(n[1]['attributes']) + return 0 + + def get_edge_attr_dim(Gn): + for G in Gn: + if nx.number_of_edges(G) > 0: + for e in G.edges(data=True): + if 'attributes' in e[2]: + return len(e[2]['attributes']) + return 0 + + if attr_names == []: + attr_names = [ + 'substructures', + 'node_labeled', + 'edge_labeled', + 'is_directed', + 'dataset_size', + 'ave_node_num', + 'min_node_num', + 'max_node_num', + 'ave_edge_num', + 'min_edge_num', + 'max_edge_num', + 'ave_node_degree', + 'min_node_degree', + 'max_node_degree', + 'ave_fill_factor', + 'min_fill_factor', + 'max_fill_factor', + 'node_label_num', + 'edge_label_num', + 'node_attr_dim', + 'edge_attr_dim', + 'class_number', + ] + + # dataset size + if 'dataset_size' in attr_names: - attrs.update({'ave_edge_num': get_ave_edge_num(all_edge_num)}) + attrs.update({'dataset_size': get_dataset_size(Gn)}) - if 'max_edge_num' in attr_names: + # graph node number + if any(i in attr_names + for i in ['ave_node_num', 'min_node_num', 'max_node_num']): - attrs.update({'max_edge_num': get_max_edge_num(all_edge_num)}) + all_node_num = get_all_node_num(Gn) + + if 'ave_node_num' in attr_names: - if 'min_edge_num' in attr_names: + attrs.update({'ave_node_num': get_ave_node_num(all_node_num)}) + + if 'min_node_num' in attr_names: - attrs.update({'min_edge_num': get_min_edge_num(all_edge_num)}) + attrs.update({'min_node_num': get_min_node_num(all_node_num)}) + + if 'max_node_num' in attr_names: - # label number - if any(i in attr_names for i in ['node_labeled', 'node_label_num']): - is_nl = is_node_labeled(Gn) - node_label_num = get_node_label_num(Gn) + attrs.update({'max_node_num': get_max_node_num(all_node_num)}) + + # graph edge number + if any(i in attr_names for i in + ['ave_edge_num', 'min_edge_num', 'max_edge_num']): - if 'node_labeled' in attr_names: - # graphs are considered node unlabeled if all nodes have the same label. - attrs.update({'node_labeled': is_nl if node_label_num > 1 else False}) + all_edge_num = get_all_edge_num(Gn) - if 'node_label_num' in attr_names: - attrs.update({'node_label_num': node_label_num}) + if 'ave_edge_num' in attr_names: - if any(i in attr_names for i in ['edge_labeled', 'edge_label_num']): - is_el = is_edge_labeled(Gn) - edge_label_num = get_edge_label_num(Gn) + attrs.update({'ave_edge_num': get_ave_edge_num(all_edge_num)}) - if 'edge_labeled' in attr_names: - # graphs are considered edge unlabeled if all edges have the same label. - attrs.update({'edge_labeled': is_el if edge_label_num > 1 else False}) + if 'max_edge_num' in attr_names: - if 'edge_label_num' in attr_names: - attrs.update({'edge_label_num': edge_label_num}) + attrs.update({'max_edge_num': get_max_edge_num(all_edge_num)}) - if 'is_directed' in attr_names: - attrs.update({'is_directed': is_directed(Gn)}) + if 'min_edge_num' in attr_names: - if 'ave_node_degree' in attr_names: - attrs.update({'ave_node_degree': get_ave_node_degree(Gn)}) + attrs.update({'min_edge_num': get_min_edge_num(all_edge_num)}) - if 'max_node_degree' in attr_names: - attrs.update({'max_node_degree': get_max_node_degree(Gn)}) + # label number + if any(i in attr_names for i in ['node_labeled', 'node_label_num']): + is_nl = is_node_labeled(Gn) + node_label_num = get_node_label_num(Gn) - if 'min_node_degree' in attr_names: - attrs.update({'min_node_degree': get_min_node_degree(Gn)}) - - if 'ave_fill_factor' in attr_names: - attrs.update({'ave_fill_factor': get_ave_fill_factor(Gn)}) + if 'node_labeled' in attr_names: + # graphs are considered node unlabeled if all nodes have the same label. + attrs.update({'node_labeled': is_nl if node_label_num > 1 else False}) - if 'max_fill_factor' in attr_names: - attrs.update({'max_fill_factor': get_max_fill_factor(Gn)}) + if 'node_label_num' in attr_names: + attrs.update({'node_label_num': node_label_num}) - if 'min_fill_factor' in attr_names: - attrs.update({'min_fill_factor': get_min_fill_factor(Gn)}) + if any(i in attr_names for i in ['edge_labeled', 'edge_label_num']): + is_el = is_edge_labeled(Gn) + edge_label_num = get_edge_label_num(Gn) - if 'substructures' in attr_names: - attrs.update({'substructures': get_substructures(Gn)}) + if 'edge_labeled' in attr_names: + # graphs are considered edge unlabeled if all edges have the same label. + attrs.update({'edge_labeled': is_el if edge_label_num > 1 else False}) - if 'class_number' in attr_names: - attrs.update({'class_number': get_class_num(target)}) + if 'edge_label_num' in attr_names: + attrs.update({'edge_label_num': edge_label_num}) - if 'node_attr_dim' in attr_names: - attrs['node_attr_dim'] = get_node_attr_dim(Gn) + if 'is_directed' in attr_names: + attrs.update({'is_directed': is_directed(Gn)}) - if 'edge_attr_dim' in attr_names: - attrs['edge_attr_dim'] = get_edge_attr_dim(Gn) + if 'ave_node_degree' in attr_names: + attrs.update({'ave_node_degree': get_ave_node_degree(Gn)}) - from collections import OrderedDict - return OrderedDict( - sorted(attrs.items(), key=lambda i: attr_names.index(i[0]))) + if 'max_node_degree' in attr_names: + attrs.update({'max_node_degree': get_max_node_degree(Gn)}) + + if 'min_node_degree' in attr_names: + attrs.update({'min_node_degree': get_min_node_degree(Gn)}) + + if 'ave_fill_factor' in attr_names: + attrs.update({'ave_fill_factor': get_ave_fill_factor(Gn)}) + + if 'max_fill_factor' in attr_names: + attrs.update({'max_fill_factor': get_max_fill_factor(Gn)}) + + if 'min_fill_factor' in attr_names: + attrs.update({'min_fill_factor': get_min_fill_factor(Gn)}) + + if 'substructures' in attr_names: + attrs.update({'substructures': get_substructures(Gn)}) + + if 'class_number' in attr_names: + attrs.update({'class_number': get_class_num(target)}) + + if 'node_attr_dim' in attr_names: + attrs['node_attr_dim'] = get_node_attr_dim(Gn) + + if 'edge_attr_dim' in attr_names: + attrs['edge_attr_dim'] = get_edge_attr_dim(Gn) + + from collections import OrderedDict + return OrderedDict( + sorted(attrs.items(), key=lambda i: attr_names.index(i[0]))) def load_predefined_dataset(ds_name): import os from gklearn.utils.graphfiles import loadDataset - + current_path = os.path.dirname(os.path.realpath(__file__)) + '/' if ds_name == 'Acyclic': ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' @@ -415,5 +415,5 @@ def load_predefined_dataset(ds_name): pass else: raise Exception('The dataset name "', ds_name, '" is not pre-defined.') - + return graphs, targets \ No newline at end of file diff --git a/gklearn/utils/kernels.py b/gklearn/utils/kernels.py index 49ce04b..5bd7e4d 100644 --- a/gklearn/utils/kernels.py +++ b/gklearn/utils/kernels.py @@ -18,8 +18,8 @@ def deltakernel(x, y): References ---------- - [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between - labeled graphs. In Proceedings of the 20th International Conference on + [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between + labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003. """ return x == y #(1 if condition else 0) @@ -68,7 +68,7 @@ def polynomialkernel(x, y, d=1, c=0): x, y : array d : integer, default 1 - + c : float, default 0 Returns @@ -89,7 +89,7 @@ def linearkernel(x, y): x, y : array d : integer, default 1 - + c : float, default 0 Returns