From 85c17b06c1cc82d63178ce92ab2ab123e9c1e341 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 5 Jan 2021 15:19:25 +0100 Subject: [PATCH] [Exp] GED stability: add marker file, use edge labels in datasets, test Acyclic and Alkane. --- ... edit_costs.real_data.nums_sols.ratios.IPFP.py} | 79 ++++--- gklearn/experiments/ged/stability/group_results.py | 105 ++++++--- ..._edit_costs.real_data.nums_sols.ratios.IPFP.py} | 16 +- gklearn/experiments/ged/stability/utils.py | 249 ++++++++++++++++++++- 4 files changed, 368 insertions(+), 81 deletions(-) rename gklearn/experiments/ged/stability/{edit_costs.nums_sols.ratios.IPFP.py => edit_costs.real_data.nums_sols.ratios.IPFP.py} (72%) rename gklearn/experiments/ged/stability/{run_job_edit_costs.nums_sols.ratios.IPFP.py => run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py} (62%) diff --git a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py similarity index 72% rename from gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py rename to gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py index 710213a..33c6973 100644 --- a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py @@ -4,7 +4,7 @@ Created on Wed Oct 20 11:48:02 2020 @author: ljia -""" +""" # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. import os @@ -13,15 +13,15 @@ import pickle import logging from gklearn.ged.util import compute_geds import time -from utils import get_dataset +from utils import get_dataset, set_edit_cost_consts import sys -from group_results import group_trials +from group_results import group_trials, check_group_existence, update_group_marker def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) - + # Return if the file exists. if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): return None, None @@ -41,8 +41,11 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): 'threads': multiprocessing.cpu_count(), 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' } - - edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] + + edit_cost_constants = set_edit_cost_consts(ratio, + node_labeled=len(dataset.node_labels), + edge_labeled=len(dataset.edge_labels), + mode='uniform') # edit_cost_constants = [item * 0.01 for item in edit_cost_constants] # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) @@ -53,7 +56,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): options['node_attrs'] = dataset.node_attrs options['edge_attrs'] = dataset.edge_attrs parallel = True # if num_solutions == 1 else False - + """**5. Compute GED matrix.**""" ged_mat = 'error' runtime = 0 @@ -67,9 +70,9 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception(save_file_suffix) print(repr(exp)) - + """**6. Get results.**""" - + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: pickle.dump(ged_mat, f) with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: @@ -77,66 +80,76 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): return ged_mat, runtime - + def save_trials_as_group(dataset, ds_name, num_solutions, ratio): # Return if the group file exists. name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' - if os.path.isfile(name_group): + if check_group_existence(name_group): return - + ged_mats = [] runtimes = [] - for trial in range(1, 101): + num_trials = 100 + for trial in range(1, num_trials + 1): print() print('Trial:', trial) ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) ged_mats.append(ged_mat) runtimes.append(runtime) - + # Group trials and Remove single files. + # @todo: if the program stops between the following lines, then there may be errors. name_prefix = 'ged_matrix' + name_middle - group_trials(save_dir, name_prefix, True, True, False) + group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) name_prefix = 'runtime' + name_middle - group_trials(save_dir, name_prefix, True, True, False) + group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) + update_group_marker(name_group) def results_for_a_dataset(ds_name): """**1. Get dataset.**""" dataset = get_dataset(ds_name) - - for num_solutions in num_solutions_list: + + for ratio in ratio_list: print() - print('# of solutions:', num_solutions) - for ratio in ratio_list: + print('Ratio:', ratio) + for num_solutions in num_solutions_list: print() - print('Ratio:', ratio) + print('# of solutions:', num_solutions) save_trials_as_group(dataset, ds_name, num_solutions, ratio) - - -def get_param_lists(ds_name): + + +def get_param_lists(ds_name, test=False): + if test: + num_solutions_list = [1, 10, 20, 30, 40, 50] + ratio_list = [10] + return num_solutions_list, ratio_list + if ds_name == 'AIDS_symb': num_solutions_list = [1, 20, 40, 60, 80, 100] ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] else: - num_solutions_list = [1, 20, 40, 60, 80, 100] - ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] - + num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] + return num_solutions_list, ratio_list - + if __name__ == '__main__': if len(sys.argv) > 1: ds_name_list = sys.argv[1:] else: - ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - - save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' + ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] +# ds_name_list = ['Acyclic'] # 'Alkane_unlabeled'] +# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] + + save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir + 'groups/', exist_ok=True) - + for ds_name in ds_name_list: print() print('Dataset:', ds_name) - num_solutions_list, ratio_list = get_param_lists(ds_name) + num_solutions_list, ratio_list = get_param_lists(ds_name, test=False) results_for_a_dataset(ds_name) diff --git a/gklearn/experiments/ged/stability/group_results.py b/gklearn/experiments/ged/stability/group_results.py index e1f999e..564625d 100644 --- a/gklearn/experiments/ged/stability/group_results.py +++ b/gklearn/experiments/ged/stability/group_results.py @@ -5,7 +5,7 @@ Created on Thu Oct 29 17:26:43 2020 @author: ljia -This script groups results together into a single file for the sake of faster +This script groups results together into a single file for the sake of faster searching and loading. """ import os @@ -16,9 +16,55 @@ from tqdm import tqdm import sys +def check_group_existence(file_name): + path, name = os.path.split(file_name) + marker_fn = os.path.join(path, 'group_names_finished.pkl') + if os.path.isfile(marker_fn): + with open(marker_fn, 'rb') as f: + fns = pickle.load(f) + if name in fns: + return True + + if os.path.isfile(file_name): + return True + + return False + + +def update_group_marker(file_name): + path, name = os.path.split(file_name) + marker_fn = os.path.join(path, 'group_names_finished.pkl') + if os.path.isfile(marker_fn): + with open(marker_fn, 'rb') as f: + fns = pickle.loads(f) + if name in fns: + return + else: + fns.add(name) + else: + fns = set({name}) + with open(marker_fn, 'wb') as f: + pickle.dump(fns, f) + + +def create_group_marker_file(dir_folder, overwrite=True): + if not overwrite: + return + + fns = set() + for file in sorted(os.listdir(dir_folder)): + if os.path.isfile(os.path.join(dir_folder, file)): + if file.endswith('.npy'): + fns.add(file) + + marker_fn = os.path.join(dir_folder, 'group_names_finished.pkl') + with open(marker_fn, 'wb') as f: + pickle.dump(fns, f) + + # This function is used by other scripts. Modify it carefully. -def group_trials(dir_folder, name_prefix, override, clear, backup): - +def group_trials(dir_folder, name_prefix, overwrite, clear, backup, num_trials=100): + # Get group name. label_name = name_prefix.split('.')[0] if label_name == 'ged_matrix': @@ -33,10 +79,10 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): else: name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl' - if not override and os.path.isfile(name_group): + if not overwrite and os.path.isfile(name_group): # Check if all trial files exist. trials_complete = True - for trial in range(1, 101): + for trial in range(1, num_trials + 1): file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' if not os.path.isfile(file_name): trials_complete = False @@ -44,7 +90,7 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): else: # Get data. data_group = [] - for trial in range(1, 101): + for trial in range(1, num_trials + 1): file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' if os.path.isfile(file_name): with open(file_name, 'rb') as f: @@ -64,7 +110,7 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): else: # Not all trials are completed. return - + # Write groups. if label_name == 'ged_matrix': data_group = np.array(data_group) @@ -73,31 +119,31 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): else: with open(name_group, 'wb') as f: pickle.dump(data_group, f) - + trials_complete = True if trials_complete: # Backup. if backup: - for trial in range(1, 101): + for trial in range(1, num_trials + 1): src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl' copyfile(src, dst) - + # Clear. if clear: - for trial in range(1, 101): + for trial in range(1, num_trials + 1): src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' os.remove(src) -def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): - +def group_all_in_folder(dir_folder, overwrite=False, clear=True, backup=True): + # Create folders. os.makedirs(dir_folder + 'groups/', exist_ok=True) if backup: os.makedirs(dir_folder + 'backups', exist_ok=True) - + # Iterate all files. cur_file_prefix = '' for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout): @@ -106,20 +152,23 @@ def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): # print(name) # print(name_prefix) if name_prefix != cur_file_prefix: - group_trials(dir_folder, name_prefix, override, clear, backup) + group_trials(dir_folder, name_prefix, overwrite, clear, backup) cur_file_prefix = name_prefix - - + + if __name__ == '__main__': - dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' - group_all_in_folder(dir_folder) - - dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' - group_all_in_folder(dir_folder) - - dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' - group_all_in_folder(dir_folder) - - dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' - group_all_in_folder(dir_folder) \ No newline at end of file + # dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' + # group_all_in_folder(dir_folder) + + # dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' + # group_all_in_folder(dir_folder) + + # dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' + # group_all_in_folder(dir_folder) + + # dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' + # group_all_in_folder(dir_folder) + + dir_folder = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/groups/' + create_group_marker_file(dir_folder) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/run_job_edit_costs.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py similarity index 62% rename from gklearn/experiments/ged/stability/run_job_edit_costs.nums_sols.ratios.IPFP.py rename to gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py index 6939a06..7ab72b2 100644 --- a/gklearn/experiments/ged/stability/run_job_edit_costs.nums_sols.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py @@ -15,30 +15,30 @@ def get_job_script(arg): #SBATCH --exclusive #SBATCH --job-name="st.""" + arg + r""".IPFP" -#SBATCH --partition=tlong +#SBATCH --partition=court #SBATCH --mail-type=ALL #SBATCH --mail-user=jajupmochi@gmail.com -#SBATCH --output="outputs/output_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" -#SBATCH --error="errors/error_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" +#SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.IPFP.""" + arg + """.txt" +#SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.IPFP.""" + arg + """.txt" # #SBATCH --ntasks=1 #SBATCH --nodes=1 #SBATCH --cpus-per-task=1 -#SBATCH --time=300:00:00 +#SBATCH --time=48:00:00 #SBATCH --mem-per-cpu=4000 srun hostname srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability -srun python3 edit_costs.nums_sols.ratios.IPFP.py """ + arg +srun python3 edit_costs.real_data.nums_sols.ratios.IPFP.py """ + arg script = script.strip() script = re.sub('\n\t+', '\n', script) script = re.sub('\n +', '\n', script) - + return script if __name__ == '__main__': - ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] - for ds_name in [ds_list[i] for i in [0, 3]]: + ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] + for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]: job_script = get_job_script(ds_name) command = 'sbatch <