From ffb26889b9874f09fc642f1aa82d3a0f02a30c43 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 20 Oct 2020 16:03:00 +0200 Subject: [PATCH] Add exps for the GED stability. --- .../stability/edit_costs.nums_sols.ratios.IPFP.py | 107 +++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py diff --git a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py new file mode 100644 index 0000000..ed7eb2d --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 20 11:48:02 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. + +import os +import multiprocessing +import pickle +import logging +from gklearn.utils import Dataset +from gklearn.ged.util import compute_geds + + +def get_dataset(ds_name): + # The node/edge labels that will not be used in the computation. + if ds_name == 'MAO': + irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} + elif ds_name == 'Monoterpenoides': + irrelevant_labels = {'edge_labels': ['valence']} + elif ds_name == 'MUTAG': + irrelevant_labels = {'edge_labels': ['label_0']} + elif ds_name == 'AIDS_symb': + irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} + + # Initialize a Dataset. + dataset = Dataset() + # Load predefined dataset. + dataset.load_predefined_dataset(ds_name) + # Remove irrelevant labels. + dataset.remove_labels(**irrelevant_labels) + print('dataset size:', len(dataset.graphs)) + return dataset + + +def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial): + + save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'IPFP', # use IPFP huristic. + 'initialization_method': 'RANDOM', # or 'NODE', etc. + # when bigger than 1, then the method is considered mIPFP. + 'initial_solutions': int(num_solutions * 4), + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 0.25, + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = dataset.node_labels + options['edge_labels'] = dataset.edge_labels + options['node_attrs'] = dataset.node_attrs + options['edge_attrs'] = dataset.edge_attrs + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + try: + ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception('save_file_suffix') + print(repr(exp)) + + """**6. Get results.**""" + + pickle.dump(ged_mat, open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb')) + + +if __name__ == '__main__': + for ds_name in ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']: + print() + print('Dataset:', ds_name) + for num_solutions in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: + print() + print('# of solutions:', num_solutions) + for ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: + print() + print('Ratio:', ratio) + for trial in range(1, 101): + print() + print('Trial:', trial) + xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial) \ No newline at end of file