From d24cdec251d81dc6de0ea745c7b22a878a8e2b76 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 2 Nov 2020 16:58:27 +0100 Subject: [PATCH] Update exps: ged stability. --- ...is_stability.ratios.real_data.relative_error.py | 312 +++++++++++++++++++++ .../edit_costs.max_num_sols.ratios.bipartite.py | 130 +++++++++ .../stability/edit_costs.nums_sols.ratios.IPFP.py | 101 ++++--- .../stability/edit_costs.repeats.ratios.IPFP.py | 125 +++++++++ .../edit_costs.repeats.ratios.bipartite.py | 130 +++++++++ gklearn/experiments/ged/stability/group_results.py | 108 +++++++ gklearn/experiments/ged/stability/utils.py | 30 ++ 7 files changed, 893 insertions(+), 43 deletions(-) create mode 100644 gklearn/experiments/ged/stability/Analysis_stability.ratios.real_data.relative_error.py create mode 100644 gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py create mode 100644 gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py create mode 100644 gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py create mode 100644 gklearn/experiments/ged/stability/group_results.py create mode 100644 gklearn/experiments/ged/stability/utils.py diff --git a/gklearn/experiments/ged/stability/Analysis_stability.ratios.real_data.relative_error.py b/gklearn/experiments/ged/stability/Analysis_stability.ratios.real_data.relative_error.py new file mode 100644 index 0000000..a618626 --- /dev/null +++ b/gklearn/experiments/ged/stability/Analysis_stability.ratios.real_data.relative_error.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Nov 6 15:35:32 2018 + +@author: ljia +""" + +#import numpy as np +import matplotlib.pyplot as plt +import numpy as np +import matplotlib.gridspec as gridspec +# import pickle +import os +import sys +from tqdm import tqdm +# from mpl_toolkits.mplot3d import Axes3D + + +root_dir = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/' + +root_dir_criann = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/CRIANN/' + +Dataset_List = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] + +Legend_Labels = ['common walk', 'marginalized', 'Sylvester equation', 'conjugate gradient', 'fixed-point iterations', 'Spectral decomposition', 'shortest path', 'structural sp', 'path up to length $h$', 'treelet', 'WL subtree'] + +# Colors = ['#084594', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', '#c6dbef', +# '#54278f', '#756bb1', '#9e9ac8', '#de2d26', '#fc9272'] +Colors=[ + '#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', + '#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b', '#c49c94', + '#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22', '#dbdb8d', + '#17becf', '#9edae5'] + +SMALL_SIZE = 8 +MEDIUM_SIZE = 10 +BIGGER_SIZE = 12 + + +def read_trials_group(save_dir, ds_name, num_sols, ratio, label): + file_name = save_dir + 'groups/ged_mats.' + ds_name + '.' + label + '_' + str(num_sols) + '.ratio_' + "{:.2f}".format(ratio) + '.npy' + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + ged_mats = np.load(f) + return ged_mats + else: + return [] + +# ged_mats = [] +# for trial in range(1, 101): +# file_name = file_prefix + '.trial_' + str(trial) + '.pkl' +# if os.path.isfile(file_name): +# ged_matrix = pickle.load(open(file_name, 'rb')) +# ged_mats.append(ged_matrix) +# else: +# # print(trial) +# pass + + +# Check average relative error along elements in two ged matrices. +def matrices_ave_relative_error(m1, m2): + error = 0 + base = 0 + for i in range(m1.shape[0]): + for j in range(m1.shape[1]): + error += np.abs(m1[i, j] - m2[i, j]) + base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2 + + return error / base + + +def compute_relative_error(ged_mats): + + if len(ged_mats) != 0: + # get the smallest "correct" GED matrix. + ged_mat_s = np.ones(ged_mats[0].shape) * np.inf + for i in range(ged_mats[0].shape[0]): + for j in range(ged_mats[0].shape[1]): + ged_mat_s[i, j] = np.min([mat[i, j] for mat in ged_mats]) + + # compute average error. + errors = [] + for i, mat in enumerate(ged_mats): + err = matrices_ave_relative_error(mat, ged_mat_s) + # if not per_correct: + # print('matrix # ', str(i)) + # pass + errors.append(err) + else: + errors = [0] + + return np.mean(errors) + + + + +#plt.rc('font', size=SMALL_SIZE) # controls default text sizes +plt.rc('axes', titlesize=15) # fontsize of the axes title +plt.rc('axes', labelsize=15) # fontsize of the x and y labels +plt.rc('xtick', labelsize=15) # fontsize of the tick labels +plt.rc('ytick', labelsize=15) # fontsize of the tick labels +plt.rc('legend', fontsize=15) # legend fontsize +plt.rc('figure', titlesize=15) # fontsize of the figure title + +#fig, _ = plt.subplots(2, 2, figsize=(13, 12)) +#ax1 = plt.subplot(221) +#ax2 = plt.subplot(222) +#ax3 = plt.subplot(223) +#ax4 = plt.subplot(224) +gs = gridspec.GridSpec(2, 2) +gs.update(hspace=0.3) +fig = plt.figure(figsize=(11, 12)) +ax = fig.add_subplot(111) # The big subplot for common labels +ax1 = fig.add_subplot(gs[0, 0], projection='3d') +ax2 = fig.add_subplot(gs[0, 1], projection='3d') +ax3 = fig.add_subplot(gs[1, 0], projection='3d') +ax4 = fig.add_subplot(gs[1, 1], projection='3d') +# ax5 = fig.add_subplot(gs[2, 0]) +# ax6 = fig.add_subplot(gs[2, 1]) + +# Turn off axis lines and ticks of the big subplot +ax.spines['top'].set_color('none') +ax.spines['bottom'].set_color('none') +ax.spines['left'].set_color('none') +ax.spines['right'].set_color('none') +ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') +ax.xaxis.set_ticks_position('none') +ax.yaxis.set_ticks_position('none') +# Set common labels +#ax.set_xlabel('accuracy(%)') +ax.yaxis.set_label_coords(-0.105, 0.5) +# ax.set_ylabel('runtime($s$)') + + +# -------------- num_sols, IPFP -------------- +def get_num_sol_results(): + save_dir = root_dir_criann + 'edit_costs.num_sols.ratios.IPFP/' + errors = {} + print('-------- num_sols, IPFP --------') + for ds_name in Dataset_List: + print(ds_name) + errors[ds_name] = [] + for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: + errors[ds_name].append([]) + for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout): + ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'num_sols') + error = compute_relative_error(ged_mats) + errors[ds_name][-1].append(error) + + return errors + +x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] +y_values = range(0, 19) +X, Y = np.meshgrid(x_values, y_values) +errors = get_num_sol_results() +for i, ds_name in enumerate(Dataset_List): + if ds_name in errors: + z_values = np.array(errors[ds_name]) + ax1.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i]) + +# ax1.set_yscale('squareroot') +# ax1.grid(axis='y') +ax1.set_xlabel('# of solutions') +ax1.set_ylabel('ratios') +ax1.set_zlabel('average relative errors (%)') +ax1.set_title('(a) num_sols, IPFP') +ax1.set_yticks(range(0, 19, 2)) +ax1.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10]) +# ax1.set_axisbelow(True) +# ax1.spines['top'].set_visible(False) +# ax1.spines['bottom'].set_visible(False) +# ax1.spines['right'].set_visible(False) +# ax1.spines['left'].set_visible(False) +# ax1.xaxis.set_ticks_position('none') +# ax1.yaxis.set_ticks_position('none') +# ax1.set_ylim(bottom=-1000) +handles, labels = ax1.get_legend_handles_labels() + + + +# # -------------- repeats, IPFP -------------- +def get_repeats_results(): + save_dir = root_dir_criann + 'edit_costs.repeats.ratios.IPFP/' + errors = {} + print('-------- repeats, IPFP --------') + for ds_name in Dataset_List: + print(ds_name) + errors[ds_name] = [] + for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: + errors[ds_name].append([]) + for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout): + ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'repeats') + error = compute_relative_error(ged_mats) + errors[ds_name][-1].append(error) + + return errors + +x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] +y_values = range(0, 19) +X, Y = np.meshgrid(x_values, y_values) +errors = get_repeats_results() +for i, ds_name in enumerate(Dataset_List): + if ds_name in errors: + z_values = np.array(errors[ds_name]) + ax2.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i]) + +# ax2.set_yscale('squareroot') +# ax2.grid(axis='y') +ax2.set_xlabel('# of solutions') +ax2.set_ylabel('ratios') +ax2.set_zlabel('average relative errors (%)') +ax2.set_title('(b) repeats, IPFP') +ax2.set_yticks(range(0, 19, 2)) +ax2.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10]) +# ax2.set_axisbelow(True) +# ax2.spines['top'].set_visible(False) +# ax2.spines['bottom'].set_visible(False) +# ax2.spines['right'].set_visible(False) +# ax2.spines['left'].set_visible(False) +# ax2.xaxis.set_ticks_position('none') +# ax2.yaxis.set_ticks_position('none') +# ax2.set_ylim(bottom=-1000) +handles, labels = ax2.get_legend_handles_labels() + + +# # -------------- degrees -------------- +# def get_degree_results(): +# save_dir = root_dir_criann + '28 cores/synthesized_graphs_degrees/' +# run_times = {} +# for kernel_name in Graph_Kernel_List: +# run_times[kernel_name] = [] +# for num in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: +# file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl' +# if os.path.isfile(file_name): +# run_time = pickle.load(open(file_name, 'rb')) +# else: +# run_time = 0 +# run_times[kernel_name].append(run_time) +# return run_times + +# x_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] +# run_times = get_degree_results() +# for i, kernel_name in enumerate(Graph_Kernel_List): +# if kernel_name in run_times: +# ax3.plot(x_labels, run_times[kernel_name], '.-', label=Legend_Labels[i], color=Colors[i]) + +# ax3.set_yscale('log', nonposy='clip') +# ax3.grid(axis='y') +# ax3.set_xlabel('degrees') +# ax3.set_ylabel('runtime($s$)') +# #ax3.set_ylabel('runtime($s$) per pair of graphs') +# ax3.set_title('(c) degrees') +# ax3.set_axisbelow(True) +# ax3.spines['top'].set_visible(False) +# ax3.spines['bottom'].set_visible(False) +# ax3.spines['right'].set_visible(False) +# ax3.spines['left'].set_visible(False) +# ax3.xaxis.set_ticks_position('none') +# ax3.yaxis.set_ticks_position('none') + + +# # -------------- Node labels -------------- +# def get_node_label_results(): +# save_dir = root_dir_criann + '28 cores/synthesized_graphs_num_node_label_alphabet/' +# run_times = {} +# for kernel_name in Graph_Kernel_List_VSym: +# run_times[kernel_name] = [] +# for num in [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]: +# file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl' +# if os.path.isfile(file_name): +# run_time = pickle.load(open(file_name, 'rb')) +# else: +# run_time = 0 +# run_times[kernel_name].append(run_time) +# return run_times + +# # save_dir = root_dir_criann + 'synthesized_graphs_num_node_label_alphabet/' +# # run_times = pickle.load(open(save_dir + 'run_times.pkl', 'rb')) +# # return run_times + +# x_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20] +# run_times = get_node_label_results() +# for i, kernel_name in enumerate(Graph_Kernel_List): +# if kernel_name in run_times: +# ax4.plot(x_labels[1:], run_times[kernel_name][1:], '.-', label=Legend_Labels[i], color=Colors[i]) + +# ax4.set_yscale('log', nonposy='clip') +# ax4.grid(axis='y') +# ax4.set_xlabel('# of alphabets') +# ax4.set_ylabel('runtime($s$)') +# #ax4.set_ylabel('runtime($s$) per pair of graphs') +# ax4.set_title('(d) alphabet size of vertex labels') +# ax4.set_axisbelow(True) +# ax4.spines['top'].set_visible(False) +# ax4.spines['bottom'].set_visible(False) +# ax4.spines['right'].set_visible(False) +# ax4.spines['left'].set_visible(False) +# ax4.xaxis.set_ticks_position('none') +# ax4.yaxis.set_ticks_position('none') + + +from matplotlib.lines import Line2D +custom_lines = [] +for color in Colors: + custom_lines.append(Line2D([0], [0], color=color, lw=4)) + +fig.subplots_adjust(bottom=0.135) +fig.legend(custom_lines, labels, loc='lower center', ncol=4, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) +plt.savefig('stability.real_data.relative_error.eps', format='eps', dpi=300, transparent=True, + bbox_inches='tight') +plt.show() \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py new file mode 100644 index 0000000..d05558a --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Nov 2 16:17:01 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. + +import os +import multiprocessing +import pickle +import logging +from gklearn.ged.util import compute_geds +import numpy as np +import time +from utils import get_dataset +import sys + + +def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): + + save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. + # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) + 'lsape_model': 'ECBP', # + # ??when bigger than 1, then the method is considered mIPFP. + # the actual number of computed solutions might be smaller than the specified value + 'max_num_solutions': max_num_solutions, + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + 'greedy_method': 'BASIC', # + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'optimal': True, # if TRUE, the option --greedy-method has no effect + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'centrality_method': 'NONE', + 'centrality_weight': 0.7, + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = dataset.node_labels + options['edge_labels'] = dataset.edge_labels + options['node_attrs'] = dataset.node_attrs + options['edge_attrs'] = dataset.edge_attrs + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + runtime = 0 + try: + time0 = time.time() + ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) + runtime = time.time() - time0 + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception(save_file_suffix) + print(repr(exp)) + + """**6. Get results.**""" + + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): + ged_mats = [] + runtimes = [] + for trial in range(1, 101): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + + save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: + np.save(f, np.array(ged_mats)) + with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + +def results_for_a_dataset(ds_name): + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + for max_num_solutions in [1, 20, 40, 60, 80, 100]: + print() + print('Max # of solutions:', max_num_solutions) + for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: + print() + print('Ratio:', ratio) + save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) + + +if __name__ == '__main__': + if len(sys.argv) > 1: + ds_name_list = sys.argv[1:] + else: + ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] + + save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + if not os.path.exists(save_dir + 'groups/'): + os.makedirs(save_dir + 'groups/') + + for ds_name in ds_name_list: + print() + print('Dataset:', ds_name) + results_for_a_dataset(ds_name) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py index ed7eb2d..4a3c0da 100644 --- a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py @@ -11,41 +11,16 @@ import os import multiprocessing import pickle import logging -from gklearn.utils import Dataset from gklearn.ged.util import compute_geds +import numpy as np +import time +from utils import get_dataset +import sys -def get_dataset(ds_name): - # The node/edge labels that will not be used in the computation. - if ds_name == 'MAO': - irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} - elif ds_name == 'Monoterpenoides': - irrelevant_labels = {'edge_labels': ['valence']} - elif ds_name == 'MUTAG': - irrelevant_labels = {'edge_labels': ['label_0']} - elif ds_name == 'AIDS_symb': - irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} +def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): - # Initialize a Dataset. - dataset = Dataset() - # Load predefined dataset. - dataset.load_predefined_dataset(ds_name) - # Remove irrelevant labels. - dataset.remove_labels(**irrelevant_labels) - print('dataset size:', len(dataset.graphs)) - return dataset - - -def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial): - - save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' - if not os.path.exists(save_dir): - os.makedirs(save_dir) - save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) - - """**1. Get dataset.**""" - dataset = get_dataset(ds_name) """**2. Set parameters.**""" @@ -77,31 +52,71 @@ def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial): """**5. Compute GED matrix.**""" ged_mat = 'error' + runtime = 0 try: + time0 = time.time() ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) + runtime = time.time() - time0 except Exception as exp: print('An exception occured when running this experiment:') LOG_FILENAME = save_dir + 'error.txt' logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) - logging.exception('save_file_suffix') + logging.exception(save_file_suffix) print(repr(exp)) """**6. Get results.**""" - pickle.dump(ged_mat, open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb')) + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(dataset, ds_name, num_solutions, ratio): + ged_mats = [] + runtimes = [] + for trial in range(1, 101): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: + np.save(f, np.array(ged_mats)) + with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + +def results_for_a_dataset(ds_name): + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + for num_solutions in [1, 20, 40, 60, 80, 100]: + print() + print('# of solutions:', num_solutions) + for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: + print() + print('Ratio:', ratio) + save_trials_as_group(dataset, ds_name, num_solutions, ratio) + if __name__ == '__main__': - for ds_name in ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']: + if len(sys.argv) > 1: + ds_name_list = sys.argv[1:] + else: + ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] + + save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + if not os.path.exists(save_dir + 'groups/'): + os.makedirs(save_dir + 'groups/') + + for ds_name in ds_name_list: print() print('Dataset:', ds_name) - for num_solutions in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: - print() - print('# of solutions:', num_solutions) - for ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: - print() - print('Ratio:', ratio) - for trial in range(1, 101): - print() - print('Trial:', trial) - xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial) \ No newline at end of file + results_for_a_dataset(ds_name) diff --git a/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py new file mode 100644 index 0000000..5b4576b --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 20 17:48:02 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. + +import os +import multiprocessing +import pickle +import logging +from gklearn.ged.util import compute_geds +import numpy as np +import time +from utils import get_dataset +import sys + + +def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): + + save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'IPFP', # use IPFP huristic. + 'initialization_method': 'RANDOM', # or 'NODE', etc. + # when bigger than 1, then the method is considered mIPFP. + 'initial_solutions': 1, + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = dataset.node_labels + options['edge_labels'] = dataset.edge_labels + options['node_attrs'] = dataset.node_attrs + options['edge_attrs'] = dataset.edge_attrs + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + runtime = 0 + try: + time0 = time.time() + ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) + runtime = time.time() - time0 + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception(save_file_suffix) + print(repr(exp)) + + """**6. Get results.**""" + + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(dataset, ds_name, repeats, ratio): + ged_mats = [] + runtimes = [] + for trial in range(1, 101): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + + save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: + np.save(f, np.array(ged_mats)) + with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + +def results_for_a_dataset(ds_name): + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + for repeats in [1, 20, 40, 60, 80, 100]: + print() + print('Repeats:', repeats) + for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: + print() + print('Ratio:', ratio) + save_trials_as_group(dataset, ds_name, repeats, ratio) + + +if __name__ == '__main__': + if len(sys.argv) > 1: + ds_name_list = sys.argv[1:] + else: + ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] + + save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + if not os.path.exists(save_dir + 'groups/'): + os.makedirs(save_dir + 'groups/') + + for ds_name in ds_name_list: + print() + print('Dataset:', ds_name) + results_for_a_dataset(ds_name) diff --git a/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py new file mode 100644 index 0000000..f6ecd99 --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 20 17:48:02 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. + +import os +import multiprocessing +import pickle +import logging +from gklearn.ged.util import compute_geds +import numpy as np +import time +from utils import get_dataset +import sys + + +def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): + + save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. + # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) + 'lsape_model': 'ECBP', # + # ??when bigger than 1, then the method is considered mIPFP. + # the actual number of computed solutions might be smaller than the specified value + 'max_num_solutions': 1, + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + 'greedy_method': 'BASIC', # + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'optimal': True, # if TRUE, the option --greedy-method has no effect + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'centrality_method': 'NONE', + 'centrality_weight': 0.7, + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = dataset.node_labels + options['edge_labels'] = dataset.edge_labels + options['node_attrs'] = dataset.node_attrs + options['edge_attrs'] = dataset.edge_attrs + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + runtime = 0 + try: + time0 = time.time() + ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) + runtime = time.time() - time0 + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception(save_file_suffix) + print(repr(exp)) + + """**6. Get results.**""" + + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(dataset, ds_name, repeats, ratio): + ged_mats = [] + runtimes = [] + for trial in range(1, 101): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + + save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: + np.save(f, np.array(ged_mats)) + with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + +def results_for_a_dataset(ds_name): + """**1. Get dataset.**""" + dataset = get_dataset(ds_name) + + for repeats in [1, 20, 40, 60, 80, 100]: + print() + print('Repeats:', repeats) + for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: + print() + print('Ratio:', ratio) + save_trials_as_group(dataset, ds_name, repeats, ratio) + + +if __name__ == '__main__': + if len(sys.argv) > 1: + ds_name_list = sys.argv[1:] + else: + ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] + + save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + if not os.path.exists(save_dir + 'groups/'): + os.makedirs(save_dir + 'groups/') + + for ds_name in ds_name_list: + print() + print('Dataset:', ds_name) + results_for_a_dataset(ds_name) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/group_results.py b/gklearn/experiments/ged/stability/group_results.py new file mode 100644 index 0000000..48ea68d --- /dev/null +++ b/gklearn/experiments/ged/stability/group_results.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Oct 29 17:26:43 2020 + +@author: ljia + +This script groups results together into a single file for the sake of faster +searching and loading. +""" +import os +import pickle +import numpy as np +from shutil import copyfile +from tqdm import tqdm +import sys + + +def group_trials(dir_folder, name_prefix, override, clear, backup): + + # Get group name. + label_name = name_prefix.split('.')[0] + if label_name == 'ged_matrix': + group_label = 'ged_mats' + elif label_name == 'runtime': + group_label = 'runtimes' + else: + group_label = label_name + name_suffix = name_prefix[len(label_name):] + if label_name == 'ged_matrix': + name_group = dir_folder + 'groups/' + group_label + name_suffix + 'npy' + else: + name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl' + + if not override and os.path.isfile(name_group): + # Check if all trial files exist. + trials_complete = True + for trial in range(1, 101): + file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' + if not os.path.isfile(file_name): + trials_complete = False + break + else: + # Get data. + data_group = [] + for trial in range(1, 101): + file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + data = pickle.load(f) + data_group.append(data) + else: # Not all trials are completed. + return + + # Write groups. + if label_name == 'ged_matrix': + data_group = np.array(data_group) + with open(name_group, 'wb') as f: + np.save(f, data_group) + else: + with open(name_group, 'wb') as f: + pickle.dump(data_group, f) + + trials_complete = True + + if trials_complete: + # Backup. + if backup: + for trial in range(1, 101): + src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' + dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl' + copyfile(src, dst) + + # Clear. + if clear: + for trial in range(1, 101): + src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' + os.remove(src) + + +def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): + + # Create folders. + if not os.path.exists(dir_folder + 'groups/'): + os.makedirs(dir_folder + 'groups/') + if backup: + if not os.path.exists(dir_folder + 'backups'): + os.makedirs(dir_folder + 'backups') + + # Iterate all files. + cur_file_prefix = '' + for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout): + if os.path.isfile(os.path.join(dir_folder, file)): + name_prefix = file.split('trial_')[0] +# print(name) +# print(name_prefix) + if name_prefix != cur_file_prefix: + group_trials(dir_folder, name_prefix, override, clear, backup) + cur_file_prefix = name_prefix + + + +if __name__ == '__main__': + dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' + group_all_in_folder(dir_folder) + + dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' + group_all_in_folder(dir_folder) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/utils.py b/gklearn/experiments/ged/stability/utils.py new file mode 100644 index 0000000..5feaba4 --- /dev/null +++ b/gklearn/experiments/ged/stability/utils.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Oct 29 19:17:36 2020 + +@author: ljia +""" +from gklearn.utils import Dataset + + +def get_dataset(ds_name): + # The node/edge labels that will not be used in the computation. + if ds_name == 'MAO': + irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} + elif ds_name == 'Monoterpenoides': + irrelevant_labels = {'edge_labels': ['valence']} + elif ds_name == 'MUTAG': + irrelevant_labels = {'edge_labels': ['label_0']} + elif ds_name == 'AIDS_symb': + irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} + ds_name = 'AIDS' + + # Initialize a Dataset. + dataset = Dataset() + # Load predefined dataset. + dataset.load_predefined_dataset(ds_name) + # Remove irrelevant labels. + dataset.remove_labels(**irrelevant_labels) + print('dataset size:', len(dataset.graphs)) + return dataset \ No newline at end of file