@@ -0,0 +1,312 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Nov 6 15:35:32 2018 | |||
@author: ljia | |||
""" | |||
#import numpy as np | |||
import matplotlib.pyplot as plt | |||
import numpy as np | |||
import matplotlib.gridspec as gridspec | |||
# import pickle | |||
import os | |||
import sys | |||
from tqdm import tqdm | |||
# from mpl_toolkits.mplot3d import Axes3D | |||
root_dir = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/' | |||
root_dir_criann = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/CRIANN/' | |||
Dataset_List = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
Legend_Labels = ['common walk', 'marginalized', 'Sylvester equation', 'conjugate gradient', 'fixed-point iterations', 'Spectral decomposition', 'shortest path', 'structural sp', 'path up to length $h$', 'treelet', 'WL subtree'] | |||
# Colors = ['#084594', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', '#c6dbef', | |||
# '#54278f', '#756bb1', '#9e9ac8', '#de2d26', '#fc9272'] | |||
Colors=[ | |||
'#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', | |||
'#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b', '#c49c94', | |||
'#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22', '#dbdb8d', | |||
'#17becf', '#9edae5'] | |||
SMALL_SIZE = 8 | |||
MEDIUM_SIZE = 10 | |||
BIGGER_SIZE = 12 | |||
def read_trials_group(save_dir, ds_name, num_sols, ratio, label): | |||
file_name = save_dir + 'groups/ged_mats.' + ds_name + '.' + label + '_' + str(num_sols) + '.ratio_' + "{:.2f}".format(ratio) + '.npy' | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
ged_mats = np.load(f) | |||
return ged_mats | |||
else: | |||
return [] | |||
# ged_mats = [] | |||
# for trial in range(1, 101): | |||
# file_name = file_prefix + '.trial_' + str(trial) + '.pkl' | |||
# if os.path.isfile(file_name): | |||
# ged_matrix = pickle.load(open(file_name, 'rb')) | |||
# ged_mats.append(ged_matrix) | |||
# else: | |||
# # print(trial) | |||
# pass | |||
# Check average relative error along elements in two ged matrices. | |||
def matrices_ave_relative_error(m1, m2): | |||
error = 0 | |||
base = 0 | |||
for i in range(m1.shape[0]): | |||
for j in range(m1.shape[1]): | |||
error += np.abs(m1[i, j] - m2[i, j]) | |||
base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2 | |||
return error / base | |||
def compute_relative_error(ged_mats): | |||
if len(ged_mats) != 0: | |||
# get the smallest "correct" GED matrix. | |||
ged_mat_s = np.ones(ged_mats[0].shape) * np.inf | |||
for i in range(ged_mats[0].shape[0]): | |||
for j in range(ged_mats[0].shape[1]): | |||
ged_mat_s[i, j] = np.min([mat[i, j] for mat in ged_mats]) | |||
# compute average error. | |||
errors = [] | |||
for i, mat in enumerate(ged_mats): | |||
err = matrices_ave_relative_error(mat, ged_mat_s) | |||
# if not per_correct: | |||
# print('matrix # ', str(i)) | |||
# pass | |||
errors.append(err) | |||
else: | |||
errors = [0] | |||
return np.mean(errors) | |||
#plt.rc('font', size=SMALL_SIZE) # controls default text sizes | |||
plt.rc('axes', titlesize=15) # fontsize of the axes title | |||
plt.rc('axes', labelsize=15) # fontsize of the x and y labels | |||
plt.rc('xtick', labelsize=15) # fontsize of the tick labels | |||
plt.rc('ytick', labelsize=15) # fontsize of the tick labels | |||
plt.rc('legend', fontsize=15) # legend fontsize | |||
plt.rc('figure', titlesize=15) # fontsize of the figure title | |||
#fig, _ = plt.subplots(2, 2, figsize=(13, 12)) | |||
#ax1 = plt.subplot(221) | |||
#ax2 = plt.subplot(222) | |||
#ax3 = plt.subplot(223) | |||
#ax4 = plt.subplot(224) | |||
gs = gridspec.GridSpec(2, 2) | |||
gs.update(hspace=0.3) | |||
fig = plt.figure(figsize=(11, 12)) | |||
ax = fig.add_subplot(111) # The big subplot for common labels | |||
ax1 = fig.add_subplot(gs[0, 0], projection='3d') | |||
ax2 = fig.add_subplot(gs[0, 1], projection='3d') | |||
ax3 = fig.add_subplot(gs[1, 0], projection='3d') | |||
ax4 = fig.add_subplot(gs[1, 1], projection='3d') | |||
# ax5 = fig.add_subplot(gs[2, 0]) | |||
# ax6 = fig.add_subplot(gs[2, 1]) | |||
# Turn off axis lines and ticks of the big subplot | |||
ax.spines['top'].set_color('none') | |||
ax.spines['bottom'].set_color('none') | |||
ax.spines['left'].set_color('none') | |||
ax.spines['right'].set_color('none') | |||
ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') | |||
ax.xaxis.set_ticks_position('none') | |||
ax.yaxis.set_ticks_position('none') | |||
# Set common labels | |||
#ax.set_xlabel('accuracy(%)') | |||
ax.yaxis.set_label_coords(-0.105, 0.5) | |||
# ax.set_ylabel('runtime($s$)') | |||
# -------------- num_sols, IPFP -------------- | |||
def get_num_sol_results(): | |||
save_dir = root_dir_criann + 'edit_costs.num_sols.ratios.IPFP/' | |||
errors = {} | |||
print('-------- num_sols, IPFP --------') | |||
for ds_name in Dataset_List: | |||
print(ds_name) | |||
errors[ds_name] = [] | |||
for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: | |||
errors[ds_name].append([]) | |||
for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout): | |||
ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'num_sols') | |||
error = compute_relative_error(ged_mats) | |||
errors[ds_name][-1].append(error) | |||
return errors | |||
x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] | |||
y_values = range(0, 19) | |||
X, Y = np.meshgrid(x_values, y_values) | |||
errors = get_num_sol_results() | |||
for i, ds_name in enumerate(Dataset_List): | |||
if ds_name in errors: | |||
z_values = np.array(errors[ds_name]) | |||
ax1.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i]) | |||
# ax1.set_yscale('squareroot') | |||
# ax1.grid(axis='y') | |||
ax1.set_xlabel('# of solutions') | |||
ax1.set_ylabel('ratios') | |||
ax1.set_zlabel('average relative errors (%)') | |||
ax1.set_title('(a) num_sols, IPFP') | |||
ax1.set_yticks(range(0, 19, 2)) | |||
ax1.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10]) | |||
# ax1.set_axisbelow(True) | |||
# ax1.spines['top'].set_visible(False) | |||
# ax1.spines['bottom'].set_visible(False) | |||
# ax1.spines['right'].set_visible(False) | |||
# ax1.spines['left'].set_visible(False) | |||
# ax1.xaxis.set_ticks_position('none') | |||
# ax1.yaxis.set_ticks_position('none') | |||
# ax1.set_ylim(bottom=-1000) | |||
handles, labels = ax1.get_legend_handles_labels() | |||
# # -------------- repeats, IPFP -------------- | |||
def get_repeats_results(): | |||
save_dir = root_dir_criann + 'edit_costs.repeats.ratios.IPFP/' | |||
errors = {} | |||
print('-------- repeats, IPFP --------') | |||
for ds_name in Dataset_List: | |||
print(ds_name) | |||
errors[ds_name] = [] | |||
for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: | |||
errors[ds_name].append([]) | |||
for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout): | |||
ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'repeats') | |||
error = compute_relative_error(ged_mats) | |||
errors[ds_name][-1].append(error) | |||
return errors | |||
x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] | |||
y_values = range(0, 19) | |||
X, Y = np.meshgrid(x_values, y_values) | |||
errors = get_repeats_results() | |||
for i, ds_name in enumerate(Dataset_List): | |||
if ds_name in errors: | |||
z_values = np.array(errors[ds_name]) | |||
ax2.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i]) | |||
# ax2.set_yscale('squareroot') | |||
# ax2.grid(axis='y') | |||
ax2.set_xlabel('# of solutions') | |||
ax2.set_ylabel('ratios') | |||
ax2.set_zlabel('average relative errors (%)') | |||
ax2.set_title('(b) repeats, IPFP') | |||
ax2.set_yticks(range(0, 19, 2)) | |||
ax2.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10]) | |||
# ax2.set_axisbelow(True) | |||
# ax2.spines['top'].set_visible(False) | |||
# ax2.spines['bottom'].set_visible(False) | |||
# ax2.spines['right'].set_visible(False) | |||
# ax2.spines['left'].set_visible(False) | |||
# ax2.xaxis.set_ticks_position('none') | |||
# ax2.yaxis.set_ticks_position('none') | |||
# ax2.set_ylim(bottom=-1000) | |||
handles, labels = ax2.get_legend_handles_labels() | |||
# # -------------- degrees -------------- | |||
# def get_degree_results(): | |||
# save_dir = root_dir_criann + '28 cores/synthesized_graphs_degrees/' | |||
# run_times = {} | |||
# for kernel_name in Graph_Kernel_List: | |||
# run_times[kernel_name] = [] | |||
# for num in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: | |||
# file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl' | |||
# if os.path.isfile(file_name): | |||
# run_time = pickle.load(open(file_name, 'rb')) | |||
# else: | |||
# run_time = 0 | |||
# run_times[kernel_name].append(run_time) | |||
# return run_times | |||
# x_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] | |||
# run_times = get_degree_results() | |||
# for i, kernel_name in enumerate(Graph_Kernel_List): | |||
# if kernel_name in run_times: | |||
# ax3.plot(x_labels, run_times[kernel_name], '.-', label=Legend_Labels[i], color=Colors[i]) | |||
# ax3.set_yscale('log', nonposy='clip') | |||
# ax3.grid(axis='y') | |||
# ax3.set_xlabel('degrees') | |||
# ax3.set_ylabel('runtime($s$)') | |||
# #ax3.set_ylabel('runtime($s$) per pair of graphs') | |||
# ax3.set_title('(c) degrees') | |||
# ax3.set_axisbelow(True) | |||
# ax3.spines['top'].set_visible(False) | |||
# ax3.spines['bottom'].set_visible(False) | |||
# ax3.spines['right'].set_visible(False) | |||
# ax3.spines['left'].set_visible(False) | |||
# ax3.xaxis.set_ticks_position('none') | |||
# ax3.yaxis.set_ticks_position('none') | |||
# # -------------- Node labels -------------- | |||
# def get_node_label_results(): | |||
# save_dir = root_dir_criann + '28 cores/synthesized_graphs_num_node_label_alphabet/' | |||
# run_times = {} | |||
# for kernel_name in Graph_Kernel_List_VSym: | |||
# run_times[kernel_name] = [] | |||
# for num in [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]: | |||
# file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl' | |||
# if os.path.isfile(file_name): | |||
# run_time = pickle.load(open(file_name, 'rb')) | |||
# else: | |||
# run_time = 0 | |||
# run_times[kernel_name].append(run_time) | |||
# return run_times | |||
# # save_dir = root_dir_criann + 'synthesized_graphs_num_node_label_alphabet/' | |||
# # run_times = pickle.load(open(save_dir + 'run_times.pkl', 'rb')) | |||
# # return run_times | |||
# x_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20] | |||
# run_times = get_node_label_results() | |||
# for i, kernel_name in enumerate(Graph_Kernel_List): | |||
# if kernel_name in run_times: | |||
# ax4.plot(x_labels[1:], run_times[kernel_name][1:], '.-', label=Legend_Labels[i], color=Colors[i]) | |||
# ax4.set_yscale('log', nonposy='clip') | |||
# ax4.grid(axis='y') | |||
# ax4.set_xlabel('# of alphabets') | |||
# ax4.set_ylabel('runtime($s$)') | |||
# #ax4.set_ylabel('runtime($s$) per pair of graphs') | |||
# ax4.set_title('(d) alphabet size of vertex labels') | |||
# ax4.set_axisbelow(True) | |||
# ax4.spines['top'].set_visible(False) | |||
# ax4.spines['bottom'].set_visible(False) | |||
# ax4.spines['right'].set_visible(False) | |||
# ax4.spines['left'].set_visible(False) | |||
# ax4.xaxis.set_ticks_position('none') | |||
# ax4.yaxis.set_ticks_position('none') | |||
from matplotlib.lines import Line2D | |||
custom_lines = [] | |||
for color in Colors: | |||
custom_lines.append(Line2D([0], [0], color=color, lw=4)) | |||
fig.subplots_adjust(bottom=0.135) | |||
fig.legend(custom_lines, labels, loc='lower center', ncol=4, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) | |||
plt.savefig('stability.real_data.relative_error.eps', format='eps', dpi=300, transparent=True, | |||
bbox_inches='tight') | |||
plt.show() |
@@ -0,0 +1,130 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Mon Nov 2 16:17:01 2020 | |||
@author: ljia | |||
""" | |||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||
import os | |||
import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import numpy as np | |||
import time | |||
from utils import get_dataset | |||
import sys | |||
def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
"""**2. Set parameters.**""" | |||
# Parameters for GED computation. | |||
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||
'lsape_model': 'ECBP', # | |||
# ??when bigger than 1, then the method is considered mIPFP. | |||
# the actual number of computed solutions might be smaller than the specified value | |||
'max_num_solutions': max_num_solutions, | |||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
'greedy_method': 'BASIC', # | |||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'attr_distance': 'euclidean', | |||
'optimal': True, # if TRUE, the option --greedy-method has no effect | |||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||
'threads': multiprocessing.cpu_count(), | |||
'centrality_method': 'NONE', | |||
'centrality_weight': 0.7, | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
options = ged_options.copy() | |||
options['edit_cost_constants'] = edit_cost_constants | |||
options['node_labels'] = dataset.node_labels | |||
options['edge_labels'] = dataset.edge_labels | |||
options['node_attrs'] = dataset.node_attrs | |||
options['edge_attrs'] = dataset.edge_attrs | |||
parallel = True # if num_solutions == 1 else False | |||
"""**5. Compute GED matrix.**""" | |||
ged_mat = 'error' | |||
runtime = 0 | |||
try: | |||
time0 = time.time() | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||
runtime = time.time() - time0 | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
"""**6. Get results.**""" | |||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
return ged_mat, runtime | |||
def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
print() | |||
print('Trial:', trial) | |||
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial) | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
np.save(f, np.array(ged_mats)) | |||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for max_num_solutions in [1, 20, 40, 60, 80, 100]: | |||
print() | |||
print('Max # of solutions:', max_num_solutions) | |||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
print() | |||
print('Ratio:', ratio) | |||
save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
ds_name_list = sys.argv[1:] | |||
else: | |||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
if not os.path.exists(save_dir + 'groups/'): | |||
os.makedirs(save_dir + 'groups/') | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
results_for_a_dataset(ds_name) |
@@ -11,41 +11,16 @@ import os | |||
import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.utils import Dataset | |||
from gklearn.ged.util import compute_geds | |||
import numpy as np | |||
import time | |||
from utils import get_dataset | |||
import sys | |||
def get_dataset(ds_name): | |||
# The node/edge labels that will not be used in the computation. | |||
if ds_name == 'MAO': | |||
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
elif ds_name == 'Monoterpenoides': | |||
irrelevant_labels = {'edge_labels': ['valence']} | |||
elif ds_name == 'MUTAG': | |||
irrelevant_labels = {'edge_labels': ['label_0']} | |||
elif ds_name == 'AIDS_symb': | |||
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | |||
def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
# Initialize a Dataset. | |||
dataset = Dataset() | |||
# Load predefined dataset. | |||
dataset.load_predefined_dataset(ds_name) | |||
# Remove irrelevant labels. | |||
dataset.remove_labels(**irrelevant_labels) | |||
print('dataset size:', len(dataset.graphs)) | |||
return dataset | |||
def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial): | |||
save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
"""**2. Set parameters.**""" | |||
@@ -77,31 +52,71 @@ def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial): | |||
"""**5. Compute GED matrix.**""" | |||
ged_mat = 'error' | |||
runtime = 0 | |||
try: | |||
time0 = time.time() | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) | |||
runtime = time.time() - time0 | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('save_file_suffix') | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
"""**6. Get results.**""" | |||
pickle.dump(ged_mat, open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb')) | |||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
return ged_mat, runtime | |||
def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
print() | |||
print('Trial:', trial) | |||
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
np.save(f, np.array(ged_mats)) | |||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for num_solutions in [1, 20, 40, 60, 80, 100]: | |||
print() | |||
print('# of solutions:', num_solutions) | |||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
print() | |||
print('Ratio:', ratio) | |||
save_trials_as_group(dataset, ds_name, num_solutions, ratio) | |||
if __name__ == '__main__': | |||
for ds_name in ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']: | |||
if len(sys.argv) > 1: | |||
ds_name_list = sys.argv[1:] | |||
else: | |||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
if not os.path.exists(save_dir + 'groups/'): | |||
os.makedirs(save_dir + 'groups/') | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
for num_solutions in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: | |||
print() | |||
print('# of solutions:', num_solutions) | |||
for ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: | |||
print() | |||
print('Ratio:', ratio) | |||
for trial in range(1, 101): | |||
print() | |||
print('Trial:', trial) | |||
xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial) | |||
results_for_a_dataset(ds_name) |
@@ -0,0 +1,125 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Wed Oct 20 17:48:02 2020 | |||
@author: ljia | |||
""" | |||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||
import os | |||
import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import numpy as np | |||
import time | |||
from utils import get_dataset | |||
import sys | |||
def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
"""**2. Set parameters.**""" | |||
# Parameters for GED computation. | |||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||
# when bigger than 1, then the method is considered mIPFP. | |||
'initial_solutions': 1, | |||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'attr_distance': 'euclidean', | |||
'ratio_runs_from_initial_solutions': 1, | |||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||
'threads': multiprocessing.cpu_count(), | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
options = ged_options.copy() | |||
options['edit_cost_constants'] = edit_cost_constants | |||
options['node_labels'] = dataset.node_labels | |||
options['edge_labels'] = dataset.edge_labels | |||
options['node_attrs'] = dataset.node_attrs | |||
options['edge_attrs'] = dataset.edge_attrs | |||
parallel = True # if num_solutions == 1 else False | |||
"""**5. Compute GED matrix.**""" | |||
ged_mat = 'error' | |||
runtime = 0 | |||
try: | |||
time0 = time.time() | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||
runtime = time.time() - time0 | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
"""**6. Get results.**""" | |||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
return ged_mat, runtime | |||
def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
print() | |||
print('Trial:', trial) | |||
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial) | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
np.save(f, np.array(ged_mats)) | |||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for repeats in [1, 20, 40, 60, 80, 100]: | |||
print() | |||
print('Repeats:', repeats) | |||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
print() | |||
print('Ratio:', ratio) | |||
save_trials_as_group(dataset, ds_name, repeats, ratio) | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
ds_name_list = sys.argv[1:] | |||
else: | |||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
if not os.path.exists(save_dir + 'groups/'): | |||
os.makedirs(save_dir + 'groups/') | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
results_for_a_dataset(ds_name) |
@@ -0,0 +1,130 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Wed Oct 20 17:48:02 2020 | |||
@author: ljia | |||
""" | |||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||
import os | |||
import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import numpy as np | |||
import time | |||
from utils import get_dataset | |||
import sys | |||
def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
"""**2. Set parameters.**""" | |||
# Parameters for GED computation. | |||
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||
'lsape_model': 'ECBP', # | |||
# ??when bigger than 1, then the method is considered mIPFP. | |||
# the actual number of computed solutions might be smaller than the specified value | |||
'max_num_solutions': 1, | |||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
'greedy_method': 'BASIC', # | |||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'attr_distance': 'euclidean', | |||
'optimal': True, # if TRUE, the option --greedy-method has no effect | |||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||
'threads': multiprocessing.cpu_count(), | |||
'centrality_method': 'NONE', | |||
'centrality_weight': 0.7, | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
options = ged_options.copy() | |||
options['edit_cost_constants'] = edit_cost_constants | |||
options['node_labels'] = dataset.node_labels | |||
options['edge_labels'] = dataset.edge_labels | |||
options['node_attrs'] = dataset.node_attrs | |||
options['edge_attrs'] = dataset.edge_attrs | |||
parallel = True # if num_solutions == 1 else False | |||
"""**5. Compute GED matrix.**""" | |||
ged_mat = 'error' | |||
runtime = 0 | |||
try: | |||
time0 = time.time() | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||
runtime = time.time() - time0 | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
"""**6. Get results.**""" | |||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
return ged_mat, runtime | |||
def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
print() | |||
print('Trial:', trial) | |||
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial) | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
np.save(f, np.array(ged_mats)) | |||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for repeats in [1, 20, 40, 60, 80, 100]: | |||
print() | |||
print('Repeats:', repeats) | |||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
print() | |||
print('Ratio:', ratio) | |||
save_trials_as_group(dataset, ds_name, repeats, ratio) | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
ds_name_list = sys.argv[1:] | |||
else: | |||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
if not os.path.exists(save_dir + 'groups/'): | |||
os.makedirs(save_dir + 'groups/') | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
results_for_a_dataset(ds_name) |
@@ -0,0 +1,108 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Thu Oct 29 17:26:43 2020 | |||
@author: ljia | |||
This script groups results together into a single file for the sake of faster | |||
searching and loading. | |||
""" | |||
import os | |||
import pickle | |||
import numpy as np | |||
from shutil import copyfile | |||
from tqdm import tqdm | |||
import sys | |||
def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
# Get group name. | |||
label_name = name_prefix.split('.')[0] | |||
if label_name == 'ged_matrix': | |||
group_label = 'ged_mats' | |||
elif label_name == 'runtime': | |||
group_label = 'runtimes' | |||
else: | |||
group_label = label_name | |||
name_suffix = name_prefix[len(label_name):] | |||
if label_name == 'ged_matrix': | |||
name_group = dir_folder + 'groups/' + group_label + name_suffix + 'npy' | |||
else: | |||
name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl' | |||
if not override and os.path.isfile(name_group): | |||
# Check if all trial files exist. | |||
trials_complete = True | |||
for trial in range(1, 101): | |||
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
if not os.path.isfile(file_name): | |||
trials_complete = False | |||
break | |||
else: | |||
# Get data. | |||
data_group = [] | |||
for trial in range(1, 101): | |||
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
data = pickle.load(f) | |||
data_group.append(data) | |||
else: # Not all trials are completed. | |||
return | |||
# Write groups. | |||
if label_name == 'ged_matrix': | |||
data_group = np.array(data_group) | |||
with open(name_group, 'wb') as f: | |||
np.save(f, data_group) | |||
else: | |||
with open(name_group, 'wb') as f: | |||
pickle.dump(data_group, f) | |||
trials_complete = True | |||
if trials_complete: | |||
# Backup. | |||
if backup: | |||
for trial in range(1, 101): | |||
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
copyfile(src, dst) | |||
# Clear. | |||
if clear: | |||
for trial in range(1, 101): | |||
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
os.remove(src) | |||
def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): | |||
# Create folders. | |||
if not os.path.exists(dir_folder + 'groups/'): | |||
os.makedirs(dir_folder + 'groups/') | |||
if backup: | |||
if not os.path.exists(dir_folder + 'backups'): | |||
os.makedirs(dir_folder + 'backups') | |||
# Iterate all files. | |||
cur_file_prefix = '' | |||
for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout): | |||
if os.path.isfile(os.path.join(dir_folder, file)): | |||
name_prefix = file.split('trial_')[0] | |||
# print(name) | |||
# print(name_prefix) | |||
if name_prefix != cur_file_prefix: | |||
group_trials(dir_folder, name_prefix, override, clear, backup) | |||
cur_file_prefix = name_prefix | |||
if __name__ == '__main__': | |||
dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' | |||
group_all_in_folder(dir_folder) | |||
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' | |||
group_all_in_folder(dir_folder) |
@@ -0,0 +1,30 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Thu Oct 29 19:17:36 2020 | |||
@author: ljia | |||
""" | |||
from gklearn.utils import Dataset | |||
def get_dataset(ds_name): | |||
# The node/edge labels that will not be used in the computation. | |||
if ds_name == 'MAO': | |||
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
elif ds_name == 'Monoterpenoides': | |||
irrelevant_labels = {'edge_labels': ['valence']} | |||
elif ds_name == 'MUTAG': | |||
irrelevant_labels = {'edge_labels': ['label_0']} | |||
elif ds_name == 'AIDS_symb': | |||
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | |||
ds_name = 'AIDS' | |||
# Initialize a Dataset. | |||
dataset = Dataset() | |||
# Load predefined dataset. | |||
dataset.load_predefined_dataset(ds_name) | |||
# Remove irrelevant labels. | |||
dataset.remove_labels(**irrelevant_labels) | |||
print('dataset size:', len(dataset.graphs)) | |||
return dataset |