@@ -0,0 +1,312 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Tue Nov 6 15:35:32 2018 | |||||
@author: ljia | |||||
""" | |||||
#import numpy as np | |||||
import matplotlib.pyplot as plt | |||||
import numpy as np | |||||
import matplotlib.gridspec as gridspec | |||||
# import pickle | |||||
import os | |||||
import sys | |||||
from tqdm import tqdm | |||||
# from mpl_toolkits.mplot3d import Axes3D | |||||
root_dir = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/' | |||||
root_dir_criann = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/CRIANN/' | |||||
Dataset_List = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
Legend_Labels = ['common walk', 'marginalized', 'Sylvester equation', 'conjugate gradient', 'fixed-point iterations', 'Spectral decomposition', 'shortest path', 'structural sp', 'path up to length $h$', 'treelet', 'WL subtree'] | |||||
# Colors = ['#084594', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', '#c6dbef', | |||||
# '#54278f', '#756bb1', '#9e9ac8', '#de2d26', '#fc9272'] | |||||
Colors=[ | |||||
'#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', | |||||
'#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b', '#c49c94', | |||||
'#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22', '#dbdb8d', | |||||
'#17becf', '#9edae5'] | |||||
SMALL_SIZE = 8 | |||||
MEDIUM_SIZE = 10 | |||||
BIGGER_SIZE = 12 | |||||
def read_trials_group(save_dir, ds_name, num_sols, ratio, label): | |||||
file_name = save_dir + 'groups/ged_mats.' + ds_name + '.' + label + '_' + str(num_sols) + '.ratio_' + "{:.2f}".format(ratio) + '.npy' | |||||
if os.path.isfile(file_name): | |||||
with open(file_name, 'rb') as f: | |||||
ged_mats = np.load(f) | |||||
return ged_mats | |||||
else: | |||||
return [] | |||||
# ged_mats = [] | |||||
# for trial in range(1, 101): | |||||
# file_name = file_prefix + '.trial_' + str(trial) + '.pkl' | |||||
# if os.path.isfile(file_name): | |||||
# ged_matrix = pickle.load(open(file_name, 'rb')) | |||||
# ged_mats.append(ged_matrix) | |||||
# else: | |||||
# # print(trial) | |||||
# pass | |||||
# Check average relative error along elements in two ged matrices. | |||||
def matrices_ave_relative_error(m1, m2): | |||||
error = 0 | |||||
base = 0 | |||||
for i in range(m1.shape[0]): | |||||
for j in range(m1.shape[1]): | |||||
error += np.abs(m1[i, j] - m2[i, j]) | |||||
base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2 | |||||
return error / base | |||||
def compute_relative_error(ged_mats): | |||||
if len(ged_mats) != 0: | |||||
# get the smallest "correct" GED matrix. | |||||
ged_mat_s = np.ones(ged_mats[0].shape) * np.inf | |||||
for i in range(ged_mats[0].shape[0]): | |||||
for j in range(ged_mats[0].shape[1]): | |||||
ged_mat_s[i, j] = np.min([mat[i, j] for mat in ged_mats]) | |||||
# compute average error. | |||||
errors = [] | |||||
for i, mat in enumerate(ged_mats): | |||||
err = matrices_ave_relative_error(mat, ged_mat_s) | |||||
# if not per_correct: | |||||
# print('matrix # ', str(i)) | |||||
# pass | |||||
errors.append(err) | |||||
else: | |||||
errors = [0] | |||||
return np.mean(errors) | |||||
#plt.rc('font', size=SMALL_SIZE) # controls default text sizes | |||||
plt.rc('axes', titlesize=15) # fontsize of the axes title | |||||
plt.rc('axes', labelsize=15) # fontsize of the x and y labels | |||||
plt.rc('xtick', labelsize=15) # fontsize of the tick labels | |||||
plt.rc('ytick', labelsize=15) # fontsize of the tick labels | |||||
plt.rc('legend', fontsize=15) # legend fontsize | |||||
plt.rc('figure', titlesize=15) # fontsize of the figure title | |||||
#fig, _ = plt.subplots(2, 2, figsize=(13, 12)) | |||||
#ax1 = plt.subplot(221) | |||||
#ax2 = plt.subplot(222) | |||||
#ax3 = plt.subplot(223) | |||||
#ax4 = plt.subplot(224) | |||||
gs = gridspec.GridSpec(2, 2) | |||||
gs.update(hspace=0.3) | |||||
fig = plt.figure(figsize=(11, 12)) | |||||
ax = fig.add_subplot(111) # The big subplot for common labels | |||||
ax1 = fig.add_subplot(gs[0, 0], projection='3d') | |||||
ax2 = fig.add_subplot(gs[0, 1], projection='3d') | |||||
ax3 = fig.add_subplot(gs[1, 0], projection='3d') | |||||
ax4 = fig.add_subplot(gs[1, 1], projection='3d') | |||||
# ax5 = fig.add_subplot(gs[2, 0]) | |||||
# ax6 = fig.add_subplot(gs[2, 1]) | |||||
# Turn off axis lines and ticks of the big subplot | |||||
ax.spines['top'].set_color('none') | |||||
ax.spines['bottom'].set_color('none') | |||||
ax.spines['left'].set_color('none') | |||||
ax.spines['right'].set_color('none') | |||||
ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') | |||||
ax.xaxis.set_ticks_position('none') | |||||
ax.yaxis.set_ticks_position('none') | |||||
# Set common labels | |||||
#ax.set_xlabel('accuracy(%)') | |||||
ax.yaxis.set_label_coords(-0.105, 0.5) | |||||
# ax.set_ylabel('runtime($s$)') | |||||
# -------------- num_sols, IPFP -------------- | |||||
def get_num_sol_results(): | |||||
save_dir = root_dir_criann + 'edit_costs.num_sols.ratios.IPFP/' | |||||
errors = {} | |||||
print('-------- num_sols, IPFP --------') | |||||
for ds_name in Dataset_List: | |||||
print(ds_name) | |||||
errors[ds_name] = [] | |||||
for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: | |||||
errors[ds_name].append([]) | |||||
for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout): | |||||
ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'num_sols') | |||||
error = compute_relative_error(ged_mats) | |||||
errors[ds_name][-1].append(error) | |||||
return errors | |||||
x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] | |||||
y_values = range(0, 19) | |||||
X, Y = np.meshgrid(x_values, y_values) | |||||
errors = get_num_sol_results() | |||||
for i, ds_name in enumerate(Dataset_List): | |||||
if ds_name in errors: | |||||
z_values = np.array(errors[ds_name]) | |||||
ax1.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i]) | |||||
# ax1.set_yscale('squareroot') | |||||
# ax1.grid(axis='y') | |||||
ax1.set_xlabel('# of solutions') | |||||
ax1.set_ylabel('ratios') | |||||
ax1.set_zlabel('average relative errors (%)') | |||||
ax1.set_title('(a) num_sols, IPFP') | |||||
ax1.set_yticks(range(0, 19, 2)) | |||||
ax1.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10]) | |||||
# ax1.set_axisbelow(True) | |||||
# ax1.spines['top'].set_visible(False) | |||||
# ax1.spines['bottom'].set_visible(False) | |||||
# ax1.spines['right'].set_visible(False) | |||||
# ax1.spines['left'].set_visible(False) | |||||
# ax1.xaxis.set_ticks_position('none') | |||||
# ax1.yaxis.set_ticks_position('none') | |||||
# ax1.set_ylim(bottom=-1000) | |||||
handles, labels = ax1.get_legend_handles_labels() | |||||
# # -------------- repeats, IPFP -------------- | |||||
def get_repeats_results(): | |||||
save_dir = root_dir_criann + 'edit_costs.repeats.ratios.IPFP/' | |||||
errors = {} | |||||
print('-------- repeats, IPFP --------') | |||||
for ds_name in Dataset_List: | |||||
print(ds_name) | |||||
errors[ds_name] = [] | |||||
for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: | |||||
errors[ds_name].append([]) | |||||
for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout): | |||||
ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'repeats') | |||||
error = compute_relative_error(ged_mats) | |||||
errors[ds_name][-1].append(error) | |||||
return errors | |||||
x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] | |||||
y_values = range(0, 19) | |||||
X, Y = np.meshgrid(x_values, y_values) | |||||
errors = get_repeats_results() | |||||
for i, ds_name in enumerate(Dataset_List): | |||||
if ds_name in errors: | |||||
z_values = np.array(errors[ds_name]) | |||||
ax2.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i]) | |||||
# ax2.set_yscale('squareroot') | |||||
# ax2.grid(axis='y') | |||||
ax2.set_xlabel('# of solutions') | |||||
ax2.set_ylabel('ratios') | |||||
ax2.set_zlabel('average relative errors (%)') | |||||
ax2.set_title('(b) repeats, IPFP') | |||||
ax2.set_yticks(range(0, 19, 2)) | |||||
ax2.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10]) | |||||
# ax2.set_axisbelow(True) | |||||
# ax2.spines['top'].set_visible(False) | |||||
# ax2.spines['bottom'].set_visible(False) | |||||
# ax2.spines['right'].set_visible(False) | |||||
# ax2.spines['left'].set_visible(False) | |||||
# ax2.xaxis.set_ticks_position('none') | |||||
# ax2.yaxis.set_ticks_position('none') | |||||
# ax2.set_ylim(bottom=-1000) | |||||
handles, labels = ax2.get_legend_handles_labels() | |||||
# # -------------- degrees -------------- | |||||
# def get_degree_results(): | |||||
# save_dir = root_dir_criann + '28 cores/synthesized_graphs_degrees/' | |||||
# run_times = {} | |||||
# for kernel_name in Graph_Kernel_List: | |||||
# run_times[kernel_name] = [] | |||||
# for num in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: | |||||
# file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl' | |||||
# if os.path.isfile(file_name): | |||||
# run_time = pickle.load(open(file_name, 'rb')) | |||||
# else: | |||||
# run_time = 0 | |||||
# run_times[kernel_name].append(run_time) | |||||
# return run_times | |||||
# x_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] | |||||
# run_times = get_degree_results() | |||||
# for i, kernel_name in enumerate(Graph_Kernel_List): | |||||
# if kernel_name in run_times: | |||||
# ax3.plot(x_labels, run_times[kernel_name], '.-', label=Legend_Labels[i], color=Colors[i]) | |||||
# ax3.set_yscale('log', nonposy='clip') | |||||
# ax3.grid(axis='y') | |||||
# ax3.set_xlabel('degrees') | |||||
# ax3.set_ylabel('runtime($s$)') | |||||
# #ax3.set_ylabel('runtime($s$) per pair of graphs') | |||||
# ax3.set_title('(c) degrees') | |||||
# ax3.set_axisbelow(True) | |||||
# ax3.spines['top'].set_visible(False) | |||||
# ax3.spines['bottom'].set_visible(False) | |||||
# ax3.spines['right'].set_visible(False) | |||||
# ax3.spines['left'].set_visible(False) | |||||
# ax3.xaxis.set_ticks_position('none') | |||||
# ax3.yaxis.set_ticks_position('none') | |||||
# # -------------- Node labels -------------- | |||||
# def get_node_label_results(): | |||||
# save_dir = root_dir_criann + '28 cores/synthesized_graphs_num_node_label_alphabet/' | |||||
# run_times = {} | |||||
# for kernel_name in Graph_Kernel_List_VSym: | |||||
# run_times[kernel_name] = [] | |||||
# for num in [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]: | |||||
# file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl' | |||||
# if os.path.isfile(file_name): | |||||
# run_time = pickle.load(open(file_name, 'rb')) | |||||
# else: | |||||
# run_time = 0 | |||||
# run_times[kernel_name].append(run_time) | |||||
# return run_times | |||||
# # save_dir = root_dir_criann + 'synthesized_graphs_num_node_label_alphabet/' | |||||
# # run_times = pickle.load(open(save_dir + 'run_times.pkl', 'rb')) | |||||
# # return run_times | |||||
# x_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20] | |||||
# run_times = get_node_label_results() | |||||
# for i, kernel_name in enumerate(Graph_Kernel_List): | |||||
# if kernel_name in run_times: | |||||
# ax4.plot(x_labels[1:], run_times[kernel_name][1:], '.-', label=Legend_Labels[i], color=Colors[i]) | |||||
# ax4.set_yscale('log', nonposy='clip') | |||||
# ax4.grid(axis='y') | |||||
# ax4.set_xlabel('# of alphabets') | |||||
# ax4.set_ylabel('runtime($s$)') | |||||
# #ax4.set_ylabel('runtime($s$) per pair of graphs') | |||||
# ax4.set_title('(d) alphabet size of vertex labels') | |||||
# ax4.set_axisbelow(True) | |||||
# ax4.spines['top'].set_visible(False) | |||||
# ax4.spines['bottom'].set_visible(False) | |||||
# ax4.spines['right'].set_visible(False) | |||||
# ax4.spines['left'].set_visible(False) | |||||
# ax4.xaxis.set_ticks_position('none') | |||||
# ax4.yaxis.set_ticks_position('none') | |||||
from matplotlib.lines import Line2D | |||||
custom_lines = [] | |||||
for color in Colors: | |||||
custom_lines.append(Line2D([0], [0], color=color, lw=4)) | |||||
fig.subplots_adjust(bottom=0.135) | |||||
fig.legend(custom_lines, labels, loc='lower center', ncol=4, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) | |||||
plt.savefig('stability.real_data.relative_error.eps', format='eps', dpi=300, transparent=True, | |||||
bbox_inches='tight') | |||||
plt.show() |
@@ -0,0 +1,130 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Mon Nov 2 16:17:01 2020 | |||||
@author: ljia | |||||
""" | |||||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
import os | |||||
import multiprocessing | |||||
import pickle | |||||
import logging | |||||
from gklearn.ged.util import compute_geds | |||||
import numpy as np | |||||
import time | |||||
from utils import get_dataset | |||||
import sys | |||||
def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||||
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
"""**2. Set parameters.**""" | |||||
# Parameters for GED computation. | |||||
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
'lsape_model': 'ECBP', # | |||||
# ??when bigger than 1, then the method is considered mIPFP. | |||||
# the actual number of computed solutions might be smaller than the specified value | |||||
'max_num_solutions': max_num_solutions, | |||||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
'greedy_method': 'BASIC', # | |||||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
'attr_distance': 'euclidean', | |||||
'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
'threads': multiprocessing.cpu_count(), | |||||
'centrality_method': 'NONE', | |||||
'centrality_weight': 0.7, | |||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
} | |||||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
options = ged_options.copy() | |||||
options['edit_cost_constants'] = edit_cost_constants | |||||
options['node_labels'] = dataset.node_labels | |||||
options['edge_labels'] = dataset.edge_labels | |||||
options['node_attrs'] = dataset.node_attrs | |||||
options['edge_attrs'] = dataset.edge_attrs | |||||
parallel = True # if num_solutions == 1 else False | |||||
"""**5. Compute GED matrix.**""" | |||||
ged_mat = 'error' | |||||
runtime = 0 | |||||
try: | |||||
time0 = time.time() | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||||
runtime = time.time() - time0 | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = save_dir + 'error.txt' | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception(save_file_suffix) | |||||
print(repr(exp)) | |||||
"""**6. Get results.**""" | |||||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(ged_mat, f) | |||||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
return ged_mat, runtime | |||||
def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||||
ged_mats = [] | |||||
runtimes = [] | |||||
for trial in range(1, 101): | |||||
print() | |||||
print('Trial:', trial) | |||||
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial) | |||||
ged_mats.append(ged_mat) | |||||
runtimes.append(runtime) | |||||
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
np.save(f, np.array(ged_mats)) | |||||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
def results_for_a_dataset(ds_name): | |||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
for max_num_solutions in [1, 20, 40, 60, 80, 100]: | |||||
print() | |||||
print('Max # of solutions:', max_num_solutions) | |||||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
print() | |||||
print('Ratio:', ratio) | |||||
save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) | |||||
if __name__ == '__main__': | |||||
if len(sys.argv) > 1: | |||||
ds_name_list = sys.argv[1:] | |||||
else: | |||||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' | |||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
if not os.path.exists(save_dir + 'groups/'): | |||||
os.makedirs(save_dir + 'groups/') | |||||
for ds_name in ds_name_list: | |||||
print() | |||||
print('Dataset:', ds_name) | |||||
results_for_a_dataset(ds_name) |
@@ -11,41 +11,16 @@ import os | |||||
import multiprocessing | import multiprocessing | ||||
import pickle | import pickle | ||||
import logging | import logging | ||||
from gklearn.utils import Dataset | |||||
from gklearn.ged.util import compute_geds | from gklearn.ged.util import compute_geds | ||||
import numpy as np | |||||
import time | |||||
from utils import get_dataset | |||||
import sys | |||||
def get_dataset(ds_name): | |||||
# The node/edge labels that will not be used in the computation. | |||||
if ds_name == 'MAO': | |||||
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||||
elif ds_name == 'Monoterpenoides': | |||||
irrelevant_labels = {'edge_labels': ['valence']} | |||||
elif ds_name == 'MUTAG': | |||||
irrelevant_labels = {'edge_labels': ['label_0']} | |||||
elif ds_name == 'AIDS_symb': | |||||
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | |||||
def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||||
# Initialize a Dataset. | |||||
dataset = Dataset() | |||||
# Load predefined dataset. | |||||
dataset.load_predefined_dataset(ds_name) | |||||
# Remove irrelevant labels. | |||||
dataset.remove_labels(**irrelevant_labels) | |||||
print('dataset size:', len(dataset.graphs)) | |||||
return dataset | |||||
def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial): | |||||
save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | |||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | ||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
"""**2. Set parameters.**""" | """**2. Set parameters.**""" | ||||
@@ -77,31 +52,71 @@ def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial): | |||||
"""**5. Compute GED matrix.**""" | """**5. Compute GED matrix.**""" | ||||
ged_mat = 'error' | ged_mat = 'error' | ||||
runtime = 0 | |||||
try: | try: | ||||
time0 = time.time() | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) | ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) | ||||
runtime = time.time() - time0 | |||||
except Exception as exp: | except Exception as exp: | ||||
print('An exception occured when running this experiment:') | print('An exception occured when running this experiment:') | ||||
LOG_FILENAME = save_dir + 'error.txt' | LOG_FILENAME = save_dir + 'error.txt' | ||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | ||||
logging.exception('save_file_suffix') | |||||
logging.exception(save_file_suffix) | |||||
print(repr(exp)) | print(repr(exp)) | ||||
"""**6. Get results.**""" | """**6. Get results.**""" | ||||
pickle.dump(ged_mat, open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb')) | |||||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(ged_mat, f) | |||||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
return ged_mat, runtime | |||||
def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||||
ged_mats = [] | |||||
runtimes = [] | |||||
for trial in range(1, 101): | |||||
print() | |||||
print('Trial:', trial) | |||||
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) | |||||
ged_mats.append(ged_mat) | |||||
runtimes.append(runtime) | |||||
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
np.save(f, np.array(ged_mats)) | |||||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
def results_for_a_dataset(ds_name): | |||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
for num_solutions in [1, 20, 40, 60, 80, 100]: | |||||
print() | |||||
print('# of solutions:', num_solutions) | |||||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
print() | |||||
print('Ratio:', ratio) | |||||
save_trials_as_group(dataset, ds_name, num_solutions, ratio) | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
for ds_name in ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']: | |||||
if len(sys.argv) > 1: | |||||
ds_name_list = sys.argv[1:] | |||||
else: | |||||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | |||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
if not os.path.exists(save_dir + 'groups/'): | |||||
os.makedirs(save_dir + 'groups/') | |||||
for ds_name in ds_name_list: | |||||
print() | print() | ||||
print('Dataset:', ds_name) | print('Dataset:', ds_name) | ||||
for num_solutions in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: | |||||
print() | |||||
print('# of solutions:', num_solutions) | |||||
for ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: | |||||
print() | |||||
print('Ratio:', ratio) | |||||
for trial in range(1, 101): | |||||
print() | |||||
print('Trial:', trial) | |||||
xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial) | |||||
results_for_a_dataset(ds_name) |
@@ -0,0 +1,125 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Wed Oct 20 17:48:02 2020 | |||||
@author: ljia | |||||
""" | |||||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
import os | |||||
import multiprocessing | |||||
import pickle | |||||
import logging | |||||
from gklearn.ged.util import compute_geds | |||||
import numpy as np | |||||
import time | |||||
from utils import get_dataset | |||||
import sys | |||||
def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
"""**2. Set parameters.**""" | |||||
# Parameters for GED computation. | |||||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||||
# when bigger than 1, then the method is considered mIPFP. | |||||
'initial_solutions': 1, | |||||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
'attr_distance': 'euclidean', | |||||
'ratio_runs_from_initial_solutions': 1, | |||||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
'threads': multiprocessing.cpu_count(), | |||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
} | |||||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
options = ged_options.copy() | |||||
options['edit_cost_constants'] = edit_cost_constants | |||||
options['node_labels'] = dataset.node_labels | |||||
options['edge_labels'] = dataset.edge_labels | |||||
options['node_attrs'] = dataset.node_attrs | |||||
options['edge_attrs'] = dataset.edge_attrs | |||||
parallel = True # if num_solutions == 1 else False | |||||
"""**5. Compute GED matrix.**""" | |||||
ged_mat = 'error' | |||||
runtime = 0 | |||||
try: | |||||
time0 = time.time() | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||||
runtime = time.time() - time0 | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = save_dir + 'error.txt' | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception(save_file_suffix) | |||||
print(repr(exp)) | |||||
"""**6. Get results.**""" | |||||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(ged_mat, f) | |||||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
return ged_mat, runtime | |||||
def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||||
ged_mats = [] | |||||
runtimes = [] | |||||
for trial in range(1, 101): | |||||
print() | |||||
print('Trial:', trial) | |||||
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial) | |||||
ged_mats.append(ged_mat) | |||||
runtimes.append(runtime) | |||||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
np.save(f, np.array(ged_mats)) | |||||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
def results_for_a_dataset(ds_name): | |||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
for repeats in [1, 20, 40, 60, 80, 100]: | |||||
print() | |||||
print('Repeats:', repeats) | |||||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
print() | |||||
print('Ratio:', ratio) | |||||
save_trials_as_group(dataset, ds_name, repeats, ratio) | |||||
if __name__ == '__main__': | |||||
if len(sys.argv) > 1: | |||||
ds_name_list = sys.argv[1:] | |||||
else: | |||||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/' | |||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
if not os.path.exists(save_dir + 'groups/'): | |||||
os.makedirs(save_dir + 'groups/') | |||||
for ds_name in ds_name_list: | |||||
print() | |||||
print('Dataset:', ds_name) | |||||
results_for_a_dataset(ds_name) |
@@ -0,0 +1,130 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Wed Oct 20 17:48:02 2020 | |||||
@author: ljia | |||||
""" | |||||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed. | |||||
import os | |||||
import multiprocessing | |||||
import pickle | |||||
import logging | |||||
from gklearn.ged.util import compute_geds | |||||
import numpy as np | |||||
import time | |||||
from utils import get_dataset | |||||
import sys | |||||
def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
"""**2. Set parameters.**""" | |||||
# Parameters for GED computation. | |||||
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
'lsape_model': 'ECBP', # | |||||
# ??when bigger than 1, then the method is considered mIPFP. | |||||
# the actual number of computed solutions might be smaller than the specified value | |||||
'max_num_solutions': 1, | |||||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
'greedy_method': 'BASIC', # | |||||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
'attr_distance': 'euclidean', | |||||
'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
'threads': multiprocessing.cpu_count(), | |||||
'centrality_method': 'NONE', | |||||
'centrality_weight': 0.7, | |||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
} | |||||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
options = ged_options.copy() | |||||
options['edit_cost_constants'] = edit_cost_constants | |||||
options['node_labels'] = dataset.node_labels | |||||
options['edge_labels'] = dataset.edge_labels | |||||
options['node_attrs'] = dataset.node_attrs | |||||
options['edge_attrs'] = dataset.edge_attrs | |||||
parallel = True # if num_solutions == 1 else False | |||||
"""**5. Compute GED matrix.**""" | |||||
ged_mat = 'error' | |||||
runtime = 0 | |||||
try: | |||||
time0 = time.time() | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||||
runtime = time.time() - time0 | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = save_dir + 'error.txt' | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception(save_file_suffix) | |||||
print(repr(exp)) | |||||
"""**6. Get results.**""" | |||||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(ged_mat, f) | |||||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
return ged_mat, runtime | |||||
def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||||
ged_mats = [] | |||||
runtimes = [] | |||||
for trial in range(1, 101): | |||||
print() | |||||
print('Trial:', trial) | |||||
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial) | |||||
ged_mats.append(ged_mat) | |||||
runtimes.append(runtime) | |||||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
np.save(f, np.array(ged_mats)) | |||||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
def results_for_a_dataset(ds_name): | |||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
for repeats in [1, 20, 40, 60, 80, 100]: | |||||
print() | |||||
print('Repeats:', repeats) | |||||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
print() | |||||
print('Ratio:', ratio) | |||||
save_trials_as_group(dataset, ds_name, repeats, ratio) | |||||
if __name__ == '__main__': | |||||
if len(sys.argv) > 1: | |||||
ds_name_list = sys.argv[1:] | |||||
else: | |||||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/' | |||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
if not os.path.exists(save_dir + 'groups/'): | |||||
os.makedirs(save_dir + 'groups/') | |||||
for ds_name in ds_name_list: | |||||
print() | |||||
print('Dataset:', ds_name) | |||||
results_for_a_dataset(ds_name) |
@@ -0,0 +1,108 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Thu Oct 29 17:26:43 2020 | |||||
@author: ljia | |||||
This script groups results together into a single file for the sake of faster | |||||
searching and loading. | |||||
""" | |||||
import os | |||||
import pickle | |||||
import numpy as np | |||||
from shutil import copyfile | |||||
from tqdm import tqdm | |||||
import sys | |||||
def group_trials(dir_folder, name_prefix, override, clear, backup): | |||||
# Get group name. | |||||
label_name = name_prefix.split('.')[0] | |||||
if label_name == 'ged_matrix': | |||||
group_label = 'ged_mats' | |||||
elif label_name == 'runtime': | |||||
group_label = 'runtimes' | |||||
else: | |||||
group_label = label_name | |||||
name_suffix = name_prefix[len(label_name):] | |||||
if label_name == 'ged_matrix': | |||||
name_group = dir_folder + 'groups/' + group_label + name_suffix + 'npy' | |||||
else: | |||||
name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl' | |||||
if not override and os.path.isfile(name_group): | |||||
# Check if all trial files exist. | |||||
trials_complete = True | |||||
for trial in range(1, 101): | |||||
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
if not os.path.isfile(file_name): | |||||
trials_complete = False | |||||
break | |||||
else: | |||||
# Get data. | |||||
data_group = [] | |||||
for trial in range(1, 101): | |||||
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
if os.path.isfile(file_name): | |||||
with open(file_name, 'rb') as f: | |||||
data = pickle.load(f) | |||||
data_group.append(data) | |||||
else: # Not all trials are completed. | |||||
return | |||||
# Write groups. | |||||
if label_name == 'ged_matrix': | |||||
data_group = np.array(data_group) | |||||
with open(name_group, 'wb') as f: | |||||
np.save(f, data_group) | |||||
else: | |||||
with open(name_group, 'wb') as f: | |||||
pickle.dump(data_group, f) | |||||
trials_complete = True | |||||
if trials_complete: | |||||
# Backup. | |||||
if backup: | |||||
for trial in range(1, 101): | |||||
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
copyfile(src, dst) | |||||
# Clear. | |||||
if clear: | |||||
for trial in range(1, 101): | |||||
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||||
os.remove(src) | |||||
def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): | |||||
# Create folders. | |||||
if not os.path.exists(dir_folder + 'groups/'): | |||||
os.makedirs(dir_folder + 'groups/') | |||||
if backup: | |||||
if not os.path.exists(dir_folder + 'backups'): | |||||
os.makedirs(dir_folder + 'backups') | |||||
# Iterate all files. | |||||
cur_file_prefix = '' | |||||
for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout): | |||||
if os.path.isfile(os.path.join(dir_folder, file)): | |||||
name_prefix = file.split('trial_')[0] | |||||
# print(name) | |||||
# print(name_prefix) | |||||
if name_prefix != cur_file_prefix: | |||||
group_trials(dir_folder, name_prefix, override, clear, backup) | |||||
cur_file_prefix = name_prefix | |||||
if __name__ == '__main__': | |||||
dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' | |||||
group_all_in_folder(dir_folder) | |||||
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' | |||||
group_all_in_folder(dir_folder) |
@@ -0,0 +1,30 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Thu Oct 29 19:17:36 2020 | |||||
@author: ljia | |||||
""" | |||||
from gklearn.utils import Dataset | |||||
def get_dataset(ds_name): | |||||
# The node/edge labels that will not be used in the computation. | |||||
if ds_name == 'MAO': | |||||
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||||
elif ds_name == 'Monoterpenoides': | |||||
irrelevant_labels = {'edge_labels': ['valence']} | |||||
elif ds_name == 'MUTAG': | |||||
irrelevant_labels = {'edge_labels': ['label_0']} | |||||
elif ds_name == 'AIDS_symb': | |||||
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | |||||
ds_name = 'AIDS' | |||||
# Initialize a Dataset. | |||||
dataset = Dataset() | |||||
# Load predefined dataset. | |||||
dataset.load_predefined_dataset(ds_name) | |||||
# Remove irrelevant labels. | |||||
dataset.remove_labels(**irrelevant_labels) | |||||
print('dataset size:', len(dataset.graphs)) | |||||
return dataset |