Browse Source

Update exps: ged stability.

v0.2.x
jajupmochi 4 years ago
parent
commit
d24cdec251
7 changed files with 893 additions and 43 deletions
  1. +312
    -0
      gklearn/experiments/ged/stability/Analysis_stability.ratios.real_data.relative_error.py
  2. +130
    -0
      gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py
  3. +58
    -43
      gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py
  4. +125
    -0
      gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py
  5. +130
    -0
      gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py
  6. +108
    -0
      gklearn/experiments/ged/stability/group_results.py
  7. +30
    -0
      gklearn/experiments/ged/stability/utils.py

+ 312
- 0
gklearn/experiments/ged/stability/Analysis_stability.ratios.real_data.relative_error.py View File

@@ -0,0 +1,312 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 6 15:35:32 2018

@author: ljia
"""

#import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.gridspec as gridspec
# import pickle
import os
import sys
from tqdm import tqdm
# from mpl_toolkits.mplot3d import Axes3D


root_dir = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/'

root_dir_criann = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/ged/stability/outputs/CRIANN/'

Dataset_List = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']

Legend_Labels = ['common walk', 'marginalized', 'Sylvester equation', 'conjugate gradient', 'fixed-point iterations', 'Spectral decomposition', 'shortest path', 'structural sp', 'path up to length $h$', 'treelet', 'WL subtree']

# Colors = ['#084594', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', '#c6dbef',
# '#54278f', '#756bb1', '#9e9ac8', '#de2d26', '#fc9272']
Colors=[
'#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a',
'#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b', '#c49c94',
'#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22', '#dbdb8d',
'#17becf', '#9edae5']

SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12


def read_trials_group(save_dir, ds_name, num_sols, ratio, label):
file_name = save_dir + 'groups/ged_mats.' + ds_name + '.' + label + '_' + str(num_sols) + '.ratio_' + "{:.2f}".format(ratio) + '.npy'
if os.path.isfile(file_name):
with open(file_name, 'rb') as f:
ged_mats = np.load(f)
return ged_mats
else:
return []
# ged_mats = []
# for trial in range(1, 101):
# file_name = file_prefix + '.trial_' + str(trial) + '.pkl'
# if os.path.isfile(file_name):
# ged_matrix = pickle.load(open(file_name, 'rb'))
# ged_mats.append(ged_matrix)
# else:
# # print(trial)
# pass
# Check average relative error along elements in two ged matrices.
def matrices_ave_relative_error(m1, m2):
error = 0
base = 0
for i in range(m1.shape[0]):
for j in range(m1.shape[1]):
error += np.abs(m1[i, j] - m2[i, j])
base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2
return error / base


def compute_relative_error(ged_mats):
if len(ged_mats) != 0:
# get the smallest "correct" GED matrix.
ged_mat_s = np.ones(ged_mats[0].shape) * np.inf
for i in range(ged_mats[0].shape[0]):
for j in range(ged_mats[0].shape[1]):
ged_mat_s[i, j] = np.min([mat[i, j] for mat in ged_mats])
# compute average error.
errors = []
for i, mat in enumerate(ged_mats):
err = matrices_ave_relative_error(mat, ged_mat_s)
# if not per_correct:
# print('matrix # ', str(i))
# pass
errors.append(err)
else:
errors = [0]
return np.mean(errors)


#plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=15) # fontsize of the axes title
plt.rc('axes', labelsize=15) # fontsize of the x and y labels
plt.rc('xtick', labelsize=15) # fontsize of the tick labels
plt.rc('ytick', labelsize=15) # fontsize of the tick labels
plt.rc('legend', fontsize=15) # legend fontsize
plt.rc('figure', titlesize=15) # fontsize of the figure title

#fig, _ = plt.subplots(2, 2, figsize=(13, 12))
#ax1 = plt.subplot(221)
#ax2 = plt.subplot(222)
#ax3 = plt.subplot(223)
#ax4 = plt.subplot(224)
gs = gridspec.GridSpec(2, 2)
gs.update(hspace=0.3)
fig = plt.figure(figsize=(11, 12))
ax = fig.add_subplot(111) # The big subplot for common labels
ax1 = fig.add_subplot(gs[0, 0], projection='3d')
ax2 = fig.add_subplot(gs[0, 1], projection='3d')
ax3 = fig.add_subplot(gs[1, 0], projection='3d')
ax4 = fig.add_subplot(gs[1, 1], projection='3d')
# ax5 = fig.add_subplot(gs[2, 0])
# ax6 = fig.add_subplot(gs[2, 1])

# Turn off axis lines and ticks of the big subplot
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.spines['right'].set_color('none')
ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off')
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
# Set common labels
#ax.set_xlabel('accuracy(%)')
ax.yaxis.set_label_coords(-0.105, 0.5)
# ax.set_ylabel('runtime($s$)')


# -------------- num_sols, IPFP --------------
def get_num_sol_results():
save_dir = root_dir_criann + 'edit_costs.num_sols.ratios.IPFP/'
errors = {}
print('-------- num_sols, IPFP --------')
for ds_name in Dataset_List:
print(ds_name)
errors[ds_name] = []
for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
errors[ds_name].append([])
for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout):
ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'num_sols')
error = compute_relative_error(ged_mats)
errors[ds_name][-1].append(error)

return errors
x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
y_values = range(0, 19)
X, Y = np.meshgrid(x_values, y_values)
errors = get_num_sol_results()
for i, ds_name in enumerate(Dataset_List):
if ds_name in errors:
z_values = np.array(errors[ds_name])
ax1.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i])

# ax1.set_yscale('squareroot')
# ax1.grid(axis='y')
ax1.set_xlabel('# of solutions')
ax1.set_ylabel('ratios')
ax1.set_zlabel('average relative errors (%)')
ax1.set_title('(a) num_sols, IPFP')
ax1.set_yticks(range(0, 19, 2))
ax1.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10])
# ax1.set_axisbelow(True)
# ax1.spines['top'].set_visible(False)
# ax1.spines['bottom'].set_visible(False)
# ax1.spines['right'].set_visible(False)
# ax1.spines['left'].set_visible(False)
# ax1.xaxis.set_ticks_position('none')
# ax1.yaxis.set_ticks_position('none')
# ax1.set_ylim(bottom=-1000)
handles, labels = ax1.get_legend_handles_labels()



# # -------------- repeats, IPFP --------------
def get_repeats_results():
save_dir = root_dir_criann + 'edit_costs.repeats.ratios.IPFP/'
errors = {}
print('-------- repeats, IPFP --------')
for ds_name in Dataset_List:
print(ds_name)
errors[ds_name] = []
for num_sols in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
errors[ds_name].append([])
for ratio in tqdm([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], desc='num_sols = ' + str(num_sols), file=sys.stdout):
ged_mats = read_trials_group(save_dir, ds_name, num_sols, ratio, 'repeats')
error = compute_relative_error(ged_mats)
errors[ds_name][-1].append(error)

return errors
x_values = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
y_values = range(0, 19)
X, Y = np.meshgrid(x_values, y_values)
errors = get_repeats_results()
for i, ds_name in enumerate(Dataset_List):
if ds_name in errors:
z_values = np.array(errors[ds_name])
ax2.plot_wireframe(X, Y, z_values.T, label=Dataset_List[i], color=Colors[i]) #, '.-', label=Legend_Labels[i], color=Colors[i])

# ax2.set_yscale('squareroot')
# ax2.grid(axis='y')
ax2.set_xlabel('# of solutions')
ax2.set_ylabel('ratios')
ax2.set_zlabel('average relative errors (%)')
ax2.set_title('(b) repeats, IPFP')
ax2.set_yticks(range(0, 19, 2))
ax2.set_yticklabels([0.1, 0.3, 0.5, 0.7, 0.9, 2, 4, 6, 8, 10])
# ax2.set_axisbelow(True)
# ax2.spines['top'].set_visible(False)
# ax2.spines['bottom'].set_visible(False)
# ax2.spines['right'].set_visible(False)
# ax2.spines['left'].set_visible(False)
# ax2.xaxis.set_ticks_position('none')
# ax2.yaxis.set_ticks_position('none')
# ax2.set_ylim(bottom=-1000)
handles, labels = ax2.get_legend_handles_labels()


# # -------------- degrees --------------
# def get_degree_results():
# save_dir = root_dir_criann + '28 cores/synthesized_graphs_degrees/'
# run_times = {}
# for kernel_name in Graph_Kernel_List:
# run_times[kernel_name] = []
# for num in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
# file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl'
# if os.path.isfile(file_name):
# run_time = pickle.load(open(file_name, 'rb'))
# else:
# run_time = 0
# run_times[kernel_name].append(run_time)
# return run_times

# x_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# run_times = get_degree_results()
# for i, kernel_name in enumerate(Graph_Kernel_List):
# if kernel_name in run_times:
# ax3.plot(x_labels, run_times[kernel_name], '.-', label=Legend_Labels[i], color=Colors[i])

# ax3.set_yscale('log', nonposy='clip')
# ax3.grid(axis='y')
# ax3.set_xlabel('degrees')
# ax3.set_ylabel('runtime($s$)')
# #ax3.set_ylabel('runtime($s$) per pair of graphs')
# ax3.set_title('(c) degrees')
# ax3.set_axisbelow(True)
# ax3.spines['top'].set_visible(False)
# ax3.spines['bottom'].set_visible(False)
# ax3.spines['right'].set_visible(False)
# ax3.spines['left'].set_visible(False)
# ax3.xaxis.set_ticks_position('none')
# ax3.yaxis.set_ticks_position('none')


# # -------------- Node labels --------------
# def get_node_label_results():
# save_dir = root_dir_criann + '28 cores/synthesized_graphs_num_node_label_alphabet/'
# run_times = {}
# for kernel_name in Graph_Kernel_List_VSym:
# run_times[kernel_name] = []
# for num in [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]:
# file_name = save_dir + 'run_time.' + kernel_name + '.' + str(num) + '.pkl'
# if os.path.isfile(file_name):
# run_time = pickle.load(open(file_name, 'rb'))
# else:
# run_time = 0
# run_times[kernel_name].append(run_time)
# return run_times

# # save_dir = root_dir_criann + 'synthesized_graphs_num_node_label_alphabet/'
# # run_times = pickle.load(open(save_dir + 'run_times.pkl', 'rb'))
# # return run_times

# x_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
# run_times = get_node_label_results()
# for i, kernel_name in enumerate(Graph_Kernel_List):
# if kernel_name in run_times:
# ax4.plot(x_labels[1:], run_times[kernel_name][1:], '.-', label=Legend_Labels[i], color=Colors[i])

# ax4.set_yscale('log', nonposy='clip')
# ax4.grid(axis='y')
# ax4.set_xlabel('# of alphabets')
# ax4.set_ylabel('runtime($s$)')
# #ax4.set_ylabel('runtime($s$) per pair of graphs')
# ax4.set_title('(d) alphabet size of vertex labels')
# ax4.set_axisbelow(True)
# ax4.spines['top'].set_visible(False)
# ax4.spines['bottom'].set_visible(False)
# ax4.spines['right'].set_visible(False)
# ax4.spines['left'].set_visible(False)
# ax4.xaxis.set_ticks_position('none')
# ax4.yaxis.set_ticks_position('none')


from matplotlib.lines import Line2D
custom_lines = []
for color in Colors:
custom_lines.append(Line2D([0], [0], color=color, lw=4))

fig.subplots_adjust(bottom=0.135)
fig.legend(custom_lines, labels, loc='lower center', ncol=4, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6)
plt.savefig('stability.real_data.relative_error.eps', format='eps', dpi=300, transparent=True,
bbox_inches='tight')
plt.show()

+ 130
- 0
gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 2 16:17:01 2020

@author: ljia
"""
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed.

import os
import multiprocessing
import pickle
import logging
from gklearn.ged.util import compute_geds
import numpy as np
import time
from utils import get_dataset
import sys


def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial):
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)

"""**2. Set parameters.**"""

# Parameters for GED computation.
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic.
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv)
'lsape_model': 'ECBP', #
# ??when bigger than 1, then the method is considered mIPFP.
# the actual number of computed solutions might be smaller than the specified value
'max_num_solutions': max_num_solutions,
'edit_cost': 'CONSTANT', # use CONSTANT cost.
'greedy_method': 'BASIC', #
# the distance between non-symbolic node/edge labels is computed by euclidean distance.
'attr_distance': 'euclidean',
'optimal': True, # if TRUE, the option --greedy-method has no effect
# parallel threads. Do not work if mpg_options['parallel'] = False.
'threads': multiprocessing.cpu_count(),
'centrality_method': 'NONE',
'centrality_weight': 0.7,
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
}
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))

options = ged_options.copy()
options['edit_cost_constants'] = edit_cost_constants
options['node_labels'] = dataset.node_labels
options['edge_labels'] = dataset.edge_labels
options['node_attrs'] = dataset.node_attrs
options['edge_attrs'] = dataset.edge_attrs
parallel = True # if num_solutions == 1 else False
"""**5. Compute GED matrix.**"""
ged_mat = 'error'
runtime = 0
try:
time0 = time.time()
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True)
runtime = time.time() - time0
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))
"""**6. Get results.**"""
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)

return ged_mat, runtime

def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio):
ged_mats = []
runtimes = []
for trial in range(1, 101):
print()
print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial)
ged_mats.append(ged_mat)
runtimes.append(runtime)
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio)
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f:
np.save(f, np.array(ged_mats))
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)
def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)
for max_num_solutions in [1, 20, 40, 60, 80, 100]:
print()
print('Max # of solutions:', max_num_solutions)
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]:
print()
print('Ratio:', ratio)
save_trials_as_group(dataset, ds_name, max_num_solutions, ratio)

if __name__ == '__main__':
if len(sys.argv) > 1:
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
if not os.path.exists(save_dir + 'groups/'):
os.makedirs(save_dir + 'groups/')
for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
results_for_a_dataset(ds_name)

+ 58
- 43
gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py View File

@@ -11,41 +11,16 @@ import os
import multiprocessing
import pickle
import logging
from gklearn.utils import Dataset
from gklearn.ged.util import compute_geds
import numpy as np
import time
from utils import get_dataset
import sys


def get_dataset(ds_name):
# The node/edge labels that will not be used in the computation.
if ds_name == 'MAO':
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}
elif ds_name == 'Monoterpenoides':
irrelevant_labels = {'edge_labels': ['valence']}
elif ds_name == 'MUTAG':
irrelevant_labels = {'edge_labels': ['label_0']}
elif ds_name == 'AIDS_symb':
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']}
def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):

# Initialize a Dataset.
dataset = Dataset()
# Load predefined dataset.
dataset.load_predefined_dataset(ds_name)
# Remove irrelevant labels.
dataset.remove_labels(**irrelevant_labels)
print('dataset size:', len(dataset.graphs))
return dataset


def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial):

save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)

"""**2. Set parameters.**"""

@@ -77,31 +52,71 @@ def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial):
"""**5. Compute GED matrix.**"""
ged_mat = 'error'
runtime = 0
try:
time0 = time.time()
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True)
runtime = time.time() - time0
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception('save_file_suffix')
logging.exception(save_file_suffix)
print(repr(exp))
"""**6. Get results.**"""
pickle.dump(ged_mat, open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb'))
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)
return ged_mat, runtime
def save_trials_as_group(dataset, ds_name, num_solutions, ratio):
ged_mats = []
runtimes = []
for trial in range(1, 101):
print()
print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial)
ged_mats.append(ged_mat)
runtimes.append(runtime)
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio)
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f:
np.save(f, np.array(ged_mats))
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)
def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)
for num_solutions in [1, 20, 40, 60, 80, 100]:
print()
print('# of solutions:', num_solutions)
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]:
print()
print('Ratio:', ratio)
save_trials_as_group(dataset, ds_name, num_solutions, ratio)

if __name__ == '__main__':
for ds_name in ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']:
if len(sys.argv) > 1:
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
if not os.path.exists(save_dir + 'groups/'):
os.makedirs(save_dir + 'groups/')
for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
for num_solutions in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
print()
print('# of solutions:', num_solutions)
for ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
print()
print('Ratio:', ratio)
for trial in range(1, 101):
print()
print('Trial:', trial)
xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial)
results_for_a_dataset(ds_name)

+ 125
- 0
gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py View File

@@ -0,0 +1,125 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 20 17:48:02 2020

@author: ljia
"""
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed.

import os
import multiprocessing
import pickle
import logging
from gklearn.ged.util import compute_geds
import numpy as np
import time
from utils import get_dataset
import sys


def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial):
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)

"""**2. Set parameters.**"""

# Parameters for GED computation.
ged_options = {'method': 'IPFP', # use IPFP huristic.
'initialization_method': 'RANDOM', # or 'NODE', etc.
# when bigger than 1, then the method is considered mIPFP.
'initial_solutions': 1,
'edit_cost': 'CONSTANT', # use CONSTANT cost.
# the distance between non-symbolic node/edge labels is computed by euclidean distance.
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
# parallel threads. Do not work if mpg_options['parallel'] = False.
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
}
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))

options = ged_options.copy()
options['edit_cost_constants'] = edit_cost_constants
options['node_labels'] = dataset.node_labels
options['edge_labels'] = dataset.edge_labels
options['node_attrs'] = dataset.node_attrs
options['edge_attrs'] = dataset.edge_attrs
parallel = True # if num_solutions == 1 else False
"""**5. Compute GED matrix.**"""
ged_mat = 'error'
runtime = 0
try:
time0 = time.time()
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True)
runtime = time.time() - time0
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))
"""**6. Get results.**"""
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)

return ged_mat, runtime

def save_trials_as_group(dataset, ds_name, repeats, ratio):
ged_mats = []
runtimes = []
for trial in range(1, 101):
print()
print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial)
ged_mats.append(ged_mat)
runtimes.append(runtime)
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio)
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f:
np.save(f, np.array(ged_mats))
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)
def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)
for repeats in [1, 20, 40, 60, 80, 100]:
print()
print('Repeats:', repeats)
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]:
print()
print('Ratio:', ratio)
save_trials_as_group(dataset, ds_name, repeats, ratio)

if __name__ == '__main__':
if len(sys.argv) > 1:
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
if not os.path.exists(save_dir + 'groups/'):
os.makedirs(save_dir + 'groups/')
for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
results_for_a_dataset(ds_name)

+ 130
- 0
gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 20 17:48:02 2020

@author: ljia
"""
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed.

import os
import multiprocessing
import pickle
import logging
from gklearn.ged.util import compute_geds
import numpy as np
import time
from utils import get_dataset
import sys


def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial):
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)

"""**2. Set parameters.**"""

# Parameters for GED computation.
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic.
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv)
'lsape_model': 'ECBP', #
# ??when bigger than 1, then the method is considered mIPFP.
# the actual number of computed solutions might be smaller than the specified value
'max_num_solutions': 1,
'edit_cost': 'CONSTANT', # use CONSTANT cost.
'greedy_method': 'BASIC', #
# the distance between non-symbolic node/edge labels is computed by euclidean distance.
'attr_distance': 'euclidean',
'optimal': True, # if TRUE, the option --greedy-method has no effect
# parallel threads. Do not work if mpg_options['parallel'] = False.
'threads': multiprocessing.cpu_count(),
'centrality_method': 'NONE',
'centrality_weight': 0.7,
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
}
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))

options = ged_options.copy()
options['edit_cost_constants'] = edit_cost_constants
options['node_labels'] = dataset.node_labels
options['edge_labels'] = dataset.edge_labels
options['node_attrs'] = dataset.node_attrs
options['edge_attrs'] = dataset.edge_attrs
parallel = True # if num_solutions == 1 else False
"""**5. Compute GED matrix.**"""
ged_mat = 'error'
runtime = 0
try:
time0 = time.time()
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=repeats, parallel=parallel, verbose=True)
runtime = time.time() - time0
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))
"""**6. Get results.**"""
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)

return ged_mat, runtime

def save_trials_as_group(dataset, ds_name, repeats, ratio):
ged_mats = []
runtimes = []
for trial in range(1, 101):
print()
print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial)
ged_mats.append(ged_mat)
runtimes.append(runtime)
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio)
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f:
np.save(f, np.array(ged_mats))
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)
def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)
for repeats in [1, 20, 40, 60, 80, 100]:
print()
print('Repeats:', repeats)
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]:
print()
print('Ratio:', ratio)
save_trials_as_group(dataset, ds_name, repeats, ratio)

if __name__ == '__main__':
if len(sys.argv) > 1:
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
if not os.path.exists(save_dir + 'groups/'):
os.makedirs(save_dir + 'groups/')
for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
results_for_a_dataset(ds_name)

+ 108
- 0
gklearn/experiments/ged/stability/group_results.py View File

@@ -0,0 +1,108 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 29 17:26:43 2020

@author: ljia

This script groups results together into a single file for the sake of faster
searching and loading.
"""
import os
import pickle
import numpy as np
from shutil import copyfile
from tqdm import tqdm
import sys


def group_trials(dir_folder, name_prefix, override, clear, backup):
# Get group name.
label_name = name_prefix.split('.')[0]
if label_name == 'ged_matrix':
group_label = 'ged_mats'
elif label_name == 'runtime':
group_label = 'runtimes'
else:
group_label = label_name
name_suffix = name_prefix[len(label_name):]
if label_name == 'ged_matrix':
name_group = dir_folder + 'groups/' + group_label + name_suffix + 'npy'
else:
name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl'

if not override and os.path.isfile(name_group):
# Check if all trial files exist.
trials_complete = True
for trial in range(1, 101):
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl'
if not os.path.isfile(file_name):
trials_complete = False
break
else:
# Get data.
data_group = []
for trial in range(1, 101):
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl'
if os.path.isfile(file_name):
with open(file_name, 'rb') as f:
data = pickle.load(f)
data_group.append(data)
else: # Not all trials are completed.
return
# Write groups.
if label_name == 'ged_matrix':
data_group = np.array(data_group)
with open(name_group, 'wb') as f:
np.save(f, data_group)
else:
with open(name_group, 'wb') as f:
pickle.dump(data_group, f)
trials_complete = True

if trials_complete:
# Backup.
if backup:
for trial in range(1, 101):
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl'
dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl'
copyfile(src, dst)
# Clear.
if clear:
for trial in range(1, 101):
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl'
os.remove(src)


def group_all_in_folder(dir_folder, override=False, clear=True, backup=True):
# Create folders.
if not os.path.exists(dir_folder + 'groups/'):
os.makedirs(dir_folder + 'groups/')
if backup:
if not os.path.exists(dir_folder + 'backups'):
os.makedirs(dir_folder + 'backups')
# Iterate all files.
cur_file_prefix = ''
for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout):
if os.path.isfile(os.path.join(dir_folder, file)):
name_prefix = file.split('trial_')[0]
# print(name)
# print(name_prefix)
if name_prefix != cur_file_prefix:
group_trials(dir_folder, name_prefix, override, clear, backup)
cur_file_prefix = name_prefix

if __name__ == '__main__':
dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/'
group_all_in_folder(dir_folder)
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/'
group_all_in_folder(dir_folder)

+ 30
- 0
gklearn/experiments/ged/stability/utils.py View File

@@ -0,0 +1,30 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 29 19:17:36 2020

@author: ljia
"""
from gklearn.utils import Dataset


def get_dataset(ds_name):
# The node/edge labels that will not be used in the computation.
if ds_name == 'MAO':
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}
elif ds_name == 'Monoterpenoides':
irrelevant_labels = {'edge_labels': ['valence']}
elif ds_name == 'MUTAG':
irrelevant_labels = {'edge_labels': ['label_0']}
elif ds_name == 'AIDS_symb':
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']}
ds_name = 'AIDS'

# Initialize a Dataset.
dataset = Dataset()
# Load predefined dataset.
dataset.load_predefined_dataset(ds_name)
# Remove irrelevant labels.
dataset.remove_labels(**irrelevant_labels)
print('dataset size:', len(dataset.graphs))
return dataset

Loading…
Cancel
Save