Browse Source

1. update termination criterion of iteration in class MedianPreimageGenerator.

2. add helper function generate_median_preimage_by_class in gklearn.preimage.utils.
v0.2.x
jajupmochi 5 years ago
parent
commit
8804465536
7 changed files with 1297 additions and 917 deletions
  1. +1
    -1
      README.md
  2. +131
    -42
      gklearn/preimage/median_preimage_generator.py
  3. +4
    -1
      gklearn/preimage/test_median_preimage_generator.py
  4. +381
    -119
      gklearn/preimage/utils.py
  5. +1
    -1
      gklearn/utils/__init__.py
  6. +44
    -16
      gklearn/utils/dataset.py
  7. +735
    -737
      gklearn/utils/graphfiles.py

+ 1
- 1
README.md View File

@@ -105,7 +105,7 @@ A comparison of performances of graph kernels on benchmark datasets can be found

## Authors

* [Linlin Jia](https://github.com/jajupmochi), LITIS, INSA Rouen Normandie
* [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie
* [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie
* [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie



+ 131
- 42
gklearn/preimage/median_preimage_generator.py View File

@@ -17,6 +17,7 @@ from gklearn.ged.util import compute_geds, ged_options_to_string
from gklearn.ged.median import MedianGraphEstimator
from gklearn.ged.median import constant_node_costs,mge_options_to_string
from gklearn.gedlib import librariesImport, gedlibpy
from gklearn.utils import Timer
# from gklearn.utils.dataset import Dataset

class MedianPreimageGenerator(PreimageGenerator):
@@ -29,10 +30,13 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__mge_options = {}
self.__fit_method = 'k-graphs'
self.__init_ecc = None
self.__max_itrs = 100
self.__parallel = True
self.__n_jobs = multiprocessing.cpu_count()
self.__ds_name = None
self.__time_limit_in_sec = 0
self.__max_itrs = 100
self.__max_itrs_without_update = 3
self.__epsilon_ratio = 0.01
# values to compute.
self.__edit_cost_constants = []
self.__runtime_precompute_gm = None
@@ -41,11 +45,15 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__runtime_total = None
self.__set_median = None
self.__gen_median = None
self.__best_from_dataset = None
self.__sod_set_median = None
self.__sod_gen_median = None
self.__k_dis_set_median = None
self.__k_dis_gen_median = None
self.__k_dis_dataset = None
self.__itrs = 0
self.__converged = False
self.__num_updates_ecc = 0
def set_options(self, **kwargs):
@@ -57,10 +65,13 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__fit_method = kwargs.get('fit_method', 'k-graphs')
self.__init_ecc = kwargs.get('init_ecc', None)
self.__edit_cost_constants = kwargs.get('edit_cost_constants', [])
self.__max_itrs = kwargs.get('max_itrs', 100)
self.__parallel = kwargs.get('parallel', True)
self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
self.__ds_name = kwargs.get('ds_name', None)
self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0)
self.__max_itrs = kwargs.get('max_itrs', 100)
self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3)
self.__epsilon_ratio = kwargs.get('epsilon_ratio', 0.01)
def run(self):
@@ -75,7 +86,6 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__runtime_precompute_gm = end_precompute_gm - start
# 2. optimize edit cost constants.
# self.__optimize_edit_cost_constants(dataset=dataset, Gn=Gn, Kmatrix_median=Kmatrix_median)
self.__optimize_edit_cost_constants()
end_optimize_ec = time.time()
self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm
@@ -108,28 +118,47 @@ class MedianPreimageGenerator(PreimageGenerator):
if self._verbose:
print()
print('================================================================================')
print('The optimized edit cost constants: ', self.__edit_cost_constants)
print('SOD of the set median: ', self.__sod_set_median)
print('SOD of the generalized median: ', self.__sod_gen_median)
print('Finished generalization of preimages.')
print('--------------------------------------------------------------------------------')
print('The optimized edit cost constants:', self.__edit_cost_constants)
print('SOD of the set median:', self.__sod_set_median)
print('SOD of the generalized median:', self.__sod_gen_median)
print('Distance in kernel space for set median:', self.__k_dis_set_median)
print('Distance in kernel space for generalized median:', self.__k_dis_gen_median)
print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
print('Time to pre-compute Gram matrix: ', self.__runtime_precompute_gm)
print('Time to optimize edit costs: ', self.__runtime_optimize_ec)
print('Time to generate pre-images: ', self.__runtime_generate_preimage)
print('Total time: ', self.__runtime_total)
print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm)
print('Time to optimize edit costs:', self.__runtime_optimize_ec)
print('Time to generate pre-images:', self.__runtime_generate_preimage)
print('Total time:', self.__runtime_total)
print('Total number of iterations for optimizing:', self.__itrs)
print('Total number of updating edit costs:', self.__num_updates_ecc)
print('Is optimization of edit costs converged:', self.__converged)
print('================================================================================')

# collect return values.
# return (sod_sm, sod_gm), \
# (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \
# (time_fitting, time_generating)


def get_results(self):
results = {}
results['edit_cost_constants'] = self.__edit_cost_constants
results['runtime_precompute_gm'] = self.__runtime_precompute_gm
results['runtime_optimize_ec'] = self.__runtime_optimize_ec
results['runtime_generate_preimage'] = self.__runtime_generate_preimage
results['runtime_total'] = self.__runtime_total
results['sod_set_median'] = self.__sod_set_median
results['sod_gen_median'] = self.__sod_gen_median
results['k_dis_set_median'] = self.__k_dis_set_median
results['k_dis_gen_median'] = self.__k_dis_gen_median
results['k_dis_dataset'] = self.__k_dis_dataset
results['itrs'] = self.__itrs
results['converged'] = self.__converged
results['num_updates_ecc'] = self.__num_updates_ecc
return results

# def __optimize_edit_cost_constants(self, dataset=None, Gn=None, Kmatrix_median=None):
def __optimize_edit_cost_constants(self):
"""fit edit cost constants.
"""
@@ -177,8 +206,6 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__init_ecc = [3, 3, 1, 3, 3, 1]
# optimize on the k-graph subset.
self.__optimize_ecc_by_kernel_distances()
# fit_GED_to_kernel_distance(Gn_median,
# dataset=dataset, Kmatrix=Kmatrix_median)
elif self.__fit_method == 'whole-dataset':
if self.__init_ecc is None:
if self.__ged_options['edit_cost'] == 'LETTER':
@@ -189,15 +216,11 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__init_ecc = [3, 3, 1, 3, 3, 1]
# optimizeon the whole set.
self.__optimize_ecc_by_kernel_distances()
# fit_GED_to_kernel_distance(Gn, dataset=dataset)
elif self.__fit_method == 'precomputed':
pass
def __optimize_ecc_by_kernel_distances(self):
# def fit_GED_to_kernel_distance(Gn, Kmatrix=None,
# parallel=True):
def __optimize_ecc_by_kernel_distances(self):
# compute distances in feature space.
dis_k_mat, _, _, _ = self.__graph_kernel.compute_distance_matrix()
dis_k_vec = []
@@ -222,20 +245,25 @@ class MedianPreimageGenerator(PreimageGenerator):
nb_cost_mat = np.array(n_edit_operations)
nb_cost_mat_list = [nb_cost_mat]
if self._verbose >= 2:
print('edit_cost_constants:', self.__edit_cost_constants)
print('residual_list:', residual_list)
for itr in range(self.__max_itrs):
print('Current edit cost constants:', self.__edit_cost_constants)
print('Residual list:', residual_list)
# run iteration from initial edit costs.
self.__converged = False
itrs_without_update = 0
self.__itrs = 0
self.__num_updates_ecc = 0
timer = Timer(self.__time_limit_in_sec)
while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update):
if self._verbose >= 2:
print('\niteration', itr)
print('\niteration', self.__itrs)
time0 = time.time()
# "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method.
np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm',
nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec,
n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
ged_mat=ged_mat)
self.__edit_cost_constants, residual = self.__update_ecc(nb_cost_mat, dis_k_vec)
# "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method.
# np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm',
# nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec,
# n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
# ged_mat=ged_mat)
self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec)
for i in range(len(self.__edit_cost_constants)):
if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9:
self.__edit_cost_constants[i] = 0
@@ -254,12 +282,59 @@ class MedianPreimageGenerator(PreimageGenerator):
edit_cost_list.append(self.__edit_cost_constants)
nb_cost_mat = np.array(n_edit_operations)
nb_cost_mat_list.append(nb_cost_mat)
# check convergency.
ec_changed = False
for i, cost in enumerate(self.__edit_cost_constants):
# if cost == 0:
# if edit_cost_list[-2][i] > self.__epsilon_ratio:
# ec_changed = True
# break
# elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ratio:
# ec_changed = True
# break
if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ratio:
ec_changed = True
break
residual_changed = False
if residual_list[-1] == 0:
if residual_list[-2] > self.__epsilon_ratio:
residual_changed = True
elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_ratio:
residual_changed = True
self.__converged = not (ec_changed or residual_changed)
if self.__converged:
itrs_without_update += 1
else:
itrs_without_update = 0
self.__num_updates_ecc += 1
# print current states.
if self._verbose >= 2:
print('edit_cost_constants:', self.__edit_cost_constants)
print('residual_list:', residual_list)
# return residual_list, edit_cost_list, dis_k_mat, ged_mat, \
# time_list, nb_cost_mat_list
print()
print('-------------------------------------------------------------------------')
print('States of iteration', str(self.__itrs))
print('-------------------------------------------------------------------------')
# print('Time spend:', self.__runtime_optimize_ec)
print('Total number of iterations for optimizing:', self.__itrs)
print('Total number of updating edit costs:', self.__num_updates_ecc)
print('Is optimization of edit costs converged:', self.__converged)
print('Does edit cost changed:', ec_changed)
print('Does residual changed:', residual_changed)
print('Iterations without update:', itrs_without_update)
print('Current edit cost constants:', self.__edit_cost_constants)
print('Residual list:', residual_list)
print('-------------------------------------------------------------------------')
self.__itrs += 1


def __termination_criterion_met(self, converged, timer, itr, itrs_without_update):
if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False):
# if self.__state == AlgorithmState.TERMINATED:
# self.__state = AlgorithmState.INITIALIZED
return True
return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False)


def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'):
@@ -591,6 +666,7 @@ class MedianPreimageGenerator(PreimageGenerator):
gram_with_gm, withterm3=False))
idx_k_dis_median_set_min = np.argmin(k_dis_median_set)
self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min]
self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy()
if self._verbose >= 2:
print()
@@ -599,8 +675,6 @@ class MedianPreimageGenerator(PreimageGenerator):
print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
print('distance in kernel space for each graph in median set:', k_dis_median_set)
# return dis_k_sm, dis_k_gm, k_dis_median_set, dis_k_gi_min, idx_dis_k_gi_min

def __set_graph_kernel_by_name(self):
if self.kernel_options['name'] == 'structuralspkernel':
@@ -670,5 +744,20 @@ class MedianPreimageGenerator(PreimageGenerator):
return self.__init_ecc

@init_ecc.setter
def fit_method(self, value):
self.__init_ecc = value
def init_ecc(self, value):
self.__init_ecc = value
@property
def set_median(self):
return self.__set_median


@property
def gen_median(self):
return self.__gen_median
@property
def best_from_dataset(self):
return self.__best_from_dataset

+ 4
- 1
gklearn/preimage/test_median_preimage_generator.py View File

@@ -20,9 +20,12 @@ def test_median_preimage_generator():
mpg = MedianPreimageGenerator()
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [3, 3, 1, 3, 3],
'max_itrs': 6,
'ds_name': 'Letter-high',
'parallel': True,
'time_limit_in_sec': 0,
'max_itrs': 100,
'max_itrs_without_update': 3,
'epsilon_ratio': 0.01,
'verbose': 2}
mpg.set_options(**mpg_options)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)


+ 381
- 119
gklearn/preimage/utils.py View File

@@ -19,146 +19,408 @@ from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, po
from gklearn.kernels.structuralspKernel import structuralspkernel
from gklearn.kernels.treeletKernel import treeletkernel
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
from gklearn.utils import Dataset
import csv
import matplotlib.pyplot as plt
import networkx as nx


def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, dir_save='', ):
from gklearn.preimage import MedianPreimageGenerator
from gklearn.utils import split_dataset_by_target
from gklearn.utils.graphfiles import saveGXL
# 1. get dataset.
print('getting dataset...')
dataset_all = Dataset()
dataset_all.load_predefined_dataset(ds_name)
datasets = split_dataset_by_target(dataset_all)
# dataset.cut_graphs(range(0, 10))

if save_results:
# create result files.
print('creating output files...')
fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save)
sod_sm_list = []
sod_gm_list = []
dis_k_sm_list = []
dis_k_gm_list = []
dis_k_gi_min_list = []
time_precompute_gm_list = []
time_optimize_ec_list = []
time_generate_list = []
time_total_list = []
itrs_list = []
converged_list = []
num_updates_ecc_list = []
nb_sod_sm2gm = [0, 0, 0]
nb_dis_k_sm2gm = [0, 0, 0]
nb_dis_k_gi2sm = [0, 0, 0]
nb_dis_k_gi2gm = [0, 0, 0]
# repeats_better_sod_sm2gm = []
# repeats_better_dis_k_sm2gm = []
# repeats_better_dis_k_gi2sm = []
# repeats_better_dis_k_gi2gm = []
print('start generating preimage for each class of target...')
for dataset in datasets:
print('\ntarget =', dataset.targets[0], '\n')
num_graphs = len(dataset.graphs)
if num_graphs < 2:
print('\nnumber of graphs = ', num_graphs, ', skip.\n')
continue
# 2. set parameters.
print('1. initializing mpg and setting parameters...')
mpg = MedianPreimageGenerator()
mpg.dataset = dataset
mpg.set_options(**mpg_options.copy())
mpg.kernel_options = kernel_options.copy()
mpg.ged_options = ged_options.copy()
mpg.mge_options = mge_options.copy()

# 3. compute median preimage.
print('2. computing median preimage...')
mpg.run()
results = mpg.get_results()
# write result detail.
if save_results:
print('writing results to files...')
sod_sm2gm = get_relations(np.sign(results['sod_gen_median'] - results['sod_set_median']))
dis_k_sm2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_set_median']))
dis_k_gi2sm = get_relations(np.sign(results['k_dis_set_median'] - results['k_dis_dataset']))
dis_k_gi2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_dataset']))

f_detail = open(dir_save + fn_output_detail, 'a')
csv.writer(f_detail).writerow([ds_name, kernel_options['name'],
ged_options['edit_cost'], ged_options['method'],
ged_options['attr_distance'], mpg_options['fit_method'],
num_graphs, dataset.targets[0], 1,
results['sod_set_median'], results['sod_gen_median'],
results['k_dis_set_median'], results['k_dis_gen_median'],
results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm,
dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'],
results['runtime_precompute_gm'], results['runtime_optimize_ec'],
results['runtime_generate_preimage'], results['runtime_total'],
results['itrs'], results['converged'],
results['num_updates_ecc']])
f_detail.close()
# compute result summary.
sod_sm_list.append(results['sod_set_median'])
sod_gm_list.append(results['sod_gen_median'])
dis_k_sm_list.append(results['k_dis_set_median'])
dis_k_gm_list.append(results['k_dis_gen_median'])
dis_k_gi_min_list.append(results['k_dis_dataset'])
time_precompute_gm_list.append(results['runtime_precompute_gm'])
time_optimize_ec_list.append(results['runtime_optimize_ec'])
time_generate_list.append(results['runtime_generate_preimage'])
time_total_list.append(results['runtime_total'])
itrs_list.append(results['itrs'])
converged_list.append(results['converged'])
num_updates_ecc_list.append(results['num_updates_ecc'])
# # SOD SM -> GM
if results['sod_set_median'] > results['sod_gen_median']:
nb_sod_sm2gm[0] += 1
# repeats_better_sod_sm2gm.append(1)
elif results['sod_set_median'] == results['sod_gen_median']:
nb_sod_sm2gm[1] += 1
elif results['sod_set_median'] < results['sod_gen_median']:
nb_sod_sm2gm[2] += 1
# # dis_k SM -> GM
if results['k_dis_set_median'] > results['k_dis_gen_median']:
nb_dis_k_sm2gm[0] += 1
# repeats_better_dis_k_sm2gm.append(1)
elif results['k_dis_set_median'] == results['k_dis_gen_median']:
nb_dis_k_sm2gm[1] += 1
elif results['k_dis_set_median'] < results['k_dis_gen_median']:
nb_dis_k_sm2gm[2] += 1
# # dis_k gi -> SM
if results['k_dis_dataset'] > results['k_dis_set_median']:
nb_dis_k_gi2sm[0] += 1
# repeats_better_dis_k_gi2sm.append(1)
elif results['k_dis_dataset'] == results['k_dis_set_median']:
nb_dis_k_gi2sm[1] += 1
elif results['k_dis_dataset'] < results['k_dis_set_median']:
nb_dis_k_gi2sm[2] += 1
# # dis_k gi -> GM
if results['k_dis_dataset'] > results['k_dis_gen_median']:
nb_dis_k_gi2gm[0] += 1
# repeats_better_dis_k_gi2gm.append(1)
elif results['k_dis_dataset'] == results['k_dis_gen_median']:
nb_dis_k_gi2gm[1] += 1
elif results['k_dis_dataset'] < results['k_dis_gen_median']:
nb_dis_k_gi2gm[2] += 1

# write result summary for each letter.
f_summary = open(dir_save + fn_output_summary, 'a')
csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
ged_options['edit_cost'], ged_options['method'],
ged_options['attr_distance'], mpg_options['fit_method'],
num_graphs, dataset.targets[0],
results['sod_set_median'], results['sod_gen_median'],
results['k_dis_set_median'], results['k_dis_gen_median'],
results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm,
dis_k_gi2sm, dis_k_gi2gm,
results['runtime_precompute_gm'], results['runtime_optimize_ec'],
results['runtime_generate_preimage'], results['runtime_total'],
results['itrs'], results['converged'],
results['num_updates_ecc'], nb_sod_sm2gm,
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm])
f_summary.close()
# save median graphs.
if save_medians:
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1)
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1)
saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1)
saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
# plot median graphs.
if plot_medians and save_medians:
if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low':
draw_Letter_graph(mpg.set_median, fn_pre_sm)
draw_Letter_graph(mpg.gen_median, fn_pre_gm)
draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset)

# write result summary for each letter.
if save_results:
sod_sm_mean = np.mean(sod_sm_list)
sod_gm_mean = np.mean(sod_gm_list)
dis_k_sm_mean = np.mean(dis_k_sm_list)
dis_k_gm_mean = np.mean(dis_k_gm_list)
dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
time_precompute_gm_mean = np.mean(time_precompute_gm_list)
time_optimize_ec_mean = np.mean(time_optimize_ec_list)
time_generate_mean = np.mean(time_generate_list)
time_total_mean = np.mean(time_total_list)
itrs_mean = np.mean(itrs_list)
num_converged = np.sum(converged_list)
num_updates_ecc_mean = np.mean(num_updates_ecc_list)
sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean))
dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
dis_k_gi2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
f_summary = open(dir_save + fn_output_summary, 'a')
csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
ged_options['edit_cost'], ged_options['method'],
ged_options['attr_distance'], mpg_options['fit_method'],
num_graphs, 'all',
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
dis_k_gi2sm_mean, dis_k_gi2gm_mean,
time_precompute_gm_mean, time_optimize_ec_mean,
time_generate_mean, time_total_mean, itrs_mean,
num_converged, num_updates_ecc_mean])
f_summary.close()
print('\ncomplete.')

def __init_output_file(ds_name, gkernel, fit_method, dir_output):
# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv'
f_detail = open(dir_output + fn_output_detail, 'a')
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost',
'GED method', 'attr distance', 'fit method', 'k',
'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
'dis_k gi -> GM', 'edit cost constants', 'time precompute gm',
'time optimize ec', 'time generate preimage', 'time total',
'itrs', 'converged', 'num updates ecc'])
f_detail.close()
# fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv'
f_summary = open(dir_output + fn_output_summary, 'a')
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost',
'GED method', 'attr distance', 'fit method', 'k',
'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
'dis_k gi -> GM', 'time precompute gm', 'time optimize ec',
'time generate preimage', 'time total', 'itrs', 'num converged',
'num updates ecc', '# SOD SM -> GM', '# dis_k SM -> GM',
'# dis_k gi -> SM', '# dis_k gi -> GM'])
# 'repeats better SOD SM -> GM',
# 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
# 'repeats better dis_k gi -> GM'])
f_summary.close()
return fn_output_detail, fn_output_summary


def get_relations(sign):
if sign == -1:
return 'better'
elif sign == 0:
return 'same'
elif sign == 1:
return 'worse'
#Dessin median courrant
def draw_Letter_graph(graph, file_prefix):
plt.figure()
pos = {}
for n in graph.nodes:
pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
nx.draw_networkx(graph, pos)
plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
# plt.show()
plt.clf()


def remove_edges(Gn):
for G in Gn:
for _, _, attrs in G.edges(data=True):
attrs.clear()
for G in Gn:
for _, _, attrs in G.edges(data=True):
attrs.clear()
def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
term1 = Kmatrix[idx_g, idx_g]
term2 = 0
for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2
if withterm3 == False:
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3)
term1 = Kmatrix[idx_g, idx_g]
term2 = 0
for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2
if withterm3 == False:
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3)


def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
term1 = Kmatrix[idx_g, idx_g]
term2 = 0
for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2
if withterm3 == False:
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3)
term1 = Kmatrix[idx_g, idx_g]
term2 = 0
for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2
if withterm3 == False:
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3)


def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'):
if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label,
p_quit=0.03, n_iteration=10, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
depth=7, k_func='MinMax', compute_method='trie',
parallel=parallel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix = np.empty((len(Gn), len(Gn)))
# Kmatrix[:] = np.nan
Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# for i, row in enumerate(idx):
# for j, col in enumerate(idx):
# Kmatrix[row, col] = Kmatrix_tmp[i, j]
elif graph_kernel == 'structuralspkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label,
edge_label=edge_label, node_kernels=sub_kernels,
edge_kernels=sub_kernels,
parallel=parallel, n_jobs=multiprocessing.cpu_count(),
verbose=verbose)
elif graph_kernel == 'treeletkernel':
pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
# pkernel = functools.partial(gaussiankernel, gamma=1e-6)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
sub_kernel=pkernel, parallel=parallel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'weisfeilerlehmankernel':
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
height=4, base_kernel='subtree', parallel=None,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
return Kmatrix
if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label,
p_quit=0.03, n_iteration=10, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
depth=7, k_func='MinMax', compute_method='trie',
parallel=parallel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix = np.empty((len(Gn), len(Gn)))
# Kmatrix[:] = np.nan
Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# for i, row in enumerate(idx):
# for j, col in enumerate(idx):
# Kmatrix[row, col] = Kmatrix_tmp[i, j]
elif graph_kernel == 'structuralspkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label,
edge_label=edge_label, node_kernels=sub_kernels,
edge_kernels=sub_kernels,
parallel=parallel, n_jobs=multiprocessing.cpu_count(),
verbose=verbose)
elif graph_kernel == 'treeletkernel':
pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
# pkernel = functools.partial(gaussiankernel, gamma=1e-6)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
sub_kernel=pkernel, parallel=parallel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'weisfeilerlehmankernel':
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
height=4, base_kernel='subtree', parallel=None,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
return Kmatrix

def gram2distances(Kmatrix):
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix)))
for i1 in range(len(Kmatrix)):
for i2 in range(len(Kmatrix)):
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2]
dmatrix = np.sqrt(dmatrix)
return dmatrix
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix)))
for i1 in range(len(Kmatrix)):
for i2 in range(len(Kmatrix)):
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2]
dmatrix = np.sqrt(dmatrix)
return dmatrix


def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None,
gkernel=None, verbose=True):
dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose)
for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]
if dis < 0:
if dis > -1e-10:
dis = 0
else:
raise ValueError('The distance is negative.')
dis_mat[i, j] = np.sqrt(dis)
dis_mat[j, i] = dis_mat[i, j]
dis_max = np.max(np.max(dis_mat))
dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
dis_mean = np.mean(np.mean(dis_mat))
return dis_mat, dis_max, dis_min, dis_mean
gkernel=None, verbose=True):
dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose)
for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]
if dis < 0:
if dis > -1e-10:
dis = 0
else:
raise ValueError('The distance is negative.')
dis_mat[i, j] = np.sqrt(dis)
dis_mat[j, i] = dis_mat[i, j]
dis_max = np.max(np.max(dis_mat))
dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
dis_mean = np.mean(np.mean(dis_mat))
return dis_mat, dis_max, dis_min, dis_mean


def get_same_item_indices(ls):
"""Get the indices of the same items in a list. Return a dict keyed by items.
"""
idx_dict = {}
for idx, item in enumerate(ls):
if item in idx_dict:
idx_dict[item].append(idx)
else:
idx_dict[item] = [idx]
return idx_dict
"""Get the indices of the same items in a list. Return a dict keyed by items.
"""
idx_dict = {}
for idx, item in enumerate(ls):
if item in idx_dict:
idx_dict[item].append(idx)
else:
idx_dict[item] = [idx]
return idx_dict


def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None,
node_label=None, edge_label=None):
dis_k_all = [] # distance between g_star and each graph.
alpha = [1 / len(Gn)] * len(Gn)
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_all.append(dtemp)
node_label=None, edge_label=None):
dis_k_all = [] # distance between g_star and each graph.
alpha = [1 / len(Gn)] * len(Gn)
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_all.append(dtemp)


def normalize_distance_matrix(D):
max_value = np.amax(D)
min_value = np.amin(D)
return (D - min_value) / (max_value - min_value)
max_value = np.amax(D)
min_value = np.amin(D)
return (D - min_value) / (max_value - min_value)

+ 1
- 1
gklearn/utils/__init__.py View File

@@ -15,5 +15,5 @@ __date__ = "November 2017"

# from utils import graphfiles
# from utils import utils
from gklearn.utils.dataset import Dataset
from gklearn.utils.dataset import Dataset, split_dataset_by_target
from gklearn.utils.timer import Timer

+ 44
- 16
gklearn/utils/dataset.py View File

@@ -8,6 +8,7 @@ Created on Thu Mar 26 18:48:27 2020
import numpy as np
import networkx as nx
from gklearn.utils.graphfiles import loadDataset
import os


class Dataset(object):
@@ -15,7 +16,7 @@ class Dataset(object):
def __init__(self, filename=None, filename_y=None, extra_params=None):
if filename is None:
self.__graphs = None
self.__target = None
self.__targets = None
self.__node_labels = None
self.__edge_labels = None
self.__node_attrs = None
@@ -50,33 +51,40 @@ class Dataset(object):
def load_dataset(self, filename, filename_y=None, extra_params=None):
self.__graphs, self.__target = loadDataset(filename, filename_y=filename_y, extra_params=extra_params)
self.__graphs, self.__targets = loadDataset(filename, filename_y=filename_y, extra_params=extra_params)
self.set_labels_attrs()
def load_graphs(self, graphs, targets=None):
self.__graphs = graphs
self.__targets = targets
self.set_labels_attrs()
def load_predefined_dataset(self, ds_name):
current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
if ds_name == 'Letter-high': # node non-symb
ds_file = '../../datasets/Letter-high/Letter-high_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'Letter-med': # node non-symb
ds_file = '../../datasets/Letter-high/Letter-med_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'Letter-low': # node non-symb
ds_file = '../../datasets/Letter-high/Letter-low_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'Fingerprint':
ds_file = '../../datasets/Fingerprint/Fingerprint_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'SYNTHETIC':
pass
elif ds_name == 'SYNTHETICnew':
ds_file = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'Synthie':
pass
elif ds_name == 'COIL-DEL':
ds_file = '../../datasets/COIL-DEL/COIL-DEL_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'COIL-RAG':
pass
elif ds_name == 'COLORS-3':
@@ -514,7 +522,7 @@ class Dataset(object):
def __get_class_num(self):
return len(set(self.__target))
return len(set(self.__targets))
def __get_node_attr_dim(self):
@@ -529,6 +537,11 @@ class Dataset(object):
def graphs(self):
return self.__graphs


@property
def targets(self):
return self.__targets
@property
def node_labels(self):
@@ -547,4 +560,19 @@ class Dataset(object):
@property
def edge_attrs(self):
return self.__edge_attrs
return self.__edge_attrs
def split_dataset_by_target(dataset):
from gklearn.preimage.utils import get_same_item_indices
graphs = dataset.graphs
targets = dataset.targets
datasets = []
idx_targets = get_same_item_indices(targets)
for key, val in idx_targets.items():
sub_graphs = [graphs[i] for i in val]
sub_dataset = Dataset()
sub_dataset.load_graphs(sub_graphs, [key] * len(val))
datasets.append(sub_dataset)
return datasets

+ 735
- 737
gklearn/utils/graphfiles.py
File diff suppressed because it is too large
View File


Loading…
Cancel
Save