Browse Source

Merge pull request #5 from jajupmochi/v0.2

V0.2
tags/v0.2.0
linlin GitHub 5 years ago
parent
commit
1a652e5bd6
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 1565 additions and 957 deletions
  1. +2
    -2
      README.md
  2. +4
    -4
      gklearn/ged/median/median_graph_estimator.py
  3. +1
    -1
      gklearn/ged/util/util.py
  4. +15
    -3
      gklearn/kernels/graph_kernel.py
  5. +178
    -64
      gklearn/preimage/median_preimage_generator.py
  6. +7
    -1
      gklearn/preimage/preimage_generator.py
  7. +4
    -1
      gklearn/preimage/test_median_preimage_generator.py
  8. +438
    -119
      gklearn/preimage/utils.py
  9. +3
    -1
      gklearn/utils/__init__.py
  10. +79
    -17
      gklearn/utils/dataset.py
  11. +735
    -737
      gklearn/utils/graphfiles.py
  12. +56
    -0
      gklearn/utils/utils.py
  13. +33
    -0
      notebooks/tests/test_tqdm.py
  14. +10
    -7
      requirements.txt

+ 2
- 2
README.md View File

@@ -4,7 +4,7 @@
[![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master)
[![PyPI version](https://badge.fury.io/py/graphkit-learn.svg)](https://badge.fury.io/py/graphkit-learn)

A python package for graph kernels.
A python package for graph kernels, graph edit distances and graph pre-image problem.

## Requirements

@@ -105,7 +105,7 @@ A comparison of performances of graph kernels on benchmark datasets can be found

## Authors

* [Linlin Jia](https://github.com/jajupmochi), LITIS, INSA Rouen Normandie
* [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie
* [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie
* [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie



+ 4
- 4
gklearn/ged/median/median_graph_estimator.py View File

@@ -348,7 +348,7 @@ class MedianGraphEstimator(object):
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout)
progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout)
# Compute node maps and sum of distances for initial median.
self.__sum_of_distances = 0
@@ -457,7 +457,7 @@ class MedianGraphEstimator(object):
self.__itrs[median_pos] += 1
# Update the best median.
if self.__sum_of_distances < self.__best_init_sum_of_distances:
if self.__sum_of_distances < best_sum_of_distances:
best_sum_of_distances = self.__sum_of_distances
node_maps_from_best_median = self.__node_maps_from_median
best_median = median
@@ -588,7 +588,7 @@ class MedianGraphEstimator(object):
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout)
progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout)
# Compute the medoid.
medoid_id = graph_ids[0]
@@ -718,7 +718,7 @@ class MedianGraphEstimator(object):
def __update_node_maps(self):
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout)
progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout)
# Update the node maps.
node_maps_were_modified = False


+ 1
- 1
gklearn/ged/util/util.py View File

@@ -307,7 +307,7 @@ def ged_options_to_string(options):
opt_str = ' '
for key, val in options.items():
if key == 'initialization_method':
opt_str += '--initial_solutions ' + str(val) + ' '
opt_str += '--initialization-method ' + str(val) + ' '
elif key == 'initialization_options':
opt_str += '--initialization-options ' + str(val) + ' '
elif key == 'lower_bound_method':


+ 15
- 3
gklearn/kernels/graph_kernel.py View File

@@ -76,11 +76,11 @@ class GraphKernel(object):
def compute_distance_matrix(self):
dis_mat = np.empty((len(self._graphs), len(self._graphs)))
if self._gram_matrix is None:
raise Exception('Please compute the Gram matrix before computing distance matrix.')
for i in range(len(self._graphs)):
for j in range(i, len(self._graphs)):
dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix)))
for i in range(len(self._gram_matrix)):
for j in range(i, len(self._gram_matrix)):
dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j]
if dis < 0:
if dis > -1e-10:
@@ -184,18 +184,22 @@ class GraphKernel(object):
def parallel(self):
return self._parallel
@property
def n_jobs(self):
return self._n_jobs


@property
def verbose(self):
return self._verbose
@property
def normalize(self):
return self._normalize
@property
def run_time(self):
return self._run_time
@@ -205,7 +209,15 @@ class GraphKernel(object):
def gram_matrix(self):
return self._gram_matrix
@gram_matrix.setter
def gram_matrix(self, value):
self._gram_matrix = value
@property
def gram_matrix_unnorm(self):
return self._gram_matrix_unnorm

@gram_matrix_unnorm.setter
def gram_matrix_unnorm(self, value):
self._gram_matrix_unnorm = value

+ 178
- 64
gklearn/preimage/median_preimage_generator.py View File

@@ -17,6 +17,7 @@ from gklearn.ged.util import compute_geds, ged_options_to_string
from gklearn.ged.median import MedianGraphEstimator
from gklearn.ged.median import constant_node_costs,mge_options_to_string
from gklearn.gedlib import librariesImport, gedlibpy
from gklearn.utils import Timer
# from gklearn.utils.dataset import Dataset

class MedianPreimageGenerator(PreimageGenerator):
@@ -29,24 +30,34 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__mge_options = {}
self.__fit_method = 'k-graphs'
self.__init_ecc = None
self.__max_itrs = 100
self.__parallel = True
self.__n_jobs = multiprocessing.cpu_count()
self.__ds_name = None
self.__time_limit_in_sec = 0
self.__max_itrs = 100
self.__max_itrs_without_update = 3
self.__epsilon_residual = 0.01
self.__epsilon_ec = 0.1
# values to compute.
self.__edit_cost_constants = []
self.__runtime_precompute_gm = None
self.__runtime_optimize_ec = None
self.__runtime_generate_preimage = None
self.__runtime_total = None
self.__set_median = None
self.__gen_median = None
self.__best_from_dataset = None
self.__sod_set_median = None
self.__sod_gen_median = None
self.__k_dis_set_median = None
self.__k_dis_gen_median = None
self.__k_dis_dataset = None
self.__itrs = 0
self.__converged = False
self.__num_updates_ecc = 0
# values that can be set or to be computed.
self.__edit_cost_constants = []
self.__gram_matrix_unnorm = None
self.__runtime_precompute_gm = None

def set_options(self, **kwargs):
self._kernel_options = kwargs.get('kernel_options', {})
@@ -57,10 +68,16 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__fit_method = kwargs.get('fit_method', 'k-graphs')
self.__init_ecc = kwargs.get('init_ecc', None)
self.__edit_cost_constants = kwargs.get('edit_cost_constants', [])
self.__max_itrs = kwargs.get('max_itrs', 100)
self.__parallel = kwargs.get('parallel', True)
self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
self.__ds_name = kwargs.get('ds_name', None)
self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0)
self.__max_itrs = kwargs.get('max_itrs', 100)
self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3)
self.__epsilon_residual = kwargs.get('epsilon_residual', 0.01)
self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1)
self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None)
self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None)
def run(self):
@@ -70,12 +87,20 @@ class MedianPreimageGenerator(PreimageGenerator):
start = time.time()
# 1. precompute gram matrix.
gram_matrix, run_time = self.__graph_kernel.compute(self._dataset.graphs, **self._kernel_options)
end_precompute_gm = time.time()
self.__runtime_precompute_gm = end_precompute_gm - start
if self.__gram_matrix_unnorm is None:
gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options)
self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm
end_precompute_gm = time.time()
self.__runtime_precompute_gm = end_precompute_gm - start
else:
if self.__runtime_precompute_gm is None:
raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.')
self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm
self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm))
end_precompute_gm = time.time()
start -= self.__runtime_precompute_gm
# 2. optimize edit cost constants.
# self.__optimize_edit_cost_constants(dataset=dataset, Gn=Gn, Kmatrix_median=Kmatrix_median)
self.__optimize_edit_cost_constants()
end_optimize_ec = time.time()
self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm
@@ -108,28 +133,48 @@ class MedianPreimageGenerator(PreimageGenerator):
if self._verbose:
print()
print('================================================================================')
print('The optimized edit cost constants: ', self.__edit_cost_constants)
print('SOD of the set median: ', self.__sod_set_median)
print('SOD of the generalized median: ', self.__sod_gen_median)
print('Finished generalization of preimages.')
print('--------------------------------------------------------------------------------')
print('The optimized edit cost constants:', self.__edit_cost_constants)
print('SOD of the set median:', self.__sod_set_median)
print('SOD of the generalized median:', self.__sod_gen_median)
print('Distance in kernel space for set median:', self.__k_dis_set_median)
print('Distance in kernel space for generalized median:', self.__k_dis_gen_median)
print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
print('Time to pre-compute Gram matrix: ', self.__runtime_precompute_gm)
print('Time to optimize edit costs: ', self.__runtime_optimize_ec)
print('Time to generate pre-images: ', self.__runtime_generate_preimage)
print('Total time: ', self.__runtime_total)
print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm)
print('Time to optimize edit costs:', self.__runtime_optimize_ec)
print('Time to generate pre-images:', self.__runtime_generate_preimage)
print('Total time:', self.__runtime_total)
print('Total number of iterations for optimizing:', self.__itrs)
print('Total number of updating edit costs:', self.__num_updates_ecc)
print('Is optimization of edit costs converged:', self.__converged)
print('================================================================================')
print()

# collect return values.
# return (sod_sm, sod_gm), \
# (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \
# (time_fitting, time_generating)


def get_results(self):
results = {}
results['edit_cost_constants'] = self.__edit_cost_constants
results['runtime_precompute_gm'] = self.__runtime_precompute_gm
results['runtime_optimize_ec'] = self.__runtime_optimize_ec
results['runtime_generate_preimage'] = self.__runtime_generate_preimage
results['runtime_total'] = self.__runtime_total
results['sod_set_median'] = self.__sod_set_median
results['sod_gen_median'] = self.__sod_gen_median
results['k_dis_set_median'] = self.__k_dis_set_median
results['k_dis_gen_median'] = self.__k_dis_gen_median
results['k_dis_dataset'] = self.__k_dis_dataset
results['itrs'] = self.__itrs
results['converged'] = self.__converged
results['num_updates_ecc'] = self.__num_updates_ecc
return results

# def __optimize_edit_cost_constants(self, dataset=None, Gn=None, Kmatrix_median=None):
def __optimize_edit_cost_constants(self):
"""fit edit cost constants.
"""
@@ -177,8 +222,6 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__init_ecc = [3, 3, 1, 3, 3, 1]
# optimize on the k-graph subset.
self.__optimize_ecc_by_kernel_distances()
# fit_GED_to_kernel_distance(Gn_median,
# dataset=dataset, Kmatrix=Kmatrix_median)
elif self.__fit_method == 'whole-dataset':
if self.__init_ecc is None:
if self.__ged_options['edit_cost'] == 'LETTER':
@@ -189,17 +232,13 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__init_ecc = [3, 3, 1, 3, 3, 1]
# optimizeon the whole set.
self.__optimize_ecc_by_kernel_distances()
# fit_GED_to_kernel_distance(Gn, dataset=dataset)
elif self.__fit_method == 'precomputed':
pass
def __optimize_ecc_by_kernel_distances(self):
# def fit_GED_to_kernel_distance(Gn, Kmatrix=None,
# parallel=True):
def __optimize_ecc_by_kernel_distances(self):
# compute distances in feature space.
dis_k_mat, _, _, _ = self.__graph_kernel.compute_distance_matrix()
dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix()
dis_k_vec = []
for i in range(len(dis_k_mat)):
# for j in range(i, len(dis_k_mat)):
@@ -222,20 +261,25 @@ class MedianPreimageGenerator(PreimageGenerator):
nb_cost_mat = np.array(n_edit_operations)
nb_cost_mat_list = [nb_cost_mat]
if self._verbose >= 2:
print('edit_cost_constants:', self.__edit_cost_constants)
print('residual_list:', residual_list)
for itr in range(self.__max_itrs):
print('Current edit cost constants:', self.__edit_cost_constants)
print('Residual list:', residual_list)
# run iteration from initial edit costs.
self.__converged = False
itrs_without_update = 0
self.__itrs = 0
self.__num_updates_ecc = 0
timer = Timer(self.__time_limit_in_sec)
while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update):
if self._verbose >= 2:
print('\niteration', itr)
print('\niteration', self.__itrs + 1)
time0 = time.time()
# "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method.
np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm',
nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec,
n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
ged_mat=ged_mat)
self.__edit_cost_constants, residual = self.__update_ecc(nb_cost_mat, dis_k_vec)
# "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method.
# np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm',
# nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec,
# n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
# ged_mat=ged_mat)
self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec)
for i in range(len(self.__edit_cost_constants)):
if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9:
self.__edit_cost_constants[i] = 0
@@ -254,12 +298,59 @@ class MedianPreimageGenerator(PreimageGenerator):
edit_cost_list.append(self.__edit_cost_constants)
nb_cost_mat = np.array(n_edit_operations)
nb_cost_mat_list.append(nb_cost_mat)
# check convergency.
ec_changed = False
for i, cost in enumerate(self.__edit_cost_constants):
if cost == 0:
if edit_cost_list[-2][i] > self.__epsilon_ec:
ec_changed = True
break
elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ec:
ec_changed = True
break
# if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec:
# ec_changed = True
# break
residual_changed = False
if residual_list[-1] == 0:
if residual_list[-2] > self.__epsilon_residual:
residual_changed = True
elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_residual:
residual_changed = True
self.__converged = not (ec_changed or residual_changed)
if self.__converged:
itrs_without_update += 1
else:
itrs_without_update = 0
self.__num_updates_ecc += 1
# print current states.
if self._verbose >= 2:
print('edit_cost_constants:', self.__edit_cost_constants)
print('residual_list:', residual_list)
# return residual_list, edit_cost_list, dis_k_mat, ged_mat, \
# time_list, nb_cost_mat_list
print()
print('-------------------------------------------------------------------------')
print('States of iteration', self.__itrs + 1)
print('-------------------------------------------------------------------------')
# print('Time spend:', self.__runtime_optimize_ec)
print('Total number of iterations for optimizing:', self.__itrs + 1)
print('Total number of updating edit costs:', self.__num_updates_ecc)
print('Was optimization of edit costs converged:', self.__converged)
print('Did edit costs change:', ec_changed)
print('Did residual change:', residual_changed)
print('Iterations without update:', itrs_without_update)
print('Current edit cost constants:', self.__edit_cost_constants)
print('Residual list:', residual_list)
print('-------------------------------------------------------------------------')
self.__itrs += 1


def __termination_criterion_met(self, converged, timer, itr, itrs_without_update):
if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False):
# if self.__state == AlgorithmState.TERMINATED:
# self.__state = AlgorithmState.INITIALIZED
return True
return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False)


def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'):
@@ -559,11 +650,11 @@ class MedianPreimageGenerator(PreimageGenerator):
def __compute_distances_to_true_median(self):
# compute distance in kernel space for set median.
kernels_to_sm, _ = self.__graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options)
kernel_sm, _ = self.__graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options)
kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize
kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options)
kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options)
kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize
# @todo: not correct kernel value
gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0)
gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
gram_with_sm = np.concatenate((np.array([[1] + kernels_to_sm]).T, gram_with_sm), axis=1)
self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
[1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
@@ -574,10 +665,10 @@ class MedianPreimageGenerator(PreimageGenerator):
# print(set_median.edges(data=True))
# compute distance in kernel space for generalized median.
kernels_to_gm, _ = self.__graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options)
kernel_gm, _ = self.__graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options)
kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize
gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0)
kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options)
kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options)
kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize
gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
gram_with_gm = np.concatenate((np.array([[1] + kernels_to_gm]).T, gram_with_gm), axis=1)
self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)),
[1 / len(self._dataset.graphs)] * len(self._dataset.graphs),
@@ -591,6 +682,7 @@ class MedianPreimageGenerator(PreimageGenerator):
gram_with_gm, withterm3=False))
idx_k_dis_median_set_min = np.argmin(k_dis_median_set)
self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min]
self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy()
if self._verbose >= 2:
print()
@@ -599,18 +691,16 @@ class MedianPreimageGenerator(PreimageGenerator):
print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset)
print('distance in kernel space for each graph in median set:', k_dis_median_set)
# return dis_k_sm, dis_k_gm, k_dis_median_set, dis_k_gi_min, idx_dis_k_gi_min

def __set_graph_kernel_by_name(self):
if self.kernel_options['name'] == 'structuralspkernel':
from gklearn.kernels import StructuralSP
self.__graph_kernel = StructuralSP(node_labels=self.dataset.node_labels,
edge_labels=self.dataset.edge_labels,
node_attrs=self.dataset.node_attrs,
edge_attrs=self.dataset.edge_attrs,
ds_infos=self.dataset.get_dataset_infos(keys=['directed']),
**self.kernel_options)
self._graph_kernel = StructuralSP(node_labels=self._dataset.node_labels,
edge_labels=self._dataset.edge_labels,
node_attrs=self._dataset.node_attrs,
edge_attrs=self._dataset.edge_attrs,
ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
**self._kernel_options)
# def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
@@ -618,7 +708,7 @@ class MedianPreimageGenerator(PreimageGenerator):
"""
Cleans node and edge labels and attributes of the given graph.
"""
G_new = nx.Graph()
G_new = nx.Graph(**G.graph)
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd)) # @todo: should we keep this as str()?
for l_name in self._dataset.node_labels:
@@ -670,5 +760,29 @@ class MedianPreimageGenerator(PreimageGenerator):
return self.__init_ecc

@init_ecc.setter
def fit_method(self, value):
self.__init_ecc = value
def init_ecc(self, value):
self.__init_ecc = value
@property
def set_median(self):
return self.__set_median


@property
def gen_median(self):
return self.__gen_median
@property
def best_from_dataset(self):
return self.__best_from_dataset
@property
def gram_matrix_unnorm(self):
return self.__gram_matrix_unnorm
@gram_matrix_unnorm.setter
def gram_matrix_unnorm(self, value):
self.__gram_matrix_unnorm = value

+ 7
- 1
gklearn/preimage/preimage_generator.py View File

@@ -5,7 +5,7 @@ Created on Thu Mar 26 18:26:36 2020

@author: ljia
"""
from gklearn.utils import Dataset
# from gklearn.utils import Dataset

class PreimageGenerator(object):
@@ -32,6 +32,11 @@ class PreimageGenerator(object):
@kernel_options.setter
def kernel_options(self, value):
self._kernel_options = value


@property
def graph_kernel(self):
return self._graph_kernel
@property
@@ -41,3 +46,4 @@ class PreimageGenerator(object):
@verbose.setter
def verbose(self, value):
self._verbose = value


+ 4
- 1
gklearn/preimage/test_median_preimage_generator.py View File

@@ -20,9 +20,12 @@ def test_median_preimage_generator():
mpg = MedianPreimageGenerator()
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [3, 3, 1, 3, 3],
'max_itrs': 6,
'ds_name': 'Letter-high',
'parallel': True,
'time_limit_in_sec': 0,
'max_itrs': 100,
'max_itrs_without_update': 3,
'epsilon_ratio': 0.01,
'verbose': 2}
mpg.set_options(**mpg_options)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)


+ 438
- 119
gklearn/preimage/utils.py View File

@@ -19,146 +19,465 @@ from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, po
from gklearn.kernels.structuralspKernel import structuralspkernel
from gklearn.kernels.treeletKernel import treeletkernel
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
from gklearn.utils import Dataset
import csv
import networkx as nx


def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, load_gm='auto', dir_save='', irrelevant_labels=None):
import os.path
from gklearn.preimage import MedianPreimageGenerator
from gklearn.utils import split_dataset_by_target
from gklearn.utils.graphfiles import saveGXL
# 1. get dataset.
print('1. getting dataset...')
dataset_all = Dataset()
dataset_all.load_predefined_dataset(ds_name)
if not irrelevant_labels is None:
dataset_all.remove_labels(**irrelevant_labels)
# dataset_all.cut_graphs(range(0, 100))
datasets = split_dataset_by_target(dataset_all)

if save_results:
# create result files.
print('creating output files...')
fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save)
sod_sm_list = []
sod_gm_list = []
dis_k_sm_list = []
dis_k_gm_list = []
dis_k_gi_min_list = []
time_optimize_ec_list = []
time_generate_list = []
time_total_list = []
itrs_list = []
converged_list = []
num_updates_ecc_list = []
nb_sod_sm2gm = [0, 0, 0]
nb_dis_k_sm2gm = [0, 0, 0]
nb_dis_k_gi2sm = [0, 0, 0]
nb_dis_k_gi2gm = [0, 0, 0]
dis_k_max_list = []
dis_k_min_list = []
dis_k_mean_list = []
if load_gm == 'auto':
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz'
gmfile_exist = os.path.isfile(os.path.abspath(gm_fname))
if gmfile_exist:
gmfile = np.load(gm_fname)
gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list']
time_precompute_gm_list = gmfile['run_time_list'].tolist()
else:
gram_matrix_unnorm_list = []
time_precompute_gm_list = []
elif not load_gm:
gram_matrix_unnorm_list = []
time_precompute_gm_list = []
else:
gmfile = np.load()
gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list']
time_precompute_gm_list = gmfile['run_time_list']
# repeats_better_sod_sm2gm = []
# repeats_better_dis_k_sm2gm = []
# repeats_better_dis_k_gi2sm = []
# repeats_better_dis_k_gi2gm = []
print('start generating preimage for each class of target...')
for idx, dataset in enumerate(datasets):
target = dataset.targets[0]
print('\ntarget =', target, '\n')
# if target != 1:
# continue
num_graphs = len(dataset.graphs)
if num_graphs < 2:
print('\nnumber of graphs = ', num_graphs, ', skip.\n')
continue
# 2. set parameters.
print('2. initializing mpg and setting parameters...')
if load_gm:
if gmfile_exist:
mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx]
mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx]
mpg = MedianPreimageGenerator()
mpg.dataset = dataset
mpg.set_options(**mpg_options.copy())
mpg.kernel_options = kernel_options.copy()
mpg.ged_options = ged_options.copy()
mpg.mge_options = mge_options.copy()

# 3. compute median preimage.
print('3. computing median preimage...')
mpg.run()
results = mpg.get_results()
# 4. compute pairwise kernel distances.
print('4. computing pairwise kernel distances...')
_, dis_k_max, dis_k_min, dis_k_mean = mpg.graph_kernel.compute_distance_matrix()
dis_k_max_list.append(dis_k_max)
dis_k_min_list.append(dis_k_min)
dis_k_mean_list.append(dis_k_mean)
# 5. save results (and median graphs).
print('5. saving results (and median graphs)...')
# write result detail.
if save_results:
print('writing results to files...')
sod_sm2gm = get_relations(np.sign(results['sod_gen_median'] - results['sod_set_median']))
dis_k_sm2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_set_median']))
dis_k_gi2sm = get_relations(np.sign(results['k_dis_set_median'] - results['k_dis_dataset']))
dis_k_gi2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_dataset']))

f_detail = open(dir_save + fn_output_detail, 'a')
csv.writer(f_detail).writerow([ds_name, kernel_options['name'],
ged_options['edit_cost'], ged_options['method'],
ged_options['attr_distance'], mpg_options['fit_method'],
num_graphs, target, 1,
results['sod_set_median'], results['sod_gen_median'],
results['k_dis_set_median'], results['k_dis_gen_median'],
results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm,
dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'],
results['runtime_precompute_gm'], results['runtime_optimize_ec'],
results['runtime_generate_preimage'], results['runtime_total'],
results['itrs'], results['converged'],
results['num_updates_ecc']])
f_detail.close()
# compute result summary.
sod_sm_list.append(results['sod_set_median'])
sod_gm_list.append(results['sod_gen_median'])
dis_k_sm_list.append(results['k_dis_set_median'])
dis_k_gm_list.append(results['k_dis_gen_median'])
dis_k_gi_min_list.append(results['k_dis_dataset'])
time_precompute_gm_list.append(results['runtime_precompute_gm'])
time_optimize_ec_list.append(results['runtime_optimize_ec'])
time_generate_list.append(results['runtime_generate_preimage'])
time_total_list.append(results['runtime_total'])
itrs_list.append(results['itrs'])
converged_list.append(results['converged'])
num_updates_ecc_list.append(results['num_updates_ecc'])
# # SOD SM -> GM
if results['sod_set_median'] > results['sod_gen_median']:
nb_sod_sm2gm[0] += 1
# repeats_better_sod_sm2gm.append(1)
elif results['sod_set_median'] == results['sod_gen_median']:
nb_sod_sm2gm[1] += 1
elif results['sod_set_median'] < results['sod_gen_median']:
nb_sod_sm2gm[2] += 1
# # dis_k SM -> GM
if results['k_dis_set_median'] > results['k_dis_gen_median']:
nb_dis_k_sm2gm[0] += 1
# repeats_better_dis_k_sm2gm.append(1)
elif results['k_dis_set_median'] == results['k_dis_gen_median']:
nb_dis_k_sm2gm[1] += 1
elif results['k_dis_set_median'] < results['k_dis_gen_median']:
nb_dis_k_sm2gm[2] += 1
# # dis_k gi -> SM
if results['k_dis_dataset'] > results['k_dis_set_median']:
nb_dis_k_gi2sm[0] += 1
# repeats_better_dis_k_gi2sm.append(1)
elif results['k_dis_dataset'] == results['k_dis_set_median']:
nb_dis_k_gi2sm[1] += 1
elif results['k_dis_dataset'] < results['k_dis_set_median']:
nb_dis_k_gi2sm[2] += 1
# # dis_k gi -> GM
if results['k_dis_dataset'] > results['k_dis_gen_median']:
nb_dis_k_gi2gm[0] += 1
# repeats_better_dis_k_gi2gm.append(1)
elif results['k_dis_dataset'] == results['k_dis_gen_median']:
nb_dis_k_gi2gm[1] += 1
elif results['k_dis_dataset'] < results['k_dis_gen_median']:
nb_dis_k_gi2gm[2] += 1

# write result summary for each letter.
f_summary = open(dir_save + fn_output_summary, 'a')
csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
ged_options['edit_cost'], ged_options['method'],
ged_options['attr_distance'], mpg_options['fit_method'],
num_graphs, target,
results['sod_set_median'], results['sod_gen_median'],
results['k_dis_set_median'], results['k_dis_gen_median'],
results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm,
dis_k_gi2sm, dis_k_gi2gm,
results['runtime_precompute_gm'], results['runtime_optimize_ec'],
results['runtime_generate_preimage'], results['runtime_total'],
results['itrs'], results['converged'],
results['num_updates_ecc'], nb_sod_sm2gm,
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm])
f_summary.close()
# save median graphs.
if save_medians:
print('Saving median graphs to files...')
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
# plot median graphs.
if plot_medians and save_medians:
if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low':
draw_Letter_graph(mpg.set_median, fn_pre_sm)
draw_Letter_graph(mpg.gen_median, fn_pre_gm)
draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset)
if (load_gm == 'auto' and not gmfile_exist) or not load_gm:
gram_matrix_unnorm_list.append(mpg.gram_matrix_unnorm)

# write result summary for each letter.
if save_results:
sod_sm_mean = np.mean(sod_sm_list)
sod_gm_mean = np.mean(sod_gm_list)
dis_k_sm_mean = np.mean(dis_k_sm_list)
dis_k_gm_mean = np.mean(dis_k_gm_list)
dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
time_precompute_gm_mean = np.mean(time_precompute_gm_list)
time_optimize_ec_mean = np.mean(time_optimize_ec_list)
time_generate_mean = np.mean(time_generate_list)
time_total_mean = np.mean(time_total_list)
itrs_mean = np.mean(itrs_list)
num_converged = np.sum(converged_list)
num_updates_ecc_mean = np.mean(num_updates_ecc_list)
sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean))
dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
dis_k_gi2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
f_summary = open(dir_save + fn_output_summary, 'a')
csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
ged_options['edit_cost'], ged_options['method'],
ged_options['attr_distance'], mpg_options['fit_method'],
num_graphs, 'all',
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
dis_k_gi2sm_mean, dis_k_gi2gm_mean,
time_precompute_gm_mean, time_optimize_ec_mean,
time_generate_mean, time_total_mean, itrs_mean,
num_converged, num_updates_ecc_mean])
f_summary.close()
# save total pairwise kernel distances.
dis_k_max = np.max(dis_k_max_list)
dis_k_min = np.min(dis_k_min_list)
dis_k_mean = np.mean(dis_k_mean_list)
print('The maximum pairwise distance in kernel space:', dis_k_max)
print('The minimum pairwise distance in kernel space:', dis_k_min)
print('The average pairwise distance in kernel space:', dis_k_mean)
# write Gram matrices to file.
if (load_gm == 'auto' and not gmfile_exist) or not load_gm:
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list)

print('\ncomplete.')

def __init_output_file(ds_name, gkernel, fit_method, dir_output):
# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv'
f_detail = open(dir_output + fn_output_detail, 'a')
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost',
'GED method', 'attr distance', 'fit method', 'num graphs',
'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
'dis_k gi -> GM', 'edit cost constants', 'time precompute gm',
'time optimize ec', 'time generate preimage', 'time total',
'itrs', 'converged', 'num updates ecc'])
f_detail.close()
# fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv'
f_summary = open(dir_output + fn_output_summary, 'a')
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost',
'GED method', 'attr distance', 'fit method', 'num graphs',
'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
'dis_k gi -> GM', 'time precompute gm', 'time optimize ec',
'time generate preimage', 'time total', 'itrs', 'num converged',
'num updates ecc', '# SOD SM -> GM', '# dis_k SM -> GM',
'# dis_k gi -> SM', '# dis_k gi -> GM'])
# 'repeats better SOD SM -> GM',
# 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
# 'repeats better dis_k gi -> GM'])
f_summary.close()
return fn_output_detail, fn_output_summary


def get_relations(sign):
if sign == -1:
return 'better'
elif sign == 0:
return 'same'
elif sign == 1:
return 'worse'
#Dessin median courrant
def draw_Letter_graph(graph, file_prefix):
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
plt.figure()
pos = {}
for n in graph.nodes:
pos[n] = np.array([float(graph.nodes[n]['x']),float(graph.nodes[n]['y'])])
nx.draw_networkx(graph, pos)
plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
# plt.show()
plt.clf()
plt.close()


def remove_edges(Gn):
for G in Gn:
for _, _, attrs in G.edges(data=True):
attrs.clear()
for G in Gn:
for _, _, attrs in G.edges(data=True):
attrs.clear()
def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
term1 = Kmatrix[idx_g, idx_g]
term2 = 0
for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2
if withterm3 == False:
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3)
term1 = Kmatrix[idx_g, idx_g]
term2 = 0
for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2
if withterm3 == False:
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3)


def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
term1 = Kmatrix[idx_g, idx_g]
term2 = 0
for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2
if withterm3 == False:
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3)
term1 = Kmatrix[idx_g, idx_g]
term2 = 0
for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2
if withterm3 == False:
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3)


def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'):
if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label,
p_quit=0.03, n_iteration=10, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
depth=7, k_func='MinMax', compute_method='trie',
parallel=parallel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix = np.empty((len(Gn), len(Gn)))
# Kmatrix[:] = np.nan
Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# for i, row in enumerate(idx):
# for j, col in enumerate(idx):
# Kmatrix[row, col] = Kmatrix_tmp[i, j]
elif graph_kernel == 'structuralspkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label,
edge_label=edge_label, node_kernels=sub_kernels,
edge_kernels=sub_kernels,
parallel=parallel, n_jobs=multiprocessing.cpu_count(),
verbose=verbose)
elif graph_kernel == 'treeletkernel':
pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
# pkernel = functools.partial(gaussiankernel, gamma=1e-6)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
sub_kernel=pkernel, parallel=parallel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'weisfeilerlehmankernel':
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
height=4, base_kernel='subtree', parallel=None,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
return Kmatrix
if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label,
p_quit=0.03, n_iteration=10, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
depth=7, k_func='MinMax', compute_method='trie',
parallel=parallel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'spkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix = np.empty((len(Gn), len(Gn)))
# Kmatrix[:] = np.nan
Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels=
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# for i, row in enumerate(idx):
# for j, col in enumerate(idx):
# Kmatrix[row, col] = Kmatrix_tmp[i, j]
elif graph_kernel == 'structuralspkernel':
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label,
edge_label=edge_label, node_kernels=sub_kernels,
edge_kernels=sub_kernels,
parallel=parallel, n_jobs=multiprocessing.cpu_count(),
verbose=verbose)
elif graph_kernel == 'treeletkernel':
pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
# pkernel = functools.partial(gaussiankernel, gamma=1e-6)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
sub_kernel=pkernel, parallel=parallel,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'weisfeilerlehmankernel':
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
height=4, base_kernel='subtree', parallel=None,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
return Kmatrix

def gram2distances(Kmatrix):
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix)))
for i1 in range(len(Kmatrix)):
for i2 in range(len(Kmatrix)):
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2]
dmatrix = np.sqrt(dmatrix)
return dmatrix
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix)))
for i1 in range(len(Kmatrix)):
for i2 in range(len(Kmatrix)):
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2]
dmatrix = np.sqrt(dmatrix)
return dmatrix


def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None,
gkernel=None, verbose=True):
dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose)
for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]
if dis < 0:
if dis > -1e-10:
dis = 0
else:
raise ValueError('The distance is negative.')
dis_mat[i, j] = np.sqrt(dis)
dis_mat[j, i] = dis_mat[i, j]
dis_max = np.max(np.max(dis_mat))
dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
dis_mean = np.mean(np.mean(dis_mat))
return dis_mat, dis_max, dis_min, dis_mean
gkernel=None, verbose=True):
dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose)
for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]
if dis < 0:
if dis > -1e-10:
dis = 0
else:
raise ValueError('The distance is negative.')
dis_mat[i, j] = np.sqrt(dis)
dis_mat[j, i] = dis_mat[i, j]
dis_max = np.max(np.max(dis_mat))
dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
dis_mean = np.mean(np.mean(dis_mat))
return dis_mat, dis_max, dis_min, dis_mean


def get_same_item_indices(ls):
"""Get the indices of the same items in a list. Return a dict keyed by items.
"""
idx_dict = {}
for idx, item in enumerate(ls):
if item in idx_dict:
idx_dict[item].append(idx)
else:
idx_dict[item] = [idx]
return idx_dict
"""Get the indices of the same items in a list. Return a dict keyed by items.
"""
idx_dict = {}
for idx, item in enumerate(ls):
if item in idx_dict:
idx_dict[item].append(idx)
else:
idx_dict[item] = [idx]
return idx_dict


def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None,
node_label=None, edge_label=None):
dis_k_all = [] # distance between g_star and each graph.
alpha = [1 / len(Gn)] * len(Gn)
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_all.append(dtemp)
node_label=None, edge_label=None):
dis_k_all = [] # distance between g_star and each graph.
alpha = [1 / len(Gn)] * len(Gn)
if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
dis_all.append(dtemp)


def normalize_distance_matrix(D):
max_value = np.amax(D)
min_value = np.amin(D)
return (D - min_value) / (max_value - min_value)
max_value = np.amax(D)
min_value = np.amin(D)
return (D - min_value) / (max_value - min_value)

+ 3
- 1
gklearn/utils/__init__.py View File

@@ -15,5 +15,7 @@ __date__ = "November 2017"

# from utils import graphfiles
# from utils import utils
from gklearn.utils.dataset import Dataset
from gklearn.utils.dataset import Dataset, split_dataset_by_target
from gklearn.utils.timer import Timer
from gklearn.utils.utils import get_graph_kernel_by_name
from gklearn.utils.utils import compute_gram_matrices_by_class

+ 79
- 17
gklearn/utils/dataset.py View File

@@ -8,6 +8,7 @@ Created on Thu Mar 26 18:48:27 2020
import numpy as np
import networkx as nx
from gklearn.utils.graphfiles import loadDataset
import os


class Dataset(object):
@@ -15,7 +16,7 @@ class Dataset(object):
def __init__(self, filename=None, filename_y=None, extra_params=None):
if filename is None:
self.__graphs = None
self.__target = None
self.__targets = None
self.__node_labels = None
self.__edge_labels = None
self.__node_attrs = None
@@ -50,33 +51,41 @@ class Dataset(object):
def load_dataset(self, filename, filename_y=None, extra_params=None):
self.__graphs, self.__target = loadDataset(filename, filename_y=filename_y, extra_params=extra_params)
self.__graphs, self.__targets = loadDataset(filename, filename_y=filename_y, extra_params=extra_params)
self.set_labels_attrs()
def load_graphs(self, graphs, targets=None):
# this has to be followed by set_labels().
self.__graphs = graphs
self.__targets = targets
# self.set_labels_attrs()
def load_predefined_dataset(self, ds_name):
current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
if ds_name == 'Letter-high': # node non-symb
ds_file = '../../datasets/Letter-high/Letter-high_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'Letter-med': # node non-symb
ds_file = '../../datasets/Letter-high/Letter-med_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'Letter-low': # node non-symb
ds_file = '../../datasets/Letter-high/Letter-low_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'Fingerprint':
ds_file = '../../datasets/Fingerprint/Fingerprint_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'SYNTHETIC':
pass
elif ds_name == 'SYNTHETICnew':
ds_file = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'Synthie':
pass
elif ds_name == 'COIL-DEL':
ds_file = '../../datasets/COIL-DEL/COIL-DEL_A.txt'
self.__graphs, self.__target = loadDataset(ds_file)
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
self.__graphs, self.__targets = loadDataset(ds_file)
elif ds_name == 'COIL-RAG':
pass
elif ds_name == 'COLORS-3':
@@ -86,6 +95,13 @@ class Dataset(object):
self.set_labels_attrs()

def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
self.__node_labels = node_labels
self.__node_attrs = node_attrs
self.__edge_labels = edge_labels
self.__edge_attrs = edge_attrs

def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None):
# @todo: remove labels which have only one possible values.
@@ -363,9 +379,34 @@ class Dataset(object):
print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0]))))
def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
for g in self.__graphs:
for nd in g.nodes():
for nl in node_labels:
del g.nodes[nd][nl]
for na in node_attrs:
del g.nodes[nd][na]
for ed in g.edges():
for el in edge_labels:
del g.edges[ed][el]
for ea in edge_attrs:
del g.edges[ed][ea]
if len(node_labels) > 0:
self.__node_labels = [nl for nl in self.__node_labels if nl not in node_labels]
if len(edge_labels) > 0:
self.__edge_labels = [el for el in self.__edge_labels if el not in edge_labels]
if len(node_attrs) > 0:
self.__node_attrs = [na for na in self.__node_attrs if na not in node_attrs]
if len(edge_attrs) > 0:
self.__edge_attrs = [ea for ea in self.__edge_attrs if ea not in edge_attrs]
def cut_graphs(self, range_):
self.__graphs = [self.__graphs[i] for i in range_]
self.set_labels_attrs()
if self.__targets is not None:
self.__targets = [self.__targets[i] for i in range_]
# @todo
# self.set_labels_attrs()
def __get_dataset_size(self):
@@ -514,7 +555,7 @@ class Dataset(object):
def __get_class_num(self):
return len(set(self.__target))
return len(set(self.__targets))
def __get_node_attr_dim(self):
@@ -529,6 +570,11 @@ class Dataset(object):
def graphs(self):
return self.__graphs


@property
def targets(self):
return self.__targets
@property
def node_labels(self):
@@ -547,4 +593,20 @@ class Dataset(object):
@property
def edge_attrs(self):
return self.__edge_attrs
return self.__edge_attrs
def split_dataset_by_target(dataset):
from gklearn.preimage.utils import get_same_item_indices
graphs = dataset.graphs
targets = dataset.targets
datasets = []
idx_targets = get_same_item_indices(targets)
for key, val in idx_targets.items():
sub_graphs = [graphs[i] for i in val]
sub_dataset = Dataset()
sub_dataset.load_graphs(sub_graphs, [key] * len(val))
sub_dataset.set_labels(node_labels=dataset.node_labels, node_attrs=dataset.node_attrs, edge_labels=dataset.edge_labels, edge_attrs=dataset.edge_attrs)
datasets.append(sub_dataset)
return datasets

+ 735
- 737
gklearn/utils/graphfiles.py
File diff suppressed because it is too large
View File


+ 56
- 0
gklearn/utils/utils.py View File

@@ -296,3 +296,59 @@ def get_edge_labels(Gn, edge_label):
for G in Gn:
el = el | set(nx.get_edge_attributes(G, edge_label).values())
return el


def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}):
if name == 'structuralspkernel':
from gklearn.kernels import StructuralSP
graph_kernel = StructuralSP(node_labels=node_labels, edge_labels=edge_labels,
node_attrs=node_attrs, edge_attrs=edge_attrs,
ds_infos=ds_infos, **kernel_options)
return graph_kernel


def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save='', irrelevant_labels=None):
from gklearn.utils import Dataset, split_dataset_by_target
# 1. get dataset.
print('1. getting dataset...')
dataset_all = Dataset()
dataset_all.load_predefined_dataset(ds_name)
if not irrelevant_labels is None:
dataset_all.remove_labels(**irrelevant_labels)
# dataset_all.cut_graphs(range(0, 10))
datasets = split_dataset_by_target(dataset_all)
gram_matrix_unnorm_list = []
run_time_list = []
print('start generating preimage for each class of target...')
for idx, dataset in enumerate(datasets):
target = dataset.targets[0]
print('\ntarget =', target, '\n')
# 2. initialize graph kernel.
print('2. initializing graph kernel and setting parameters...')
graph_kernel = get_graph_kernel_by_name(kernel_options['name'],
node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs,
edge_attrs=dataset.edge_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
kernel_options=kernel_options)

# 3. compute gram matrix.
print('3. computing gram matrix...')
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options)
gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm
gram_matrix_unnorm_list.append(gram_matrix_unnorm)
run_time_list.append(run_time)
# 4. save results.
print()
print('4. saving results...')
if save_results:
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list)

print('\ncomplete.')

+ 33
- 0
notebooks/tests/test_tqdm.py View File

@@ -0,0 +1,33 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 3 10:38:59 2020

@author: ljia
"""

from tqdm import tqdm
import sys

print('start')

for i in tqdm(range(10000000), file=sys.stdout):
x = i
# print(x)
# =============================================================================
# summary
# terminal, IPython 7.0.1 (Spyder 4): Works.
# write to file: does not work. Progress bar splits as the progress goes.
# Jupyter:
# =============================================================================

# for i in tqdm(range(10000000)):
# x = i
# print(x)
# =============================================================================
# summary
# terminal, IPython 7.0.1 (Spyder 4): does not work. When combines with other
# print, progress bar splits.
# write to file: does not work. Cannot write progress bar to file.
# Jupyter:
# =============================================================================

+ 10
- 7
requirements.txt View File

@@ -1,7 +1,10 @@
numpy==1.15.2
scipy==1.1.0
matplotlib==3.0.0
networkx==2.2
scikit-learn==0.20.0
tabulate==0.8.2
tqdm==4.26.0
numpy>=1.15.2
scipy>=1.1.0
matplotlib>=3.0.0
networkx>=2.2
scikit-learn>=0.20.0
tabulate>=0.8.2
tqdm>=4.26.0
# cvxpy # for preimage.
# cvxopt # for preimage.
# mosek # for preimage.

Loading…
Cancel
Save