@@ -4,7 +4,7 @@ | |||
[](https://graphkit-learn.readthedocs.io/en/master/?badge=master) | |||
[](https://badge.fury.io/py/graphkit-learn) | |||
A python package for graph kernels. | |||
A python package for graph kernels, graph edit distances and graph pre-image problem. | |||
## Requirements | |||
@@ -105,7 +105,7 @@ A comparison of performances of graph kernels on benchmark datasets can be found | |||
## Authors | |||
* [Linlin Jia](https://github.com/jajupmochi), LITIS, INSA Rouen Normandie | |||
* [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie | |||
* [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie | |||
* [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie | |||
@@ -348,7 +348,7 @@ class MedianGraphEstimator(object): | |||
# Print information about current iteration. | |||
if self.__print_to_stdout == 2: | |||
progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout) | |||
progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout) | |||
# Compute node maps and sum of distances for initial median. | |||
self.__sum_of_distances = 0 | |||
@@ -457,7 +457,7 @@ class MedianGraphEstimator(object): | |||
self.__itrs[median_pos] += 1 | |||
# Update the best median. | |||
if self.__sum_of_distances < self.__best_init_sum_of_distances: | |||
if self.__sum_of_distances < best_sum_of_distances: | |||
best_sum_of_distances = self.__sum_of_distances | |||
node_maps_from_best_median = self.__node_maps_from_median | |||
best_median = median | |||
@@ -588,7 +588,7 @@ class MedianGraphEstimator(object): | |||
# Print information about current iteration. | |||
if self.__print_to_stdout == 2: | |||
progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout) | |||
progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout) | |||
# Compute the medoid. | |||
medoid_id = graph_ids[0] | |||
@@ -718,7 +718,7 @@ class MedianGraphEstimator(object): | |||
def __update_node_maps(self): | |||
# Print information about current iteration. | |||
if self.__print_to_stdout == 2: | |||
progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) | |||
progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) | |||
# Update the node maps. | |||
node_maps_were_modified = False | |||
@@ -307,7 +307,7 @@ def ged_options_to_string(options): | |||
opt_str = ' ' | |||
for key, val in options.items(): | |||
if key == 'initialization_method': | |||
opt_str += '--initial_solutions ' + str(val) + ' ' | |||
opt_str += '--initialization-method ' + str(val) + ' ' | |||
elif key == 'initialization_options': | |||
opt_str += '--initialization-options ' + str(val) + ' ' | |||
elif key == 'lower_bound_method': | |||
@@ -76,11 +76,11 @@ class GraphKernel(object): | |||
def compute_distance_matrix(self): | |||
dis_mat = np.empty((len(self._graphs), len(self._graphs))) | |||
if self._gram_matrix is None: | |||
raise Exception('Please compute the Gram matrix before computing distance matrix.') | |||
for i in range(len(self._graphs)): | |||
for j in range(i, len(self._graphs)): | |||
dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix))) | |||
for i in range(len(self._gram_matrix)): | |||
for j in range(i, len(self._gram_matrix)): | |||
dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j] | |||
if dis < 0: | |||
if dis > -1e-10: | |||
@@ -184,18 +184,22 @@ class GraphKernel(object): | |||
def parallel(self): | |||
return self._parallel | |||
@property | |||
def n_jobs(self): | |||
return self._n_jobs | |||
@property | |||
def verbose(self): | |||
return self._verbose | |||
@property | |||
def normalize(self): | |||
return self._normalize | |||
@property | |||
def run_time(self): | |||
return self._run_time | |||
@@ -205,7 +209,15 @@ class GraphKernel(object): | |||
def gram_matrix(self): | |||
return self._gram_matrix | |||
@gram_matrix.setter | |||
def gram_matrix(self, value): | |||
self._gram_matrix = value | |||
@property | |||
def gram_matrix_unnorm(self): | |||
return self._gram_matrix_unnorm | |||
@gram_matrix_unnorm.setter | |||
def gram_matrix_unnorm(self, value): | |||
self._gram_matrix_unnorm = value |
@@ -17,6 +17,7 @@ from gklearn.ged.util import compute_geds, ged_options_to_string | |||
from gklearn.ged.median import MedianGraphEstimator | |||
from gklearn.ged.median import constant_node_costs,mge_options_to_string | |||
from gklearn.gedlib import librariesImport, gedlibpy | |||
from gklearn.utils import Timer | |||
# from gklearn.utils.dataset import Dataset | |||
class MedianPreimageGenerator(PreimageGenerator): | |||
@@ -29,24 +30,34 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
self.__mge_options = {} | |||
self.__fit_method = 'k-graphs' | |||
self.__init_ecc = None | |||
self.__max_itrs = 100 | |||
self.__parallel = True | |||
self.__n_jobs = multiprocessing.cpu_count() | |||
self.__ds_name = None | |||
self.__time_limit_in_sec = 0 | |||
self.__max_itrs = 100 | |||
self.__max_itrs_without_update = 3 | |||
self.__epsilon_residual = 0.01 | |||
self.__epsilon_ec = 0.1 | |||
# values to compute. | |||
self.__edit_cost_constants = [] | |||
self.__runtime_precompute_gm = None | |||
self.__runtime_optimize_ec = None | |||
self.__runtime_generate_preimage = None | |||
self.__runtime_total = None | |||
self.__set_median = None | |||
self.__gen_median = None | |||
self.__best_from_dataset = None | |||
self.__sod_set_median = None | |||
self.__sod_gen_median = None | |||
self.__k_dis_set_median = None | |||
self.__k_dis_gen_median = None | |||
self.__k_dis_dataset = None | |||
self.__itrs = 0 | |||
self.__converged = False | |||
self.__num_updates_ecc = 0 | |||
# values that can be set or to be computed. | |||
self.__edit_cost_constants = [] | |||
self.__gram_matrix_unnorm = None | |||
self.__runtime_precompute_gm = None | |||
def set_options(self, **kwargs): | |||
self._kernel_options = kwargs.get('kernel_options', {}) | |||
@@ -57,10 +68,16 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
self.__fit_method = kwargs.get('fit_method', 'k-graphs') | |||
self.__init_ecc = kwargs.get('init_ecc', None) | |||
self.__edit_cost_constants = kwargs.get('edit_cost_constants', []) | |||
self.__max_itrs = kwargs.get('max_itrs', 100) | |||
self.__parallel = kwargs.get('parallel', True) | |||
self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||
self.__ds_name = kwargs.get('ds_name', None) | |||
self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) | |||
self.__max_itrs = kwargs.get('max_itrs', 100) | |||
self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3) | |||
self.__epsilon_residual = kwargs.get('epsilon_residual', 0.01) | |||
self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1) | |||
self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) | |||
self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) | |||
def run(self): | |||
@@ -70,12 +87,20 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
start = time.time() | |||
# 1. precompute gram matrix. | |||
gram_matrix, run_time = self.__graph_kernel.compute(self._dataset.graphs, **self._kernel_options) | |||
end_precompute_gm = time.time() | |||
self.__runtime_precompute_gm = end_precompute_gm - start | |||
if self.__gram_matrix_unnorm is None: | |||
gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options) | |||
self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm | |||
end_precompute_gm = time.time() | |||
self.__runtime_precompute_gm = end_precompute_gm - start | |||
else: | |||
if self.__runtime_precompute_gm is None: | |||
raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') | |||
self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm | |||
self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) | |||
end_precompute_gm = time.time() | |||
start -= self.__runtime_precompute_gm | |||
# 2. optimize edit cost constants. | |||
# self.__optimize_edit_cost_constants(dataset=dataset, Gn=Gn, Kmatrix_median=Kmatrix_median) | |||
self.__optimize_edit_cost_constants() | |||
end_optimize_ec = time.time() | |||
self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm | |||
@@ -108,28 +133,48 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
if self._verbose: | |||
print() | |||
print('================================================================================') | |||
print('The optimized edit cost constants: ', self.__edit_cost_constants) | |||
print('SOD of the set median: ', self.__sod_set_median) | |||
print('SOD of the generalized median: ', self.__sod_gen_median) | |||
print('Finished generalization of preimages.') | |||
print('--------------------------------------------------------------------------------') | |||
print('The optimized edit cost constants:', self.__edit_cost_constants) | |||
print('SOD of the set median:', self.__sod_set_median) | |||
print('SOD of the generalized median:', self.__sod_gen_median) | |||
print('Distance in kernel space for set median:', self.__k_dis_set_median) | |||
print('Distance in kernel space for generalized median:', self.__k_dis_gen_median) | |||
print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) | |||
print('Time to pre-compute Gram matrix: ', self.__runtime_precompute_gm) | |||
print('Time to optimize edit costs: ', self.__runtime_optimize_ec) | |||
print('Time to generate pre-images: ', self.__runtime_generate_preimage) | |||
print('Total time: ', self.__runtime_total) | |||
print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) | |||
print('Time to optimize edit costs:', self.__runtime_optimize_ec) | |||
print('Time to generate pre-images:', self.__runtime_generate_preimage) | |||
print('Total time:', self.__runtime_total) | |||
print('Total number of iterations for optimizing:', self.__itrs) | |||
print('Total number of updating edit costs:', self.__num_updates_ecc) | |||
print('Is optimization of edit costs converged:', self.__converged) | |||
print('================================================================================') | |||
print() | |||
# collect return values. | |||
# return (sod_sm, sod_gm), \ | |||
# (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ | |||
# (time_fitting, time_generating) | |||
def get_results(self): | |||
results = {} | |||
results['edit_cost_constants'] = self.__edit_cost_constants | |||
results['runtime_precompute_gm'] = self.__runtime_precompute_gm | |||
results['runtime_optimize_ec'] = self.__runtime_optimize_ec | |||
results['runtime_generate_preimage'] = self.__runtime_generate_preimage | |||
results['runtime_total'] = self.__runtime_total | |||
results['sod_set_median'] = self.__sod_set_median | |||
results['sod_gen_median'] = self.__sod_gen_median | |||
results['k_dis_set_median'] = self.__k_dis_set_median | |||
results['k_dis_gen_median'] = self.__k_dis_gen_median | |||
results['k_dis_dataset'] = self.__k_dis_dataset | |||
results['itrs'] = self.__itrs | |||
results['converged'] = self.__converged | |||
results['num_updates_ecc'] = self.__num_updates_ecc | |||
return results | |||
# def __optimize_edit_cost_constants(self, dataset=None, Gn=None, Kmatrix_median=None): | |||
def __optimize_edit_cost_constants(self): | |||
"""fit edit cost constants. | |||
""" | |||
@@ -177,8 +222,6 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
self.__init_ecc = [3, 3, 1, 3, 3, 1] | |||
# optimize on the k-graph subset. | |||
self.__optimize_ecc_by_kernel_distances() | |||
# fit_GED_to_kernel_distance(Gn_median, | |||
# dataset=dataset, Kmatrix=Kmatrix_median) | |||
elif self.__fit_method == 'whole-dataset': | |||
if self.__init_ecc is None: | |||
if self.__ged_options['edit_cost'] == 'LETTER': | |||
@@ -189,17 +232,13 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
self.__init_ecc = [3, 3, 1, 3, 3, 1] | |||
# optimizeon the whole set. | |||
self.__optimize_ecc_by_kernel_distances() | |||
# fit_GED_to_kernel_distance(Gn, dataset=dataset) | |||
elif self.__fit_method == 'precomputed': | |||
pass | |||
def __optimize_ecc_by_kernel_distances(self): | |||
# def fit_GED_to_kernel_distance(Gn, Kmatrix=None, | |||
# parallel=True): | |||
def __optimize_ecc_by_kernel_distances(self): | |||
# compute distances in feature space. | |||
dis_k_mat, _, _, _ = self.__graph_kernel.compute_distance_matrix() | |||
dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix() | |||
dis_k_vec = [] | |||
for i in range(len(dis_k_mat)): | |||
# for j in range(i, len(dis_k_mat)): | |||
@@ -222,20 +261,25 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
nb_cost_mat = np.array(n_edit_operations) | |||
nb_cost_mat_list = [nb_cost_mat] | |||
if self._verbose >= 2: | |||
print('edit_cost_constants:', self.__edit_cost_constants) | |||
print('residual_list:', residual_list) | |||
for itr in range(self.__max_itrs): | |||
print('Current edit cost constants:', self.__edit_cost_constants) | |||
print('Residual list:', residual_list) | |||
# run iteration from initial edit costs. | |||
self.__converged = False | |||
itrs_without_update = 0 | |||
self.__itrs = 0 | |||
self.__num_updates_ecc = 0 | |||
timer = Timer(self.__time_limit_in_sec) | |||
while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update): | |||
if self._verbose >= 2: | |||
print('\niteration', itr) | |||
print('\niteration', self.__itrs + 1) | |||
time0 = time.time() | |||
# "fit" geds to distances in feature space by tuning edit costs using the | |||
# Least Squares Method. | |||
np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm', | |||
nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, | |||
n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, | |||
ged_mat=ged_mat) | |||
self.__edit_cost_constants, residual = self.__update_ecc(nb_cost_mat, dis_k_vec) | |||
# "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method. | |||
# np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm', | |||
# nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, | |||
# n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, | |||
# ged_mat=ged_mat) | |||
self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec) | |||
for i in range(len(self.__edit_cost_constants)): | |||
if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9: | |||
self.__edit_cost_constants[i] = 0 | |||
@@ -254,12 +298,59 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
edit_cost_list.append(self.__edit_cost_constants) | |||
nb_cost_mat = np.array(n_edit_operations) | |||
nb_cost_mat_list.append(nb_cost_mat) | |||
# check convergency. | |||
ec_changed = False | |||
for i, cost in enumerate(self.__edit_cost_constants): | |||
if cost == 0: | |||
if edit_cost_list[-2][i] > self.__epsilon_ec: | |||
ec_changed = True | |||
break | |||
elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ec: | |||
ec_changed = True | |||
break | |||
# if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec: | |||
# ec_changed = True | |||
# break | |||
residual_changed = False | |||
if residual_list[-1] == 0: | |||
if residual_list[-2] > self.__epsilon_residual: | |||
residual_changed = True | |||
elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_residual: | |||
residual_changed = True | |||
self.__converged = not (ec_changed or residual_changed) | |||
if self.__converged: | |||
itrs_without_update += 1 | |||
else: | |||
itrs_without_update = 0 | |||
self.__num_updates_ecc += 1 | |||
# print current states. | |||
if self._verbose >= 2: | |||
print('edit_cost_constants:', self.__edit_cost_constants) | |||
print('residual_list:', residual_list) | |||
# return residual_list, edit_cost_list, dis_k_mat, ged_mat, \ | |||
# time_list, nb_cost_mat_list | |||
print() | |||
print('-------------------------------------------------------------------------') | |||
print('States of iteration', self.__itrs + 1) | |||
print('-------------------------------------------------------------------------') | |||
# print('Time spend:', self.__runtime_optimize_ec) | |||
print('Total number of iterations for optimizing:', self.__itrs + 1) | |||
print('Total number of updating edit costs:', self.__num_updates_ecc) | |||
print('Was optimization of edit costs converged:', self.__converged) | |||
print('Did edit costs change:', ec_changed) | |||
print('Did residual change:', residual_changed) | |||
print('Iterations without update:', itrs_without_update) | |||
print('Current edit cost constants:', self.__edit_cost_constants) | |||
print('Residual list:', residual_list) | |||
print('-------------------------------------------------------------------------') | |||
self.__itrs += 1 | |||
def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): | |||
if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): | |||
# if self.__state == AlgorithmState.TERMINATED: | |||
# self.__state = AlgorithmState.INITIALIZED | |||
return True | |||
return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) | |||
def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'): | |||
@@ -559,11 +650,11 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
def __compute_distances_to_true_median(self): | |||
# compute distance in kernel space for set median. | |||
kernels_to_sm, _ = self.__graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) | |||
kernel_sm, _ = self.__graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) | |||
kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize | |||
kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) | |||
kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) | |||
kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize | |||
# @todo: not correct kernel value | |||
gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0) | |||
gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | |||
gram_with_sm = np.concatenate((np.array([[1] + kernels_to_sm]).T, gram_with_sm), axis=1) | |||
self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), | |||
[1 / len(self._dataset.graphs)] * len(self._dataset.graphs), | |||
@@ -574,10 +665,10 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
# print(set_median.edges(data=True)) | |||
# compute distance in kernel space for generalized median. | |||
kernels_to_gm, _ = self.__graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) | |||
kernel_gm, _ = self.__graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) | |||
kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize | |||
gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0) | |||
kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) | |||
kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) | |||
kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize | |||
gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | |||
gram_with_gm = np.concatenate((np.array([[1] + kernels_to_gm]).T, gram_with_gm), axis=1) | |||
self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), | |||
[1 / len(self._dataset.graphs)] * len(self._dataset.graphs), | |||
@@ -591,6 +682,7 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
gram_with_gm, withterm3=False)) | |||
idx_k_dis_median_set_min = np.argmin(k_dis_median_set) | |||
self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min] | |||
self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy() | |||
if self._verbose >= 2: | |||
print() | |||
@@ -599,18 +691,16 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) | |||
print('distance in kernel space for each graph in median set:', k_dis_median_set) | |||
# return dis_k_sm, dis_k_gm, k_dis_median_set, dis_k_gi_min, idx_dis_k_gi_min | |||
def __set_graph_kernel_by_name(self): | |||
if self.kernel_options['name'] == 'structuralspkernel': | |||
from gklearn.kernels import StructuralSP | |||
self.__graph_kernel = StructuralSP(node_labels=self.dataset.node_labels, | |||
edge_labels=self.dataset.edge_labels, | |||
node_attrs=self.dataset.node_attrs, | |||
edge_attrs=self.dataset.edge_attrs, | |||
ds_infos=self.dataset.get_dataset_infos(keys=['directed']), | |||
**self.kernel_options) | |||
self._graph_kernel = StructuralSP(node_labels=self._dataset.node_labels, | |||
edge_labels=self._dataset.edge_labels, | |||
node_attrs=self._dataset.node_attrs, | |||
edge_attrs=self._dataset.edge_attrs, | |||
ds_infos=self._dataset.get_dataset_infos(keys=['directed']), | |||
**self._kernel_options) | |||
# def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): | |||
@@ -618,7 +708,7 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
""" | |||
Cleans node and edge labels and attributes of the given graph. | |||
""" | |||
G_new = nx.Graph() | |||
G_new = nx.Graph(**G.graph) | |||
for nd, attrs in G.nodes(data=True): | |||
G_new.add_node(str(nd)) # @todo: should we keep this as str()? | |||
for l_name in self._dataset.node_labels: | |||
@@ -670,5 +760,29 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
return self.__init_ecc | |||
@init_ecc.setter | |||
def fit_method(self, value): | |||
self.__init_ecc = value | |||
def init_ecc(self, value): | |||
self.__init_ecc = value | |||
@property | |||
def set_median(self): | |||
return self.__set_median | |||
@property | |||
def gen_median(self): | |||
return self.__gen_median | |||
@property | |||
def best_from_dataset(self): | |||
return self.__best_from_dataset | |||
@property | |||
def gram_matrix_unnorm(self): | |||
return self.__gram_matrix_unnorm | |||
@gram_matrix_unnorm.setter | |||
def gram_matrix_unnorm(self, value): | |||
self.__gram_matrix_unnorm = value |
@@ -5,7 +5,7 @@ Created on Thu Mar 26 18:26:36 2020 | |||
@author: ljia | |||
""" | |||
from gklearn.utils import Dataset | |||
# from gklearn.utils import Dataset | |||
class PreimageGenerator(object): | |||
@@ -32,6 +32,11 @@ class PreimageGenerator(object): | |||
@kernel_options.setter | |||
def kernel_options(self, value): | |||
self._kernel_options = value | |||
@property | |||
def graph_kernel(self): | |||
return self._graph_kernel | |||
@property | |||
@@ -41,3 +46,4 @@ class PreimageGenerator(object): | |||
@verbose.setter | |||
def verbose(self, value): | |||
self._verbose = value | |||
@@ -20,9 +20,12 @@ def test_median_preimage_generator(): | |||
mpg = MedianPreimageGenerator() | |||
mpg_options = {'fit_method': 'k-graphs', | |||
'init_ecc': [3, 3, 1, 3, 3], | |||
'max_itrs': 6, | |||
'ds_name': 'Letter-high', | |||
'parallel': True, | |||
'time_limit_in_sec': 0, | |||
'max_itrs': 100, | |||
'max_itrs_without_update': 3, | |||
'epsilon_ratio': 0.01, | |||
'verbose': 2} | |||
mpg.set_options(**mpg_options) | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
@@ -19,146 +19,465 @@ from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, po | |||
from gklearn.kernels.structuralspKernel import structuralspkernel | |||
from gklearn.kernels.treeletKernel import treeletkernel | |||
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel | |||
from gklearn.utils import Dataset | |||
import csv | |||
import networkx as nx | |||
def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, load_gm='auto', dir_save='', irrelevant_labels=None): | |||
import os.path | |||
from gklearn.preimage import MedianPreimageGenerator | |||
from gklearn.utils import split_dataset_by_target | |||
from gklearn.utils.graphfiles import saveGXL | |||
# 1. get dataset. | |||
print('1. getting dataset...') | |||
dataset_all = Dataset() | |||
dataset_all.load_predefined_dataset(ds_name) | |||
if not irrelevant_labels is None: | |||
dataset_all.remove_labels(**irrelevant_labels) | |||
# dataset_all.cut_graphs(range(0, 100)) | |||
datasets = split_dataset_by_target(dataset_all) | |||
if save_results: | |||
# create result files. | |||
print('creating output files...') | |||
fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save) | |||
sod_sm_list = [] | |||
sod_gm_list = [] | |||
dis_k_sm_list = [] | |||
dis_k_gm_list = [] | |||
dis_k_gi_min_list = [] | |||
time_optimize_ec_list = [] | |||
time_generate_list = [] | |||
time_total_list = [] | |||
itrs_list = [] | |||
converged_list = [] | |||
num_updates_ecc_list = [] | |||
nb_sod_sm2gm = [0, 0, 0] | |||
nb_dis_k_sm2gm = [0, 0, 0] | |||
nb_dis_k_gi2sm = [0, 0, 0] | |||
nb_dis_k_gi2gm = [0, 0, 0] | |||
dis_k_max_list = [] | |||
dis_k_min_list = [] | |||
dis_k_mean_list = [] | |||
if load_gm == 'auto': | |||
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' | |||
gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) | |||
if gmfile_exist: | |||
gmfile = np.load(gm_fname) | |||
gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list'] | |||
time_precompute_gm_list = gmfile['run_time_list'].tolist() | |||
else: | |||
gram_matrix_unnorm_list = [] | |||
time_precompute_gm_list = [] | |||
elif not load_gm: | |||
gram_matrix_unnorm_list = [] | |||
time_precompute_gm_list = [] | |||
else: | |||
gmfile = np.load() | |||
gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list'] | |||
time_precompute_gm_list = gmfile['run_time_list'] | |||
# repeats_better_sod_sm2gm = [] | |||
# repeats_better_dis_k_sm2gm = [] | |||
# repeats_better_dis_k_gi2sm = [] | |||
# repeats_better_dis_k_gi2gm = [] | |||
print('start generating preimage for each class of target...') | |||
for idx, dataset in enumerate(datasets): | |||
target = dataset.targets[0] | |||
print('\ntarget =', target, '\n') | |||
# if target != 1: | |||
# continue | |||
num_graphs = len(dataset.graphs) | |||
if num_graphs < 2: | |||
print('\nnumber of graphs = ', num_graphs, ', skip.\n') | |||
continue | |||
# 2. set parameters. | |||
print('2. initializing mpg and setting parameters...') | |||
if load_gm: | |||
if gmfile_exist: | |||
mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx] | |||
mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx] | |||
mpg = MedianPreimageGenerator() | |||
mpg.dataset = dataset | |||
mpg.set_options(**mpg_options.copy()) | |||
mpg.kernel_options = kernel_options.copy() | |||
mpg.ged_options = ged_options.copy() | |||
mpg.mge_options = mge_options.copy() | |||
# 3. compute median preimage. | |||
print('3. computing median preimage...') | |||
mpg.run() | |||
results = mpg.get_results() | |||
# 4. compute pairwise kernel distances. | |||
print('4. computing pairwise kernel distances...') | |||
_, dis_k_max, dis_k_min, dis_k_mean = mpg.graph_kernel.compute_distance_matrix() | |||
dis_k_max_list.append(dis_k_max) | |||
dis_k_min_list.append(dis_k_min) | |||
dis_k_mean_list.append(dis_k_mean) | |||
# 5. save results (and median graphs). | |||
print('5. saving results (and median graphs)...') | |||
# write result detail. | |||
if save_results: | |||
print('writing results to files...') | |||
sod_sm2gm = get_relations(np.sign(results['sod_gen_median'] - results['sod_set_median'])) | |||
dis_k_sm2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_set_median'])) | |||
dis_k_gi2sm = get_relations(np.sign(results['k_dis_set_median'] - results['k_dis_dataset'])) | |||
dis_k_gi2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_dataset'])) | |||
f_detail = open(dir_save + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow([ds_name, kernel_options['name'], | |||
ged_options['edit_cost'], ged_options['method'], | |||
ged_options['attr_distance'], mpg_options['fit_method'], | |||
num_graphs, target, 1, | |||
results['sod_set_median'], results['sod_gen_median'], | |||
results['k_dis_set_median'], results['k_dis_gen_median'], | |||
results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, | |||
dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'], | |||
results['runtime_precompute_gm'], results['runtime_optimize_ec'], | |||
results['runtime_generate_preimage'], results['runtime_total'], | |||
results['itrs'], results['converged'], | |||
results['num_updates_ecc']]) | |||
f_detail.close() | |||
# compute result summary. | |||
sod_sm_list.append(results['sod_set_median']) | |||
sod_gm_list.append(results['sod_gen_median']) | |||
dis_k_sm_list.append(results['k_dis_set_median']) | |||
dis_k_gm_list.append(results['k_dis_gen_median']) | |||
dis_k_gi_min_list.append(results['k_dis_dataset']) | |||
time_precompute_gm_list.append(results['runtime_precompute_gm']) | |||
time_optimize_ec_list.append(results['runtime_optimize_ec']) | |||
time_generate_list.append(results['runtime_generate_preimage']) | |||
time_total_list.append(results['runtime_total']) | |||
itrs_list.append(results['itrs']) | |||
converged_list.append(results['converged']) | |||
num_updates_ecc_list.append(results['num_updates_ecc']) | |||
# # SOD SM -> GM | |||
if results['sod_set_median'] > results['sod_gen_median']: | |||
nb_sod_sm2gm[0] += 1 | |||
# repeats_better_sod_sm2gm.append(1) | |||
elif results['sod_set_median'] == results['sod_gen_median']: | |||
nb_sod_sm2gm[1] += 1 | |||
elif results['sod_set_median'] < results['sod_gen_median']: | |||
nb_sod_sm2gm[2] += 1 | |||
# # dis_k SM -> GM | |||
if results['k_dis_set_median'] > results['k_dis_gen_median']: | |||
nb_dis_k_sm2gm[0] += 1 | |||
# repeats_better_dis_k_sm2gm.append(1) | |||
elif results['k_dis_set_median'] == results['k_dis_gen_median']: | |||
nb_dis_k_sm2gm[1] += 1 | |||
elif results['k_dis_set_median'] < results['k_dis_gen_median']: | |||
nb_dis_k_sm2gm[2] += 1 | |||
# # dis_k gi -> SM | |||
if results['k_dis_dataset'] > results['k_dis_set_median']: | |||
nb_dis_k_gi2sm[0] += 1 | |||
# repeats_better_dis_k_gi2sm.append(1) | |||
elif results['k_dis_dataset'] == results['k_dis_set_median']: | |||
nb_dis_k_gi2sm[1] += 1 | |||
elif results['k_dis_dataset'] < results['k_dis_set_median']: | |||
nb_dis_k_gi2sm[2] += 1 | |||
# # dis_k gi -> GM | |||
if results['k_dis_dataset'] > results['k_dis_gen_median']: | |||
nb_dis_k_gi2gm[0] += 1 | |||
# repeats_better_dis_k_gi2gm.append(1) | |||
elif results['k_dis_dataset'] == results['k_dis_gen_median']: | |||
nb_dis_k_gi2gm[1] += 1 | |||
elif results['k_dis_dataset'] < results['k_dis_gen_median']: | |||
nb_dis_k_gi2gm[2] += 1 | |||
# write result summary for each letter. | |||
f_summary = open(dir_save + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow([ds_name, kernel_options['name'], | |||
ged_options['edit_cost'], ged_options['method'], | |||
ged_options['attr_distance'], mpg_options['fit_method'], | |||
num_graphs, target, | |||
results['sod_set_median'], results['sod_gen_median'], | |||
results['k_dis_set_median'], results['k_dis_gen_median'], | |||
results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, | |||
dis_k_gi2sm, dis_k_gi2gm, | |||
results['runtime_precompute_gm'], results['runtime_optimize_ec'], | |||
results['runtime_generate_preimage'], results['runtime_total'], | |||
results['itrs'], results['converged'], | |||
results['num_updates_ecc'], nb_sod_sm2gm, | |||
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm]) | |||
f_summary.close() | |||
# save median graphs. | |||
if save_medians: | |||
print('Saving median graphs to files...') | |||
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | |||
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||
fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default', | |||
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||
fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | |||
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||
# plot median graphs. | |||
if plot_medians and save_medians: | |||
if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low': | |||
draw_Letter_graph(mpg.set_median, fn_pre_sm) | |||
draw_Letter_graph(mpg.gen_median, fn_pre_gm) | |||
draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset) | |||
if (load_gm == 'auto' and not gmfile_exist) or not load_gm: | |||
gram_matrix_unnorm_list.append(mpg.gram_matrix_unnorm) | |||
# write result summary for each letter. | |||
if save_results: | |||
sod_sm_mean = np.mean(sod_sm_list) | |||
sod_gm_mean = np.mean(sod_gm_list) | |||
dis_k_sm_mean = np.mean(dis_k_sm_list) | |||
dis_k_gm_mean = np.mean(dis_k_gm_list) | |||
dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) | |||
time_precompute_gm_mean = np.mean(time_precompute_gm_list) | |||
time_optimize_ec_mean = np.mean(time_optimize_ec_list) | |||
time_generate_mean = np.mean(time_generate_list) | |||
time_total_mean = np.mean(time_total_list) | |||
itrs_mean = np.mean(itrs_list) | |||
num_converged = np.sum(converged_list) | |||
num_updates_ecc_mean = np.mean(num_updates_ecc_list) | |||
sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean)) | |||
dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) | |||
dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) | |||
dis_k_gi2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) | |||
f_summary = open(dir_save + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow([ds_name, kernel_options['name'], | |||
ged_options['edit_cost'], ged_options['method'], | |||
ged_options['attr_distance'], mpg_options['fit_method'], | |||
num_graphs, 'all', | |||
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, | |||
dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, | |||
dis_k_gi2sm_mean, dis_k_gi2gm_mean, | |||
time_precompute_gm_mean, time_optimize_ec_mean, | |||
time_generate_mean, time_total_mean, itrs_mean, | |||
num_converged, num_updates_ecc_mean]) | |||
f_summary.close() | |||
# save total pairwise kernel distances. | |||
dis_k_max = np.max(dis_k_max_list) | |||
dis_k_min = np.min(dis_k_min_list) | |||
dis_k_mean = np.mean(dis_k_mean_list) | |||
print('The maximum pairwise distance in kernel space:', dis_k_max) | |||
print('The minimum pairwise distance in kernel space:', dis_k_min) | |||
print('The average pairwise distance in kernel space:', dis_k_mean) | |||
# write Gram matrices to file. | |||
if (load_gm == 'auto' and not gmfile_exist) or not load_gm: | |||
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list) | |||
print('\ncomplete.') | |||
def __init_output_file(ds_name, gkernel, fit_method, dir_output): | |||
# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||
f_detail = open(dir_output + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', | |||
'GED method', 'attr distance', 'fit method', 'num graphs', | |||
'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
'dis_k gi -> GM', 'edit cost constants', 'time precompute gm', | |||
'time optimize ec', 'time generate preimage', 'time total', | |||
'itrs', 'converged', 'num updates ecc']) | |||
f_detail.close() | |||
# fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' | |||
f_summary = open(dir_output + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost', | |||
'GED method', 'attr distance', 'fit method', 'num graphs', | |||
'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', | |||
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', | |||
'dis_k gi -> GM', 'time precompute gm', 'time optimize ec', | |||
'time generate preimage', 'time total', 'itrs', 'num converged', | |||
'num updates ecc', '# SOD SM -> GM', '# dis_k SM -> GM', | |||
'# dis_k gi -> SM', '# dis_k gi -> GM']) | |||
# 'repeats better SOD SM -> GM', | |||
# 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', | |||
# 'repeats better dis_k gi -> GM']) | |||
f_summary.close() | |||
return fn_output_detail, fn_output_summary | |||
def get_relations(sign): | |||
if sign == -1: | |||
return 'better' | |||
elif sign == 0: | |||
return 'same' | |||
elif sign == 1: | |||
return 'worse' | |||
#Dessin median courrant | |||
def draw_Letter_graph(graph, file_prefix): | |||
import matplotlib | |||
matplotlib.use('agg') | |||
import matplotlib.pyplot as plt | |||
plt.figure() | |||
pos = {} | |||
for n in graph.nodes: | |||
pos[n] = np.array([float(graph.nodes[n]['x']),float(graph.nodes[n]['y'])]) | |||
nx.draw_networkx(graph, pos) | |||
plt.savefig(file_prefix + '.eps', format='eps', dpi=300) | |||
# plt.show() | |||
plt.clf() | |||
plt.close() | |||
def remove_edges(Gn): | |||
for G in Gn: | |||
for _, _, attrs in G.edges(data=True): | |||
attrs.clear() | |||
for G in Gn: | |||
for _, _, attrs in G.edges(data=True): | |||
attrs.clear() | |||
def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | |||
term1 = Kmatrix[idx_g, idx_g] | |||
term2 = 0 | |||
for i, a in enumerate(alpha): | |||
term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||
term2 *= 2 | |||
if withterm3 == False: | |||
for i1, a1 in enumerate(alpha): | |||
for i2, a2 in enumerate(alpha): | |||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
return np.sqrt(term1 - term2 + term3) | |||
term1 = Kmatrix[idx_g, idx_g] | |||
term2 = 0 | |||
for i, a in enumerate(alpha): | |||
term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||
term2 *= 2 | |||
if withterm3 == False: | |||
for i1, a1 in enumerate(alpha): | |||
for i2, a2 in enumerate(alpha): | |||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
return np.sqrt(term1 - term2 + term3) | |||
def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | |||
term1 = Kmatrix[idx_g, idx_g] | |||
term2 = 0 | |||
for i, a in enumerate(alpha): | |||
term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||
term2 *= 2 | |||
if withterm3 == False: | |||
for i1, a1 in enumerate(alpha): | |||
for i2, a2 in enumerate(alpha): | |||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
return np.sqrt(term1 - term2 + term3) | |||
term1 = Kmatrix[idx_g, idx_g] | |||
term2 = 0 | |||
for i, a in enumerate(alpha): | |||
term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||
term2 *= 2 | |||
if withterm3 == False: | |||
for i1, a1 in enumerate(alpha): | |||
for i2, a2 in enumerate(alpha): | |||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
return np.sqrt(term1 - term2 + term3) | |||
def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'): | |||
if graph_kernel == 'marginalizedkernel': | |||
Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
p_quit=0.03, n_iteration=10, remove_totters=False, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
elif graph_kernel == 'untilhpathkernel': | |||
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
depth=7, k_func='MinMax', compute_method='trie', | |||
parallel=parallel, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
elif graph_kernel == 'spkernel': | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
Kmatrix = np.empty((len(Gn), len(Gn))) | |||
# Kmatrix[:] = np.nan | |||
Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= | |||
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
# for i, row in enumerate(idx): | |||
# for j, col in enumerate(idx): | |||
# Kmatrix[row, col] = Kmatrix_tmp[i, j] | |||
elif graph_kernel == 'structuralspkernel': | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, | |||
edge_label=edge_label, node_kernels=sub_kernels, | |||
edge_kernels=sub_kernels, | |||
parallel=parallel, n_jobs=multiprocessing.cpu_count(), | |||
verbose=verbose) | |||
elif graph_kernel == 'treeletkernel': | |||
pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||
# pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
sub_kernel=pkernel, parallel=parallel, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
elif graph_kernel == 'weisfeilerlehmankernel': | |||
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, | |||
height=4, base_kernel='subtree', parallel=None, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
# normalization | |||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||
for i in range(len(Kmatrix)): | |||
for j in range(i, len(Kmatrix)): | |||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
return Kmatrix | |||
if graph_kernel == 'marginalizedkernel': | |||
Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
p_quit=0.03, n_iteration=10, remove_totters=False, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
elif graph_kernel == 'untilhpathkernel': | |||
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
depth=7, k_func='MinMax', compute_method='trie', | |||
parallel=parallel, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
elif graph_kernel == 'spkernel': | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
Kmatrix = np.empty((len(Gn), len(Gn))) | |||
# Kmatrix[:] = np.nan | |||
Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= | |||
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
# for i, row in enumerate(idx): | |||
# for j, col in enumerate(idx): | |||
# Kmatrix[row, col] = Kmatrix_tmp[i, j] | |||
elif graph_kernel == 'structuralspkernel': | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, | |||
edge_label=edge_label, node_kernels=sub_kernels, | |||
edge_kernels=sub_kernels, | |||
parallel=parallel, n_jobs=multiprocessing.cpu_count(), | |||
verbose=verbose) | |||
elif graph_kernel == 'treeletkernel': | |||
pkernel = functools.partial(polynomialkernel, d=2, c=1e5) | |||
# pkernel = functools.partial(gaussiankernel, gamma=1e-6) | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, | |||
sub_kernel=pkernel, parallel=parallel, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
elif graph_kernel == 'weisfeilerlehmankernel': | |||
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, | |||
height=4, base_kernel='subtree', parallel=None, | |||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||
# normalization | |||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||
for i in range(len(Kmatrix)): | |||
for j in range(i, len(Kmatrix)): | |||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||
Kmatrix[j][i] = Kmatrix[i][j] | |||
return Kmatrix | |||
def gram2distances(Kmatrix): | |||
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) | |||
for i1 in range(len(Kmatrix)): | |||
for i2 in range(len(Kmatrix)): | |||
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] | |||
dmatrix = np.sqrt(dmatrix) | |||
return dmatrix | |||
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) | |||
for i1 in range(len(Kmatrix)): | |||
for i2 in range(len(Kmatrix)): | |||
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] | |||
dmatrix = np.sqrt(dmatrix) | |||
return dmatrix | |||
def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, | |||
gkernel=None, verbose=True): | |||
dis_mat = np.empty((len(Gn), len(Gn))) | |||
if Kmatrix is None: | |||
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) | |||
for i in range(len(Gn)): | |||
for j in range(i, len(Gn)): | |||
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] | |||
if dis < 0: | |||
if dis > -1e-10: | |||
dis = 0 | |||
else: | |||
raise ValueError('The distance is negative.') | |||
dis_mat[i, j] = np.sqrt(dis) | |||
dis_mat[j, i] = dis_mat[i, j] | |||
dis_max = np.max(np.max(dis_mat)) | |||
dis_min = np.min(np.min(dis_mat[dis_mat != 0])) | |||
dis_mean = np.mean(np.mean(dis_mat)) | |||
return dis_mat, dis_max, dis_min, dis_mean | |||
gkernel=None, verbose=True): | |||
dis_mat = np.empty((len(Gn), len(Gn))) | |||
if Kmatrix is None: | |||
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) | |||
for i in range(len(Gn)): | |||
for j in range(i, len(Gn)): | |||
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] | |||
if dis < 0: | |||
if dis > -1e-10: | |||
dis = 0 | |||
else: | |||
raise ValueError('The distance is negative.') | |||
dis_mat[i, j] = np.sqrt(dis) | |||
dis_mat[j, i] = dis_mat[i, j] | |||
dis_max = np.max(np.max(dis_mat)) | |||
dis_min = np.min(np.min(dis_mat[dis_mat != 0])) | |||
dis_mean = np.mean(np.mean(dis_mat)) | |||
return dis_mat, dis_max, dis_min, dis_mean | |||
def get_same_item_indices(ls): | |||
"""Get the indices of the same items in a list. Return a dict keyed by items. | |||
""" | |||
idx_dict = {} | |||
for idx, item in enumerate(ls): | |||
if item in idx_dict: | |||
idx_dict[item].append(idx) | |||
else: | |||
idx_dict[item] = [idx] | |||
return idx_dict | |||
"""Get the indices of the same items in a list. Return a dict keyed by items. | |||
""" | |||
idx_dict = {} | |||
for idx, item in enumerate(ls): | |||
if item in idx_dict: | |||
idx_dict[item].append(idx) | |||
else: | |||
idx_dict[item] = [idx] | |||
return idx_dict | |||
def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, | |||
node_label=None, edge_label=None): | |||
dis_k_all = [] # distance between g_star and each graph. | |||
alpha = [1 / len(Gn)] * len(Gn) | |||
if Kmatrix is None: | |||
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||
term3 = 0 | |||
for i1, a1 in enumerate(alpha): | |||
for i2, a2 in enumerate(alpha): | |||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||
dis_all.append(dtemp) | |||
node_label=None, edge_label=None): | |||
dis_k_all = [] # distance between g_star and each graph. | |||
alpha = [1 / len(Gn)] * len(Gn) | |||
if Kmatrix is None: | |||
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) | |||
term3 = 0 | |||
for i1, a1 in enumerate(alpha): | |||
for i2, a2 in enumerate(alpha): | |||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): | |||
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) | |||
dis_all.append(dtemp) | |||
def normalize_distance_matrix(D): | |||
max_value = np.amax(D) | |||
min_value = np.amin(D) | |||
return (D - min_value) / (max_value - min_value) | |||
max_value = np.amax(D) | |||
min_value = np.amin(D) | |||
return (D - min_value) / (max_value - min_value) |
@@ -15,5 +15,7 @@ __date__ = "November 2017" | |||
# from utils import graphfiles | |||
# from utils import utils | |||
from gklearn.utils.dataset import Dataset | |||
from gklearn.utils.dataset import Dataset, split_dataset_by_target | |||
from gklearn.utils.timer import Timer | |||
from gklearn.utils.utils import get_graph_kernel_by_name | |||
from gklearn.utils.utils import compute_gram_matrices_by_class |
@@ -8,6 +8,7 @@ Created on Thu Mar 26 18:48:27 2020 | |||
import numpy as np | |||
import networkx as nx | |||
from gklearn.utils.graphfiles import loadDataset | |||
import os | |||
class Dataset(object): | |||
@@ -15,7 +16,7 @@ class Dataset(object): | |||
def __init__(self, filename=None, filename_y=None, extra_params=None): | |||
if filename is None: | |||
self.__graphs = None | |||
self.__target = None | |||
self.__targets = None | |||
self.__node_labels = None | |||
self.__edge_labels = None | |||
self.__node_attrs = None | |||
@@ -50,33 +51,41 @@ class Dataset(object): | |||
def load_dataset(self, filename, filename_y=None, extra_params=None): | |||
self.__graphs, self.__target = loadDataset(filename, filename_y=filename_y, extra_params=extra_params) | |||
self.__graphs, self.__targets = loadDataset(filename, filename_y=filename_y, extra_params=extra_params) | |||
self.set_labels_attrs() | |||
def load_graphs(self, graphs, targets=None): | |||
# this has to be followed by set_labels(). | |||
self.__graphs = graphs | |||
self.__targets = targets | |||
# self.set_labels_attrs() | |||
def load_predefined_dataset(self, ds_name): | |||
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | |||
if ds_name == 'Letter-high': # node non-symb | |||
ds_file = '../../datasets/Letter-high/Letter-high_A.txt' | |||
self.__graphs, self.__target = loadDataset(ds_file) | |||
ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' | |||
self.__graphs, self.__targets = loadDataset(ds_file) | |||
elif ds_name == 'Letter-med': # node non-symb | |||
ds_file = '../../datasets/Letter-high/Letter-med_A.txt' | |||
self.__graphs, self.__target = loadDataset(ds_file) | |||
ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' | |||
self.__graphs, self.__targets = loadDataset(ds_file) | |||
elif ds_name == 'Letter-low': # node non-symb | |||
ds_file = '../../datasets/Letter-high/Letter-low_A.txt' | |||
self.__graphs, self.__target = loadDataset(ds_file) | |||
ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt' | |||
self.__graphs, self.__targets = loadDataset(ds_file) | |||
elif ds_name == 'Fingerprint': | |||
ds_file = '../../datasets/Fingerprint/Fingerprint_A.txt' | |||
self.__graphs, self.__target = loadDataset(ds_file) | |||
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' | |||
self.__graphs, self.__targets = loadDataset(ds_file) | |||
elif ds_name == 'SYNTHETIC': | |||
pass | |||
elif ds_name == 'SYNTHETICnew': | |||
ds_file = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||
self.__graphs, self.__target = loadDataset(ds_file) | |||
ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||
self.__graphs, self.__targets = loadDataset(ds_file) | |||
elif ds_name == 'Synthie': | |||
pass | |||
elif ds_name == 'COIL-DEL': | |||
ds_file = '../../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
self.__graphs, self.__target = loadDataset(ds_file) | |||
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
self.__graphs, self.__targets = loadDataset(ds_file) | |||
elif ds_name == 'COIL-RAG': | |||
pass | |||
elif ds_name == 'COLORS-3': | |||
@@ -86,6 +95,13 @@ class Dataset(object): | |||
self.set_labels_attrs() | |||
def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): | |||
self.__node_labels = node_labels | |||
self.__node_attrs = node_attrs | |||
self.__edge_labels = edge_labels | |||
self.__edge_attrs = edge_attrs | |||
def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): | |||
# @todo: remove labels which have only one possible values. | |||
@@ -363,9 +379,34 @@ class Dataset(object): | |||
print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) | |||
def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): | |||
for g in self.__graphs: | |||
for nd in g.nodes(): | |||
for nl in node_labels: | |||
del g.nodes[nd][nl] | |||
for na in node_attrs: | |||
del g.nodes[nd][na] | |||
for ed in g.edges(): | |||
for el in edge_labels: | |||
del g.edges[ed][el] | |||
for ea in edge_attrs: | |||
del g.edges[ed][ea] | |||
if len(node_labels) > 0: | |||
self.__node_labels = [nl for nl in self.__node_labels if nl not in node_labels] | |||
if len(edge_labels) > 0: | |||
self.__edge_labels = [el for el in self.__edge_labels if el not in edge_labels] | |||
if len(node_attrs) > 0: | |||
self.__node_attrs = [na for na in self.__node_attrs if na not in node_attrs] | |||
if len(edge_attrs) > 0: | |||
self.__edge_attrs = [ea for ea in self.__edge_attrs if ea not in edge_attrs] | |||
def cut_graphs(self, range_): | |||
self.__graphs = [self.__graphs[i] for i in range_] | |||
self.set_labels_attrs() | |||
if self.__targets is not None: | |||
self.__targets = [self.__targets[i] for i in range_] | |||
# @todo | |||
# self.set_labels_attrs() | |||
def __get_dataset_size(self): | |||
@@ -514,7 +555,7 @@ class Dataset(object): | |||
def __get_class_num(self): | |||
return len(set(self.__target)) | |||
return len(set(self.__targets)) | |||
def __get_node_attr_dim(self): | |||
@@ -529,6 +570,11 @@ class Dataset(object): | |||
def graphs(self): | |||
return self.__graphs | |||
@property | |||
def targets(self): | |||
return self.__targets | |||
@property | |||
def node_labels(self): | |||
@@ -547,4 +593,20 @@ class Dataset(object): | |||
@property | |||
def edge_attrs(self): | |||
return self.__edge_attrs | |||
return self.__edge_attrs | |||
def split_dataset_by_target(dataset): | |||
from gklearn.preimage.utils import get_same_item_indices | |||
graphs = dataset.graphs | |||
targets = dataset.targets | |||
datasets = [] | |||
idx_targets = get_same_item_indices(targets) | |||
for key, val in idx_targets.items(): | |||
sub_graphs = [graphs[i] for i in val] | |||
sub_dataset = Dataset() | |||
sub_dataset.load_graphs(sub_graphs, [key] * len(val)) | |||
sub_dataset.set_labels(node_labels=dataset.node_labels, node_attrs=dataset.node_attrs, edge_labels=dataset.edge_labels, edge_attrs=dataset.edge_attrs) | |||
datasets.append(sub_dataset) | |||
return datasets |
@@ -296,3 +296,59 @@ def get_edge_labels(Gn, edge_label): | |||
for G in Gn: | |||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
return el | |||
def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}): | |||
if name == 'structuralspkernel': | |||
from gklearn.kernels import StructuralSP | |||
graph_kernel = StructuralSP(node_labels=node_labels, edge_labels=edge_labels, | |||
node_attrs=node_attrs, edge_attrs=edge_attrs, | |||
ds_infos=ds_infos, **kernel_options) | |||
return graph_kernel | |||
def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save='', irrelevant_labels=None): | |||
from gklearn.utils import Dataset, split_dataset_by_target | |||
# 1. get dataset. | |||
print('1. getting dataset...') | |||
dataset_all = Dataset() | |||
dataset_all.load_predefined_dataset(ds_name) | |||
if not irrelevant_labels is None: | |||
dataset_all.remove_labels(**irrelevant_labels) | |||
# dataset_all.cut_graphs(range(0, 10)) | |||
datasets = split_dataset_by_target(dataset_all) | |||
gram_matrix_unnorm_list = [] | |||
run_time_list = [] | |||
print('start generating preimage for each class of target...') | |||
for idx, dataset in enumerate(datasets): | |||
target = dataset.targets[0] | |||
print('\ntarget =', target, '\n') | |||
# 2. initialize graph kernel. | |||
print('2. initializing graph kernel and setting parameters...') | |||
graph_kernel = get_graph_kernel_by_name(kernel_options['name'], | |||
node_labels=dataset.node_labels, | |||
edge_labels=dataset.edge_labels, | |||
node_attrs=dataset.node_attrs, | |||
edge_attrs=dataset.edge_attrs, | |||
ds_infos=dataset.get_dataset_infos(keys=['directed']), | |||
kernel_options=kernel_options) | |||
# 3. compute gram matrix. | |||
print('3. computing gram matrix...') | |||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options) | |||
gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm | |||
gram_matrix_unnorm_list.append(gram_matrix_unnorm) | |||
run_time_list.append(run_time) | |||
# 4. save results. | |||
print() | |||
print('4. saving results...') | |||
if save_results: | |||
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list) | |||
print('\ncomplete.') |
@@ -0,0 +1,33 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri Apr 3 10:38:59 2020 | |||
@author: ljia | |||
""" | |||
from tqdm import tqdm | |||
import sys | |||
print('start') | |||
for i in tqdm(range(10000000), file=sys.stdout): | |||
x = i | |||
# print(x) | |||
# ============================================================================= | |||
# summary | |||
# terminal, IPython 7.0.1 (Spyder 4): Works. | |||
# write to file: does not work. Progress bar splits as the progress goes. | |||
# Jupyter: | |||
# ============================================================================= | |||
# for i in tqdm(range(10000000)): | |||
# x = i | |||
# print(x) | |||
# ============================================================================= | |||
# summary | |||
# terminal, IPython 7.0.1 (Spyder 4): does not work. When combines with other | |||
# print, progress bar splits. | |||
# write to file: does not work. Cannot write progress bar to file. | |||
# Jupyter: | |||
# ============================================================================= |
@@ -1,7 +1,10 @@ | |||
numpy==1.15.2 | |||
scipy==1.1.0 | |||
matplotlib==3.0.0 | |||
networkx==2.2 | |||
scikit-learn==0.20.0 | |||
tabulate==0.8.2 | |||
tqdm==4.26.0 | |||
numpy>=1.15.2 | |||
scipy>=1.1.0 | |||
matplotlib>=3.0.0 | |||
networkx>=2.2 | |||
scikit-learn>=0.20.0 | |||
tabulate>=0.8.2 | |||
tqdm>=4.26.0 | |||
# cvxpy # for preimage. | |||
# cvxopt # for preimage. | |||
# mosek # for preimage. |