diff --git a/README.md b/README.md index 5f490ac..491a1ed 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master) [![PyPI version](https://badge.fury.io/py/graphkit-learn.svg)](https://badge.fury.io/py/graphkit-learn) -A python package for graph kernels. +A python package for graph kernels, graph edit distances and graph pre-image problem. ## Requirements diff --git a/gklearn/ged/median/median_graph_estimator.py b/gklearn/ged/median/median_graph_estimator.py index 70651a9..84cd64d 100644 --- a/gklearn/ged/median/median_graph_estimator.py +++ b/gklearn/ged/median/median_graph_estimator.py @@ -348,7 +348,7 @@ class MedianGraphEstimator(object): # Print information about current iteration. if self.__print_to_stdout == 2: - progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout) + progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout) # Compute node maps and sum of distances for initial median. self.__sum_of_distances = 0 @@ -457,7 +457,7 @@ class MedianGraphEstimator(object): self.__itrs[median_pos] += 1 # Update the best median. - if self.__sum_of_distances < self.__best_init_sum_of_distances: + if self.__sum_of_distances < best_sum_of_distances: best_sum_of_distances = self.__sum_of_distances node_maps_from_best_median = self.__node_maps_from_median best_median = median @@ -588,7 +588,7 @@ class MedianGraphEstimator(object): # Print information about current iteration. if self.__print_to_stdout == 2: - progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout) + progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout) # Compute the medoid. medoid_id = graph_ids[0] @@ -718,7 +718,7 @@ class MedianGraphEstimator(object): def __update_node_maps(self): # Print information about current iteration. if self.__print_to_stdout == 2: - progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) # Update the node maps. node_maps_were_modified = False diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index b58e945..2ff0103 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -307,7 +307,7 @@ def ged_options_to_string(options): opt_str = ' ' for key, val in options.items(): if key == 'initialization_method': - opt_str += '--initial_solutions ' + str(val) + ' ' + opt_str += '--initialization-method ' + str(val) + ' ' elif key == 'initialization_options': opt_str += '--initialization-options ' + str(val) + ' ' elif key == 'lower_bound_method': diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index e71abb3..e703981 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -76,11 +76,11 @@ class GraphKernel(object): def compute_distance_matrix(self): - dis_mat = np.empty((len(self._graphs), len(self._graphs))) if self._gram_matrix is None: raise Exception('Please compute the Gram matrix before computing distance matrix.') - for i in range(len(self._graphs)): - for j in range(i, len(self._graphs)): + dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix))) + for i in range(len(self._gram_matrix)): + for j in range(i, len(self._gram_matrix)): dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j] if dis < 0: if dis > -1e-10: @@ -184,18 +184,22 @@ class GraphKernel(object): def parallel(self): return self._parallel + @property def n_jobs(self): return self._n_jobs + @property def verbose(self): return self._verbose + @property def normalize(self): return self._normalize + @property def run_time(self): return self._run_time @@ -205,7 +209,15 @@ class GraphKernel(object): def gram_matrix(self): return self._gram_matrix + @gram_matrix.setter + def gram_matrix(self, value): + self._gram_matrix = value + @property def gram_matrix_unnorm(self): return self._gram_matrix_unnorm + + @gram_matrix_unnorm.setter + def gram_matrix_unnorm(self, value): + self._gram_matrix_unnorm = value \ No newline at end of file diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index 6b93dae..b10a598 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -36,10 +36,9 @@ class MedianPreimageGenerator(PreimageGenerator): self.__time_limit_in_sec = 0 self.__max_itrs = 100 self.__max_itrs_without_update = 3 - self.__epsilon_ratio = 0.01 + self.__epsilon_residual = 0.01 + self.__epsilon_ec = 0.1 # values to compute. - self.__edit_cost_constants = [] - self.__runtime_precompute_gm = None self.__runtime_optimize_ec = None self.__runtime_generate_preimage = None self.__runtime_total = None @@ -54,7 +53,11 @@ class MedianPreimageGenerator(PreimageGenerator): self.__itrs = 0 self.__converged = False self.__num_updates_ecc = 0 - + # values that can be set or to be computed. + self.__edit_cost_constants = [] + self.__gram_matrix_unnorm = None + self.__runtime_precompute_gm = None + def set_options(self, **kwargs): self._kernel_options = kwargs.get('kernel_options', {}) @@ -71,7 +74,10 @@ class MedianPreimageGenerator(PreimageGenerator): self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) self.__max_itrs = kwargs.get('max_itrs', 100) self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3) - self.__epsilon_ratio = kwargs.get('epsilon_ratio', 0.01) + self.__epsilon_residual = kwargs.get('epsilon_residual', 0.01) + self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1) + self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) + self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) def run(self): @@ -81,9 +87,18 @@ class MedianPreimageGenerator(PreimageGenerator): start = time.time() # 1. precompute gram matrix. - gram_matrix, run_time = self.__graph_kernel.compute(self._dataset.graphs, **self._kernel_options) - end_precompute_gm = time.time() - self.__runtime_precompute_gm = end_precompute_gm - start + if self.__gram_matrix_unnorm is None: + gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options) + self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm + end_precompute_gm = time.time() + self.__runtime_precompute_gm = end_precompute_gm - start + else: + if self.__runtime_precompute_gm is None: + raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') + self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm + self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) + end_precompute_gm = time.time() + start -= self.__runtime_precompute_gm # 2. optimize edit cost constants. self.__optimize_edit_cost_constants() @@ -134,6 +149,7 @@ class MedianPreimageGenerator(PreimageGenerator): print('Total number of updating edit costs:', self.__num_updates_ecc) print('Is optimization of edit costs converged:', self.__converged) print('================================================================================') + print() # collect return values. # return (sod_sm, sod_gm), \ @@ -222,7 +238,7 @@ class MedianPreimageGenerator(PreimageGenerator): def __optimize_ecc_by_kernel_distances(self): # compute distances in feature space. - dis_k_mat, _, _, _ = self.__graph_kernel.compute_distance_matrix() + dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix() dis_k_vec = [] for i in range(len(dis_k_mat)): # for j in range(i, len(dis_k_mat)): @@ -256,7 +272,7 @@ class MedianPreimageGenerator(PreimageGenerator): timer = Timer(self.__time_limit_in_sec) while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update): if self._verbose >= 2: - print('\niteration', self.__itrs) + print('\niteration', self.__itrs + 1) time0 = time.time() # "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method. # np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm', @@ -286,21 +302,21 @@ class MedianPreimageGenerator(PreimageGenerator): # check convergency. ec_changed = False for i, cost in enumerate(self.__edit_cost_constants): -# if cost == 0: -# if edit_cost_list[-2][i] > self.__epsilon_ratio: -# ec_changed = True -# break -# elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ratio: -# ec_changed = True -# break - if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ratio: + if cost == 0: + if edit_cost_list[-2][i] > self.__epsilon_ec: + ec_changed = True + break + elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ec: ec_changed = True break +# if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec: +# ec_changed = True +# break residual_changed = False if residual_list[-1] == 0: - if residual_list[-2] > self.__epsilon_ratio: + if residual_list[-2] > self.__epsilon_residual: residual_changed = True - elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_ratio: + elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_residual: residual_changed = True self.__converged = not (ec_changed or residual_changed) if self.__converged: @@ -313,14 +329,14 @@ class MedianPreimageGenerator(PreimageGenerator): if self._verbose >= 2: print() print('-------------------------------------------------------------------------') - print('States of iteration', str(self.__itrs)) + print('States of iteration', self.__itrs + 1) print('-------------------------------------------------------------------------') # print('Time spend:', self.__runtime_optimize_ec) - print('Total number of iterations for optimizing:', self.__itrs) + print('Total number of iterations for optimizing:', self.__itrs + 1) print('Total number of updating edit costs:', self.__num_updates_ecc) - print('Is optimization of edit costs converged:', self.__converged) - print('Does edit cost changed:', ec_changed) - print('Does residual changed:', residual_changed) + print('Was optimization of edit costs converged:', self.__converged) + print('Did edit costs change:', ec_changed) + print('Did residual change:', residual_changed) print('Iterations without update:', itrs_without_update) print('Current edit cost constants:', self.__edit_cost_constants) print('Residual list:', residual_list) @@ -634,11 +650,11 @@ class MedianPreimageGenerator(PreimageGenerator): def __compute_distances_to_true_median(self): # compute distance in kernel space for set median. - kernels_to_sm, _ = self.__graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) - kernel_sm, _ = self.__graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) - kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize + kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) + kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) + kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize # @todo: not correct kernel value - gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0) + gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) gram_with_sm = np.concatenate((np.array([[1] + kernels_to_sm]).T, gram_with_sm), axis=1) self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), @@ -649,10 +665,10 @@ class MedianPreimageGenerator(PreimageGenerator): # print(set_median.edges(data=True)) # compute distance in kernel space for generalized median. - kernels_to_gm, _ = self.__graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) - kernel_gm, _ = self.__graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) - kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize - gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0) + kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) + kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) + kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize + gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) gram_with_gm = np.concatenate((np.array([[1] + kernels_to_gm]).T, gram_with_gm), axis=1) self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), @@ -679,12 +695,12 @@ class MedianPreimageGenerator(PreimageGenerator): def __set_graph_kernel_by_name(self): if self.kernel_options['name'] == 'structuralspkernel': from gklearn.kernels import StructuralSP - self.__graph_kernel = StructuralSP(node_labels=self.dataset.node_labels, - edge_labels=self.dataset.edge_labels, - node_attrs=self.dataset.node_attrs, - edge_attrs=self.dataset.edge_attrs, - ds_infos=self.dataset.get_dataset_infos(keys=['directed']), - **self.kernel_options) + self._graph_kernel = StructuralSP(node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + node_attrs=self._dataset.node_attrs, + edge_attrs=self._dataset.edge_attrs, + ds_infos=self._dataset.get_dataset_infos(keys=['directed']), + **self._kernel_options) # def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): @@ -692,7 +708,7 @@ class MedianPreimageGenerator(PreimageGenerator): """ Cleans node and edge labels and attributes of the given graph. """ - G_new = nx.Graph() + G_new = nx.Graph(**G.graph) for nd, attrs in G.nodes(data=True): G_new.add_node(str(nd)) # @todo: should we keep this as str()? for l_name in self._dataset.node_labels: @@ -760,4 +776,13 @@ class MedianPreimageGenerator(PreimageGenerator): @property def best_from_dataset(self): - return self.__best_from_dataset \ No newline at end of file + return self.__best_from_dataset + + + @property + def gram_matrix_unnorm(self): + return self.__gram_matrix_unnorm + + @gram_matrix_unnorm.setter + def gram_matrix_unnorm(self, value): + self.__gram_matrix_unnorm = value \ No newline at end of file diff --git a/gklearn/preimage/preimage_generator.py b/gklearn/preimage/preimage_generator.py index d74f41c..306eb12 100644 --- a/gklearn/preimage/preimage_generator.py +++ b/gklearn/preimage/preimage_generator.py @@ -5,7 +5,7 @@ Created on Thu Mar 26 18:26:36 2020 @author: ljia """ -from gklearn.utils import Dataset +# from gklearn.utils import Dataset class PreimageGenerator(object): @@ -32,6 +32,11 @@ class PreimageGenerator(object): @kernel_options.setter def kernel_options(self, value): self._kernel_options = value + + + @property + def graph_kernel(self): + return self._graph_kernel @property @@ -41,3 +46,4 @@ class PreimageGenerator(object): @verbose.setter def verbose(self, value): self._verbose = value + diff --git a/gklearn/preimage/utils.py b/gklearn/preimage/utils.py index bd4de0b..cbe00a1 100644 --- a/gklearn/preimage/utils.py +++ b/gklearn/preimage/utils.py @@ -21,21 +21,23 @@ from gklearn.kernels.treeletKernel import treeletkernel from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel from gklearn.utils import Dataset import csv -import matplotlib.pyplot as plt import networkx as nx -def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, dir_save='', ): +def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, load_gm='auto', dir_save='', irrelevant_labels=None): + import os.path from gklearn.preimage import MedianPreimageGenerator from gklearn.utils import split_dataset_by_target from gklearn.utils.graphfiles import saveGXL # 1. get dataset. - print('getting dataset...') + print('1. getting dataset...') dataset_all = Dataset() dataset_all.load_predefined_dataset(ds_name) + if not irrelevant_labels is None: + dataset_all.remove_labels(**irrelevant_labels) +# dataset_all.cut_graphs(range(0, 100)) datasets = split_dataset_by_target(dataset_all) -# dataset.cut_graphs(range(0, 10)) if save_results: # create result files. @@ -47,7 +49,6 @@ def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_ dis_k_sm_list = [] dis_k_gm_list = [] dis_k_gi_min_list = [] - time_precompute_gm_list = [] time_optimize_ec_list = [] time_generate_list = [] time_total_list = [] @@ -58,6 +59,26 @@ def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_ nb_dis_k_sm2gm = [0, 0, 0] nb_dis_k_gi2sm = [0, 0, 0] nb_dis_k_gi2gm = [0, 0, 0] + dis_k_max_list = [] + dis_k_min_list = [] + dis_k_mean_list = [] + if load_gm == 'auto': + gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' + gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) + if gmfile_exist: + gmfile = np.load(gm_fname) + gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list'] + time_precompute_gm_list = gmfile['run_time_list'].tolist() + else: + gram_matrix_unnorm_list = [] + time_precompute_gm_list = [] + elif not load_gm: + gram_matrix_unnorm_list = [] + time_precompute_gm_list = [] + else: + gmfile = np.load() + gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list'] + time_precompute_gm_list = gmfile['run_time_list'] # repeats_better_sod_sm2gm = [] # repeats_better_dis_k_sm2gm = [] # repeats_better_dis_k_gi2sm = [] @@ -65,16 +86,23 @@ def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_ print('start generating preimage for each class of target...') - for dataset in datasets: - print('\ntarget =', dataset.targets[0], '\n') - num_graphs = len(dataset.graphs) + for idx, dataset in enumerate(datasets): + target = dataset.targets[0] + print('\ntarget =', target, '\n') +# if target != 1: +# continue + num_graphs = len(dataset.graphs) if num_graphs < 2: print('\nnumber of graphs = ', num_graphs, ', skip.\n') continue # 2. set parameters. - print('1. initializing mpg and setting parameters...') + print('2. initializing mpg and setting parameters...') + if load_gm: + if gmfile_exist: + mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx] + mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx] mpg = MedianPreimageGenerator() mpg.dataset = dataset mpg.set_options(**mpg_options.copy()) @@ -83,10 +111,19 @@ def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_ mpg.mge_options = mge_options.copy() # 3. compute median preimage. - print('2. computing median preimage...') + print('3. computing median preimage...') mpg.run() results = mpg.get_results() + # 4. compute pairwise kernel distances. + print('4. computing pairwise kernel distances...') + _, dis_k_max, dis_k_min, dis_k_mean = mpg.graph_kernel.compute_distance_matrix() + dis_k_max_list.append(dis_k_max) + dis_k_min_list.append(dis_k_min) + dis_k_mean_list.append(dis_k_mean) + + # 5. save results (and median graphs). + print('5. saving results (and median graphs)...') # write result detail. if save_results: print('writing results to files...') @@ -99,7 +136,7 @@ def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_ csv.writer(f_detail).writerow([ds_name, kernel_options['name'], ged_options['edit_cost'], ged_options['method'], ged_options['attr_distance'], mpg_options['fit_method'], - num_graphs, dataset.targets[0], 1, + num_graphs, target, 1, results['sod_set_median'], results['sod_gen_median'], results['k_dis_set_median'], results['k_dis_gen_median'], results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, @@ -161,7 +198,7 @@ def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_ csv.writer(f_summary).writerow([ds_name, kernel_options['name'], ged_options['edit_cost'], ged_options['method'], ged_options['attr_distance'], mpg_options['fit_method'], - num_graphs, dataset.targets[0], + num_graphs, target, results['sod_set_median'], results['sod_gen_median'], results['k_dis_set_median'], results['k_dis_gen_median'], results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, @@ -175,17 +212,18 @@ def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_ # save median graphs. if save_medians: - fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) + print('Saving median graphs to files...') + fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', - node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, + node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) - fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) + fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default', - node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, + node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) - fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) + fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', - node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, + node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) # plot median graphs. @@ -194,7 +232,9 @@ def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_ draw_Letter_graph(mpg.set_median, fn_pre_sm) draw_Letter_graph(mpg.gen_median, fn_pre_gm) draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset) - + + if (load_gm == 'auto' and not gmfile_exist) or not load_gm: + gram_matrix_unnorm_list.append(mpg.gram_matrix_unnorm) # write result summary for each letter. if save_results: @@ -227,6 +267,18 @@ def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_ num_converged, num_updates_ecc_mean]) f_summary.close() + # save total pairwise kernel distances. + dis_k_max = np.max(dis_k_max_list) + dis_k_min = np.min(dis_k_min_list) + dis_k_mean = np.mean(dis_k_mean_list) + print('The maximum pairwise distance in kernel space:', dis_k_max) + print('The minimum pairwise distance in kernel space:', dis_k_min) + print('The average pairwise distance in kernel space:', dis_k_mean) + + # write Gram matrices to file. + if (load_gm == 'auto' and not gmfile_exist) or not load_gm: + np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list) + print('\ncomplete.') @@ -235,7 +287,7 @@ def __init_output_file(ds_name, gkernel, fit_method, dir_output): fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' f_detail = open(dir_output + fn_output_detail, 'a') csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', - 'GED method', 'attr distance', 'fit method', 'k', + 'GED method', 'attr distance', 'fit method', 'num graphs', 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 'dis_k gi -> GM', 'edit cost constants', 'time precompute gm', @@ -247,7 +299,7 @@ def __init_output_file(ds_name, gkernel, fit_method, dir_output): fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' f_summary = open(dir_output + fn_output_summary, 'a') csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost', - 'GED method', 'attr distance', 'fit method', 'k', + 'GED method', 'attr distance', 'fit method', 'num graphs', 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 'dis_k gi -> GM', 'time precompute gm', 'time optimize ec', @@ -263,24 +315,28 @@ def __init_output_file(ds_name, gkernel, fit_method, dir_output): def get_relations(sign): - if sign == -1: - return 'better' - elif sign == 0: - return 'same' - elif sign == 1: - return 'worse' + if sign == -1: + return 'better' + elif sign == 0: + return 'same' + elif sign == 1: + return 'worse' #Dessin median courrant def draw_Letter_graph(graph, file_prefix): - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) - nx.draw_networkx(graph, pos) - plt.savefig(file_prefix + '.eps', format='eps', dpi=300) -# plt.show() - plt.clf() + import matplotlib + matplotlib.use('agg') + import matplotlib.pyplot as plt + plt.figure() + pos = {} + for n in graph.nodes: + pos[n] = np.array([float(graph.nodes[n]['x']),float(graph.nodes[n]['y'])]) + nx.draw_networkx(graph, pos) + plt.savefig(file_prefix + '.eps', format='eps', dpi=300) +# plt.show() + plt.clf() + plt.close() def remove_edges(Gn): @@ -288,6 +344,7 @@ def remove_edges(Gn): for _, _, attrs in G.edges(data=True): attrs.clear() + def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): term1 = Kmatrix[idx_g, idx_g] term2 = 0 diff --git a/gklearn/utils/__init__.py b/gklearn/utils/__init__.py index 84d54f3..d5301c6 100644 --- a/gklearn/utils/__init__.py +++ b/gklearn/utils/__init__.py @@ -17,3 +17,5 @@ __date__ = "November 2017" # from utils import utils from gklearn.utils.dataset import Dataset, split_dataset_by_target from gklearn.utils.timer import Timer +from gklearn.utils.utils import get_graph_kernel_by_name +from gklearn.utils.utils import compute_gram_matrices_by_class diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 08e2718..b36205d 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -56,9 +56,10 @@ class Dataset(object): def load_graphs(self, graphs, targets=None): + # this has to be followed by set_labels(). self.__graphs = graphs self.__targets = targets - self.set_labels_attrs() +# self.set_labels_attrs() def load_predefined_dataset(self, ds_name): @@ -94,6 +95,13 @@ class Dataset(object): self.set_labels_attrs() + + def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): + self.__node_labels = node_labels + self.__node_attrs = node_attrs + self.__edge_labels = edge_labels + self.__edge_attrs = edge_attrs + def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): # @todo: remove labels which have only one possible values. @@ -371,9 +379,34 @@ class Dataset(object): print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) + def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + for g in self.__graphs: + for nd in g.nodes(): + for nl in node_labels: + del g.nodes[nd][nl] + for na in node_attrs: + del g.nodes[nd][na] + for ed in g.edges(): + for el in edge_labels: + del g.edges[ed][el] + for ea in edge_attrs: + del g.edges[ed][ea] + if len(node_labels) > 0: + self.__node_labels = [nl for nl in self.__node_labels if nl not in node_labels] + if len(edge_labels) > 0: + self.__edge_labels = [el for el in self.__edge_labels if el not in edge_labels] + if len(node_attrs) > 0: + self.__node_attrs = [na for na in self.__node_attrs if na not in node_attrs] + if len(edge_attrs) > 0: + self.__edge_attrs = [ea for ea in self.__edge_attrs if ea not in edge_attrs] + + def cut_graphs(self, range_): self.__graphs = [self.__graphs[i] for i in range_] - self.set_labels_attrs() + if self.__targets is not None: + self.__targets = [self.__targets[i] for i in range_] + # @todo +# self.set_labels_attrs() def __get_dataset_size(self): @@ -574,5 +607,6 @@ def split_dataset_by_target(dataset): sub_graphs = [graphs[i] for i in val] sub_dataset = Dataset() sub_dataset.load_graphs(sub_graphs, [key] * len(val)) + sub_dataset.set_labels(node_labels=dataset.node_labels, node_attrs=dataset.node_attrs, edge_labels=dataset.edge_labels, edge_attrs=dataset.edge_attrs) datasets.append(sub_dataset) return datasets \ No newline at end of file diff --git a/gklearn/utils/utils.py b/gklearn/utils/utils.py index 9a1e56c..2a95ea2 100644 --- a/gklearn/utils/utils.py +++ b/gklearn/utils/utils.py @@ -296,3 +296,59 @@ def get_edge_labels(Gn, edge_label): for G in Gn: el = el | set(nx.get_edge_attributes(G, edge_label).values()) return el + + +def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}): + if name == 'structuralspkernel': + from gklearn.kernels import StructuralSP + graph_kernel = StructuralSP(node_labels=node_labels, edge_labels=edge_labels, + node_attrs=node_attrs, edge_attrs=edge_attrs, + ds_infos=ds_infos, **kernel_options) + return graph_kernel + + +def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save='', irrelevant_labels=None): + from gklearn.utils import Dataset, split_dataset_by_target + + # 1. get dataset. + print('1. getting dataset...') + dataset_all = Dataset() + dataset_all.load_predefined_dataset(ds_name) + if not irrelevant_labels is None: + dataset_all.remove_labels(**irrelevant_labels) +# dataset_all.cut_graphs(range(0, 10)) + datasets = split_dataset_by_target(dataset_all) + + gram_matrix_unnorm_list = [] + run_time_list = [] + + print('start generating preimage for each class of target...') + for idx, dataset in enumerate(datasets): + target = dataset.targets[0] + print('\ntarget =', target, '\n') + + # 2. initialize graph kernel. + print('2. initializing graph kernel and setting parameters...') + graph_kernel = get_graph_kernel_by_name(kernel_options['name'], + node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, + edge_attrs=dataset.edge_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + kernel_options=kernel_options) + + # 3. compute gram matrix. + print('3. computing gram matrix...') + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options) + gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm + + gram_matrix_unnorm_list.append(gram_matrix_unnorm) + run_time_list.append(run_time) + + # 4. save results. + print() + print('4. saving results...') + if save_results: + np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list) + + print('\ncomplete.') \ No newline at end of file diff --git a/notebooks/tests/test_tqdm.py b/notebooks/tests/test_tqdm.py new file mode 100644 index 0000000..e408760 --- /dev/null +++ b/notebooks/tests/test_tqdm.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Apr 3 10:38:59 2020 + +@author: ljia +""" + +from tqdm import tqdm +import sys + +print('start') + +for i in tqdm(range(10000000), file=sys.stdout): + x = i +# print(x) +# ============================================================================= +# summary +# terminal, IPython 7.0.1 (Spyder 4): Works. +# write to file: does not work. Progress bar splits as the progress goes. +# Jupyter: +# ============================================================================= + +# for i in tqdm(range(10000000)): +# x = i +# print(x) +# ============================================================================= +# summary +# terminal, IPython 7.0.1 (Spyder 4): does not work. When combines with other +# print, progress bar splits. +# write to file: does not work. Cannot write progress bar to file. +# Jupyter: +# ============================================================================= diff --git a/requirements.txt b/requirements.txt index a48620b..85aabf8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,10 @@ -numpy==1.15.2 -scipy==1.1.0 -matplotlib==3.0.0 -networkx==2.2 -scikit-learn==0.20.0 -tabulate==0.8.2 -tqdm==4.26.0 +numpy>=1.15.2 +scipy>=1.1.0 +matplotlib>=3.0.0 +networkx>=2.2 +scikit-learn>=0.20.0 +tabulate>=0.8.2 +tqdm>=4.26.0 +# cvxpy # for preimage. +# cvxopt # for preimage. +# mosek # for preimage.