diff --git a/README.md b/README.md index e31046a..491a1ed 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master) [![PyPI version](https://badge.fury.io/py/graphkit-learn.svg)](https://badge.fury.io/py/graphkit-learn) -A python package for graph kernels. +A python package for graph kernels, graph edit distances and graph pre-image problem. ## Requirements @@ -105,7 +105,7 @@ A comparison of performances of graph kernels on benchmark datasets can be found ## Authors -* [Linlin Jia](https://github.com/jajupmochi), LITIS, INSA Rouen Normandie +* [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie * [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie * [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie diff --git a/gklearn/ged/median/median_graph_estimator.py b/gklearn/ged/median/median_graph_estimator.py index 70651a9..84cd64d 100644 --- a/gklearn/ged/median/median_graph_estimator.py +++ b/gklearn/ged/median/median_graph_estimator.py @@ -348,7 +348,7 @@ class MedianGraphEstimator(object): # Print information about current iteration. if self.__print_to_stdout == 2: - progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout) + progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout) # Compute node maps and sum of distances for initial median. self.__sum_of_distances = 0 @@ -457,7 +457,7 @@ class MedianGraphEstimator(object): self.__itrs[median_pos] += 1 # Update the best median. - if self.__sum_of_distances < self.__best_init_sum_of_distances: + if self.__sum_of_distances < best_sum_of_distances: best_sum_of_distances = self.__sum_of_distances node_maps_from_best_median = self.__node_maps_from_median best_median = median @@ -588,7 +588,7 @@ class MedianGraphEstimator(object): # Print information about current iteration. if self.__print_to_stdout == 2: - progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout) + progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout) # Compute the medoid. medoid_id = graph_ids[0] @@ -718,7 +718,7 @@ class MedianGraphEstimator(object): def __update_node_maps(self): # Print information about current iteration. if self.__print_to_stdout == 2: - progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) # Update the node maps. node_maps_were_modified = False diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index b58e945..2ff0103 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -307,7 +307,7 @@ def ged_options_to_string(options): opt_str = ' ' for key, val in options.items(): if key == 'initialization_method': - opt_str += '--initial_solutions ' + str(val) + ' ' + opt_str += '--initialization-method ' + str(val) + ' ' elif key == 'initialization_options': opt_str += '--initialization-options ' + str(val) + ' ' elif key == 'lower_bound_method': diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index e71abb3..e703981 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -76,11 +76,11 @@ class GraphKernel(object): def compute_distance_matrix(self): - dis_mat = np.empty((len(self._graphs), len(self._graphs))) if self._gram_matrix is None: raise Exception('Please compute the Gram matrix before computing distance matrix.') - for i in range(len(self._graphs)): - for j in range(i, len(self._graphs)): + dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix))) + for i in range(len(self._gram_matrix)): + for j in range(i, len(self._gram_matrix)): dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j] if dis < 0: if dis > -1e-10: @@ -184,18 +184,22 @@ class GraphKernel(object): def parallel(self): return self._parallel + @property def n_jobs(self): return self._n_jobs + @property def verbose(self): return self._verbose + @property def normalize(self): return self._normalize + @property def run_time(self): return self._run_time @@ -205,7 +209,15 @@ class GraphKernel(object): def gram_matrix(self): return self._gram_matrix + @gram_matrix.setter + def gram_matrix(self, value): + self._gram_matrix = value + @property def gram_matrix_unnorm(self): return self._gram_matrix_unnorm + + @gram_matrix_unnorm.setter + def gram_matrix_unnorm(self, value): + self._gram_matrix_unnorm = value \ No newline at end of file diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index 89ef7b0..b10a598 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -17,6 +17,7 @@ from gklearn.ged.util import compute_geds, ged_options_to_string from gklearn.ged.median import MedianGraphEstimator from gklearn.ged.median import constant_node_costs,mge_options_to_string from gklearn.gedlib import librariesImport, gedlibpy +from gklearn.utils import Timer # from gklearn.utils.dataset import Dataset class MedianPreimageGenerator(PreimageGenerator): @@ -29,24 +30,34 @@ class MedianPreimageGenerator(PreimageGenerator): self.__mge_options = {} self.__fit_method = 'k-graphs' self.__init_ecc = None - self.__max_itrs = 100 self.__parallel = True self.__n_jobs = multiprocessing.cpu_count() self.__ds_name = None + self.__time_limit_in_sec = 0 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__epsilon_residual = 0.01 + self.__epsilon_ec = 0.1 # values to compute. - self.__edit_cost_constants = [] - self.__runtime_precompute_gm = None self.__runtime_optimize_ec = None self.__runtime_generate_preimage = None self.__runtime_total = None self.__set_median = None self.__gen_median = None + self.__best_from_dataset = None self.__sod_set_median = None self.__sod_gen_median = None self.__k_dis_set_median = None self.__k_dis_gen_median = None self.__k_dis_dataset = None - + self.__itrs = 0 + self.__converged = False + self.__num_updates_ecc = 0 + # values that can be set or to be computed. + self.__edit_cost_constants = [] + self.__gram_matrix_unnorm = None + self.__runtime_precompute_gm = None + def set_options(self, **kwargs): self._kernel_options = kwargs.get('kernel_options', {}) @@ -57,10 +68,16 @@ class MedianPreimageGenerator(PreimageGenerator): self.__fit_method = kwargs.get('fit_method', 'k-graphs') self.__init_ecc = kwargs.get('init_ecc', None) self.__edit_cost_constants = kwargs.get('edit_cost_constants', []) - self.__max_itrs = kwargs.get('max_itrs', 100) self.__parallel = kwargs.get('parallel', True) self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) self.__ds_name = kwargs.get('ds_name', None) + self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) + self.__max_itrs = kwargs.get('max_itrs', 100) + self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3) + self.__epsilon_residual = kwargs.get('epsilon_residual', 0.01) + self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1) + self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) + self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) def run(self): @@ -70,12 +87,20 @@ class MedianPreimageGenerator(PreimageGenerator): start = time.time() # 1. precompute gram matrix. - gram_matrix, run_time = self.__graph_kernel.compute(self._dataset.graphs, **self._kernel_options) - end_precompute_gm = time.time() - self.__runtime_precompute_gm = end_precompute_gm - start + if self.__gram_matrix_unnorm is None: + gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options) + self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm + end_precompute_gm = time.time() + self.__runtime_precompute_gm = end_precompute_gm - start + else: + if self.__runtime_precompute_gm is None: + raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') + self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm + self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) + end_precompute_gm = time.time() + start -= self.__runtime_precompute_gm # 2. optimize edit cost constants. -# self.__optimize_edit_cost_constants(dataset=dataset, Gn=Gn, Kmatrix_median=Kmatrix_median) self.__optimize_edit_cost_constants() end_optimize_ec = time.time() self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm @@ -108,28 +133,48 @@ class MedianPreimageGenerator(PreimageGenerator): if self._verbose: print() print('================================================================================') - print('The optimized edit cost constants: ', self.__edit_cost_constants) - print('SOD of the set median: ', self.__sod_set_median) - print('SOD of the generalized median: ', self.__sod_gen_median) + print('Finished generalization of preimages.') + print('--------------------------------------------------------------------------------') + print('The optimized edit cost constants:', self.__edit_cost_constants) + print('SOD of the set median:', self.__sod_set_median) + print('SOD of the generalized median:', self.__sod_gen_median) print('Distance in kernel space for set median:', self.__k_dis_set_median) print('Distance in kernel space for generalized median:', self.__k_dis_gen_median) print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) - print('Time to pre-compute Gram matrix: ', self.__runtime_precompute_gm) - print('Time to optimize edit costs: ', self.__runtime_optimize_ec) - print('Time to generate pre-images: ', self.__runtime_generate_preimage) - print('Total time: ', self.__runtime_total) + print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) + print('Time to optimize edit costs:', self.__runtime_optimize_ec) + print('Time to generate pre-images:', self.__runtime_generate_preimage) + print('Total time:', self.__runtime_total) + print('Total number of iterations for optimizing:', self.__itrs) + print('Total number of updating edit costs:', self.__num_updates_ecc) + print('Is optimization of edit costs converged:', self.__converged) print('================================================================================') + print() - - - # collect return values. # return (sod_sm, sod_gm), \ # (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ # (time_fitting, time_generating) + + def get_results(self): + results = {} + results['edit_cost_constants'] = self.__edit_cost_constants + results['runtime_precompute_gm'] = self.__runtime_precompute_gm + results['runtime_optimize_ec'] = self.__runtime_optimize_ec + results['runtime_generate_preimage'] = self.__runtime_generate_preimage + results['runtime_total'] = self.__runtime_total + results['sod_set_median'] = self.__sod_set_median + results['sod_gen_median'] = self.__sod_gen_median + results['k_dis_set_median'] = self.__k_dis_set_median + results['k_dis_gen_median'] = self.__k_dis_gen_median + results['k_dis_dataset'] = self.__k_dis_dataset + results['itrs'] = self.__itrs + results['converged'] = self.__converged + results['num_updates_ecc'] = self.__num_updates_ecc + return results + -# def __optimize_edit_cost_constants(self, dataset=None, Gn=None, Kmatrix_median=None): def __optimize_edit_cost_constants(self): """fit edit cost constants. """ @@ -177,8 +222,6 @@ class MedianPreimageGenerator(PreimageGenerator): self.__init_ecc = [3, 3, 1, 3, 3, 1] # optimize on the k-graph subset. self.__optimize_ecc_by_kernel_distances() -# fit_GED_to_kernel_distance(Gn_median, -# dataset=dataset, Kmatrix=Kmatrix_median) elif self.__fit_method == 'whole-dataset': if self.__init_ecc is None: if self.__ged_options['edit_cost'] == 'LETTER': @@ -189,17 +232,13 @@ class MedianPreimageGenerator(PreimageGenerator): self.__init_ecc = [3, 3, 1, 3, 3, 1] # optimizeon the whole set. self.__optimize_ecc_by_kernel_distances() -# fit_GED_to_kernel_distance(Gn, dataset=dataset) elif self.__fit_method == 'precomputed': pass - def __optimize_ecc_by_kernel_distances(self): -# def fit_GED_to_kernel_distance(Gn, Kmatrix=None, -# parallel=True): - + def __optimize_ecc_by_kernel_distances(self): # compute distances in feature space. - dis_k_mat, _, _, _ = self.__graph_kernel.compute_distance_matrix() + dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix() dis_k_vec = [] for i in range(len(dis_k_mat)): # for j in range(i, len(dis_k_mat)): @@ -222,20 +261,25 @@ class MedianPreimageGenerator(PreimageGenerator): nb_cost_mat = np.array(n_edit_operations) nb_cost_mat_list = [nb_cost_mat] if self._verbose >= 2: - print('edit_cost_constants:', self.__edit_cost_constants) - print('residual_list:', residual_list) - - for itr in range(self.__max_itrs): + print('Current edit cost constants:', self.__edit_cost_constants) + print('Residual list:', residual_list) + + # run iteration from initial edit costs. + self.__converged = False + itrs_without_update = 0 + self.__itrs = 0 + self.__num_updates_ecc = 0 + timer = Timer(self.__time_limit_in_sec) + while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update): if self._verbose >= 2: - print('\niteration', itr) + print('\niteration', self.__itrs + 1) time0 = time.time() - # "fit" geds to distances in feature space by tuning edit costs using the - # Least Squares Method. - np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm', - nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, - n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, - ged_mat=ged_mat) - self.__edit_cost_constants, residual = self.__update_ecc(nb_cost_mat, dis_k_vec) + # "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method. +# np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm', +# nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, +# n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, +# ged_mat=ged_mat) + self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec) for i in range(len(self.__edit_cost_constants)): if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9: self.__edit_cost_constants[i] = 0 @@ -254,12 +298,59 @@ class MedianPreimageGenerator(PreimageGenerator): edit_cost_list.append(self.__edit_cost_constants) nb_cost_mat = np.array(n_edit_operations) nb_cost_mat_list.append(nb_cost_mat) + + # check convergency. + ec_changed = False + for i, cost in enumerate(self.__edit_cost_constants): + if cost == 0: + if edit_cost_list[-2][i] > self.__epsilon_ec: + ec_changed = True + break + elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ec: + ec_changed = True + break +# if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec: +# ec_changed = True +# break + residual_changed = False + if residual_list[-1] == 0: + if residual_list[-2] > self.__epsilon_residual: + residual_changed = True + elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_residual: + residual_changed = True + self.__converged = not (ec_changed or residual_changed) + if self.__converged: + itrs_without_update += 1 + else: + itrs_without_update = 0 + self.__num_updates_ecc += 1 + + # print current states. if self._verbose >= 2: - print('edit_cost_constants:', self.__edit_cost_constants) - print('residual_list:', residual_list) - -# return residual_list, edit_cost_list, dis_k_mat, ged_mat, \ -# time_list, nb_cost_mat_list + print() + print('-------------------------------------------------------------------------') + print('States of iteration', self.__itrs + 1) + print('-------------------------------------------------------------------------') +# print('Time spend:', self.__runtime_optimize_ec) + print('Total number of iterations for optimizing:', self.__itrs + 1) + print('Total number of updating edit costs:', self.__num_updates_ecc) + print('Was optimization of edit costs converged:', self.__converged) + print('Did edit costs change:', ec_changed) + print('Did residual change:', residual_changed) + print('Iterations without update:', itrs_without_update) + print('Current edit cost constants:', self.__edit_cost_constants) + print('Residual list:', residual_list) + print('-------------------------------------------------------------------------') + + self.__itrs += 1 + + + def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): +# if self.__state == AlgorithmState.TERMINATED: +# self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'): @@ -559,11 +650,11 @@ class MedianPreimageGenerator(PreimageGenerator): def __compute_distances_to_true_median(self): # compute distance in kernel space for set median. - kernels_to_sm, _ = self.__graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) - kernel_sm, _ = self.__graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) - kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize + kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) + kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) + kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize # @todo: not correct kernel value - gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0) + gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) gram_with_sm = np.concatenate((np.array([[1] + kernels_to_sm]).T, gram_with_sm), axis=1) self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), @@ -574,10 +665,10 @@ class MedianPreimageGenerator(PreimageGenerator): # print(set_median.edges(data=True)) # compute distance in kernel space for generalized median. - kernels_to_gm, _ = self.__graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) - kernel_gm, _ = self.__graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) - kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__graph_kernel.gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize - gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self.__graph_kernel.gram_matrix)), axis=0) + kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) + kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) + kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize + gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) gram_with_gm = np.concatenate((np.array([[1] + kernels_to_gm]).T, gram_with_gm), axis=1) self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), @@ -591,6 +682,7 @@ class MedianPreimageGenerator(PreimageGenerator): gram_with_gm, withterm3=False)) idx_k_dis_median_set_min = np.argmin(k_dis_median_set) self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min] + self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy() if self._verbose >= 2: print() @@ -599,18 +691,16 @@ class MedianPreimageGenerator(PreimageGenerator): print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) print('distance in kernel space for each graph in median set:', k_dis_median_set) -# return dis_k_sm, dis_k_gm, k_dis_median_set, dis_k_gi_min, idx_dis_k_gi_min - def __set_graph_kernel_by_name(self): if self.kernel_options['name'] == 'structuralspkernel': from gklearn.kernels import StructuralSP - self.__graph_kernel = StructuralSP(node_labels=self.dataset.node_labels, - edge_labels=self.dataset.edge_labels, - node_attrs=self.dataset.node_attrs, - edge_attrs=self.dataset.edge_attrs, - ds_infos=self.dataset.get_dataset_infos(keys=['directed']), - **self.kernel_options) + self._graph_kernel = StructuralSP(node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + node_attrs=self._dataset.node_attrs, + edge_attrs=self._dataset.edge_attrs, + ds_infos=self._dataset.get_dataset_infos(keys=['directed']), + **self._kernel_options) # def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): @@ -618,7 +708,7 @@ class MedianPreimageGenerator(PreimageGenerator): """ Cleans node and edge labels and attributes of the given graph. """ - G_new = nx.Graph() + G_new = nx.Graph(**G.graph) for nd, attrs in G.nodes(data=True): G_new.add_node(str(nd)) # @todo: should we keep this as str()? for l_name in self._dataset.node_labels: @@ -670,5 +760,29 @@ class MedianPreimageGenerator(PreimageGenerator): return self.__init_ecc @init_ecc.setter - def fit_method(self, value): - self.__init_ecc = value \ No newline at end of file + def init_ecc(self, value): + self.__init_ecc = value + + + @property + def set_median(self): + return self.__set_median + + + @property + def gen_median(self): + return self.__gen_median + + + @property + def best_from_dataset(self): + return self.__best_from_dataset + + + @property + def gram_matrix_unnorm(self): + return self.__gram_matrix_unnorm + + @gram_matrix_unnorm.setter + def gram_matrix_unnorm(self, value): + self.__gram_matrix_unnorm = value \ No newline at end of file diff --git a/gklearn/preimage/preimage_generator.py b/gklearn/preimage/preimage_generator.py index d74f41c..306eb12 100644 --- a/gklearn/preimage/preimage_generator.py +++ b/gklearn/preimage/preimage_generator.py @@ -5,7 +5,7 @@ Created on Thu Mar 26 18:26:36 2020 @author: ljia """ -from gklearn.utils import Dataset +# from gklearn.utils import Dataset class PreimageGenerator(object): @@ -32,6 +32,11 @@ class PreimageGenerator(object): @kernel_options.setter def kernel_options(self, value): self._kernel_options = value + + + @property + def graph_kernel(self): + return self._graph_kernel @property @@ -41,3 +46,4 @@ class PreimageGenerator(object): @verbose.setter def verbose(self, value): self._verbose = value + diff --git a/gklearn/preimage/test_median_preimage_generator.py b/gklearn/preimage/test_median_preimage_generator.py index 9b0ccc4..2f458af 100644 --- a/gklearn/preimage/test_median_preimage_generator.py +++ b/gklearn/preimage/test_median_preimage_generator.py @@ -20,9 +20,12 @@ def test_median_preimage_generator(): mpg = MedianPreimageGenerator() mpg_options = {'fit_method': 'k-graphs', 'init_ecc': [3, 3, 1, 3, 3], - 'max_itrs': 6, 'ds_name': 'Letter-high', 'parallel': True, + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_ratio': 0.01, 'verbose': 2} mpg.set_options(**mpg_options) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) diff --git a/gklearn/preimage/utils.py b/gklearn/preimage/utils.py index 9fd186d..cbe00a1 100644 --- a/gklearn/preimage/utils.py +++ b/gklearn/preimage/utils.py @@ -19,146 +19,465 @@ from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, po from gklearn.kernels.structuralspKernel import structuralspkernel from gklearn.kernels.treeletKernel import treeletkernel from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel +from gklearn.utils import Dataset +import csv +import networkx as nx + + +def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, load_gm='auto', dir_save='', irrelevant_labels=None): + import os.path + from gklearn.preimage import MedianPreimageGenerator + from gklearn.utils import split_dataset_by_target + from gklearn.utils.graphfiles import saveGXL + + # 1. get dataset. + print('1. getting dataset...') + dataset_all = Dataset() + dataset_all.load_predefined_dataset(ds_name) + if not irrelevant_labels is None: + dataset_all.remove_labels(**irrelevant_labels) +# dataset_all.cut_graphs(range(0, 100)) + datasets = split_dataset_by_target(dataset_all) + + if save_results: + # create result files. + print('creating output files...') + fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save) + + sod_sm_list = [] + sod_gm_list = [] + dis_k_sm_list = [] + dis_k_gm_list = [] + dis_k_gi_min_list = [] + time_optimize_ec_list = [] + time_generate_list = [] + time_total_list = [] + itrs_list = [] + converged_list = [] + num_updates_ecc_list = [] + nb_sod_sm2gm = [0, 0, 0] + nb_dis_k_sm2gm = [0, 0, 0] + nb_dis_k_gi2sm = [0, 0, 0] + nb_dis_k_gi2gm = [0, 0, 0] + dis_k_max_list = [] + dis_k_min_list = [] + dis_k_mean_list = [] + if load_gm == 'auto': + gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' + gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) + if gmfile_exist: + gmfile = np.load(gm_fname) + gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list'] + time_precompute_gm_list = gmfile['run_time_list'].tolist() + else: + gram_matrix_unnorm_list = [] + time_precompute_gm_list = [] + elif not load_gm: + gram_matrix_unnorm_list = [] + time_precompute_gm_list = [] + else: + gmfile = np.load() + gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list'] + time_precompute_gm_list = gmfile['run_time_list'] +# repeats_better_sod_sm2gm = [] +# repeats_better_dis_k_sm2gm = [] +# repeats_better_dis_k_gi2sm = [] +# repeats_better_dis_k_gi2gm = [] + + + print('start generating preimage for each class of target...') + for idx, dataset in enumerate(datasets): + target = dataset.targets[0] + print('\ntarget =', target, '\n') +# if target != 1: +# continue + + num_graphs = len(dataset.graphs) + if num_graphs < 2: + print('\nnumber of graphs = ', num_graphs, ', skip.\n') + continue + + # 2. set parameters. + print('2. initializing mpg and setting parameters...') + if load_gm: + if gmfile_exist: + mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx] + mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx] + mpg = MedianPreimageGenerator() + mpg.dataset = dataset + mpg.set_options(**mpg_options.copy()) + mpg.kernel_options = kernel_options.copy() + mpg.ged_options = ged_options.copy() + mpg.mge_options = mge_options.copy() + + # 3. compute median preimage. + print('3. computing median preimage...') + mpg.run() + results = mpg.get_results() + + # 4. compute pairwise kernel distances. + print('4. computing pairwise kernel distances...') + _, dis_k_max, dis_k_min, dis_k_mean = mpg.graph_kernel.compute_distance_matrix() + dis_k_max_list.append(dis_k_max) + dis_k_min_list.append(dis_k_min) + dis_k_mean_list.append(dis_k_mean) + + # 5. save results (and median graphs). + print('5. saving results (and median graphs)...') + # write result detail. + if save_results: + print('writing results to files...') + sod_sm2gm = get_relations(np.sign(results['sod_gen_median'] - results['sod_set_median'])) + dis_k_sm2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_set_median'])) + dis_k_gi2sm = get_relations(np.sign(results['k_dis_set_median'] - results['k_dis_dataset'])) + dis_k_gi2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_dataset'])) + + f_detail = open(dir_save + fn_output_detail, 'a') + csv.writer(f_detail).writerow([ds_name, kernel_options['name'], + ged_options['edit_cost'], ged_options['method'], + ged_options['attr_distance'], mpg_options['fit_method'], + num_graphs, target, 1, + results['sod_set_median'], results['sod_gen_median'], + results['k_dis_set_median'], results['k_dis_gen_median'], + results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, + dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'], + results['runtime_precompute_gm'], results['runtime_optimize_ec'], + results['runtime_generate_preimage'], results['runtime_total'], + results['itrs'], results['converged'], + results['num_updates_ecc']]) + f_detail.close() + + # compute result summary. + sod_sm_list.append(results['sod_set_median']) + sod_gm_list.append(results['sod_gen_median']) + dis_k_sm_list.append(results['k_dis_set_median']) + dis_k_gm_list.append(results['k_dis_gen_median']) + dis_k_gi_min_list.append(results['k_dis_dataset']) + time_precompute_gm_list.append(results['runtime_precompute_gm']) + time_optimize_ec_list.append(results['runtime_optimize_ec']) + time_generate_list.append(results['runtime_generate_preimage']) + time_total_list.append(results['runtime_total']) + itrs_list.append(results['itrs']) + converged_list.append(results['converged']) + num_updates_ecc_list.append(results['num_updates_ecc']) + # # SOD SM -> GM + if results['sod_set_median'] > results['sod_gen_median']: + nb_sod_sm2gm[0] += 1 + # repeats_better_sod_sm2gm.append(1) + elif results['sod_set_median'] == results['sod_gen_median']: + nb_sod_sm2gm[1] += 1 + elif results['sod_set_median'] < results['sod_gen_median']: + nb_sod_sm2gm[2] += 1 + # # dis_k SM -> GM + if results['k_dis_set_median'] > results['k_dis_gen_median']: + nb_dis_k_sm2gm[0] += 1 + # repeats_better_dis_k_sm2gm.append(1) + elif results['k_dis_set_median'] == results['k_dis_gen_median']: + nb_dis_k_sm2gm[1] += 1 + elif results['k_dis_set_median'] < results['k_dis_gen_median']: + nb_dis_k_sm2gm[2] += 1 + # # dis_k gi -> SM + if results['k_dis_dataset'] > results['k_dis_set_median']: + nb_dis_k_gi2sm[0] += 1 + # repeats_better_dis_k_gi2sm.append(1) + elif results['k_dis_dataset'] == results['k_dis_set_median']: + nb_dis_k_gi2sm[1] += 1 + elif results['k_dis_dataset'] < results['k_dis_set_median']: + nb_dis_k_gi2sm[2] += 1 + # # dis_k gi -> GM + if results['k_dis_dataset'] > results['k_dis_gen_median']: + nb_dis_k_gi2gm[0] += 1 + # repeats_better_dis_k_gi2gm.append(1) + elif results['k_dis_dataset'] == results['k_dis_gen_median']: + nb_dis_k_gi2gm[1] += 1 + elif results['k_dis_dataset'] < results['k_dis_gen_median']: + nb_dis_k_gi2gm[2] += 1 + + # write result summary for each letter. + f_summary = open(dir_save + fn_output_summary, 'a') + csv.writer(f_summary).writerow([ds_name, kernel_options['name'], + ged_options['edit_cost'], ged_options['method'], + ged_options['attr_distance'], mpg_options['fit_method'], + num_graphs, target, + results['sod_set_median'], results['sod_gen_median'], + results['k_dis_set_median'], results['k_dis_gen_median'], + results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, + dis_k_gi2sm, dis_k_gi2gm, + results['runtime_precompute_gm'], results['runtime_optimize_ec'], + results['runtime_generate_preimage'], results['runtime_total'], + results['itrs'], results['converged'], + results['num_updates_ecc'], nb_sod_sm2gm, + nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm]) + f_summary.close() + + # save median graphs. + if save_medians: + print('Saving median graphs to files...') + fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) + saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', + node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) + fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) + saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default', + node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) + fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) + saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', + node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) + + # plot median graphs. + if plot_medians and save_medians: + if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low': + draw_Letter_graph(mpg.set_median, fn_pre_sm) + draw_Letter_graph(mpg.gen_median, fn_pre_gm) + draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset) + + if (load_gm == 'auto' and not gmfile_exist) or not load_gm: + gram_matrix_unnorm_list.append(mpg.gram_matrix_unnorm) + + # write result summary for each letter. + if save_results: + sod_sm_mean = np.mean(sod_sm_list) + sod_gm_mean = np.mean(sod_gm_list) + dis_k_sm_mean = np.mean(dis_k_sm_list) + dis_k_gm_mean = np.mean(dis_k_gm_list) + dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) + time_precompute_gm_mean = np.mean(time_precompute_gm_list) + time_optimize_ec_mean = np.mean(time_optimize_ec_list) + time_generate_mean = np.mean(time_generate_list) + time_total_mean = np.mean(time_total_list) + itrs_mean = np.mean(itrs_list) + num_converged = np.sum(converged_list) + num_updates_ecc_mean = np.mean(num_updates_ecc_list) + sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean)) + dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) + dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) + dis_k_gi2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) + f_summary = open(dir_save + fn_output_summary, 'a') + csv.writer(f_summary).writerow([ds_name, kernel_options['name'], + ged_options['edit_cost'], ged_options['method'], + ged_options['attr_distance'], mpg_options['fit_method'], + num_graphs, 'all', + sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, + dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, + dis_k_gi2sm_mean, dis_k_gi2gm_mean, + time_precompute_gm_mean, time_optimize_ec_mean, + time_generate_mean, time_total_mean, itrs_mean, + num_converged, num_updates_ecc_mean]) + f_summary.close() + + # save total pairwise kernel distances. + dis_k_max = np.max(dis_k_max_list) + dis_k_min = np.min(dis_k_min_list) + dis_k_mean = np.mean(dis_k_mean_list) + print('The maximum pairwise distance in kernel space:', dis_k_max) + print('The minimum pairwise distance in kernel space:', dis_k_min) + print('The average pairwise distance in kernel space:', dis_k_mean) + + # write Gram matrices to file. + if (load_gm == 'auto' and not gmfile_exist) or not load_gm: + np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list) + + print('\ncomplete.') + + +def __init_output_file(ds_name, gkernel, fit_method, dir_output): +# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' + fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' + f_detail = open(dir_output + fn_output_detail, 'a') + csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', + 'GED method', 'attr distance', 'fit method', 'num graphs', + 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM', 'edit cost constants', 'time precompute gm', + 'time optimize ec', 'time generate preimage', 'time total', + 'itrs', 'converged', 'num updates ecc']) + f_detail.close() + +# fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' + fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' + f_summary = open(dir_output + fn_output_summary, 'a') + csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost', + 'GED method', 'attr distance', 'fit method', 'num graphs', + 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM', 'time precompute gm', 'time optimize ec', + 'time generate preimage', 'time total', 'itrs', 'num converged', + 'num updates ecc', '# SOD SM -> GM', '# dis_k SM -> GM', + '# dis_k gi -> SM', '# dis_k gi -> GM']) +# 'repeats better SOD SM -> GM', +# 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', +# 'repeats better dis_k gi -> GM']) + f_summary.close() + + return fn_output_detail, fn_output_summary + + +def get_relations(sign): + if sign == -1: + return 'better' + elif sign == 0: + return 'same' + elif sign == 1: + return 'worse' + + +#Dessin median courrant +def draw_Letter_graph(graph, file_prefix): + import matplotlib + matplotlib.use('agg') + import matplotlib.pyplot as plt + plt.figure() + pos = {} + for n in graph.nodes: + pos[n] = np.array([float(graph.nodes[n]['x']),float(graph.nodes[n]['y'])]) + nx.draw_networkx(graph, pos) + plt.savefig(file_prefix + '.eps', format='eps', dpi=300) +# plt.show() + plt.clf() + plt.close() def remove_edges(Gn): - for G in Gn: - for _, _, attrs in G.edges(data=True): - attrs.clear() - + for G in Gn: + for _, _, attrs in G.edges(data=True): + attrs.clear() + + def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): - term1 = Kmatrix[idx_g, idx_g] - term2 = 0 - for i, a in enumerate(alpha): - term2 += a * Kmatrix[idx_g, idx_gi[i]] - term2 *= 2 - if withterm3 == False: - for i1, a1 in enumerate(alpha): - for i2, a2 in enumerate(alpha): - term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] - return np.sqrt(term1 - term2 + term3) + term1 = Kmatrix[idx_g, idx_g] + term2 = 0 + for i, a in enumerate(alpha): + term2 += a * Kmatrix[idx_g, idx_gi[i]] + term2 *= 2 + if withterm3 == False: + for i1, a1 in enumerate(alpha): + for i2, a2 in enumerate(alpha): + term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] + return np.sqrt(term1 - term2 + term3) def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): - term1 = Kmatrix[idx_g, idx_g] - term2 = 0 - for i, a in enumerate(alpha): - term2 += a * Kmatrix[idx_g, idx_gi[i]] - term2 *= 2 - if withterm3 == False: - for i1, a1 in enumerate(alpha): - for i2, a2 in enumerate(alpha): - term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] - return np.sqrt(term1 - term2 + term3) + term1 = Kmatrix[idx_g, idx_g] + term2 = 0 + for i, a in enumerate(alpha): + term2 += a * Kmatrix[idx_g, idx_gi[i]] + term2 *= 2 + if withterm3 == False: + for i1, a1 in enumerate(alpha): + for i2, a2 in enumerate(alpha): + term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] + return np.sqrt(term1 - term2 + term3) def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'): - if graph_kernel == 'marginalizedkernel': - Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, - p_quit=0.03, n_iteration=10, remove_totters=False, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) - elif graph_kernel == 'untilhpathkernel': - Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, - depth=7, k_func='MinMax', compute_method='trie', - parallel=parallel, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) - elif graph_kernel == 'spkernel': - mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) - Kmatrix = np.empty((len(Gn), len(Gn))) -# Kmatrix[:] = np.nan - Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= - {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) -# for i, row in enumerate(idx): -# for j, col in enumerate(idx): -# Kmatrix[row, col] = Kmatrix_tmp[i, j] - elif graph_kernel == 'structuralspkernel': - mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) - sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, - edge_label=edge_label, node_kernels=sub_kernels, - edge_kernels=sub_kernels, - parallel=parallel, n_jobs=multiprocessing.cpu_count(), - verbose=verbose) - elif graph_kernel == 'treeletkernel': - pkernel = functools.partial(polynomialkernel, d=2, c=1e5) -# pkernel = functools.partial(gaussiankernel, gamma=1e-6) - mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) - Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, - sub_kernel=pkernel, parallel=parallel, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) - elif graph_kernel == 'weisfeilerlehmankernel': - Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, - height=4, base_kernel='subtree', parallel=None, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) - - # normalization - Kmatrix_diag = Kmatrix.diagonal().copy() - for i in range(len(Kmatrix)): - for j in range(i, len(Kmatrix)): - Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) - Kmatrix[j][i] = Kmatrix[i][j] - return Kmatrix - + if graph_kernel == 'marginalizedkernel': + Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, + p_quit=0.03, n_iteration=10, remove_totters=False, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) + elif graph_kernel == 'untilhpathkernel': + Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, + depth=7, k_func='MinMax', compute_method='trie', + parallel=parallel, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) + elif graph_kernel == 'spkernel': + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + Kmatrix = np.empty((len(Gn), len(Gn))) +# Kmatrix[:] = np.nan + Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= + {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) +# for i, row in enumerate(idx): +# for j, col in enumerate(idx): +# Kmatrix[row, col] = Kmatrix_tmp[i, j] + elif graph_kernel == 'structuralspkernel': + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, + edge_label=edge_label, node_kernels=sub_kernels, + edge_kernels=sub_kernels, + parallel=parallel, n_jobs=multiprocessing.cpu_count(), + verbose=verbose) + elif graph_kernel == 'treeletkernel': + pkernel = functools.partial(polynomialkernel, d=2, c=1e5) +# pkernel = functools.partial(gaussiankernel, gamma=1e-6) + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, + sub_kernel=pkernel, parallel=parallel, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) + elif graph_kernel == 'weisfeilerlehmankernel': + Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, + height=4, base_kernel='subtree', parallel=None, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) + + # normalization + Kmatrix_diag = Kmatrix.diagonal().copy() + for i in range(len(Kmatrix)): + for j in range(i, len(Kmatrix)): + Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) + Kmatrix[j][i] = Kmatrix[i][j] + return Kmatrix + def gram2distances(Kmatrix): - dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) - for i1 in range(len(Kmatrix)): - for i2 in range(len(Kmatrix)): - dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] - dmatrix = np.sqrt(dmatrix) - return dmatrix + dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) + for i1 in range(len(Kmatrix)): + for i2 in range(len(Kmatrix)): + dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] + dmatrix = np.sqrt(dmatrix) + return dmatrix def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, - gkernel=None, verbose=True): - dis_mat = np.empty((len(Gn), len(Gn))) - if Kmatrix is None: - Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) - for i in range(len(Gn)): - for j in range(i, len(Gn)): - dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] - if dis < 0: - if dis > -1e-10: - dis = 0 - else: - raise ValueError('The distance is negative.') - dis_mat[i, j] = np.sqrt(dis) - dis_mat[j, i] = dis_mat[i, j] - dis_max = np.max(np.max(dis_mat)) - dis_min = np.min(np.min(dis_mat[dis_mat != 0])) - dis_mean = np.mean(np.mean(dis_mat)) - return dis_mat, dis_max, dis_min, dis_mean + gkernel=None, verbose=True): + dis_mat = np.empty((len(Gn), len(Gn))) + if Kmatrix is None: + Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) + for i in range(len(Gn)): + for j in range(i, len(Gn)): + dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] + if dis < 0: + if dis > -1e-10: + dis = 0 + else: + raise ValueError('The distance is negative.') + dis_mat[i, j] = np.sqrt(dis) + dis_mat[j, i] = dis_mat[i, j] + dis_max = np.max(np.max(dis_mat)) + dis_min = np.min(np.min(dis_mat[dis_mat != 0])) + dis_mean = np.mean(np.mean(dis_mat)) + return dis_mat, dis_max, dis_min, dis_mean def get_same_item_indices(ls): - """Get the indices of the same items in a list. Return a dict keyed by items. - """ - idx_dict = {} - for idx, item in enumerate(ls): - if item in idx_dict: - idx_dict[item].append(idx) - else: - idx_dict[item] = [idx] - return idx_dict + """Get the indices of the same items in a list. Return a dict keyed by items. + """ + idx_dict = {} + for idx, item in enumerate(ls): + if item in idx_dict: + idx_dict[item].append(idx) + else: + idx_dict[item] = [idx] + return idx_dict def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, - node_label=None, edge_label=None): - dis_k_all = [] # distance between g_star and each graph. - alpha = [1 / len(Gn)] * len(Gn) - if Kmatrix is None: - Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) - term3 = 0 - for i1, a1 in enumerate(alpha): - for i2, a2 in enumerate(alpha): - term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] - for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): - dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) - dis_all.append(dtemp) + node_label=None, edge_label=None): + dis_k_all = [] # distance between g_star and each graph. + alpha = [1 / len(Gn)] * len(Gn) + if Kmatrix is None: + Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) + term3 = 0 + for i1, a1 in enumerate(alpha): + for i2, a2 in enumerate(alpha): + term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] + for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): + dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) + dis_all.append(dtemp) def normalize_distance_matrix(D): - max_value = np.amax(D) - min_value = np.amin(D) - return (D - min_value) / (max_value - min_value) \ No newline at end of file + max_value = np.amax(D) + min_value = np.amin(D) + return (D - min_value) / (max_value - min_value) \ No newline at end of file diff --git a/gklearn/utils/__init__.py b/gklearn/utils/__init__.py index 2654b7a..d5301c6 100644 --- a/gklearn/utils/__init__.py +++ b/gklearn/utils/__init__.py @@ -15,5 +15,7 @@ __date__ = "November 2017" # from utils import graphfiles # from utils import utils -from gklearn.utils.dataset import Dataset +from gklearn.utils.dataset import Dataset, split_dataset_by_target from gklearn.utils.timer import Timer +from gklearn.utils.utils import get_graph_kernel_by_name +from gklearn.utils.utils import compute_gram_matrices_by_class diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 7a18e40..b36205d 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -8,6 +8,7 @@ Created on Thu Mar 26 18:48:27 2020 import numpy as np import networkx as nx from gklearn.utils.graphfiles import loadDataset +import os class Dataset(object): @@ -15,7 +16,7 @@ class Dataset(object): def __init__(self, filename=None, filename_y=None, extra_params=None): if filename is None: self.__graphs = None - self.__target = None + self.__targets = None self.__node_labels = None self.__edge_labels = None self.__node_attrs = None @@ -50,33 +51,41 @@ class Dataset(object): def load_dataset(self, filename, filename_y=None, extra_params=None): - self.__graphs, self.__target = loadDataset(filename, filename_y=filename_y, extra_params=extra_params) + self.__graphs, self.__targets = loadDataset(filename, filename_y=filename_y, extra_params=extra_params) self.set_labels_attrs() + def load_graphs(self, graphs, targets=None): + # this has to be followed by set_labels(). + self.__graphs = graphs + self.__targets = targets +# self.set_labels_attrs() + + def load_predefined_dataset(self, ds_name): + current_path = os.path.dirname(os.path.realpath(__file__)) + '/' if ds_name == 'Letter-high': # node non-symb - ds_file = '../../datasets/Letter-high/Letter-high_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'Letter-med': # node non-symb - ds_file = '../../datasets/Letter-high/Letter-med_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'Letter-low': # node non-symb - ds_file = '../../datasets/Letter-high/Letter-low_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'Fingerprint': - ds_file = '../../datasets/Fingerprint/Fingerprint_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'SYNTHETIC': pass elif ds_name == 'SYNTHETICnew': - ds_file = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'Synthie': pass elif ds_name == 'COIL-DEL': - ds_file = '../../datasets/COIL-DEL/COIL-DEL_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'COIL-RAG': pass elif ds_name == 'COLORS-3': @@ -86,6 +95,13 @@ class Dataset(object): self.set_labels_attrs() + + def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): + self.__node_labels = node_labels + self.__node_attrs = node_attrs + self.__edge_labels = edge_labels + self.__edge_attrs = edge_attrs + def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): # @todo: remove labels which have only one possible values. @@ -363,9 +379,34 @@ class Dataset(object): print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) + def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + for g in self.__graphs: + for nd in g.nodes(): + for nl in node_labels: + del g.nodes[nd][nl] + for na in node_attrs: + del g.nodes[nd][na] + for ed in g.edges(): + for el in edge_labels: + del g.edges[ed][el] + for ea in edge_attrs: + del g.edges[ed][ea] + if len(node_labels) > 0: + self.__node_labels = [nl for nl in self.__node_labels if nl not in node_labels] + if len(edge_labels) > 0: + self.__edge_labels = [el for el in self.__edge_labels if el not in edge_labels] + if len(node_attrs) > 0: + self.__node_attrs = [na for na in self.__node_attrs if na not in node_attrs] + if len(edge_attrs) > 0: + self.__edge_attrs = [ea for ea in self.__edge_attrs if ea not in edge_attrs] + + def cut_graphs(self, range_): self.__graphs = [self.__graphs[i] for i in range_] - self.set_labels_attrs() + if self.__targets is not None: + self.__targets = [self.__targets[i] for i in range_] + # @todo +# self.set_labels_attrs() def __get_dataset_size(self): @@ -514,7 +555,7 @@ class Dataset(object): def __get_class_num(self): - return len(set(self.__target)) + return len(set(self.__targets)) def __get_node_attr_dim(self): @@ -529,6 +570,11 @@ class Dataset(object): def graphs(self): return self.__graphs + + @property + def targets(self): + return self.__targets + @property def node_labels(self): @@ -547,4 +593,20 @@ class Dataset(object): @property def edge_attrs(self): - return self.__edge_attrs \ No newline at end of file + return self.__edge_attrs + + +def split_dataset_by_target(dataset): + from gklearn.preimage.utils import get_same_item_indices + + graphs = dataset.graphs + targets = dataset.targets + datasets = [] + idx_targets = get_same_item_indices(targets) + for key, val in idx_targets.items(): + sub_graphs = [graphs[i] for i in val] + sub_dataset = Dataset() + sub_dataset.load_graphs(sub_graphs, [key] * len(val)) + sub_dataset.set_labels(node_labels=dataset.node_labels, node_attrs=dataset.node_attrs, edge_labels=dataset.edge_labels, edge_attrs=dataset.edge_attrs) + datasets.append(sub_dataset) + return datasets \ No newline at end of file diff --git a/gklearn/utils/graphfiles.py b/gklearn/utils/graphfiles.py index 6c0e2e9..862cda1 100644 --- a/gklearn/utils/graphfiles.py +++ b/gklearn/utils/graphfiles.py @@ -3,762 +3,760 @@ from os.path import dirname, splitext def loadCT(filename): - """load data from a Chemical Table (.ct) file. - - Notes - ------ - a typical example of data in .ct is like this: - - 3 2 <- number of nodes and edges - - 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) - - 0.0000 0.0000 0.0000 C - - 0.0000 0.0000 0.0000 O - - 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo - - 2 3 1 1 - - Check `CTFile Formats file `__ - for detailed format discription. - """ - import networkx as nx - from os.path import basename - g = nx.Graph() - with open(filename) as f: - content = f.read().splitlines() - g = nx.Graph( - name = str(content[0]), - filename = basename(filename)) # set name of the graph - tmp = content[1].split(" ") - if tmp[0] == '': - nb_nodes = int(tmp[1]) # number of the nodes - nb_edges = int(tmp[2]) # number of the edges - else: - nb_nodes = int(tmp[0]) - nb_edges = int(tmp[1]) - # patch for compatibility : label will be removed later - for i in range(0, nb_nodes): - tmp = content[i + 2].split(" ") - tmp = [x for x in tmp if x != ''] - g.add_node(i, atom=tmp[3].strip(), - label=[item.strip() for item in tmp[3:]], - attributes=[item.strip() for item in tmp[0:3]]) - for i in range(0, nb_edges): - tmp = content[i + g.number_of_nodes() + 2].split(" ") - tmp = [x for x in tmp if x != ''] - g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, - bond_type=tmp[2].strip(), - label=[item.strip() for item in tmp[2:]]) - return g + """load data from a Chemical Table (.ct) file. + + Notes + ------ + a typical example of data in .ct is like this: + + 3 2 <- number of nodes and edges + + 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) + + 0.0000 0.0000 0.0000 C + + 0.0000 0.0000 0.0000 O + + 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo + + 2 3 1 1 + + Check `CTFile Formats file `__ + for detailed format discription. + """ + import networkx as nx + from os.path import basename + g = nx.Graph() + with open(filename) as f: + content = f.read().splitlines() + g = nx.Graph( + name = str(content[0]), + filename = basename(filename)) # set name of the graph + tmp = content[1].split(" ") + if tmp[0] == '': + nb_nodes = int(tmp[1]) # number of the nodes + nb_edges = int(tmp[2]) # number of the edges + else: + nb_nodes = int(tmp[0]) + nb_edges = int(tmp[1]) + # patch for compatibility : label will be removed later + for i in range(0, nb_nodes): + tmp = content[i + 2].split(" ") + tmp = [x for x in tmp if x != ''] + g.add_node(i, atom=tmp[3].strip(), + label=[item.strip() for item in tmp[3:]], + attributes=[item.strip() for item in tmp[0:3]]) + for i in range(0, nb_edges): + tmp = content[i + g.number_of_nodes() + 2].split(" ") + tmp = [x for x in tmp if x != ''] + g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, + bond_type=tmp[2].strip(), + label=[item.strip() for item in tmp[2:]]) + return g def loadGXL(filename): - from os.path import basename - import networkx as nx - import xml.etree.ElementTree as ET - - tree = ET.parse(filename) - root = tree.getroot() - index = 0 - g = nx.Graph(filename=basename(filename), name=root[0].attrib['id']) - dic = {} # used to retrieve incident nodes of edges - for node in root.iter('node'): - dic[node.attrib['id']] = index - labels = {} - for attr in node.iter('attr'): - labels[attr.attrib['name']] = attr[0].text - if 'chem' in labels: - labels['label'] = labels['chem'] - labels['atom'] = labels['chem'] - g.add_node(index, **labels) - index += 1 - - for edge in root.iter('edge'): - labels = {} - for attr in edge.iter('attr'): - labels[attr.attrib['name']] = attr[0].text - if 'valence' in labels: - labels['label'] = labels['valence'] - labels['bond_type'] = labels['valence'] - g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) - return g - - -def saveGXL(graph, filename, method='default'): - if method == 'default': - gxl_file = open(filename, 'w') - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - for v, attrs in graph.nodes(data=True): - if graph.graph['node_labels'] == [] and graph.graph['node_attrs'] == []: - gxl_file.write("\n") - else: - gxl_file.write("") - for l_name in graph.graph['node_labels']: - gxl_file.write("" + - str(attrs[l_name]) + "") - for a_name in graph.graph['node_attrs']: - gxl_file.write("" + - str(attrs[a_name]) + "") - gxl_file.write("\n") - for v1, v2, attrs in graph.edges(data=True): - if graph.graph['edge_labels'] == [] and graph.graph['edge_attrs'] == []: - gxl_file.write("\n") - else: - gxl_file.write("") - for l_name in graph.graph['edge_labels']: - gxl_file.write("" + - str(attrs[l_name]) + "") - for a_name in graph.graph['edge_attrs']: - gxl_file.write("" + - str(attrs[a_name]) + "") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("") - gxl_file.close() - elif method == 'benoit': - import xml.etree.ElementTree as ET - root_node = ET.Element('gxl') - attr = dict() - attr['id'] = str(graph.graph['name']) - attr['edgeids'] = 'true' - attr['edgemode'] = 'undirected' - graph_node = ET.SubElement(root_node, 'graph', attrib=attr) - - for v in graph: - current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) - for attr in graph.nodes[v].keys(): - cur_attr = ET.SubElement( - current_node, 'attr', attrib={'name': attr}) - cur_value = ET.SubElement(cur_attr, - graph.nodes[v][attr].__class__.__name__) - cur_value.text = graph.nodes[v][attr] - - for v1 in graph: - for v2 in graph[v1]: - if (v1 < v2): # Non oriented graphs - cur_edge = ET.SubElement( - graph_node, - 'edge', - attrib={ - 'from': str(v1), - 'to': str(v2) - }) - for attr in graph[v1][v2].keys(): - cur_attr = ET.SubElement( - cur_edge, 'attr', attrib={'name': attr}) - cur_value = ET.SubElement( - cur_attr, graph[v1][v2][attr].__class__.__name__) - cur_value.text = str(graph[v1][v2][attr]) - - tree = ET.ElementTree(root_node) - tree.write(filename) - elif method == 'gedlib': - # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 -# pass - gxl_file = open(filename, 'w') - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - for v, attrs in graph.nodes(data=True): - gxl_file.write("") - gxl_file.write("" + str(attrs['chem']) + "") - gxl_file.write("\n") - for v1, v2, attrs in graph.edges(data=True): - gxl_file.write("") - gxl_file.write("" + str(attrs['valence']) + "") -# gxl_file.write("" + "1" + "") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("") - gxl_file.close() - elif method == 'gedlib-letter': - # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 - # and https://github.com/dbblumenthal/gedlib/blob/master/data/datasets/Letter/HIGH/AP1_0000.gxl - gxl_file = open(filename, 'w') - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - for v, attrs in graph.nodes(data=True): - gxl_file.write("") - gxl_file.write("" + str(attrs['attributes'][0]) + "") - gxl_file.write("" + str(attrs['attributes'][1]) + "") - gxl_file.write("\n") - for v1, v2, attrs in graph.edges(data=True): - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("") - gxl_file.close() + from os.path import basename + import networkx as nx + import xml.etree.ElementTree as ET + + tree = ET.parse(filename) + root = tree.getroot() + index = 0 + g = nx.Graph(filename=basename(filename), name=root[0].attrib['id']) + dic = {} # used to retrieve incident nodes of edges + for node in root.iter('node'): + dic[node.attrib['id']] = index + labels = {} + for attr in node.iter('attr'): + labels[attr.attrib['name']] = attr[0].text + if 'chem' in labels: + labels['label'] = labels['chem'] + labels['atom'] = labels['chem'] + g.add_node(index, **labels) + index += 1 + + for edge in root.iter('edge'): + labels = {} + for attr in edge.iter('attr'): + labels[attr.attrib['name']] = attr[0].text + if 'valence' in labels: + labels['label'] = labels['valence'] + labels['bond_type'] = labels['valence'] + g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) + return g + + +def saveGXL(graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + if method == 'default': + gxl_file = open(filename, 'w') + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + if 'name' in graph.graph: + name = str(graph.graph['name']) + else: + name = 'dummy' + gxl_file.write("\n") + for v, attrs in graph.nodes(data=True): + gxl_file.write("") + for l_name in node_labels: + gxl_file.write("" + + str(attrs[l_name]) + "") + for a_name in node_attrs: + gxl_file.write("" + + str(attrs[a_name]) + "") + gxl_file.write("\n") + for v1, v2, attrs in graph.edges(data=True): + gxl_file.write("") + for l_name in edge_labels: + gxl_file.write("" + + str(attrs[l_name]) + "") + for a_name in edge_attrs: + gxl_file.write("" + + str(attrs[a_name]) + "") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("") + gxl_file.close() + elif method == 'benoit': + import xml.etree.ElementTree as ET + root_node = ET.Element('gxl') + attr = dict() + attr['id'] = str(graph.graph['name']) + attr['edgeids'] = 'true' + attr['edgemode'] = 'undirected' + graph_node = ET.SubElement(root_node, 'graph', attrib=attr) + + for v in graph: + current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) + for attr in graph.nodes[v].keys(): + cur_attr = ET.SubElement( + current_node, 'attr', attrib={'name': attr}) + cur_value = ET.SubElement(cur_attr, + graph.nodes[v][attr].__class__.__name__) + cur_value.text = graph.nodes[v][attr] + + for v1 in graph: + for v2 in graph[v1]: + if (v1 < v2): # Non oriented graphs + cur_edge = ET.SubElement( + graph_node, + 'edge', + attrib={ + 'from': str(v1), + 'to': str(v2) + }) + for attr in graph[v1][v2].keys(): + cur_attr = ET.SubElement( + cur_edge, 'attr', attrib={'name': attr}) + cur_value = ET.SubElement( + cur_attr, graph[v1][v2][attr].__class__.__name__) + cur_value.text = str(graph[v1][v2][attr]) + + tree = ET.ElementTree(root_node) + tree.write(filename) + elif method == 'gedlib': + # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 +# pass + gxl_file = open(filename, 'w') + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + for v, attrs in graph.nodes(data=True): + gxl_file.write("") + gxl_file.write("" + str(attrs['chem']) + "") + gxl_file.write("\n") + for v1, v2, attrs in graph.edges(data=True): + gxl_file.write("") + gxl_file.write("" + str(attrs['valence']) + "") +# gxl_file.write("" + "1" + "") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("") + gxl_file.close() + elif method == 'gedlib-letter': + # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 + # and https://github.com/dbblumenthal/gedlib/blob/master/data/datasets/Letter/HIGH/AP1_0000.gxl + gxl_file = open(filename, 'w') + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + for v, attrs in graph.nodes(data=True): + gxl_file.write("") + gxl_file.write("" + str(attrs['attributes'][0]) + "") + gxl_file.write("" + str(attrs['attributes'][1]) + "") + gxl_file.write("\n") + for v1, v2, attrs in graph.edges(data=True): + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("") + gxl_file.close() def loadSDF(filename): - """load data from structured data file (.sdf file). - - Notes - ------ - A SDF file contains a group of molecules, represented in the similar way as in MOL format. - Check `here `__ for detailed structure. - """ - import networkx as nx - from os.path import basename - from tqdm import tqdm - import sys - data = [] - with open(filename) as f: - content = f.read().splitlines() - index = 0 - pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) - while index < len(content): - index_old = index - - g = nx.Graph(name=content[index].strip()) # set name of the graph - - tmp = content[index + 3] - nb_nodes = int(tmp[:3]) # number of the nodes - nb_edges = int(tmp[3:6]) # number of the edges - - for i in range(0, nb_nodes): - tmp = content[i + index + 4] - g.add_node(i, atom=tmp[31:34].strip()) - - for i in range(0, nb_edges): - tmp = content[i + index + g.number_of_nodes() + 4] - tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] - g.add_edge( - int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) - - data.append(g) - - index += 4 + g.number_of_nodes() + g.number_of_edges() - while content[index].strip() != '$$$$': # seperator - index += 1 - index += 1 - - pbar.update(index - index_old) - pbar.update(1) - pbar.close() - - return data + """load data from structured data file (.sdf file). + + Notes + ------ + A SDF file contains a group of molecules, represented in the similar way as in MOL format. + Check `here `__ for detailed structure. + """ + import networkx as nx + from os.path import basename + from tqdm import tqdm + import sys + data = [] + with open(filename) as f: + content = f.read().splitlines() + index = 0 + pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) + while index < len(content): + index_old = index + + g = nx.Graph(name=content[index].strip()) # set name of the graph + + tmp = content[index + 3] + nb_nodes = int(tmp[:3]) # number of the nodes + nb_edges = int(tmp[3:6]) # number of the edges + + for i in range(0, nb_nodes): + tmp = content[i + index + 4] + g.add_node(i, atom=tmp[31:34].strip()) + + for i in range(0, nb_edges): + tmp = content[i + index + g.number_of_nodes() + 4] + tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] + g.add_edge( + int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) + + data.append(g) + + index += 4 + g.number_of_nodes() + g.number_of_edges() + while content[index].strip() != '$$$$': # seperator + index += 1 + index += 1 + + pbar.update(index - index_old) + pbar.update(1) + pbar.close() + + return data def loadMAT(filename, extra_params): - """Load graph data from a MATLAB (up to version 7.1) .mat file. - - Notes - ------ - A MAT file contains a struct array containing graphs, and a column vector lx containing a class label for each graph. - Check README in `downloadable file `__ for detailed structure. - """ - from scipy.io import loadmat - import numpy as np - import networkx as nx - data = [] - content = loadmat(filename) - order = extra_params['am_sp_al_nl_el'] - # print(content) - # print('----') - for key, value in content.items(): - if key[0] == 'l': # class label - y = np.transpose(value)[0].tolist() - # print(y) - elif key[0] != '_': - # print(value[0][0][0]) - # print() - # print(value[0][0][1]) - # print() - # print(value[0][0][2]) - # print() - # if len(value[0][0]) > 3: - # print(value[0][0][3]) - # print('----') - # if adjacency matrix is not compressed / edge label exists - if order[1] == 0: - for i, item in enumerate(value[0]): - # print(item) - # print('------') - g = nx.Graph(name=i) # set name of the graph - nl = np.transpose(item[order[3]][0][0][0]) # node label - # print(item[order[3]]) - # print() - for index, label in enumerate(nl[0]): - g.add_node(index, atom=str(label)) - el = item[order[4]][0][0][0] # edge label - for edge in el: - g.add_edge( - edge[0] - 1, edge[1] - 1, bond_type=str(edge[2])) - data.append(g) - else: - from scipy.sparse import csc_matrix - for i, item in enumerate(value[0]): - # print(item) - # print('------') - g = nx.Graph(name=i) # set name of the graph - nl = np.transpose(item[order[3]][0][0][0]) # node label - # print(nl) - # print() - for index, label in enumerate(nl[0]): - g.add_node(index, atom=str(label)) - sam = item[order[0]] # sparse adjacency matrix - index_no0 = sam.nonzero() - for col, row in zip(index_no0[0], index_no0[1]): - # print(col) - # print(row) - g.add_edge(col, row) - data.append(g) - # print(g.edges(data=True)) - return data, y + """Load graph data from a MATLAB (up to version 7.1) .mat file. + + Notes + ------ + A MAT file contains a struct array containing graphs, and a column vector lx containing a class label for each graph. + Check README in `downloadable file `__ for detailed structure. + """ + from scipy.io import loadmat + import numpy as np + import networkx as nx + data = [] + content = loadmat(filename) + order = extra_params['am_sp_al_nl_el'] + # print(content) + # print('----') + for key, value in content.items(): + if key[0] == 'l': # class label + y = np.transpose(value)[0].tolist() + # print(y) + elif key[0] != '_': + # print(value[0][0][0]) + # print() + # print(value[0][0][1]) + # print() + # print(value[0][0][2]) + # print() + # if len(value[0][0]) > 3: + # print(value[0][0][3]) + # print('----') + # if adjacency matrix is not compressed / edge label exists + if order[1] == 0: + for i, item in enumerate(value[0]): + # print(item) + # print('------') + g = nx.Graph(name=i) # set name of the graph + nl = np.transpose(item[order[3]][0][0][0]) # node label + # print(item[order[3]]) + # print() + for index, label in enumerate(nl[0]): + g.add_node(index, atom=str(label)) + el = item[order[4]][0][0][0] # edge label + for edge in el: + g.add_edge( + edge[0] - 1, edge[1] - 1, bond_type=str(edge[2])) + data.append(g) + else: + from scipy.sparse import csc_matrix + for i, item in enumerate(value[0]): + # print(item) + # print('------') + g = nx.Graph(name=i) # set name of the graph + nl = np.transpose(item[order[3]][0][0][0]) # node label + # print(nl) + # print() + for index, label in enumerate(nl[0]): + g.add_node(index, atom=str(label)) + sam = item[order[0]] # sparse adjacency matrix + index_no0 = sam.nonzero() + for col, row in zip(index_no0[0], index_no0[1]): + # print(col) + # print(row) + g.add_edge(col, row) + data.append(g) + # print(g.edges(data=True)) + return data, y def loadTXT(filename): - """Load graph data from a .txt file. - - Notes - ------ - The graph data is loaded from separate files. - Check README in `downloadable file `__, 2018 for detailed structure. - """ -# import numpy as np - import networkx as nx - from os import listdir - from os.path import dirname, basename - - - def get_label_names(frm): - """Get label names from DS_label_readme.txt file. - """ - - def get_names_from_line(line): - """Get names of labels/attributes from a line. - """ - str_names = line.split('[')[1].split(']')[0] - names = str_names.split(',') - names = [attr.strip() for attr in names] - return names - - - label_names = {'node_labels': [], 'node_attrs': [], - 'edge_labels': [], 'edge_attrs': []} - content_rm = open(frm).read().splitlines() - for line in content_rm: - line = line.strip() - if line.startswith('Node labels:'): - label_names['node_labels'] = get_names_from_line(line) - elif line.startswith('Node attributes:'): - label_names['node_attrs'] = get_names_from_line(line) - elif line.startswith('Edge labels:'): - label_names['edge_labels'] = get_names_from_line(line) - elif line.startswith('Edge attributes:'): - label_names['edge_attrs'] = get_names_from_line(line) - return label_names - - - # get dataset name. - dirname_dataset = dirname(filename) - filename = basename(filename) - fn_split = filename.split('_A') - ds_name = fn_split[0].strip() - - # load data file names - for name in listdir(dirname_dataset): - if ds_name + '_A' in name: - fam = dirname_dataset + '/' + name - elif ds_name + '_graph_indicator' in name: - fgi = dirname_dataset + '/' + name - elif ds_name + '_graph_labels' in name: - fgl = dirname_dataset + '/' + name - elif ds_name + '_node_labels' in name: - fnl = dirname_dataset + '/' + name - elif ds_name + '_edge_labels' in name: - fel = dirname_dataset + '/' + name - elif ds_name + '_edge_attributes' in name: - fea = dirname_dataset + '/' + name - elif ds_name + '_node_attributes' in name: - fna = dirname_dataset + '/' + name - elif ds_name + '_graph_attributes' in name: - fga = dirname_dataset + '/' + name - elif ds_name + '_label_readme' in name: - frm = dirname_dataset + '/' + name - # this is supposed to be the node attrs, make sure to put this as the last 'elif' - elif ds_name + '_attributes' in name: - fna = dirname_dataset + '/' + name - - # get labels and attributes names. - if 'frm' in locals(): - label_names = get_label_names(frm) - else: - label_names = {'node_labels': [], 'node_attrs': [], - 'edge_labels': [], 'edge_attrs': []} - - content_gi = open(fgi).read().splitlines() # graph indicator - content_am = open(fam).read().splitlines() # adjacency matrix - content_gl = open(fgl).read().splitlines() # graph labels - - # create graphs and add nodes - data = [nx.Graph(name=str(i), - node_labels=label_names['node_labels'], - node_attrs=label_names['node_attrs'], - edge_labels=label_names['edge_labels'], - edge_attrs=label_names['edge_attrs']) for i in range(0, len(content_gl))] - if 'fnl' in locals(): - content_nl = open(fnl).read().splitlines() # node labels - for idx, line in enumerate(content_gi): - # transfer to int first in case of unexpected blanks - data[int(line) - 1].add_node(idx) - labels = [l.strip() for l in content_nl[idx].split(',')] - data[int(line) - 1].nodes[idx]['atom'] = str(int(labels[0])) # @todo: this should be removed after. - if data[int(line) - 1].graph['node_labels'] == []: - for i, label in enumerate(labels): - l_name = 'label_' + str(i) - data[int(line) - 1].nodes[idx][l_name] = label - data[int(line) - 1].graph['node_labels'].append(l_name) - else: - for i, l_name in enumerate(data[int(line) - 1].graph['node_labels']): - data[int(line) - 1].nodes[idx][l_name] = labels[i] - else: - for i, line in enumerate(content_gi): - data[int(line) - 1].add_node(i) - - # add edges - for line in content_am: - tmp = line.split(',') - n1 = int(tmp[0]) - 1 - n2 = int(tmp[1]) - 1 - # ignore edge weight here. - g = int(content_gi[n1]) - 1 - data[g].add_edge(n1, n2) - - # add edge labels - if 'fel' in locals(): - content_el = open(fel).read().splitlines() - for idx, line in enumerate(content_el): - labels = [l.strip() for l in line.split(',')] - n = [int(i) - 1 for i in content_am[idx].split(',')] - g = int(content_gi[n[0]]) - 1 - data[g].edges[n[0], n[1]]['bond_type'] = labels[0] # @todo: this should be removed after. - if data[g].graph['edge_labels'] == []: - for i, label in enumerate(labels): - l_name = 'label_' + str(i) - data[g].edges[n[0], n[1]][l_name] = label - data[g].graph['edge_labels'].append(l_name) - else: - for i, l_name in enumerate(data[g].graph['edge_labels']): - data[g].edges[n[0], n[1]][l_name] = labels[i] - - # add node attributes - if 'fna' in locals(): - content_na = open(fna).read().splitlines() - for idx, line in enumerate(content_na): - attrs = [a.strip() for a in line.split(',')] - g = int(content_gi[idx]) - 1 - data[g].nodes[idx]['attributes'] = attrs # @todo: this should be removed after. - if data[g].graph['node_attrs'] == []: - for i, attr in enumerate(attrs): - a_name = 'attr_' + str(i) - data[g].nodes[idx][a_name] = attr - data[g].graph['node_attrs'].append(a_name) - else: - for i, a_name in enumerate(data[g].graph['node_attrs']): - data[g].nodes[idx][a_name] = attrs[i] - - # add edge attributes - if 'fea' in locals(): - content_ea = open(fea).read().splitlines() - for idx, line in enumerate(content_ea): - attrs = [a.strip() for a in line.split(',')] - n = [int(i) - 1 for i in content_am[idx].split(',')] - g = int(content_gi[n[0]]) - 1 - data[g].edges[n[0], n[1]]['attributes'] = attrs # @todo: this should be removed after. - if data[g].graph['edge_attrs'] == []: - for i, attr in enumerate(attrs): - a_name = 'attr_' + str(i) - data[g].edges[n[0], n[1]][a_name] = attr - data[g].graph['edge_attrs'].append(a_name) - else: - for i, a_name in enumerate(data[g].graph['edge_attrs']): - data[g].edges[n[0], n[1]][a_name] = attrs[i] - - # load y - y = [int(i) for i in content_gl] - - return data, y + """Load graph data from a .txt file. + + Notes + ------ + The graph data is loaded from separate files. + Check README in `downloadable file `__, 2018 for detailed structure. + """ +# import numpy as np + import networkx as nx + from os import listdir + from os.path import dirname, basename + + + def get_label_names(frm): + """Get label names from DS_label_readme.txt file. + """ + + def get_names_from_line(line): + """Get names of labels/attributes from a line. + """ + str_names = line.split('[')[1].split(']')[0] + names = str_names.split(',') + names = [attr.strip() for attr in names] + return names + + + label_names = {'node_labels': [], 'node_attrs': [], + 'edge_labels': [], 'edge_attrs': []} + content_rm = open(frm).read().splitlines() + for line in content_rm: + line = line.strip() + if line.startswith('Node labels:'): + label_names['node_labels'] = get_names_from_line(line) + elif line.startswith('Node attributes:'): + label_names['node_attrs'] = get_names_from_line(line) + elif line.startswith('Edge labels:'): + label_names['edge_labels'] = get_names_from_line(line) + elif line.startswith('Edge attributes:'): + label_names['edge_attrs'] = get_names_from_line(line) + return label_names + + + # get dataset name. + dirname_dataset = dirname(filename) + filename = basename(filename) + fn_split = filename.split('_A') + ds_name = fn_split[0].strip() + + # load data file names + for name in listdir(dirname_dataset): + if ds_name + '_A' in name: + fam = dirname_dataset + '/' + name + elif ds_name + '_graph_indicator' in name: + fgi = dirname_dataset + '/' + name + elif ds_name + '_graph_labels' in name: + fgl = dirname_dataset + '/' + name + elif ds_name + '_node_labels' in name: + fnl = dirname_dataset + '/' + name + elif ds_name + '_edge_labels' in name: + fel = dirname_dataset + '/' + name + elif ds_name + '_edge_attributes' in name: + fea = dirname_dataset + '/' + name + elif ds_name + '_node_attributes' in name: + fna = dirname_dataset + '/' + name + elif ds_name + '_graph_attributes' in name: + fga = dirname_dataset + '/' + name + elif ds_name + '_label_readme' in name: + frm = dirname_dataset + '/' + name + # this is supposed to be the node attrs, make sure to put this as the last 'elif' + elif ds_name + '_attributes' in name: + fna = dirname_dataset + '/' + name + + # get labels and attributes names. + if 'frm' in locals(): + label_names = get_label_names(frm) + else: + label_names = {'node_labels': [], 'node_attrs': [], + 'edge_labels': [], 'edge_attrs': []} + + content_gi = open(fgi).read().splitlines() # graph indicator + content_am = open(fam).read().splitlines() # adjacency matrix + content_gl = open(fgl).read().splitlines() # graph labels + + # create graphs and add nodes + data = [nx.Graph(name=str(i), + node_labels=label_names['node_labels'], + node_attrs=label_names['node_attrs'], + edge_labels=label_names['edge_labels'], + edge_attrs=label_names['edge_attrs']) for i in range(0, len(content_gl))] + if 'fnl' in locals(): + content_nl = open(fnl).read().splitlines() # node labels + for idx, line in enumerate(content_gi): + # transfer to int first in case of unexpected blanks + data[int(line) - 1].add_node(idx) + labels = [l.strip() for l in content_nl[idx].split(',')] + data[int(line) - 1].nodes[idx]['atom'] = str(int(labels[0])) # @todo: this should be removed after. + if data[int(line) - 1].graph['node_labels'] == []: + for i, label in enumerate(labels): + l_name = 'label_' + str(i) + data[int(line) - 1].nodes[idx][l_name] = label + data[int(line) - 1].graph['node_labels'].append(l_name) + else: + for i, l_name in enumerate(data[int(line) - 1].graph['node_labels']): + data[int(line) - 1].nodes[idx][l_name] = labels[i] + else: + for i, line in enumerate(content_gi): + data[int(line) - 1].add_node(i) + + # add edges + for line in content_am: + tmp = line.split(',') + n1 = int(tmp[0]) - 1 + n2 = int(tmp[1]) - 1 + # ignore edge weight here. + g = int(content_gi[n1]) - 1 + data[g].add_edge(n1, n2) + + # add edge labels + if 'fel' in locals(): + content_el = open(fel).read().splitlines() + for idx, line in enumerate(content_el): + labels = [l.strip() for l in line.split(',')] + n = [int(i) - 1 for i in content_am[idx].split(',')] + g = int(content_gi[n[0]]) - 1 + data[g].edges[n[0], n[1]]['bond_type'] = labels[0] # @todo: this should be removed after. + if data[g].graph['edge_labels'] == []: + for i, label in enumerate(labels): + l_name = 'label_' + str(i) + data[g].edges[n[0], n[1]][l_name] = label + data[g].graph['edge_labels'].append(l_name) + else: + for i, l_name in enumerate(data[g].graph['edge_labels']): + data[g].edges[n[0], n[1]][l_name] = labels[i] + + # add node attributes + if 'fna' in locals(): + content_na = open(fna).read().splitlines() + for idx, line in enumerate(content_na): + attrs = [a.strip() for a in line.split(',')] + g = int(content_gi[idx]) - 1 + data[g].nodes[idx]['attributes'] = attrs # @todo: this should be removed after. + if data[g].graph['node_attrs'] == []: + for i, attr in enumerate(attrs): + a_name = 'attr_' + str(i) + data[g].nodes[idx][a_name] = attr + data[g].graph['node_attrs'].append(a_name) + else: + for i, a_name in enumerate(data[g].graph['node_attrs']): + data[g].nodes[idx][a_name] = attrs[i] + + # add edge attributes + if 'fea' in locals(): + content_ea = open(fea).read().splitlines() + for idx, line in enumerate(content_ea): + attrs = [a.strip() for a in line.split(',')] + n = [int(i) - 1 for i in content_am[idx].split(',')] + g = int(content_gi[n[0]]) - 1 + data[g].edges[n[0], n[1]]['attributes'] = attrs # @todo: this should be removed after. + if data[g].graph['edge_attrs'] == []: + for i, attr in enumerate(attrs): + a_name = 'attr_' + str(i) + data[g].edges[n[0], n[1]][a_name] = attr + data[g].graph['edge_attrs'].append(a_name) + else: + for i, a_name in enumerate(data[g].graph['edge_attrs']): + data[g].edges[n[0], n[1]][a_name] = attrs[i] + + # load y + y = [int(i) for i in content_gl] + + return data, y def loadDataset(filename, filename_y=None, extra_params=None): - """Read graph data from filename and load them as NetworkX graphs. - - Parameters - ---------- - filename : string - The name of the file from where the dataset is read. - filename_y : string - The name of file of the targets corresponding to graphs. - extra_params : dict - Extra parameters only designated to '.mat' format. - - Return - ------ - data : List of NetworkX graph. - - y : List - - Targets corresponding to graphs. - - Notes - ----- - This function supports following graph dataset formats: - - 'ds': load data from .ds file. See comments of function loadFromDS for a example. - - 'cxl': load data from Graph eXchange Language file (.cxl file). See - `here `__ for detail. - - 'sdf': load data from structured data file (.sdf file). See - `here `__ - for details. - - 'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See - README in `downloadable file `__ - for details. - - 'txt': Load graph data from a special .txt file. See - `here `__ - for details. Note here filename is the name of either .txt file in - the dataset directory. - """ - extension = splitext(filename)[1][1:] - if extension == "ds": - data, y = loadFromDS(filename, filename_y) - elif extension == "cxl": - import xml.etree.ElementTree as ET - - dirname_dataset = dirname(filename) - tree = ET.parse(filename) - root = tree.getroot() - data = [] - y = [] - for graph in root.iter('graph'): - mol_filename = graph.attrib['file'] - mol_class = graph.attrib['class'] - data.append(loadGXL(dirname_dataset + '/' + mol_filename)) - y.append(mol_class) - elif extension == 'xml': - data, y = loadFromXML(filename, extra_params) - elif extension == "sdf": -# import numpy as np - from tqdm import tqdm - import sys - - data = loadSDF(filename) - - y_raw = open(filename_y).read().splitlines() - y_raw.pop(0) - tmp0 = [] - tmp1 = [] - for i in range(0, len(y_raw)): - tmp = y_raw[i].split(',') - tmp0.append(tmp[0]) - tmp1.append(tmp[1].strip()) - - y = [] - for i in tqdm(range(0, len(data)), desc='ajust data', file=sys.stdout): - try: - y.append(tmp1[tmp0.index(data[i].name)].strip()) - except ValueError: # if data[i].name not in tmp0 - data[i] = [] - data = list(filter(lambda a: a != [], data)) - elif extension == "mat": - data, y = loadMAT(filename, extra_params) - elif extension == 'txt': - data, y = loadTXT(filename) - # print(len(y)) - # print(y) - # print(data[0].nodes(data=True)) - # print('----') - # print(data[0].edges(data=True)) - # for g in data: - # print(g.nodes(data=True)) - # print('----') - # print(g.edges(data=True)) - - return data, y + """Read graph data from filename and load them as NetworkX graphs. + + Parameters + ---------- + filename : string + The name of the file from where the dataset is read. + filename_y : string + The name of file of the targets corresponding to graphs. + extra_params : dict + Extra parameters only designated to '.mat' format. + + Return + ------ + data : List of NetworkX graph. + + y : List + + Targets corresponding to graphs. + + Notes + ----- + This function supports following graph dataset formats: + + 'ds': load data from .ds file. See comments of function loadFromDS for a example. + + 'cxl': load data from Graph eXchange Language file (.cxl file). See + `here `__ for detail. + + 'sdf': load data from structured data file (.sdf file). See + `here `__ + for details. + + 'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See + README in `downloadable file `__ + for details. + + 'txt': Load graph data from a special .txt file. See + `here `__ + for details. Note here filename is the name of either .txt file in + the dataset directory. + """ + extension = splitext(filename)[1][1:] + if extension == "ds": + data, y = loadFromDS(filename, filename_y) + elif extension == "cxl": + import xml.etree.ElementTree as ET + + dirname_dataset = dirname(filename) + tree = ET.parse(filename) + root = tree.getroot() + data = [] + y = [] + for graph in root.iter('graph'): + mol_filename = graph.attrib['file'] + mol_class = graph.attrib['class'] + data.append(loadGXL(dirname_dataset + '/' + mol_filename)) + y.append(mol_class) + elif extension == 'xml': + data, y = loadFromXML(filename, extra_params) + elif extension == "sdf": +# import numpy as np + from tqdm import tqdm + import sys + + data = loadSDF(filename) + + y_raw = open(filename_y).read().splitlines() + y_raw.pop(0) + tmp0 = [] + tmp1 = [] + for i in range(0, len(y_raw)): + tmp = y_raw[i].split(',') + tmp0.append(tmp[0]) + tmp1.append(tmp[1].strip()) + + y = [] + for i in tqdm(range(0, len(data)), desc='ajust data', file=sys.stdout): + try: + y.append(tmp1[tmp0.index(data[i].name)].strip()) + except ValueError: # if data[i].name not in tmp0 + data[i] = [] + data = list(filter(lambda a: a != [], data)) + elif extension == "mat": + data, y = loadMAT(filename, extra_params) + elif extension == 'txt': + data, y = loadTXT(filename) + # print(len(y)) + # print(y) + # print(data[0].nodes(data=True)) + # print('----') + # print(data[0].edges(data=True)) + # for g in data: + # print(g.nodes(data=True)) + # print('----') + # print(g.edges(data=True)) + + return data, y def loadFromXML(filename, extra_params): - import xml.etree.ElementTree as ET - - if extra_params: - dirname_dataset = extra_params - else: - dirname_dataset = dirname(filename) - tree = ET.parse(filename) - root = tree.getroot() - data = [] - y = [] - for graph in root.iter('graph'): - mol_filename = graph.attrib['file'] - mol_class = graph.attrib['class'] - data.append(loadGXL(dirname_dataset + '/' + mol_filename)) - y.append(mol_class) - - return data, y - + import xml.etree.ElementTree as ET + + if extra_params: + dirname_dataset = extra_params + else: + dirname_dataset = dirname(filename) + tree = ET.parse(filename) + root = tree.getroot() + data = [] + y = [] + for graph in root.iter('graph'): + mol_filename = graph.attrib['file'] + mol_class = graph.attrib['class'] + data.append(loadGXL(dirname_dataset + '/' + mol_filename)) + y.append(mol_class) + + return data, y + def loadFromDS(filename, filename_y): - """Load data from .ds file. - - Possible graph formats include: - - '.ct': see function loadCT for detail. - - '.gxl': see dunction loadGXL for detail. - - Note these graph formats are checked automatically by the extensions of - graph files. - """ - dirname_dataset = dirname(filename) - data = [] - y = [] - content = open(filename).read().splitlines() - extension = splitext(content[0].split(' ')[0])[1][1:] - if filename_y is None or filename_y == '': - if extension == 'ct': - for i in range(0, len(content)): - tmp = content[i].split(' ') - # remove the '#'s in file names - data.append( - loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) - y.append(float(tmp[1])) - elif extension == 'gxl': - for i in range(0, len(content)): - tmp = content[i].split(' ') - # remove the '#'s in file names - data.append( - loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) - y.append(float(tmp[1])) - else: # y in a seperate file - if extension == 'ct': - for i in range(0, len(content)): - tmp = content[i] - # remove the '#'s in file names - data.append( - loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1))) - elif extension == 'gxl': - for i in range(0, len(content)): - tmp = content[i] - # remove the '#'s in file names - data.append( - loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1))) - - content_y = open(filename_y).read().splitlines() - # assume entries in filename and filename_y have the same order. - for item in content_y: - tmp = item.split(' ') - # assume the 3rd entry in a line is y (for Alkane dataset) - y.append(float(tmp[2])) - - return data, y - + """Load data from .ds file. + + Possible graph formats include: + + '.ct': see function loadCT for detail. + + '.gxl': see dunction loadGXL for detail. + + Note these graph formats are checked automatically by the extensions of + graph files. + """ + dirname_dataset = dirname(filename) + data = [] + y = [] + content = open(filename).read().splitlines() + extension = splitext(content[0].split(' ')[0])[1][1:] + if filename_y is None or filename_y == '': + if extension == 'ct': + for i in range(0, len(content)): + tmp = content[i].split(' ') + # remove the '#'s in file names + data.append( + loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) + y.append(float(tmp[1])) + elif extension == 'gxl': + for i in range(0, len(content)): + tmp = content[i].split(' ') + # remove the '#'s in file names + data.append( + loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) + y.append(float(tmp[1])) + else: # y in a seperate file + if extension == 'ct': + for i in range(0, len(content)): + tmp = content[i] + # remove the '#'s in file names + data.append( + loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1))) + elif extension == 'gxl': + for i in range(0, len(content)): + tmp = content[i] + # remove the '#'s in file names + data.append( + loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1))) + + content_y = open(filename_y).read().splitlines() + # assume entries in filename and filename_y have the same order. + for item in content_y: + tmp = item.split(' ') + # assume the 3rd entry in a line is y (for Alkane dataset) + y.append(float(tmp[2])) + + return data, y + def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None): - """Save list of graphs. - """ - import os - dirname_ds = os.path.dirname(filename) - if dirname_ds != '': - dirname_ds += '/' - if not os.path.exists(dirname_ds) : - os.makedirs(dirname_ds) - - if xparams is not None and 'graph_dir' in xparams: - graph_dir = xparams['graph_dir'] + '/' - if not os.path.exists(graph_dir): - os.makedirs(graph_dir) - else: - graph_dir = dirname_ds - - if group == 'xml' and gformat == 'gxl': - kwargs = {'method': xparams['method']} if xparams is not None else {} - with open(filename + '.xml', 'w') as fgroup: - fgroup.write("") - fgroup.write("\n") - fgroup.write("\n") - for idx, g in enumerate(Gn): - fname_tmp = "graph" + str(idx) + ".gxl" - saveGXL(g, graph_dir + fname_tmp, **kwargs) - fgroup.write("\n\t") - fgroup.write("\n") - fgroup.close() - - -if __name__ == '__main__': -# ### Load dataset from .ds file. -# # .ct files. -# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', -# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} -# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) -## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb -## Gn, y = loadDataset(ds['dataset']) -## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb -## Gn, y = loadDataset(ds['dataset']) -## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled -## Gn, y = loadDataset(ds['dataset']) -# print(Gn[1].nodes(data=True)) -# print(Gn[1].edges(data=True)) -# print(y[1]) - -# # .gxl file. -# ds = {'name': 'monoterpenoides', -# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb -# Gn, y = loadDataset(ds['dataset']) -# print(Gn[1].nodes(data=True)) -# print(Gn[1].edges(data=True)) -# print(y[1]) - -# ### Convert graph from one format to another. -# # .gxl file. -# import networkx as nx -# ds = {'name': 'monoterpenoides', -# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb -# Gn, y = loadDataset(ds['dataset']) -# y = [int(i) for i in y] -# print(Gn[1].nodes(data=True)) -# print(Gn[1].edges(data=True)) -# print(y[1]) -# # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. -# Gn_new = [] -# for G in Gn: -# G_new = nx.Graph() -# for nd, attrs in G.nodes(data=True): -# G_new.add_node(str(nd), chem=attrs['atom']) -# for nd1, nd2, attrs in G.edges(data=True): -# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) -## G_new.add_edge(str(nd1), str(nd2)) -# Gn_new.append(G_new) -# print(Gn_new[1].nodes(data=True)) -# print(Gn_new[1].edges(data=True)) -# print(Gn_new[1]) -# filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' -# xparams = {'method': 'gedlib'} -# saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) - - # save dataset. -# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', -# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb -# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# saveDataset(Gn, y, group='xml', filename='temp/temp') - - # test - new way to add labels and attributes. -# dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' -# dataset = '../../datasets/Fingerprint/Fingerprint_A.txt' -# dataset = '../../datasets/Letter-med/Letter-med_A.txt' -# dataset = '../../datasets/AIDS/AIDS_A.txt' -# dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' -# Gn, y_all = loadDataset(dataset) - pass \ No newline at end of file + """Save list of graphs. + """ + import os + dirname_ds = os.path.dirname(filename) + if dirname_ds != '': + dirname_ds += '/' + if not os.path.exists(dirname_ds) : + os.makedirs(dirname_ds) + + if xparams is not None and 'graph_dir' in xparams: + graph_dir = xparams['graph_dir'] + '/' + if not os.path.exists(graph_dir): + os.makedirs(graph_dir) + else: + graph_dir = dirname_ds + + if group == 'xml' and gformat == 'gxl': + kwargs = {'method': xparams['method']} if xparams is not None else {} + with open(filename + '.xml', 'w') as fgroup: + fgroup.write("") + fgroup.write("\n") + fgroup.write("\n") + for idx, g in enumerate(Gn): + fname_tmp = "graph" + str(idx) + ".gxl" + saveGXL(g, graph_dir + fname_tmp, **kwargs) + fgroup.write("\n\t") + fgroup.write("\n") + fgroup.close() + + +if __name__ == '__main__': +# ### Load dataset from .ds file. +# # .ct files. +# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', +# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} +# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) +## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb +## Gn, y = loadDataset(ds['dataset']) +## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb +## Gn, y = loadDataset(ds['dataset']) +## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled +## Gn, y = loadDataset(ds['dataset']) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) + +# # .gxl file. +# ds = {'name': 'monoterpenoides', +# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb +# Gn, y = loadDataset(ds['dataset']) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) + +# ### Convert graph from one format to another. +# # .gxl file. +# import networkx as nx +# ds = {'name': 'monoterpenoides', +# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb +# Gn, y = loadDataset(ds['dataset']) +# y = [int(i) for i in y] +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) +# # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. +# Gn_new = [] +# for G in Gn: +# G_new = nx.Graph() +# for nd, attrs in G.nodes(data=True): +# G_new.add_node(str(nd), chem=attrs['atom']) +# for nd1, nd2, attrs in G.edges(data=True): +# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) +## G_new.add_edge(str(nd1), str(nd2)) +# Gn_new.append(G_new) +# print(Gn_new[1].nodes(data=True)) +# print(Gn_new[1].edges(data=True)) +# print(Gn_new[1]) +# filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' +# xparams = {'method': 'gedlib'} +# saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) + + # save dataset. +# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', +# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb +# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +# saveDataset(Gn, y, group='xml', filename='temp/temp') + + # test - new way to add labels and attributes. +# dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' +# dataset = '../../datasets/Fingerprint/Fingerprint_A.txt' +# dataset = '../../datasets/Letter-med/Letter-med_A.txt' +# dataset = '../../datasets/AIDS/AIDS_A.txt' +# dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' +# Gn, y_all = loadDataset(dataset) + pass \ No newline at end of file diff --git a/gklearn/utils/utils.py b/gklearn/utils/utils.py index 9a1e56c..2a95ea2 100644 --- a/gklearn/utils/utils.py +++ b/gklearn/utils/utils.py @@ -296,3 +296,59 @@ def get_edge_labels(Gn, edge_label): for G in Gn: el = el | set(nx.get_edge_attributes(G, edge_label).values()) return el + + +def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}): + if name == 'structuralspkernel': + from gklearn.kernels import StructuralSP + graph_kernel = StructuralSP(node_labels=node_labels, edge_labels=edge_labels, + node_attrs=node_attrs, edge_attrs=edge_attrs, + ds_infos=ds_infos, **kernel_options) + return graph_kernel + + +def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save='', irrelevant_labels=None): + from gklearn.utils import Dataset, split_dataset_by_target + + # 1. get dataset. + print('1. getting dataset...') + dataset_all = Dataset() + dataset_all.load_predefined_dataset(ds_name) + if not irrelevant_labels is None: + dataset_all.remove_labels(**irrelevant_labels) +# dataset_all.cut_graphs(range(0, 10)) + datasets = split_dataset_by_target(dataset_all) + + gram_matrix_unnorm_list = [] + run_time_list = [] + + print('start generating preimage for each class of target...') + for idx, dataset in enumerate(datasets): + target = dataset.targets[0] + print('\ntarget =', target, '\n') + + # 2. initialize graph kernel. + print('2. initializing graph kernel and setting parameters...') + graph_kernel = get_graph_kernel_by_name(kernel_options['name'], + node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, + edge_attrs=dataset.edge_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + kernel_options=kernel_options) + + # 3. compute gram matrix. + print('3. computing gram matrix...') + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options) + gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm + + gram_matrix_unnorm_list.append(gram_matrix_unnorm) + run_time_list.append(run_time) + + # 4. save results. + print() + print('4. saving results...') + if save_results: + np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list) + + print('\ncomplete.') \ No newline at end of file diff --git a/notebooks/tests/test_tqdm.py b/notebooks/tests/test_tqdm.py new file mode 100644 index 0000000..e408760 --- /dev/null +++ b/notebooks/tests/test_tqdm.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Apr 3 10:38:59 2020 + +@author: ljia +""" + +from tqdm import tqdm +import sys + +print('start') + +for i in tqdm(range(10000000), file=sys.stdout): + x = i +# print(x) +# ============================================================================= +# summary +# terminal, IPython 7.0.1 (Spyder 4): Works. +# write to file: does not work. Progress bar splits as the progress goes. +# Jupyter: +# ============================================================================= + +# for i in tqdm(range(10000000)): +# x = i +# print(x) +# ============================================================================= +# summary +# terminal, IPython 7.0.1 (Spyder 4): does not work. When combines with other +# print, progress bar splits. +# write to file: does not work. Cannot write progress bar to file. +# Jupyter: +# ============================================================================= diff --git a/requirements.txt b/requirements.txt index a48620b..85aabf8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,10 @@ -numpy==1.15.2 -scipy==1.1.0 -matplotlib==3.0.0 -networkx==2.2 -scikit-learn==0.20.0 -tabulate==0.8.2 -tqdm==4.26.0 +numpy>=1.15.2 +scipy>=1.1.0 +matplotlib>=3.0.0 +networkx>=2.2 +scikit-learn>=0.20.0 +tabulate>=0.8.2 +tqdm>=4.26.0 +# cvxpy # for preimage. +# cvxopt # for preimage. +# mosek # for preimage.