diff --git a/README.md b/README.md index e31046a..5f490ac 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,7 @@ A comparison of performances of graph kernels on benchmark datasets can be found ## Authors -* [Linlin Jia](https://github.com/jajupmochi), LITIS, INSA Rouen Normandie +* [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie * [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie * [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index 89ef7b0..6b93dae 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -17,6 +17,7 @@ from gklearn.ged.util import compute_geds, ged_options_to_string from gklearn.ged.median import MedianGraphEstimator from gklearn.ged.median import constant_node_costs,mge_options_to_string from gklearn.gedlib import librariesImport, gedlibpy +from gklearn.utils import Timer # from gklearn.utils.dataset import Dataset class MedianPreimageGenerator(PreimageGenerator): @@ -29,10 +30,13 @@ class MedianPreimageGenerator(PreimageGenerator): self.__mge_options = {} self.__fit_method = 'k-graphs' self.__init_ecc = None - self.__max_itrs = 100 self.__parallel = True self.__n_jobs = multiprocessing.cpu_count() self.__ds_name = None + self.__time_limit_in_sec = 0 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__epsilon_ratio = 0.01 # values to compute. self.__edit_cost_constants = [] self.__runtime_precompute_gm = None @@ -41,11 +45,15 @@ class MedianPreimageGenerator(PreimageGenerator): self.__runtime_total = None self.__set_median = None self.__gen_median = None + self.__best_from_dataset = None self.__sod_set_median = None self.__sod_gen_median = None self.__k_dis_set_median = None self.__k_dis_gen_median = None self.__k_dis_dataset = None + self.__itrs = 0 + self.__converged = False + self.__num_updates_ecc = 0 def set_options(self, **kwargs): @@ -57,10 +65,13 @@ class MedianPreimageGenerator(PreimageGenerator): self.__fit_method = kwargs.get('fit_method', 'k-graphs') self.__init_ecc = kwargs.get('init_ecc', None) self.__edit_cost_constants = kwargs.get('edit_cost_constants', []) - self.__max_itrs = kwargs.get('max_itrs', 100) self.__parallel = kwargs.get('parallel', True) self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) self.__ds_name = kwargs.get('ds_name', None) + self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) + self.__max_itrs = kwargs.get('max_itrs', 100) + self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3) + self.__epsilon_ratio = kwargs.get('epsilon_ratio', 0.01) def run(self): @@ -75,7 +86,6 @@ class MedianPreimageGenerator(PreimageGenerator): self.__runtime_precompute_gm = end_precompute_gm - start # 2. optimize edit cost constants. -# self.__optimize_edit_cost_constants(dataset=dataset, Gn=Gn, Kmatrix_median=Kmatrix_median) self.__optimize_edit_cost_constants() end_optimize_ec = time.time() self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm @@ -108,28 +118,47 @@ class MedianPreimageGenerator(PreimageGenerator): if self._verbose: print() print('================================================================================') - print('The optimized edit cost constants: ', self.__edit_cost_constants) - print('SOD of the set median: ', self.__sod_set_median) - print('SOD of the generalized median: ', self.__sod_gen_median) + print('Finished generalization of preimages.') + print('--------------------------------------------------------------------------------') + print('The optimized edit cost constants:', self.__edit_cost_constants) + print('SOD of the set median:', self.__sod_set_median) + print('SOD of the generalized median:', self.__sod_gen_median) print('Distance in kernel space for set median:', self.__k_dis_set_median) print('Distance in kernel space for generalized median:', self.__k_dis_gen_median) print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) - print('Time to pre-compute Gram matrix: ', self.__runtime_precompute_gm) - print('Time to optimize edit costs: ', self.__runtime_optimize_ec) - print('Time to generate pre-images: ', self.__runtime_generate_preimage) - print('Total time: ', self.__runtime_total) + print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) + print('Time to optimize edit costs:', self.__runtime_optimize_ec) + print('Time to generate pre-images:', self.__runtime_generate_preimage) + print('Total time:', self.__runtime_total) + print('Total number of iterations for optimizing:', self.__itrs) + print('Total number of updating edit costs:', self.__num_updates_ecc) + print('Is optimization of edit costs converged:', self.__converged) print('================================================================================') - - - # collect return values. # return (sod_sm, sod_gm), \ # (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ # (time_fitting, time_generating) + + def get_results(self): + results = {} + results['edit_cost_constants'] = self.__edit_cost_constants + results['runtime_precompute_gm'] = self.__runtime_precompute_gm + results['runtime_optimize_ec'] = self.__runtime_optimize_ec + results['runtime_generate_preimage'] = self.__runtime_generate_preimage + results['runtime_total'] = self.__runtime_total + results['sod_set_median'] = self.__sod_set_median + results['sod_gen_median'] = self.__sod_gen_median + results['k_dis_set_median'] = self.__k_dis_set_median + results['k_dis_gen_median'] = self.__k_dis_gen_median + results['k_dis_dataset'] = self.__k_dis_dataset + results['itrs'] = self.__itrs + results['converged'] = self.__converged + results['num_updates_ecc'] = self.__num_updates_ecc + return results + -# def __optimize_edit_cost_constants(self, dataset=None, Gn=None, Kmatrix_median=None): def __optimize_edit_cost_constants(self): """fit edit cost constants. """ @@ -177,8 +206,6 @@ class MedianPreimageGenerator(PreimageGenerator): self.__init_ecc = [3, 3, 1, 3, 3, 1] # optimize on the k-graph subset. self.__optimize_ecc_by_kernel_distances() -# fit_GED_to_kernel_distance(Gn_median, -# dataset=dataset, Kmatrix=Kmatrix_median) elif self.__fit_method == 'whole-dataset': if self.__init_ecc is None: if self.__ged_options['edit_cost'] == 'LETTER': @@ -189,15 +216,11 @@ class MedianPreimageGenerator(PreimageGenerator): self.__init_ecc = [3, 3, 1, 3, 3, 1] # optimizeon the whole set. self.__optimize_ecc_by_kernel_distances() -# fit_GED_to_kernel_distance(Gn, dataset=dataset) elif self.__fit_method == 'precomputed': pass - def __optimize_ecc_by_kernel_distances(self): -# def fit_GED_to_kernel_distance(Gn, Kmatrix=None, -# parallel=True): - + def __optimize_ecc_by_kernel_distances(self): # compute distances in feature space. dis_k_mat, _, _, _ = self.__graph_kernel.compute_distance_matrix() dis_k_vec = [] @@ -222,20 +245,25 @@ class MedianPreimageGenerator(PreimageGenerator): nb_cost_mat = np.array(n_edit_operations) nb_cost_mat_list = [nb_cost_mat] if self._verbose >= 2: - print('edit_cost_constants:', self.__edit_cost_constants) - print('residual_list:', residual_list) - - for itr in range(self.__max_itrs): + print('Current edit cost constants:', self.__edit_cost_constants) + print('Residual list:', residual_list) + + # run iteration from initial edit costs. + self.__converged = False + itrs_without_update = 0 + self.__itrs = 0 + self.__num_updates_ecc = 0 + timer = Timer(self.__time_limit_in_sec) + while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update): if self._verbose >= 2: - print('\niteration', itr) + print('\niteration', self.__itrs) time0 = time.time() - # "fit" geds to distances in feature space by tuning edit costs using the - # Least Squares Method. - np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm', - nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, - n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, - ged_mat=ged_mat) - self.__edit_cost_constants, residual = self.__update_ecc(nb_cost_mat, dis_k_vec) + # "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method. +# np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm', +# nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, +# n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, +# ged_mat=ged_mat) + self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec) for i in range(len(self.__edit_cost_constants)): if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9: self.__edit_cost_constants[i] = 0 @@ -254,12 +282,59 @@ class MedianPreimageGenerator(PreimageGenerator): edit_cost_list.append(self.__edit_cost_constants) nb_cost_mat = np.array(n_edit_operations) nb_cost_mat_list.append(nb_cost_mat) + + # check convergency. + ec_changed = False + for i, cost in enumerate(self.__edit_cost_constants): +# if cost == 0: +# if edit_cost_list[-2][i] > self.__epsilon_ratio: +# ec_changed = True +# break +# elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ratio: +# ec_changed = True +# break + if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ratio: + ec_changed = True + break + residual_changed = False + if residual_list[-1] == 0: + if residual_list[-2] > self.__epsilon_ratio: + residual_changed = True + elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_ratio: + residual_changed = True + self.__converged = not (ec_changed or residual_changed) + if self.__converged: + itrs_without_update += 1 + else: + itrs_without_update = 0 + self.__num_updates_ecc += 1 + + # print current states. if self._verbose >= 2: - print('edit_cost_constants:', self.__edit_cost_constants) - print('residual_list:', residual_list) - -# return residual_list, edit_cost_list, dis_k_mat, ged_mat, \ -# time_list, nb_cost_mat_list + print() + print('-------------------------------------------------------------------------') + print('States of iteration', str(self.__itrs)) + print('-------------------------------------------------------------------------') +# print('Time spend:', self.__runtime_optimize_ec) + print('Total number of iterations for optimizing:', self.__itrs) + print('Total number of updating edit costs:', self.__num_updates_ecc) + print('Is optimization of edit costs converged:', self.__converged) + print('Does edit cost changed:', ec_changed) + print('Does residual changed:', residual_changed) + print('Iterations without update:', itrs_without_update) + print('Current edit cost constants:', self.__edit_cost_constants) + print('Residual list:', residual_list) + print('-------------------------------------------------------------------------') + + self.__itrs += 1 + + + def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): +# if self.__state == AlgorithmState.TERMINATED: +# self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'): @@ -591,6 +666,7 @@ class MedianPreimageGenerator(PreimageGenerator): gram_with_gm, withterm3=False)) idx_k_dis_median_set_min = np.argmin(k_dis_median_set) self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min] + self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy() if self._verbose >= 2: print() @@ -599,8 +675,6 @@ class MedianPreimageGenerator(PreimageGenerator): print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) print('distance in kernel space for each graph in median set:', k_dis_median_set) -# return dis_k_sm, dis_k_gm, k_dis_median_set, dis_k_gi_min, idx_dis_k_gi_min - def __set_graph_kernel_by_name(self): if self.kernel_options['name'] == 'structuralspkernel': @@ -670,5 +744,20 @@ class MedianPreimageGenerator(PreimageGenerator): return self.__init_ecc @init_ecc.setter - def fit_method(self, value): - self.__init_ecc = value \ No newline at end of file + def init_ecc(self, value): + self.__init_ecc = value + + + @property + def set_median(self): + return self.__set_median + + + @property + def gen_median(self): + return self.__gen_median + + + @property + def best_from_dataset(self): + return self.__best_from_dataset \ No newline at end of file diff --git a/gklearn/preimage/test_median_preimage_generator.py b/gklearn/preimage/test_median_preimage_generator.py index 9b0ccc4..2f458af 100644 --- a/gklearn/preimage/test_median_preimage_generator.py +++ b/gklearn/preimage/test_median_preimage_generator.py @@ -20,9 +20,12 @@ def test_median_preimage_generator(): mpg = MedianPreimageGenerator() mpg_options = {'fit_method': 'k-graphs', 'init_ecc': [3, 3, 1, 3, 3], - 'max_itrs': 6, 'ds_name': 'Letter-high', 'parallel': True, + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_ratio': 0.01, 'verbose': 2} mpg.set_options(**mpg_options) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) diff --git a/gklearn/preimage/utils.py b/gklearn/preimage/utils.py index 9fd186d..bd4de0b 100644 --- a/gklearn/preimage/utils.py +++ b/gklearn/preimage/utils.py @@ -19,146 +19,408 @@ from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, po from gklearn.kernels.structuralspKernel import structuralspkernel from gklearn.kernels.treeletKernel import treeletkernel from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel +from gklearn.utils import Dataset +import csv +import matplotlib.pyplot as plt +import networkx as nx + + +def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, dir_save='', ): + from gklearn.preimage import MedianPreimageGenerator + from gklearn.utils import split_dataset_by_target + from gklearn.utils.graphfiles import saveGXL + + # 1. get dataset. + print('getting dataset...') + dataset_all = Dataset() + dataset_all.load_predefined_dataset(ds_name) + datasets = split_dataset_by_target(dataset_all) +# dataset.cut_graphs(range(0, 10)) + + if save_results: + # create result files. + print('creating output files...') + fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save) + + sod_sm_list = [] + sod_gm_list = [] + dis_k_sm_list = [] + dis_k_gm_list = [] + dis_k_gi_min_list = [] + time_precompute_gm_list = [] + time_optimize_ec_list = [] + time_generate_list = [] + time_total_list = [] + itrs_list = [] + converged_list = [] + num_updates_ecc_list = [] + nb_sod_sm2gm = [0, 0, 0] + nb_dis_k_sm2gm = [0, 0, 0] + nb_dis_k_gi2sm = [0, 0, 0] + nb_dis_k_gi2gm = [0, 0, 0] +# repeats_better_sod_sm2gm = [] +# repeats_better_dis_k_sm2gm = [] +# repeats_better_dis_k_gi2sm = [] +# repeats_better_dis_k_gi2gm = [] + + + print('start generating preimage for each class of target...') + for dataset in datasets: + print('\ntarget =', dataset.targets[0], '\n') + num_graphs = len(dataset.graphs) + + if num_graphs < 2: + print('\nnumber of graphs = ', num_graphs, ', skip.\n') + continue + + # 2. set parameters. + print('1. initializing mpg and setting parameters...') + mpg = MedianPreimageGenerator() + mpg.dataset = dataset + mpg.set_options(**mpg_options.copy()) + mpg.kernel_options = kernel_options.copy() + mpg.ged_options = ged_options.copy() + mpg.mge_options = mge_options.copy() + + # 3. compute median preimage. + print('2. computing median preimage...') + mpg.run() + results = mpg.get_results() + + # write result detail. + if save_results: + print('writing results to files...') + sod_sm2gm = get_relations(np.sign(results['sod_gen_median'] - results['sod_set_median'])) + dis_k_sm2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_set_median'])) + dis_k_gi2sm = get_relations(np.sign(results['k_dis_set_median'] - results['k_dis_dataset'])) + dis_k_gi2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_dataset'])) + + f_detail = open(dir_save + fn_output_detail, 'a') + csv.writer(f_detail).writerow([ds_name, kernel_options['name'], + ged_options['edit_cost'], ged_options['method'], + ged_options['attr_distance'], mpg_options['fit_method'], + num_graphs, dataset.targets[0], 1, + results['sod_set_median'], results['sod_gen_median'], + results['k_dis_set_median'], results['k_dis_gen_median'], + results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, + dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'], + results['runtime_precompute_gm'], results['runtime_optimize_ec'], + results['runtime_generate_preimage'], results['runtime_total'], + results['itrs'], results['converged'], + results['num_updates_ecc']]) + f_detail.close() + + # compute result summary. + sod_sm_list.append(results['sod_set_median']) + sod_gm_list.append(results['sod_gen_median']) + dis_k_sm_list.append(results['k_dis_set_median']) + dis_k_gm_list.append(results['k_dis_gen_median']) + dis_k_gi_min_list.append(results['k_dis_dataset']) + time_precompute_gm_list.append(results['runtime_precompute_gm']) + time_optimize_ec_list.append(results['runtime_optimize_ec']) + time_generate_list.append(results['runtime_generate_preimage']) + time_total_list.append(results['runtime_total']) + itrs_list.append(results['itrs']) + converged_list.append(results['converged']) + num_updates_ecc_list.append(results['num_updates_ecc']) + # # SOD SM -> GM + if results['sod_set_median'] > results['sod_gen_median']: + nb_sod_sm2gm[0] += 1 + # repeats_better_sod_sm2gm.append(1) + elif results['sod_set_median'] == results['sod_gen_median']: + nb_sod_sm2gm[1] += 1 + elif results['sod_set_median'] < results['sod_gen_median']: + nb_sod_sm2gm[2] += 1 + # # dis_k SM -> GM + if results['k_dis_set_median'] > results['k_dis_gen_median']: + nb_dis_k_sm2gm[0] += 1 + # repeats_better_dis_k_sm2gm.append(1) + elif results['k_dis_set_median'] == results['k_dis_gen_median']: + nb_dis_k_sm2gm[1] += 1 + elif results['k_dis_set_median'] < results['k_dis_gen_median']: + nb_dis_k_sm2gm[2] += 1 + # # dis_k gi -> SM + if results['k_dis_dataset'] > results['k_dis_set_median']: + nb_dis_k_gi2sm[0] += 1 + # repeats_better_dis_k_gi2sm.append(1) + elif results['k_dis_dataset'] == results['k_dis_set_median']: + nb_dis_k_gi2sm[1] += 1 + elif results['k_dis_dataset'] < results['k_dis_set_median']: + nb_dis_k_gi2sm[2] += 1 + # # dis_k gi -> GM + if results['k_dis_dataset'] > results['k_dis_gen_median']: + nb_dis_k_gi2gm[0] += 1 + # repeats_better_dis_k_gi2gm.append(1) + elif results['k_dis_dataset'] == results['k_dis_gen_median']: + nb_dis_k_gi2gm[1] += 1 + elif results['k_dis_dataset'] < results['k_dis_gen_median']: + nb_dis_k_gi2gm[2] += 1 + + # write result summary for each letter. + f_summary = open(dir_save + fn_output_summary, 'a') + csv.writer(f_summary).writerow([ds_name, kernel_options['name'], + ged_options['edit_cost'], ged_options['method'], + ged_options['attr_distance'], mpg_options['fit_method'], + num_graphs, dataset.targets[0], + results['sod_set_median'], results['sod_gen_median'], + results['k_dis_set_median'], results['k_dis_gen_median'], + results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, + dis_k_gi2sm, dis_k_gi2gm, + results['runtime_precompute_gm'], results['runtime_optimize_ec'], + results['runtime_generate_preimage'], results['runtime_total'], + results['itrs'], results['converged'], + results['num_updates_ecc'], nb_sod_sm2gm, + nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm]) + f_summary.close() + + # save median graphs. + if save_medians: + fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) + saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', + node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) + fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) + saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default', + node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) + fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) + saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', + node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) + + # plot median graphs. + if plot_medians and save_medians: + if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low': + draw_Letter_graph(mpg.set_median, fn_pre_sm) + draw_Letter_graph(mpg.gen_median, fn_pre_gm) + draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset) + + + # write result summary for each letter. + if save_results: + sod_sm_mean = np.mean(sod_sm_list) + sod_gm_mean = np.mean(sod_gm_list) + dis_k_sm_mean = np.mean(dis_k_sm_list) + dis_k_gm_mean = np.mean(dis_k_gm_list) + dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) + time_precompute_gm_mean = np.mean(time_precompute_gm_list) + time_optimize_ec_mean = np.mean(time_optimize_ec_list) + time_generate_mean = np.mean(time_generate_list) + time_total_mean = np.mean(time_total_list) + itrs_mean = np.mean(itrs_list) + num_converged = np.sum(converged_list) + num_updates_ecc_mean = np.mean(num_updates_ecc_list) + sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean)) + dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) + dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) + dis_k_gi2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) + f_summary = open(dir_save + fn_output_summary, 'a') + csv.writer(f_summary).writerow([ds_name, kernel_options['name'], + ged_options['edit_cost'], ged_options['method'], + ged_options['attr_distance'], mpg_options['fit_method'], + num_graphs, 'all', + sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, + dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, + dis_k_gi2sm_mean, dis_k_gi2gm_mean, + time_precompute_gm_mean, time_optimize_ec_mean, + time_generate_mean, time_total_mean, itrs_mean, + num_converged, num_updates_ecc_mean]) + f_summary.close() + + print('\ncomplete.') + + +def __init_output_file(ds_name, gkernel, fit_method, dir_output): +# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' + fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' + f_detail = open(dir_output + fn_output_detail, 'a') + csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', + 'GED method', 'attr distance', 'fit method', 'k', + 'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM', 'edit cost constants', 'time precompute gm', + 'time optimize ec', 'time generate preimage', 'time total', + 'itrs', 'converged', 'num updates ecc']) + f_detail.close() + +# fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' + fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' + f_summary = open(dir_output + fn_output_summary, 'a') + csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost', + 'GED method', 'attr distance', 'fit method', 'k', + 'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', + 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', + 'dis_k gi -> GM', 'time precompute gm', 'time optimize ec', + 'time generate preimage', 'time total', 'itrs', 'num converged', + 'num updates ecc', '# SOD SM -> GM', '# dis_k SM -> GM', + '# dis_k gi -> SM', '# dis_k gi -> GM']) +# 'repeats better SOD SM -> GM', +# 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', +# 'repeats better dis_k gi -> GM']) + f_summary.close() + + return fn_output_detail, fn_output_summary + + +def get_relations(sign): + if sign == -1: + return 'better' + elif sign == 0: + return 'same' + elif sign == 1: + return 'worse' + + +#Dessin median courrant +def draw_Letter_graph(graph, file_prefix): + plt.figure() + pos = {} + for n in graph.nodes: + pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) + nx.draw_networkx(graph, pos) + plt.savefig(file_prefix + '.eps', format='eps', dpi=300) +# plt.show() + plt.clf() def remove_edges(Gn): - for G in Gn: - for _, _, attrs in G.edges(data=True): - attrs.clear() - + for G in Gn: + for _, _, attrs in G.edges(data=True): + attrs.clear() + def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): - term1 = Kmatrix[idx_g, idx_g] - term2 = 0 - for i, a in enumerate(alpha): - term2 += a * Kmatrix[idx_g, idx_gi[i]] - term2 *= 2 - if withterm3 == False: - for i1, a1 in enumerate(alpha): - for i2, a2 in enumerate(alpha): - term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] - return np.sqrt(term1 - term2 + term3) + term1 = Kmatrix[idx_g, idx_g] + term2 = 0 + for i, a in enumerate(alpha): + term2 += a * Kmatrix[idx_g, idx_gi[i]] + term2 *= 2 + if withterm3 == False: + for i1, a1 in enumerate(alpha): + for i2, a2 in enumerate(alpha): + term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] + return np.sqrt(term1 - term2 + term3) def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): - term1 = Kmatrix[idx_g, idx_g] - term2 = 0 - for i, a in enumerate(alpha): - term2 += a * Kmatrix[idx_g, idx_gi[i]] - term2 *= 2 - if withterm3 == False: - for i1, a1 in enumerate(alpha): - for i2, a2 in enumerate(alpha): - term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] - return np.sqrt(term1 - term2 + term3) + term1 = Kmatrix[idx_g, idx_g] + term2 = 0 + for i, a in enumerate(alpha): + term2 += a * Kmatrix[idx_g, idx_gi[i]] + term2 *= 2 + if withterm3 == False: + for i1, a1 in enumerate(alpha): + for i2, a2 in enumerate(alpha): + term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] + return np.sqrt(term1 - term2 + term3) def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'): - if graph_kernel == 'marginalizedkernel': - Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, - p_quit=0.03, n_iteration=10, remove_totters=False, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) - elif graph_kernel == 'untilhpathkernel': - Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, - depth=7, k_func='MinMax', compute_method='trie', - parallel=parallel, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) - elif graph_kernel == 'spkernel': - mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) - Kmatrix = np.empty((len(Gn), len(Gn))) -# Kmatrix[:] = np.nan - Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= - {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) -# for i, row in enumerate(idx): -# for j, col in enumerate(idx): -# Kmatrix[row, col] = Kmatrix_tmp[i, j] - elif graph_kernel == 'structuralspkernel': - mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) - sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, - edge_label=edge_label, node_kernels=sub_kernels, - edge_kernels=sub_kernels, - parallel=parallel, n_jobs=multiprocessing.cpu_count(), - verbose=verbose) - elif graph_kernel == 'treeletkernel': - pkernel = functools.partial(polynomialkernel, d=2, c=1e5) -# pkernel = functools.partial(gaussiankernel, gamma=1e-6) - mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) - Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, - sub_kernel=pkernel, parallel=parallel, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) - elif graph_kernel == 'weisfeilerlehmankernel': - Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, - height=4, base_kernel='subtree', parallel=None, - n_jobs=multiprocessing.cpu_count(), verbose=verbose) - - # normalization - Kmatrix_diag = Kmatrix.diagonal().copy() - for i in range(len(Kmatrix)): - for j in range(i, len(Kmatrix)): - Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) - Kmatrix[j][i] = Kmatrix[i][j] - return Kmatrix - + if graph_kernel == 'marginalizedkernel': + Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, + p_quit=0.03, n_iteration=10, remove_totters=False, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) + elif graph_kernel == 'untilhpathkernel': + Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, + depth=7, k_func='MinMax', compute_method='trie', + parallel=parallel, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) + elif graph_kernel == 'spkernel': + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + Kmatrix = np.empty((len(Gn), len(Gn))) +# Kmatrix[:] = np.nan + Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= + {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) +# for i, row in enumerate(idx): +# for j, col in enumerate(idx): +# Kmatrix[row, col] = Kmatrix_tmp[i, j] + elif graph_kernel == 'structuralspkernel': + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, + edge_label=edge_label, node_kernels=sub_kernels, + edge_kernels=sub_kernels, + parallel=parallel, n_jobs=multiprocessing.cpu_count(), + verbose=verbose) + elif graph_kernel == 'treeletkernel': + pkernel = functools.partial(polynomialkernel, d=2, c=1e5) +# pkernel = functools.partial(gaussiankernel, gamma=1e-6) + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, + sub_kernel=pkernel, parallel=parallel, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) + elif graph_kernel == 'weisfeilerlehmankernel': + Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, + height=4, base_kernel='subtree', parallel=None, + n_jobs=multiprocessing.cpu_count(), verbose=verbose) + + # normalization + Kmatrix_diag = Kmatrix.diagonal().copy() + for i in range(len(Kmatrix)): + for j in range(i, len(Kmatrix)): + Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) + Kmatrix[j][i] = Kmatrix[i][j] + return Kmatrix + def gram2distances(Kmatrix): - dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) - for i1 in range(len(Kmatrix)): - for i2 in range(len(Kmatrix)): - dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] - dmatrix = np.sqrt(dmatrix) - return dmatrix + dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) + for i1 in range(len(Kmatrix)): + for i2 in range(len(Kmatrix)): + dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] + dmatrix = np.sqrt(dmatrix) + return dmatrix def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, - gkernel=None, verbose=True): - dis_mat = np.empty((len(Gn), len(Gn))) - if Kmatrix is None: - Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) - for i in range(len(Gn)): - for j in range(i, len(Gn)): - dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] - if dis < 0: - if dis > -1e-10: - dis = 0 - else: - raise ValueError('The distance is negative.') - dis_mat[i, j] = np.sqrt(dis) - dis_mat[j, i] = dis_mat[i, j] - dis_max = np.max(np.max(dis_mat)) - dis_min = np.min(np.min(dis_mat[dis_mat != 0])) - dis_mean = np.mean(np.mean(dis_mat)) - return dis_mat, dis_max, dis_min, dis_mean + gkernel=None, verbose=True): + dis_mat = np.empty((len(Gn), len(Gn))) + if Kmatrix is None: + Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) + for i in range(len(Gn)): + for j in range(i, len(Gn)): + dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] + if dis < 0: + if dis > -1e-10: + dis = 0 + else: + raise ValueError('The distance is negative.') + dis_mat[i, j] = np.sqrt(dis) + dis_mat[j, i] = dis_mat[i, j] + dis_max = np.max(np.max(dis_mat)) + dis_min = np.min(np.min(dis_mat[dis_mat != 0])) + dis_mean = np.mean(np.mean(dis_mat)) + return dis_mat, dis_max, dis_min, dis_mean def get_same_item_indices(ls): - """Get the indices of the same items in a list. Return a dict keyed by items. - """ - idx_dict = {} - for idx, item in enumerate(ls): - if item in idx_dict: - idx_dict[item].append(idx) - else: - idx_dict[item] = [idx] - return idx_dict + """Get the indices of the same items in a list. Return a dict keyed by items. + """ + idx_dict = {} + for idx, item in enumerate(ls): + if item in idx_dict: + idx_dict[item].append(idx) + else: + idx_dict[item] = [idx] + return idx_dict def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, - node_label=None, edge_label=None): - dis_k_all = [] # distance between g_star and each graph. - alpha = [1 / len(Gn)] * len(Gn) - if Kmatrix is None: - Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) - term3 = 0 - for i1, a1 in enumerate(alpha): - for i2, a2 in enumerate(alpha): - term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] - for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): - dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) - dis_all.append(dtemp) + node_label=None, edge_label=None): + dis_k_all = [] # distance between g_star and each graph. + alpha = [1 / len(Gn)] * len(Gn) + if Kmatrix is None: + Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) + term3 = 0 + for i1, a1 in enumerate(alpha): + for i2, a2 in enumerate(alpha): + term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] + for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): + dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) + dis_all.append(dtemp) def normalize_distance_matrix(D): - max_value = np.amax(D) - min_value = np.amin(D) - return (D - min_value) / (max_value - min_value) \ No newline at end of file + max_value = np.amax(D) + min_value = np.amin(D) + return (D - min_value) / (max_value - min_value) \ No newline at end of file diff --git a/gklearn/utils/__init__.py b/gklearn/utils/__init__.py index 2654b7a..84d54f3 100644 --- a/gklearn/utils/__init__.py +++ b/gklearn/utils/__init__.py @@ -15,5 +15,5 @@ __date__ = "November 2017" # from utils import graphfiles # from utils import utils -from gklearn.utils.dataset import Dataset +from gklearn.utils.dataset import Dataset, split_dataset_by_target from gklearn.utils.timer import Timer diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 7a18e40..08e2718 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -8,6 +8,7 @@ Created on Thu Mar 26 18:48:27 2020 import numpy as np import networkx as nx from gklearn.utils.graphfiles import loadDataset +import os class Dataset(object): @@ -15,7 +16,7 @@ class Dataset(object): def __init__(self, filename=None, filename_y=None, extra_params=None): if filename is None: self.__graphs = None - self.__target = None + self.__targets = None self.__node_labels = None self.__edge_labels = None self.__node_attrs = None @@ -50,33 +51,40 @@ class Dataset(object): def load_dataset(self, filename, filename_y=None, extra_params=None): - self.__graphs, self.__target = loadDataset(filename, filename_y=filename_y, extra_params=extra_params) + self.__graphs, self.__targets = loadDataset(filename, filename_y=filename_y, extra_params=extra_params) + self.set_labels_attrs() + + + def load_graphs(self, graphs, targets=None): + self.__graphs = graphs + self.__targets = targets self.set_labels_attrs() def load_predefined_dataset(self, ds_name): + current_path = os.path.dirname(os.path.realpath(__file__)) + '/' if ds_name == 'Letter-high': # node non-symb - ds_file = '../../datasets/Letter-high/Letter-high_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'Letter-med': # node non-symb - ds_file = '../../datasets/Letter-high/Letter-med_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'Letter-low': # node non-symb - ds_file = '../../datasets/Letter-high/Letter-low_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'Fingerprint': - ds_file = '../../datasets/Fingerprint/Fingerprint_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'SYNTHETIC': pass elif ds_name == 'SYNTHETICnew': - ds_file = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'Synthie': pass elif ds_name == 'COIL-DEL': - ds_file = '../../datasets/COIL-DEL/COIL-DEL_A.txt' - self.__graphs, self.__target = loadDataset(ds_file) + ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' + self.__graphs, self.__targets = loadDataset(ds_file) elif ds_name == 'COIL-RAG': pass elif ds_name == 'COLORS-3': @@ -514,7 +522,7 @@ class Dataset(object): def __get_class_num(self): - return len(set(self.__target)) + return len(set(self.__targets)) def __get_node_attr_dim(self): @@ -529,6 +537,11 @@ class Dataset(object): def graphs(self): return self.__graphs + + @property + def targets(self): + return self.__targets + @property def node_labels(self): @@ -547,4 +560,19 @@ class Dataset(object): @property def edge_attrs(self): - return self.__edge_attrs \ No newline at end of file + return self.__edge_attrs + + +def split_dataset_by_target(dataset): + from gklearn.preimage.utils import get_same_item_indices + + graphs = dataset.graphs + targets = dataset.targets + datasets = [] + idx_targets = get_same_item_indices(targets) + for key, val in idx_targets.items(): + sub_graphs = [graphs[i] for i in val] + sub_dataset = Dataset() + sub_dataset.load_graphs(sub_graphs, [key] * len(val)) + datasets.append(sub_dataset) + return datasets \ No newline at end of file diff --git a/gklearn/utils/graphfiles.py b/gklearn/utils/graphfiles.py index 6c0e2e9..862cda1 100644 --- a/gklearn/utils/graphfiles.py +++ b/gklearn/utils/graphfiles.py @@ -3,762 +3,760 @@ from os.path import dirname, splitext def loadCT(filename): - """load data from a Chemical Table (.ct) file. - - Notes - ------ - a typical example of data in .ct is like this: - - 3 2 <- number of nodes and edges - - 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) - - 0.0000 0.0000 0.0000 C - - 0.0000 0.0000 0.0000 O - - 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo - - 2 3 1 1 - - Check `CTFile Formats file `__ - for detailed format discription. - """ - import networkx as nx - from os.path import basename - g = nx.Graph() - with open(filename) as f: - content = f.read().splitlines() - g = nx.Graph( - name = str(content[0]), - filename = basename(filename)) # set name of the graph - tmp = content[1].split(" ") - if tmp[0] == '': - nb_nodes = int(tmp[1]) # number of the nodes - nb_edges = int(tmp[2]) # number of the edges - else: - nb_nodes = int(tmp[0]) - nb_edges = int(tmp[1]) - # patch for compatibility : label will be removed later - for i in range(0, nb_nodes): - tmp = content[i + 2].split(" ") - tmp = [x for x in tmp if x != ''] - g.add_node(i, atom=tmp[3].strip(), - label=[item.strip() for item in tmp[3:]], - attributes=[item.strip() for item in tmp[0:3]]) - for i in range(0, nb_edges): - tmp = content[i + g.number_of_nodes() + 2].split(" ") - tmp = [x for x in tmp if x != ''] - g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, - bond_type=tmp[2].strip(), - label=[item.strip() for item in tmp[2:]]) - return g + """load data from a Chemical Table (.ct) file. + + Notes + ------ + a typical example of data in .ct is like this: + + 3 2 <- number of nodes and edges + + 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) + + 0.0000 0.0000 0.0000 C + + 0.0000 0.0000 0.0000 O + + 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo + + 2 3 1 1 + + Check `CTFile Formats file `__ + for detailed format discription. + """ + import networkx as nx + from os.path import basename + g = nx.Graph() + with open(filename) as f: + content = f.read().splitlines() + g = nx.Graph( + name = str(content[0]), + filename = basename(filename)) # set name of the graph + tmp = content[1].split(" ") + if tmp[0] == '': + nb_nodes = int(tmp[1]) # number of the nodes + nb_edges = int(tmp[2]) # number of the edges + else: + nb_nodes = int(tmp[0]) + nb_edges = int(tmp[1]) + # patch for compatibility : label will be removed later + for i in range(0, nb_nodes): + tmp = content[i + 2].split(" ") + tmp = [x for x in tmp if x != ''] + g.add_node(i, atom=tmp[3].strip(), + label=[item.strip() for item in tmp[3:]], + attributes=[item.strip() for item in tmp[0:3]]) + for i in range(0, nb_edges): + tmp = content[i + g.number_of_nodes() + 2].split(" ") + tmp = [x for x in tmp if x != ''] + g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, + bond_type=tmp[2].strip(), + label=[item.strip() for item in tmp[2:]]) + return g def loadGXL(filename): - from os.path import basename - import networkx as nx - import xml.etree.ElementTree as ET - - tree = ET.parse(filename) - root = tree.getroot() - index = 0 - g = nx.Graph(filename=basename(filename), name=root[0].attrib['id']) - dic = {} # used to retrieve incident nodes of edges - for node in root.iter('node'): - dic[node.attrib['id']] = index - labels = {} - for attr in node.iter('attr'): - labels[attr.attrib['name']] = attr[0].text - if 'chem' in labels: - labels['label'] = labels['chem'] - labels['atom'] = labels['chem'] - g.add_node(index, **labels) - index += 1 - - for edge in root.iter('edge'): - labels = {} - for attr in edge.iter('attr'): - labels[attr.attrib['name']] = attr[0].text - if 'valence' in labels: - labels['label'] = labels['valence'] - labels['bond_type'] = labels['valence'] - g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) - return g - - -def saveGXL(graph, filename, method='default'): - if method == 'default': - gxl_file = open(filename, 'w') - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - for v, attrs in graph.nodes(data=True): - if graph.graph['node_labels'] == [] and graph.graph['node_attrs'] == []: - gxl_file.write("\n") - else: - gxl_file.write("") - for l_name in graph.graph['node_labels']: - gxl_file.write("" + - str(attrs[l_name]) + "") - for a_name in graph.graph['node_attrs']: - gxl_file.write("" + - str(attrs[a_name]) + "") - gxl_file.write("\n") - for v1, v2, attrs in graph.edges(data=True): - if graph.graph['edge_labels'] == [] and graph.graph['edge_attrs'] == []: - gxl_file.write("\n") - else: - gxl_file.write("") - for l_name in graph.graph['edge_labels']: - gxl_file.write("" + - str(attrs[l_name]) + "") - for a_name in graph.graph['edge_attrs']: - gxl_file.write("" + - str(attrs[a_name]) + "") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("") - gxl_file.close() - elif method == 'benoit': - import xml.etree.ElementTree as ET - root_node = ET.Element('gxl') - attr = dict() - attr['id'] = str(graph.graph['name']) - attr['edgeids'] = 'true' - attr['edgemode'] = 'undirected' - graph_node = ET.SubElement(root_node, 'graph', attrib=attr) - - for v in graph: - current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) - for attr in graph.nodes[v].keys(): - cur_attr = ET.SubElement( - current_node, 'attr', attrib={'name': attr}) - cur_value = ET.SubElement(cur_attr, - graph.nodes[v][attr].__class__.__name__) - cur_value.text = graph.nodes[v][attr] - - for v1 in graph: - for v2 in graph[v1]: - if (v1 < v2): # Non oriented graphs - cur_edge = ET.SubElement( - graph_node, - 'edge', - attrib={ - 'from': str(v1), - 'to': str(v2) - }) - for attr in graph[v1][v2].keys(): - cur_attr = ET.SubElement( - cur_edge, 'attr', attrib={'name': attr}) - cur_value = ET.SubElement( - cur_attr, graph[v1][v2][attr].__class__.__name__) - cur_value.text = str(graph[v1][v2][attr]) - - tree = ET.ElementTree(root_node) - tree.write(filename) - elif method == 'gedlib': - # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 -# pass - gxl_file = open(filename, 'w') - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - for v, attrs in graph.nodes(data=True): - gxl_file.write("") - gxl_file.write("" + str(attrs['chem']) + "") - gxl_file.write("\n") - for v1, v2, attrs in graph.edges(data=True): - gxl_file.write("") - gxl_file.write("" + str(attrs['valence']) + "") -# gxl_file.write("" + "1" + "") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("") - gxl_file.close() - elif method == 'gedlib-letter': - # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 - # and https://github.com/dbblumenthal/gedlib/blob/master/data/datasets/Letter/HIGH/AP1_0000.gxl - gxl_file = open(filename, 'w') - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("\n") - for v, attrs in graph.nodes(data=True): - gxl_file.write("") - gxl_file.write("" + str(attrs['attributes'][0]) + "") - gxl_file.write("" + str(attrs['attributes'][1]) + "") - gxl_file.write("\n") - for v1, v2, attrs in graph.edges(data=True): - gxl_file.write("\n") - gxl_file.write("\n") - gxl_file.write("") - gxl_file.close() + from os.path import basename + import networkx as nx + import xml.etree.ElementTree as ET + + tree = ET.parse(filename) + root = tree.getroot() + index = 0 + g = nx.Graph(filename=basename(filename), name=root[0].attrib['id']) + dic = {} # used to retrieve incident nodes of edges + for node in root.iter('node'): + dic[node.attrib['id']] = index + labels = {} + for attr in node.iter('attr'): + labels[attr.attrib['name']] = attr[0].text + if 'chem' in labels: + labels['label'] = labels['chem'] + labels['atom'] = labels['chem'] + g.add_node(index, **labels) + index += 1 + + for edge in root.iter('edge'): + labels = {} + for attr in edge.iter('attr'): + labels[attr.attrib['name']] = attr[0].text + if 'valence' in labels: + labels['label'] = labels['valence'] + labels['bond_type'] = labels['valence'] + g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) + return g + + +def saveGXL(graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + if method == 'default': + gxl_file = open(filename, 'w') + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + if 'name' in graph.graph: + name = str(graph.graph['name']) + else: + name = 'dummy' + gxl_file.write("\n") + for v, attrs in graph.nodes(data=True): + gxl_file.write("") + for l_name in node_labels: + gxl_file.write("" + + str(attrs[l_name]) + "") + for a_name in node_attrs: + gxl_file.write("" + + str(attrs[a_name]) + "") + gxl_file.write("\n") + for v1, v2, attrs in graph.edges(data=True): + gxl_file.write("") + for l_name in edge_labels: + gxl_file.write("" + + str(attrs[l_name]) + "") + for a_name in edge_attrs: + gxl_file.write("" + + str(attrs[a_name]) + "") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("") + gxl_file.close() + elif method == 'benoit': + import xml.etree.ElementTree as ET + root_node = ET.Element('gxl') + attr = dict() + attr['id'] = str(graph.graph['name']) + attr['edgeids'] = 'true' + attr['edgemode'] = 'undirected' + graph_node = ET.SubElement(root_node, 'graph', attrib=attr) + + for v in graph: + current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) + for attr in graph.nodes[v].keys(): + cur_attr = ET.SubElement( + current_node, 'attr', attrib={'name': attr}) + cur_value = ET.SubElement(cur_attr, + graph.nodes[v][attr].__class__.__name__) + cur_value.text = graph.nodes[v][attr] + + for v1 in graph: + for v2 in graph[v1]: + if (v1 < v2): # Non oriented graphs + cur_edge = ET.SubElement( + graph_node, + 'edge', + attrib={ + 'from': str(v1), + 'to': str(v2) + }) + for attr in graph[v1][v2].keys(): + cur_attr = ET.SubElement( + cur_edge, 'attr', attrib={'name': attr}) + cur_value = ET.SubElement( + cur_attr, graph[v1][v2][attr].__class__.__name__) + cur_value.text = str(graph[v1][v2][attr]) + + tree = ET.ElementTree(root_node) + tree.write(filename) + elif method == 'gedlib': + # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 +# pass + gxl_file = open(filename, 'w') + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + for v, attrs in graph.nodes(data=True): + gxl_file.write("") + gxl_file.write("" + str(attrs['chem']) + "") + gxl_file.write("\n") + for v1, v2, attrs in graph.edges(data=True): + gxl_file.write("") + gxl_file.write("" + str(attrs['valence']) + "") +# gxl_file.write("" + "1" + "") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("") + gxl_file.close() + elif method == 'gedlib-letter': + # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 + # and https://github.com/dbblumenthal/gedlib/blob/master/data/datasets/Letter/HIGH/AP1_0000.gxl + gxl_file = open(filename, 'w') + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + for v, attrs in graph.nodes(data=True): + gxl_file.write("") + gxl_file.write("" + str(attrs['attributes'][0]) + "") + gxl_file.write("" + str(attrs['attributes'][1]) + "") + gxl_file.write("\n") + for v1, v2, attrs in graph.edges(data=True): + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("") + gxl_file.close() def loadSDF(filename): - """load data from structured data file (.sdf file). - - Notes - ------ - A SDF file contains a group of molecules, represented in the similar way as in MOL format. - Check `here `__ for detailed structure. - """ - import networkx as nx - from os.path import basename - from tqdm import tqdm - import sys - data = [] - with open(filename) as f: - content = f.read().splitlines() - index = 0 - pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) - while index < len(content): - index_old = index - - g = nx.Graph(name=content[index].strip()) # set name of the graph - - tmp = content[index + 3] - nb_nodes = int(tmp[:3]) # number of the nodes - nb_edges = int(tmp[3:6]) # number of the edges - - for i in range(0, nb_nodes): - tmp = content[i + index + 4] - g.add_node(i, atom=tmp[31:34].strip()) - - for i in range(0, nb_edges): - tmp = content[i + index + g.number_of_nodes() + 4] - tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] - g.add_edge( - int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) - - data.append(g) - - index += 4 + g.number_of_nodes() + g.number_of_edges() - while content[index].strip() != '$$$$': # seperator - index += 1 - index += 1 - - pbar.update(index - index_old) - pbar.update(1) - pbar.close() - - return data + """load data from structured data file (.sdf file). + + Notes + ------ + A SDF file contains a group of molecules, represented in the similar way as in MOL format. + Check `here `__ for detailed structure. + """ + import networkx as nx + from os.path import basename + from tqdm import tqdm + import sys + data = [] + with open(filename) as f: + content = f.read().splitlines() + index = 0 + pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) + while index < len(content): + index_old = index + + g = nx.Graph(name=content[index].strip()) # set name of the graph + + tmp = content[index + 3] + nb_nodes = int(tmp[:3]) # number of the nodes + nb_edges = int(tmp[3:6]) # number of the edges + + for i in range(0, nb_nodes): + tmp = content[i + index + 4] + g.add_node(i, atom=tmp[31:34].strip()) + + for i in range(0, nb_edges): + tmp = content[i + index + g.number_of_nodes() + 4] + tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] + g.add_edge( + int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) + + data.append(g) + + index += 4 + g.number_of_nodes() + g.number_of_edges() + while content[index].strip() != '$$$$': # seperator + index += 1 + index += 1 + + pbar.update(index - index_old) + pbar.update(1) + pbar.close() + + return data def loadMAT(filename, extra_params): - """Load graph data from a MATLAB (up to version 7.1) .mat file. - - Notes - ------ - A MAT file contains a struct array containing graphs, and a column vector lx containing a class label for each graph. - Check README in `downloadable file `__ for detailed structure. - """ - from scipy.io import loadmat - import numpy as np - import networkx as nx - data = [] - content = loadmat(filename) - order = extra_params['am_sp_al_nl_el'] - # print(content) - # print('----') - for key, value in content.items(): - if key[0] == 'l': # class label - y = np.transpose(value)[0].tolist() - # print(y) - elif key[0] != '_': - # print(value[0][0][0]) - # print() - # print(value[0][0][1]) - # print() - # print(value[0][0][2]) - # print() - # if len(value[0][0]) > 3: - # print(value[0][0][3]) - # print('----') - # if adjacency matrix is not compressed / edge label exists - if order[1] == 0: - for i, item in enumerate(value[0]): - # print(item) - # print('------') - g = nx.Graph(name=i) # set name of the graph - nl = np.transpose(item[order[3]][0][0][0]) # node label - # print(item[order[3]]) - # print() - for index, label in enumerate(nl[0]): - g.add_node(index, atom=str(label)) - el = item[order[4]][0][0][0] # edge label - for edge in el: - g.add_edge( - edge[0] - 1, edge[1] - 1, bond_type=str(edge[2])) - data.append(g) - else: - from scipy.sparse import csc_matrix - for i, item in enumerate(value[0]): - # print(item) - # print('------') - g = nx.Graph(name=i) # set name of the graph - nl = np.transpose(item[order[3]][0][0][0]) # node label - # print(nl) - # print() - for index, label in enumerate(nl[0]): - g.add_node(index, atom=str(label)) - sam = item[order[0]] # sparse adjacency matrix - index_no0 = sam.nonzero() - for col, row in zip(index_no0[0], index_no0[1]): - # print(col) - # print(row) - g.add_edge(col, row) - data.append(g) - # print(g.edges(data=True)) - return data, y + """Load graph data from a MATLAB (up to version 7.1) .mat file. + + Notes + ------ + A MAT file contains a struct array containing graphs, and a column vector lx containing a class label for each graph. + Check README in `downloadable file `__ for detailed structure. + """ + from scipy.io import loadmat + import numpy as np + import networkx as nx + data = [] + content = loadmat(filename) + order = extra_params['am_sp_al_nl_el'] + # print(content) + # print('----') + for key, value in content.items(): + if key[0] == 'l': # class label + y = np.transpose(value)[0].tolist() + # print(y) + elif key[0] != '_': + # print(value[0][0][0]) + # print() + # print(value[0][0][1]) + # print() + # print(value[0][0][2]) + # print() + # if len(value[0][0]) > 3: + # print(value[0][0][3]) + # print('----') + # if adjacency matrix is not compressed / edge label exists + if order[1] == 0: + for i, item in enumerate(value[0]): + # print(item) + # print('------') + g = nx.Graph(name=i) # set name of the graph + nl = np.transpose(item[order[3]][0][0][0]) # node label + # print(item[order[3]]) + # print() + for index, label in enumerate(nl[0]): + g.add_node(index, atom=str(label)) + el = item[order[4]][0][0][0] # edge label + for edge in el: + g.add_edge( + edge[0] - 1, edge[1] - 1, bond_type=str(edge[2])) + data.append(g) + else: + from scipy.sparse import csc_matrix + for i, item in enumerate(value[0]): + # print(item) + # print('------') + g = nx.Graph(name=i) # set name of the graph + nl = np.transpose(item[order[3]][0][0][0]) # node label + # print(nl) + # print() + for index, label in enumerate(nl[0]): + g.add_node(index, atom=str(label)) + sam = item[order[0]] # sparse adjacency matrix + index_no0 = sam.nonzero() + for col, row in zip(index_no0[0], index_no0[1]): + # print(col) + # print(row) + g.add_edge(col, row) + data.append(g) + # print(g.edges(data=True)) + return data, y def loadTXT(filename): - """Load graph data from a .txt file. - - Notes - ------ - The graph data is loaded from separate files. - Check README in `downloadable file `__, 2018 for detailed structure. - """ -# import numpy as np - import networkx as nx - from os import listdir - from os.path import dirname, basename - - - def get_label_names(frm): - """Get label names from DS_label_readme.txt file. - """ - - def get_names_from_line(line): - """Get names of labels/attributes from a line. - """ - str_names = line.split('[')[1].split(']')[0] - names = str_names.split(',') - names = [attr.strip() for attr in names] - return names - - - label_names = {'node_labels': [], 'node_attrs': [], - 'edge_labels': [], 'edge_attrs': []} - content_rm = open(frm).read().splitlines() - for line in content_rm: - line = line.strip() - if line.startswith('Node labels:'): - label_names['node_labels'] = get_names_from_line(line) - elif line.startswith('Node attributes:'): - label_names['node_attrs'] = get_names_from_line(line) - elif line.startswith('Edge labels:'): - label_names['edge_labels'] = get_names_from_line(line) - elif line.startswith('Edge attributes:'): - label_names['edge_attrs'] = get_names_from_line(line) - return label_names - - - # get dataset name. - dirname_dataset = dirname(filename) - filename = basename(filename) - fn_split = filename.split('_A') - ds_name = fn_split[0].strip() - - # load data file names - for name in listdir(dirname_dataset): - if ds_name + '_A' in name: - fam = dirname_dataset + '/' + name - elif ds_name + '_graph_indicator' in name: - fgi = dirname_dataset + '/' + name - elif ds_name + '_graph_labels' in name: - fgl = dirname_dataset + '/' + name - elif ds_name + '_node_labels' in name: - fnl = dirname_dataset + '/' + name - elif ds_name + '_edge_labels' in name: - fel = dirname_dataset + '/' + name - elif ds_name + '_edge_attributes' in name: - fea = dirname_dataset + '/' + name - elif ds_name + '_node_attributes' in name: - fna = dirname_dataset + '/' + name - elif ds_name + '_graph_attributes' in name: - fga = dirname_dataset + '/' + name - elif ds_name + '_label_readme' in name: - frm = dirname_dataset + '/' + name - # this is supposed to be the node attrs, make sure to put this as the last 'elif' - elif ds_name + '_attributes' in name: - fna = dirname_dataset + '/' + name - - # get labels and attributes names. - if 'frm' in locals(): - label_names = get_label_names(frm) - else: - label_names = {'node_labels': [], 'node_attrs': [], - 'edge_labels': [], 'edge_attrs': []} - - content_gi = open(fgi).read().splitlines() # graph indicator - content_am = open(fam).read().splitlines() # adjacency matrix - content_gl = open(fgl).read().splitlines() # graph labels - - # create graphs and add nodes - data = [nx.Graph(name=str(i), - node_labels=label_names['node_labels'], - node_attrs=label_names['node_attrs'], - edge_labels=label_names['edge_labels'], - edge_attrs=label_names['edge_attrs']) for i in range(0, len(content_gl))] - if 'fnl' in locals(): - content_nl = open(fnl).read().splitlines() # node labels - for idx, line in enumerate(content_gi): - # transfer to int first in case of unexpected blanks - data[int(line) - 1].add_node(idx) - labels = [l.strip() for l in content_nl[idx].split(',')] - data[int(line) - 1].nodes[idx]['atom'] = str(int(labels[0])) # @todo: this should be removed after. - if data[int(line) - 1].graph['node_labels'] == []: - for i, label in enumerate(labels): - l_name = 'label_' + str(i) - data[int(line) - 1].nodes[idx][l_name] = label - data[int(line) - 1].graph['node_labels'].append(l_name) - else: - for i, l_name in enumerate(data[int(line) - 1].graph['node_labels']): - data[int(line) - 1].nodes[idx][l_name] = labels[i] - else: - for i, line in enumerate(content_gi): - data[int(line) - 1].add_node(i) - - # add edges - for line in content_am: - tmp = line.split(',') - n1 = int(tmp[0]) - 1 - n2 = int(tmp[1]) - 1 - # ignore edge weight here. - g = int(content_gi[n1]) - 1 - data[g].add_edge(n1, n2) - - # add edge labels - if 'fel' in locals(): - content_el = open(fel).read().splitlines() - for idx, line in enumerate(content_el): - labels = [l.strip() for l in line.split(',')] - n = [int(i) - 1 for i in content_am[idx].split(',')] - g = int(content_gi[n[0]]) - 1 - data[g].edges[n[0], n[1]]['bond_type'] = labels[0] # @todo: this should be removed after. - if data[g].graph['edge_labels'] == []: - for i, label in enumerate(labels): - l_name = 'label_' + str(i) - data[g].edges[n[0], n[1]][l_name] = label - data[g].graph['edge_labels'].append(l_name) - else: - for i, l_name in enumerate(data[g].graph['edge_labels']): - data[g].edges[n[0], n[1]][l_name] = labels[i] - - # add node attributes - if 'fna' in locals(): - content_na = open(fna).read().splitlines() - for idx, line in enumerate(content_na): - attrs = [a.strip() for a in line.split(',')] - g = int(content_gi[idx]) - 1 - data[g].nodes[idx]['attributes'] = attrs # @todo: this should be removed after. - if data[g].graph['node_attrs'] == []: - for i, attr in enumerate(attrs): - a_name = 'attr_' + str(i) - data[g].nodes[idx][a_name] = attr - data[g].graph['node_attrs'].append(a_name) - else: - for i, a_name in enumerate(data[g].graph['node_attrs']): - data[g].nodes[idx][a_name] = attrs[i] - - # add edge attributes - if 'fea' in locals(): - content_ea = open(fea).read().splitlines() - for idx, line in enumerate(content_ea): - attrs = [a.strip() for a in line.split(',')] - n = [int(i) - 1 for i in content_am[idx].split(',')] - g = int(content_gi[n[0]]) - 1 - data[g].edges[n[0], n[1]]['attributes'] = attrs # @todo: this should be removed after. - if data[g].graph['edge_attrs'] == []: - for i, attr in enumerate(attrs): - a_name = 'attr_' + str(i) - data[g].edges[n[0], n[1]][a_name] = attr - data[g].graph['edge_attrs'].append(a_name) - else: - for i, a_name in enumerate(data[g].graph['edge_attrs']): - data[g].edges[n[0], n[1]][a_name] = attrs[i] - - # load y - y = [int(i) for i in content_gl] - - return data, y + """Load graph data from a .txt file. + + Notes + ------ + The graph data is loaded from separate files. + Check README in `downloadable file `__, 2018 for detailed structure. + """ +# import numpy as np + import networkx as nx + from os import listdir + from os.path import dirname, basename + + + def get_label_names(frm): + """Get label names from DS_label_readme.txt file. + """ + + def get_names_from_line(line): + """Get names of labels/attributes from a line. + """ + str_names = line.split('[')[1].split(']')[0] + names = str_names.split(',') + names = [attr.strip() for attr in names] + return names + + + label_names = {'node_labels': [], 'node_attrs': [], + 'edge_labels': [], 'edge_attrs': []} + content_rm = open(frm).read().splitlines() + for line in content_rm: + line = line.strip() + if line.startswith('Node labels:'): + label_names['node_labels'] = get_names_from_line(line) + elif line.startswith('Node attributes:'): + label_names['node_attrs'] = get_names_from_line(line) + elif line.startswith('Edge labels:'): + label_names['edge_labels'] = get_names_from_line(line) + elif line.startswith('Edge attributes:'): + label_names['edge_attrs'] = get_names_from_line(line) + return label_names + + + # get dataset name. + dirname_dataset = dirname(filename) + filename = basename(filename) + fn_split = filename.split('_A') + ds_name = fn_split[0].strip() + + # load data file names + for name in listdir(dirname_dataset): + if ds_name + '_A' in name: + fam = dirname_dataset + '/' + name + elif ds_name + '_graph_indicator' in name: + fgi = dirname_dataset + '/' + name + elif ds_name + '_graph_labels' in name: + fgl = dirname_dataset + '/' + name + elif ds_name + '_node_labels' in name: + fnl = dirname_dataset + '/' + name + elif ds_name + '_edge_labels' in name: + fel = dirname_dataset + '/' + name + elif ds_name + '_edge_attributes' in name: + fea = dirname_dataset + '/' + name + elif ds_name + '_node_attributes' in name: + fna = dirname_dataset + '/' + name + elif ds_name + '_graph_attributes' in name: + fga = dirname_dataset + '/' + name + elif ds_name + '_label_readme' in name: + frm = dirname_dataset + '/' + name + # this is supposed to be the node attrs, make sure to put this as the last 'elif' + elif ds_name + '_attributes' in name: + fna = dirname_dataset + '/' + name + + # get labels and attributes names. + if 'frm' in locals(): + label_names = get_label_names(frm) + else: + label_names = {'node_labels': [], 'node_attrs': [], + 'edge_labels': [], 'edge_attrs': []} + + content_gi = open(fgi).read().splitlines() # graph indicator + content_am = open(fam).read().splitlines() # adjacency matrix + content_gl = open(fgl).read().splitlines() # graph labels + + # create graphs and add nodes + data = [nx.Graph(name=str(i), + node_labels=label_names['node_labels'], + node_attrs=label_names['node_attrs'], + edge_labels=label_names['edge_labels'], + edge_attrs=label_names['edge_attrs']) for i in range(0, len(content_gl))] + if 'fnl' in locals(): + content_nl = open(fnl).read().splitlines() # node labels + for idx, line in enumerate(content_gi): + # transfer to int first in case of unexpected blanks + data[int(line) - 1].add_node(idx) + labels = [l.strip() for l in content_nl[idx].split(',')] + data[int(line) - 1].nodes[idx]['atom'] = str(int(labels[0])) # @todo: this should be removed after. + if data[int(line) - 1].graph['node_labels'] == []: + for i, label in enumerate(labels): + l_name = 'label_' + str(i) + data[int(line) - 1].nodes[idx][l_name] = label + data[int(line) - 1].graph['node_labels'].append(l_name) + else: + for i, l_name in enumerate(data[int(line) - 1].graph['node_labels']): + data[int(line) - 1].nodes[idx][l_name] = labels[i] + else: + for i, line in enumerate(content_gi): + data[int(line) - 1].add_node(i) + + # add edges + for line in content_am: + tmp = line.split(',') + n1 = int(tmp[0]) - 1 + n2 = int(tmp[1]) - 1 + # ignore edge weight here. + g = int(content_gi[n1]) - 1 + data[g].add_edge(n1, n2) + + # add edge labels + if 'fel' in locals(): + content_el = open(fel).read().splitlines() + for idx, line in enumerate(content_el): + labels = [l.strip() for l in line.split(',')] + n = [int(i) - 1 for i in content_am[idx].split(',')] + g = int(content_gi[n[0]]) - 1 + data[g].edges[n[0], n[1]]['bond_type'] = labels[0] # @todo: this should be removed after. + if data[g].graph['edge_labels'] == []: + for i, label in enumerate(labels): + l_name = 'label_' + str(i) + data[g].edges[n[0], n[1]][l_name] = label + data[g].graph['edge_labels'].append(l_name) + else: + for i, l_name in enumerate(data[g].graph['edge_labels']): + data[g].edges[n[0], n[1]][l_name] = labels[i] + + # add node attributes + if 'fna' in locals(): + content_na = open(fna).read().splitlines() + for idx, line in enumerate(content_na): + attrs = [a.strip() for a in line.split(',')] + g = int(content_gi[idx]) - 1 + data[g].nodes[idx]['attributes'] = attrs # @todo: this should be removed after. + if data[g].graph['node_attrs'] == []: + for i, attr in enumerate(attrs): + a_name = 'attr_' + str(i) + data[g].nodes[idx][a_name] = attr + data[g].graph['node_attrs'].append(a_name) + else: + for i, a_name in enumerate(data[g].graph['node_attrs']): + data[g].nodes[idx][a_name] = attrs[i] + + # add edge attributes + if 'fea' in locals(): + content_ea = open(fea).read().splitlines() + for idx, line in enumerate(content_ea): + attrs = [a.strip() for a in line.split(',')] + n = [int(i) - 1 for i in content_am[idx].split(',')] + g = int(content_gi[n[0]]) - 1 + data[g].edges[n[0], n[1]]['attributes'] = attrs # @todo: this should be removed after. + if data[g].graph['edge_attrs'] == []: + for i, attr in enumerate(attrs): + a_name = 'attr_' + str(i) + data[g].edges[n[0], n[1]][a_name] = attr + data[g].graph['edge_attrs'].append(a_name) + else: + for i, a_name in enumerate(data[g].graph['edge_attrs']): + data[g].edges[n[0], n[1]][a_name] = attrs[i] + + # load y + y = [int(i) for i in content_gl] + + return data, y def loadDataset(filename, filename_y=None, extra_params=None): - """Read graph data from filename and load them as NetworkX graphs. - - Parameters - ---------- - filename : string - The name of the file from where the dataset is read. - filename_y : string - The name of file of the targets corresponding to graphs. - extra_params : dict - Extra parameters only designated to '.mat' format. - - Return - ------ - data : List of NetworkX graph. - - y : List - - Targets corresponding to graphs. - - Notes - ----- - This function supports following graph dataset formats: - - 'ds': load data from .ds file. See comments of function loadFromDS for a example. - - 'cxl': load data from Graph eXchange Language file (.cxl file). See - `here `__ for detail. - - 'sdf': load data from structured data file (.sdf file). See - `here `__ - for details. - - 'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See - README in `downloadable file `__ - for details. - - 'txt': Load graph data from a special .txt file. See - `here `__ - for details. Note here filename is the name of either .txt file in - the dataset directory. - """ - extension = splitext(filename)[1][1:] - if extension == "ds": - data, y = loadFromDS(filename, filename_y) - elif extension == "cxl": - import xml.etree.ElementTree as ET - - dirname_dataset = dirname(filename) - tree = ET.parse(filename) - root = tree.getroot() - data = [] - y = [] - for graph in root.iter('graph'): - mol_filename = graph.attrib['file'] - mol_class = graph.attrib['class'] - data.append(loadGXL(dirname_dataset + '/' + mol_filename)) - y.append(mol_class) - elif extension == 'xml': - data, y = loadFromXML(filename, extra_params) - elif extension == "sdf": -# import numpy as np - from tqdm import tqdm - import sys - - data = loadSDF(filename) - - y_raw = open(filename_y).read().splitlines() - y_raw.pop(0) - tmp0 = [] - tmp1 = [] - for i in range(0, len(y_raw)): - tmp = y_raw[i].split(',') - tmp0.append(tmp[0]) - tmp1.append(tmp[1].strip()) - - y = [] - for i in tqdm(range(0, len(data)), desc='ajust data', file=sys.stdout): - try: - y.append(tmp1[tmp0.index(data[i].name)].strip()) - except ValueError: # if data[i].name not in tmp0 - data[i] = [] - data = list(filter(lambda a: a != [], data)) - elif extension == "mat": - data, y = loadMAT(filename, extra_params) - elif extension == 'txt': - data, y = loadTXT(filename) - # print(len(y)) - # print(y) - # print(data[0].nodes(data=True)) - # print('----') - # print(data[0].edges(data=True)) - # for g in data: - # print(g.nodes(data=True)) - # print('----') - # print(g.edges(data=True)) - - return data, y + """Read graph data from filename and load them as NetworkX graphs. + + Parameters + ---------- + filename : string + The name of the file from where the dataset is read. + filename_y : string + The name of file of the targets corresponding to graphs. + extra_params : dict + Extra parameters only designated to '.mat' format. + + Return + ------ + data : List of NetworkX graph. + + y : List + + Targets corresponding to graphs. + + Notes + ----- + This function supports following graph dataset formats: + + 'ds': load data from .ds file. See comments of function loadFromDS for a example. + + 'cxl': load data from Graph eXchange Language file (.cxl file). See + `here `__ for detail. + + 'sdf': load data from structured data file (.sdf file). See + `here `__ + for details. + + 'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See + README in `downloadable file `__ + for details. + + 'txt': Load graph data from a special .txt file. See + `here `__ + for details. Note here filename is the name of either .txt file in + the dataset directory. + """ + extension = splitext(filename)[1][1:] + if extension == "ds": + data, y = loadFromDS(filename, filename_y) + elif extension == "cxl": + import xml.etree.ElementTree as ET + + dirname_dataset = dirname(filename) + tree = ET.parse(filename) + root = tree.getroot() + data = [] + y = [] + for graph in root.iter('graph'): + mol_filename = graph.attrib['file'] + mol_class = graph.attrib['class'] + data.append(loadGXL(dirname_dataset + '/' + mol_filename)) + y.append(mol_class) + elif extension == 'xml': + data, y = loadFromXML(filename, extra_params) + elif extension == "sdf": +# import numpy as np + from tqdm import tqdm + import sys + + data = loadSDF(filename) + + y_raw = open(filename_y).read().splitlines() + y_raw.pop(0) + tmp0 = [] + tmp1 = [] + for i in range(0, len(y_raw)): + tmp = y_raw[i].split(',') + tmp0.append(tmp[0]) + tmp1.append(tmp[1].strip()) + + y = [] + for i in tqdm(range(0, len(data)), desc='ajust data', file=sys.stdout): + try: + y.append(tmp1[tmp0.index(data[i].name)].strip()) + except ValueError: # if data[i].name not in tmp0 + data[i] = [] + data = list(filter(lambda a: a != [], data)) + elif extension == "mat": + data, y = loadMAT(filename, extra_params) + elif extension == 'txt': + data, y = loadTXT(filename) + # print(len(y)) + # print(y) + # print(data[0].nodes(data=True)) + # print('----') + # print(data[0].edges(data=True)) + # for g in data: + # print(g.nodes(data=True)) + # print('----') + # print(g.edges(data=True)) + + return data, y def loadFromXML(filename, extra_params): - import xml.etree.ElementTree as ET - - if extra_params: - dirname_dataset = extra_params - else: - dirname_dataset = dirname(filename) - tree = ET.parse(filename) - root = tree.getroot() - data = [] - y = [] - for graph in root.iter('graph'): - mol_filename = graph.attrib['file'] - mol_class = graph.attrib['class'] - data.append(loadGXL(dirname_dataset + '/' + mol_filename)) - y.append(mol_class) - - return data, y - + import xml.etree.ElementTree as ET + + if extra_params: + dirname_dataset = extra_params + else: + dirname_dataset = dirname(filename) + tree = ET.parse(filename) + root = tree.getroot() + data = [] + y = [] + for graph in root.iter('graph'): + mol_filename = graph.attrib['file'] + mol_class = graph.attrib['class'] + data.append(loadGXL(dirname_dataset + '/' + mol_filename)) + y.append(mol_class) + + return data, y + def loadFromDS(filename, filename_y): - """Load data from .ds file. - - Possible graph formats include: - - '.ct': see function loadCT for detail. - - '.gxl': see dunction loadGXL for detail. - - Note these graph formats are checked automatically by the extensions of - graph files. - """ - dirname_dataset = dirname(filename) - data = [] - y = [] - content = open(filename).read().splitlines() - extension = splitext(content[0].split(' ')[0])[1][1:] - if filename_y is None or filename_y == '': - if extension == 'ct': - for i in range(0, len(content)): - tmp = content[i].split(' ') - # remove the '#'s in file names - data.append( - loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) - y.append(float(tmp[1])) - elif extension == 'gxl': - for i in range(0, len(content)): - tmp = content[i].split(' ') - # remove the '#'s in file names - data.append( - loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) - y.append(float(tmp[1])) - else: # y in a seperate file - if extension == 'ct': - for i in range(0, len(content)): - tmp = content[i] - # remove the '#'s in file names - data.append( - loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1))) - elif extension == 'gxl': - for i in range(0, len(content)): - tmp = content[i] - # remove the '#'s in file names - data.append( - loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1))) - - content_y = open(filename_y).read().splitlines() - # assume entries in filename and filename_y have the same order. - for item in content_y: - tmp = item.split(' ') - # assume the 3rd entry in a line is y (for Alkane dataset) - y.append(float(tmp[2])) - - return data, y - + """Load data from .ds file. + + Possible graph formats include: + + '.ct': see function loadCT for detail. + + '.gxl': see dunction loadGXL for detail. + + Note these graph formats are checked automatically by the extensions of + graph files. + """ + dirname_dataset = dirname(filename) + data = [] + y = [] + content = open(filename).read().splitlines() + extension = splitext(content[0].split(' ')[0])[1][1:] + if filename_y is None or filename_y == '': + if extension == 'ct': + for i in range(0, len(content)): + tmp = content[i].split(' ') + # remove the '#'s in file names + data.append( + loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) + y.append(float(tmp[1])) + elif extension == 'gxl': + for i in range(0, len(content)): + tmp = content[i].split(' ') + # remove the '#'s in file names + data.append( + loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) + y.append(float(tmp[1])) + else: # y in a seperate file + if extension == 'ct': + for i in range(0, len(content)): + tmp = content[i] + # remove the '#'s in file names + data.append( + loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1))) + elif extension == 'gxl': + for i in range(0, len(content)): + tmp = content[i] + # remove the '#'s in file names + data.append( + loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1))) + + content_y = open(filename_y).read().splitlines() + # assume entries in filename and filename_y have the same order. + for item in content_y: + tmp = item.split(' ') + # assume the 3rd entry in a line is y (for Alkane dataset) + y.append(float(tmp[2])) + + return data, y + def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None): - """Save list of graphs. - """ - import os - dirname_ds = os.path.dirname(filename) - if dirname_ds != '': - dirname_ds += '/' - if not os.path.exists(dirname_ds) : - os.makedirs(dirname_ds) - - if xparams is not None and 'graph_dir' in xparams: - graph_dir = xparams['graph_dir'] + '/' - if not os.path.exists(graph_dir): - os.makedirs(graph_dir) - else: - graph_dir = dirname_ds - - if group == 'xml' and gformat == 'gxl': - kwargs = {'method': xparams['method']} if xparams is not None else {} - with open(filename + '.xml', 'w') as fgroup: - fgroup.write("") - fgroup.write("\n") - fgroup.write("\n") - for idx, g in enumerate(Gn): - fname_tmp = "graph" + str(idx) + ".gxl" - saveGXL(g, graph_dir + fname_tmp, **kwargs) - fgroup.write("\n\t") - fgroup.write("\n") - fgroup.close() - - -if __name__ == '__main__': -# ### Load dataset from .ds file. -# # .ct files. -# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', -# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} -# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) -## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb -## Gn, y = loadDataset(ds['dataset']) -## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb -## Gn, y = loadDataset(ds['dataset']) -## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled -## Gn, y = loadDataset(ds['dataset']) -# print(Gn[1].nodes(data=True)) -# print(Gn[1].edges(data=True)) -# print(y[1]) - -# # .gxl file. -# ds = {'name': 'monoterpenoides', -# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb -# Gn, y = loadDataset(ds['dataset']) -# print(Gn[1].nodes(data=True)) -# print(Gn[1].edges(data=True)) -# print(y[1]) - -# ### Convert graph from one format to another. -# # .gxl file. -# import networkx as nx -# ds = {'name': 'monoterpenoides', -# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb -# Gn, y = loadDataset(ds['dataset']) -# y = [int(i) for i in y] -# print(Gn[1].nodes(data=True)) -# print(Gn[1].edges(data=True)) -# print(y[1]) -# # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. -# Gn_new = [] -# for G in Gn: -# G_new = nx.Graph() -# for nd, attrs in G.nodes(data=True): -# G_new.add_node(str(nd), chem=attrs['atom']) -# for nd1, nd2, attrs in G.edges(data=True): -# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) -## G_new.add_edge(str(nd1), str(nd2)) -# Gn_new.append(G_new) -# print(Gn_new[1].nodes(data=True)) -# print(Gn_new[1].edges(data=True)) -# print(Gn_new[1]) -# filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' -# xparams = {'method': 'gedlib'} -# saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) - - # save dataset. -# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', -# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb -# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# saveDataset(Gn, y, group='xml', filename='temp/temp') - - # test - new way to add labels and attributes. -# dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' -# dataset = '../../datasets/Fingerprint/Fingerprint_A.txt' -# dataset = '../../datasets/Letter-med/Letter-med_A.txt' -# dataset = '../../datasets/AIDS/AIDS_A.txt' -# dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' -# Gn, y_all = loadDataset(dataset) - pass \ No newline at end of file + """Save list of graphs. + """ + import os + dirname_ds = os.path.dirname(filename) + if dirname_ds != '': + dirname_ds += '/' + if not os.path.exists(dirname_ds) : + os.makedirs(dirname_ds) + + if xparams is not None and 'graph_dir' in xparams: + graph_dir = xparams['graph_dir'] + '/' + if not os.path.exists(graph_dir): + os.makedirs(graph_dir) + else: + graph_dir = dirname_ds + + if group == 'xml' and gformat == 'gxl': + kwargs = {'method': xparams['method']} if xparams is not None else {} + with open(filename + '.xml', 'w') as fgroup: + fgroup.write("") + fgroup.write("\n") + fgroup.write("\n") + for idx, g in enumerate(Gn): + fname_tmp = "graph" + str(idx) + ".gxl" + saveGXL(g, graph_dir + fname_tmp, **kwargs) + fgroup.write("\n\t") + fgroup.write("\n") + fgroup.close() + + +if __name__ == '__main__': +# ### Load dataset from .ds file. +# # .ct files. +# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', +# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} +# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) +## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb +## Gn, y = loadDataset(ds['dataset']) +## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb +## Gn, y = loadDataset(ds['dataset']) +## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled +## Gn, y = loadDataset(ds['dataset']) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) + +# # .gxl file. +# ds = {'name': 'monoterpenoides', +# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb +# Gn, y = loadDataset(ds['dataset']) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) + +# ### Convert graph from one format to another. +# # .gxl file. +# import networkx as nx +# ds = {'name': 'monoterpenoides', +# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb +# Gn, y = loadDataset(ds['dataset']) +# y = [int(i) for i in y] +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) +# # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. +# Gn_new = [] +# for G in Gn: +# G_new = nx.Graph() +# for nd, attrs in G.nodes(data=True): +# G_new.add_node(str(nd), chem=attrs['atom']) +# for nd1, nd2, attrs in G.edges(data=True): +# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) +## G_new.add_edge(str(nd1), str(nd2)) +# Gn_new.append(G_new) +# print(Gn_new[1].nodes(data=True)) +# print(Gn_new[1].edges(data=True)) +# print(Gn_new[1]) +# filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' +# xparams = {'method': 'gedlib'} +# saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) + + # save dataset. +# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', +# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb +# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +# saveDataset(Gn, y, group='xml', filename='temp/temp') + + # test - new way to add labels and attributes. +# dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' +# dataset = '../../datasets/Fingerprint/Fingerprint_A.txt' +# dataset = '../../datasets/Letter-med/Letter-med_A.txt' +# dataset = '../../datasets/AIDS/AIDS_A.txt' +# dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' +# Gn, y_all = loadDataset(dataset) + pass \ No newline at end of file