diff --git a/.travis.yml b/.travis.yml index 9909b33..cb9369c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,5 @@ language: python python: -- '3' -- '3.4' - '3.5' - '3.6' - '3.7' diff --git a/gklearn/preimage/experiments/xp_median_preimage.py b/gklearn/preimage/experiments/xp_median_preimage.py new file mode 100644 index 0000000..d23a0c8 --- /dev/null +++ b/gklearn/preimage/experiments/xp_median_preimage.py @@ -0,0 +1,549 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jan 14 15:39:29 2020 + +@author: ljia +""" +import multiprocessing +import functools +from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct +from gklearn.preimage.utils import generate_median_preimages_by_class +from gklearn.utils import compute_gram_matrices_by_class + + +def xp_median_preimage_8_1(): + """xp 8_1: Monoterpenoides, sspkernel, using CONSTANT. + """ + # set parameters. + ds_name = 'Monoterpenoides' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [3, 3, 1, 3, 3, 1], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, # + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'structuralspkernel', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'CONSTANT', # + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_7_1(): + """xp 7_1: MUTAG, sspkernel, using CONSTANT. + """ + # set parameters. + ds_name = 'MUTAG' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [4, 4, 2, 1, 1, 1], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, # + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'structuralspkernel', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'CONSTANT', # + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_6_1(): + """xp 6_1: COIL-RAG, sspkernel, using NON_SYMBOLIC. + """ + # set parameters. + ds_name = 'COIL-RAG' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [3, 3, 1, 3, 3, 1], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'structuralspkernel', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'NON_SYMBOLIC', # + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'random', 'random', 'random']: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_5_1(): + """xp 5_1: FRANKENSTEIN, sspkernel, using NON_SYMBOLIC. + """ + # set parameters. + ds_name = 'FRANKENSTEIN' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [3, 3, 1, 3, 3, 0], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'structuralspkernel', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'NON_SYMBOLIC', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'random', 'random', 'random']: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_4_1(): + """xp 4_1: COLORS-3, sspkernel, using NON_SYMBOLIC. + """ + # set parameters. + ds_name = 'COLORS-3' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [3, 3, 1, 3, 3, 0], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'structuralspkernel', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'NON_SYMBOLIC', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'random', 'random', 'random']: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_3_1(): + """xp 3_1: Fingerprint, sspkernel, using LETTER2, only node attrs. + """ + # set parameters. + ds_name = 'Fingerprint' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [0.525, 0.525, 0.001, 0.125, 0.125], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'structuralspkernel', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = {'edge_attrs': ['orient', 'angle']} # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'random', 'random', 'random']: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_2_1(): + """xp 2_1: COIL-DEL, sspkernel, using LETTER2, only node attrs. + """ + # set parameters. + ds_name = 'COIL-DEL' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [3, 3, 1, 3, 3], + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'structuralspkernel', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = {'edge_labels': ['valence']} + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + +# # compute gram matrices for each class a priori. +# print('Compute gram matrices for each class a priori.') +# compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save=dir_save, irrelevant_labels=irrelevant_labels) + + # generate preimages. + for fit_method in ['k-graphs', 'random', 'random', 'random']: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels) + + +def xp_median_preimage_1_1(): + """xp 1_1: Letter-high, sspkernel. + """ + # set parameters. + ds_name = 'Letter-high' + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [3, 3, 1, 3, 3], + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'structuralspkernel', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', +# 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 1, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + + # generate preimages. + for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save='../results/xp_median_preimage/') + + +if __name__ == "__main__": + + #### xp 1_1: Letter-high, sspkernel. + # xp_median_preimage_1_1() + + #### xp 2_1: COIL-DEL, sspkernel, using LETTER2, only node attrs. +# xp_median_preimage_2_1() + + #### xp 3_1: Fingerprint, sspkernel, using LETTER2, only node attrs. + # xp_median_preimage_3_1() + + #### xp 4_1: COLORS-3, sspkernel, using NON_SYMBOLIC. +# xp_median_preimage_4_1() + + #### xp 5_1: FRANKENSTEIN, sspkernel, using NON_SYMBOLIC. +# xp_median_preimage_5_1() + + #### xp 6_1: COIL-RAG, sspkernel, using NON_SYMBOLIC. + # xp_median_preimage_6_1() + + #### xp 7_1: MUTAG, sspkernel, using CONSTANT. + # xp_median_preimage_7_1() + + #### xp 8_1: Monoterpenoides, sspkernel, using CONSTANT. + xp_median_preimage_8_1() \ No newline at end of file diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 0c13e0f..ed84725 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -67,7 +67,7 @@ class Dataset(object): def load_predefined_dataset(self, ds_name): current_path = os.path.dirname(os.path.realpath(__file__)) + '/' - if ds_name == 'acyclic': + if ds_name == 'Acyclic': pass elif ds_name == 'COIL-DEL': ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' @@ -93,6 +93,9 @@ class Dataset(object): elif ds_name == 'Letter-med': # node non-symb ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Monoterpenoides': + ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'MUTAG': ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) diff --git a/gklearn/utils/graph_files.py b/gklearn/utils/graph_files.py index 7f424d6..5206ee4 100644 --- a/gklearn/utils/graph_files.py +++ b/gklearn/utils/graph_files.py @@ -47,7 +47,7 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): """ extension = splitext(filename)[1][1:] if extension == "ds": - data, y = loadFromDS(filename, filename_targets) + data, y, label_names = load_from_ds(filename, filename_targets) elif extension == "cxl": import xml.etree.ElementTree as ET @@ -59,7 +59,7 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): for graph in root.iter('graph'): mol_filename = graph.attrib['file'] mol_class = graph.attrib['class'] - data.append(loadGXL(dirname_dataset + '/' + mol_filename)) + data.append(load_gxl(dirname_dataset + '/' + mol_filename)) y.append(mol_class) elif extension == 'xml': dir_dataset = kwargs.get('dirname_dataset', None) @@ -127,7 +127,7 @@ def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=Non fgroup.close() -def loadCT(filename): +def load_ct(filename): """load data from a Chemical Table (.ct) file. Notes @@ -180,7 +180,7 @@ def loadCT(filename): return g -def loadGXL(filename): +def load_gxl(filename): # @todo: directed graphs. from os.path import basename import networkx as nx import xml.etree.ElementTree as ET @@ -195,9 +195,6 @@ def loadGXL(filename): labels = {} for attr in node.iter('attr'): labels[attr.attrib['name']] = attr[0].text - if 'chem' in labels: - labels['label'] = labels['chem'] - labels['atom'] = labels['chem'] g.add_node(index, **labels) index += 1 @@ -205,11 +202,26 @@ def loadGXL(filename): labels = {} for attr in edge.iter('attr'): labels[attr.attrib['name']] = attr[0].text - if 'valence' in labels: - labels['label'] = labels['valence'] - labels['bond_type'] = labels['valence'] g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) - return g + + # get label names. + label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} + for node in root.iter('node'): + for attr in node.iter('attr'): + if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. + label_names['node_labels'].append(attr.attrib['name']) + else: + label_names['node_attrs'].append(attr.attrib['name']) + break + for edge in root.iter('edge'): + for attr in edge.iter('attr'): + if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. + label_names['edge_labels'].append(attr.attrib['name']) + else: + label_names['edge_attrs'].append(attr.attrib['name']) + break + + return g, label_names def saveGXL(graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): @@ -649,43 +661,49 @@ def loadFromXML(filename, dir_dataset=None): for graph in root.iter('graph'): mol_filename = graph.attrib['file'] mol_class = graph.attrib['class'] - data.append(loadGXL(dir_dataset + '/' + mol_filename)) + data.append(load_gxl(dir_dataset + '/' + mol_filename)) y.append(mol_class) return data, y -def loadFromDS(filename, filename_y): +def load_from_ds(filename, filename_targets): """Load data from .ds file. Possible graph formats include: - '.ct': see function loadCT for detail. + '.ct': see function load_ct for detail. - '.gxl': see dunction loadGXL for detail. + '.gxl': see dunction load_gxl for detail. Note these graph formats are checked automatically by the extensions of graph files. """ + def append_label_names(label_names, new_names): + for key, val in label_names.items(): + label_names[key] += [name for name in new_names[key] if name not in val] + dirname_dataset = dirname(filename) data = [] y = [] + label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} content = open(filename).read().splitlines() extension = splitext(content[0].split(' ')[0])[1][1:] - if filename_y is None or filename_y == '': + if filename_targets is None or filename_targets == '': if extension == 'ct': for i in range(0, len(content)): tmp = content[i].split(' ') # remove the '#'s in file names data.append( - loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) + load_ct(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) y.append(float(tmp[1])) elif extension == 'gxl': for i in range(0, len(content)): tmp = content[i].split(' ') # remove the '#'s in file names - data.append( - loadGXL(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) + g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1)) + data.append(g) + append_label_names(label_names, l_names) y.append(float(tmp[1])) else: # y in a seperate file if extension == 'ct': @@ -693,22 +711,23 @@ def loadFromDS(filename, filename_y): tmp = content[i] # remove the '#'s in file names data.append( - loadCT(dirname_dataset + '/' + tmp.replace('#', '', 1))) + load_ct(dirname_dataset + '/' + tmp.replace('#', '', 1))) elif extension == 'gxl': for i in range(0, len(content)): tmp = content[i] # remove the '#'s in file names - data.append( - loadGXL(dirname_dataset + '/' + tmp.replace('#', '', 1))) + g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1)) + data.append(g) + append_label_names(label_names, l_names) - content_y = open(filename_y).read().splitlines() - # assume entries in filename and filename_y have the same order. + content_y = open(filename_targets).read().splitlines() + # assume entries in filename and filename_targets have the same order. for item in content_y: tmp = item.split(' ') # assume the 3rd entry in a line is y (for Alkane dataset) y.append(float(tmp[2])) - return data, y + return data, y, label_names if __name__ == '__main__': @@ -727,13 +746,14 @@ if __name__ == '__main__': # print(Gn[1].edges(data=True)) # print(y[1]) -# # .gxl file. -# ds = {'name': 'monoterpenoides', -# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb -# Gn, y = loadDataset(ds['dataset']) -# print(Gn[1].nodes(data=True)) -# print(Gn[1].edges(data=True)) -# print(y[1]) + # .gxl file. + ds = {'name': 'monoterpenoides', + 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb + Gn, y, label_names = load_dataset(ds['dataset']) + print(Gn[1].graph) + print(Gn[1].nodes(data=True)) + print(Gn[1].edges(data=True)) + print(y[1]) # ### Convert graph from one format to another. # # .gxl file. @@ -774,5 +794,5 @@ if __name__ == '__main__': # dataset = '../../datasets/Letter-med/Letter-med_A.txt' # dataset = '../../datasets/AIDS/AIDS_A.txt' # dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' -# Gn, targets = load_dataset(filename) +# Gn, targets, label_names = load_dataset(filename) pass