diff --git a/gklearn/preimage/experiments/xp_median_preimage.py b/gklearn/preimage/experiments/xp_median_preimage.py index d23a0c8..7ae20ba 100644 --- a/gklearn/preimage/experiments/xp_median_preimage.py +++ b/gklearn/preimage/experiments/xp_median_preimage.py @@ -12,6 +12,70 @@ from gklearn.preimage.utils import generate_median_preimages_by_class from gklearn.utils import compute_gram_matrices_by_class +def xp_median_preimage_9_1(): + """xp 9_1: MAO, sspkernel, using CONSTANT. + """ + # set parameters. + ds_name = 'MAO' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [4, 4, 2, 1, 1, 1], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, # + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'structuralspkernel', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'CONSTANT', # + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + def xp_median_preimage_8_1(): """xp 8_1: Monoterpenoides, sspkernel, using CONSTANT. """ @@ -546,4 +610,7 @@ if __name__ == "__main__": # xp_median_preimage_7_1() #### xp 8_1: Monoterpenoides, sspkernel, using CONSTANT. - xp_median_preimage_8_1() \ No newline at end of file +# xp_median_preimage_8_1() + + #### xp 9_1: MAO, sspkernel, using CONSTANT. + xp_median_preimage_9_1() \ No newline at end of file diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index ed84725..e218596 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -68,7 +68,8 @@ class Dataset(object): def load_predefined_dataset(self, ds_name): current_path = os.path.dirname(os.path.realpath(__file__)) + '/' if ds_name == 'Acyclic': - pass + ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'COIL-DEL': ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) @@ -93,6 +94,9 @@ class Dataset(object): elif ds_name == 'Letter-med': # node non-symb ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'MAO': + ds_file = current_path + '../../datasets/MAO/dataset.ds' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'Monoterpenoides': ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds' self.__graphs, self.__targets, label_names = load_dataset(ds_file) diff --git a/gklearn/utils/graph_files.py b/gklearn/utils/graph_files.py index 5206ee4..7594110 100644 --- a/gklearn/utils/graph_files.py +++ b/gklearn/utils/graph_files.py @@ -49,47 +49,14 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): if extension == "ds": data, y, label_names = load_from_ds(filename, filename_targets) elif extension == "cxl": - import xml.etree.ElementTree as ET - - dirname_dataset = dirname(filename) - tree = ET.parse(filename) - root = tree.getroot() - data = [] - y = [] - for graph in root.iter('graph'): - mol_filename = graph.attrib['file'] - mol_class = graph.attrib['class'] - data.append(load_gxl(dirname_dataset + '/' + mol_filename)) - y.append(mol_class) + dir_dataset = kwargs.get('dirname_dataset', None) + data, y, label_names = load_from_xml(filename, dir_dataset) elif extension == 'xml': dir_dataset = kwargs.get('dirname_dataset', None) - data, y = loadFromXML(filename, dir_dataset) - elif extension == "sdf": -# import numpy as np - from tqdm import tqdm - import sys - - data = loadSDF(filename) - - y_raw = open(filename_targets).read().splitlines() - y_raw.pop(0) - tmp0 = [] - tmp1 = [] - for i in range(0, len(y_raw)): - tmp = y_raw[i].split(',') - tmp0.append(tmp[0]) - tmp1.append(tmp[1].strip()) - - y = [] - for i in tqdm(range(0, len(data)), desc='ajust data', file=sys.stdout): - try: - y.append(tmp1[tmp0.index(data[i].name)].strip()) - except ValueError: # if data[i].name not in tmp0 - data[i] = [] - data = list(filter(lambda a: a != [], data)) + data, y, label_names = load_from_xml(filename, dir_dataset) elif extension == "mat": order = kwargs.get('order') - data, y = loadMAT(filename, order) + data, y, label_names = load_mat(filename, order) elif extension == 'txt': data, y, label_names = load_tud(filename) @@ -127,7 +94,7 @@ def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=Non fgroup.close() -def load_ct(filename): +def load_ct(filename): # @todo: this function is only tested on CTFile V2000; header not considered; only simple cases (atoms and bonds are considered.) """load data from a Chemical Table (.ct) file. Notes @@ -154,30 +121,65 @@ def load_ct(filename): g = nx.Graph() with open(filename) as f: content = f.read().splitlines() - g = nx.Graph( - name = str(content[0]), - filename = basename(filename)) # set name of the graph - tmp = content[1].split(" ") - if tmp[0] == '': - nb_nodes = int(tmp[1]) # number of the nodes - nb_edges = int(tmp[2]) # number of the edges - else: - nb_nodes = int(tmp[0]) - nb_edges = int(tmp[1]) - # patch for compatibility : label will be removed later - for i in range(0, nb_nodes): - tmp = content[i + 2].split(" ") + g = nx.Graph(name=str(content[0]), filename=basename(filename)) # set name of the graph + + # read the counts line. + tmp = content[1].split(' ') + tmp = [x for x in tmp if x != ''] + nb_atoms = int(tmp[0].strip()) # number of atoms + nb_bonds = int(tmp[1].strip()) # number of bonds + count_line_tags = ['number_of_atoms', 'number_of_bonds', 'number_of_atom_lists', '', 'chiral_flag', 'number_of_stext_entries', '', '', '', '', 'number_of_properties', 'CT_version'] + i = 0 + while i < len(tmp): + if count_line_tags[i] != '': # if not obsoleted + g.graph[count_line_tags[i]] = tmp[i].strip() + i += 1 + + # read the atom block. + atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag'] + for i in range(0, nb_atoms): + tmp = content[i + 2].split(' ') tmp = [x for x in tmp if x != ''] - g.add_node(i, atom=tmp[3].strip(), - label=[item.strip() for item in tmp[3:]], - attributes=[item.strip() for item in tmp[0:3]]) - for i in range(0, nb_edges): - tmp = content[i + g.number_of_nodes() + 2].split(" ") + g.add_node(i) + j = 0 + while j < len(tmp): + if atom_tags[j] != '': + g.nodes[i][atom_tags[j]] = tmp[j].strip() + j += 1 + + # read the bond block. + bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status'] + for i in range(0, nb_bonds): + tmp = content[i + g.number_of_nodes() + 2].split(' ') tmp = [x for x in tmp if x != ''] - g.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, - bond_type=tmp[2].strip(), - label=[item.strip() for item in tmp[2:]]) - return g + n1, n2 = int(tmp[0].strip()) - 1, int(tmp[1].strip()) - 1 + g.add_edge(n1, n2) + j = 2 + while j < len(tmp): + if bond_tags[j] != '': + g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip() + j += 1 + + # get label names. + label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} + atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1] + for nd in g.nodes(): + for key in g.nodes[nd]: + if atom_symbolic[atom_tags.index(key)] == 1: + label_names['node_labels'].append(key) + else: + label_names['node_attrs'].append(key) + break + bond_symbolic = [None, None, 1, 1, None, 1, 1] + for ed in g.edges(): + for key in g.edges[ed]: + if bond_symbolic[bond_tags.index(key)] == 1: + label_names['edge_labels'].append(key) + else: + label_names['edge_attrs'].append(key) + break + + return g, label_names def load_gxl(filename): # @todo: directed graphs. @@ -333,57 +335,57 @@ def saveGXL(graph, filename, method='default', node_labels=[], edge_labels=[], n gxl_file.close() -def loadSDF(filename): - """load data from structured data file (.sdf file). +# def loadSDF(filename): +# """load data from structured data file (.sdf file). - Notes - ------ - A SDF file contains a group of molecules, represented in the similar way as in MOL format. - Check `here `__ for detailed structure. - """ - import networkx as nx - from os.path import basename - from tqdm import tqdm - import sys - data = [] - with open(filename) as f: - content = f.read().splitlines() - index = 0 - pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) - while index < len(content): - index_old = index +# Notes +# ------ +# A SDF file contains a group of molecules, represented in the similar way as in MOL format. +# Check `here `__ for detailed structure. +# """ +# import networkx as nx +# from os.path import basename +# from tqdm import tqdm +# import sys +# data = [] +# with open(filename) as f: +# content = f.read().splitlines() +# index = 0 +# pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) +# while index < len(content): +# index_old = index - g = nx.Graph(name=content[index].strip()) # set name of the graph +# g = nx.Graph(name=content[index].strip()) # set name of the graph - tmp = content[index + 3] - nb_nodes = int(tmp[:3]) # number of the nodes - nb_edges = int(tmp[3:6]) # number of the edges +# tmp = content[index + 3] +# nb_nodes = int(tmp[:3]) # number of the nodes +# nb_edges = int(tmp[3:6]) # number of the edges - for i in range(0, nb_nodes): - tmp = content[i + index + 4] - g.add_node(i, atom=tmp[31:34].strip()) +# for i in range(0, nb_nodes): +# tmp = content[i + index + 4] +# g.add_node(i, atom=tmp[31:34].strip()) - for i in range(0, nb_edges): - tmp = content[i + index + g.number_of_nodes() + 4] - tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] - g.add_edge( - int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) +# for i in range(0, nb_edges): +# tmp = content[i + index + g.number_of_nodes() + 4] +# tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] +# g.add_edge( +# int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) - data.append(g) +# data.append(g) - index += 4 + g.number_of_nodes() + g.number_of_edges() - while content[index].strip() != '$$$$': # seperator - index += 1 - index += 1 +# index += 4 + g.number_of_nodes() + g.number_of_edges() +# while content[index].strip() != '$$$$': # seperator +# index += 1 +# index += 1 - pbar.update(index - index_old) - pbar.update(1) - pbar.close() +# pbar.update(index - index_old) +# pbar.update(1) +# pbar.close() - return data +# return data -def loadMAT(filename, order): +def load_mat(filename, order): # @todo: need to be updated (auto order) or deprecated. """Load graph data from a MATLAB (up to version 7.1) .mat file. Notes @@ -422,14 +424,13 @@ def loadMAT(filename, order): # print(item[order[3]]) # print() for index, label in enumerate(nl[0]): - g.add_node(index, atom=str(label)) + g.add_node(index, label_1=str(label)) el = item[order[4]][0][0][0] # edge label for edge in el: - g.add_edge( - edge[0] - 1, edge[1] - 1, bond_type=str(edge[2])) + g.add_edge(edge[0] - 1, edge[1] - 1, label_1=str(edge[2])) data.append(g) else: - from scipy.sparse import csc_matrix +# from scipy.sparse import csc_matrix for i, item in enumerate(value[0]): # print(item) # print('------') @@ -438,7 +439,7 @@ def loadMAT(filename, order): # print(nl) # print() for index, label in enumerate(nl[0]): - g.add_node(index, atom=str(label)) + g.add_node(index, label_1=str(label)) sam = item[order[0]] # sparse adjacency matrix index_no0 = sam.nonzero() for col, row in zip(index_no0[0], index_no0[1]): @@ -447,7 +448,12 @@ def loadMAT(filename, order): g.add_edge(col, row) data.append(g) # print(g.edges(data=True)) - return data, y + + label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} + if order[1] == 0: + label_names['edge_labels'].append('label_1') + + return data, y, label_names def load_tud(filename): @@ -645,26 +651,6 @@ def load_tud(filename): data[g].edges[n[0], n[1]][a_name] = attrs[i] return data, targets, label_names - - -def loadFromXML(filename, dir_dataset=None): - import xml.etree.ElementTree as ET - - if dir_dataset is not None: - dir_dataset = dir_dataset - else: - dir_dataset = dirname(filename) - tree = ET.parse(filename) - root = tree.getroot() - data = [] - y = [] - for graph in root.iter('graph'): - mol_filename = graph.attrib['file'] - mol_class = graph.attrib['class'] - data.append(load_gxl(dir_dataset + '/' + mol_filename)) - y.append(mol_class) - - return data, y def load_from_ds(filename, filename_targets): @@ -678,47 +664,33 @@ def load_from_ds(filename, filename_targets): Note these graph formats are checked automatically by the extensions of graph files. - """ - def append_label_names(label_names, new_names): - for key, val in label_names.items(): - label_names[key] += [name for name in new_names[key] if name not in val] - + """ dirname_dataset = dirname(filename) data = [] y = [] label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} content = open(filename).read().splitlines() extension = splitext(content[0].split(' ')[0])[1][1:] + if extension == 'ct': + load_file_fun = load_ct + elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet. + load_file_fun = load_gxl + if filename_targets is None or filename_targets == '': - if extension == 'ct': - for i in range(0, len(content)): - tmp = content[i].split(' ') - # remove the '#'s in file names - data.append( - load_ct(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) - y.append(float(tmp[1])) - elif extension == 'gxl': - for i in range(0, len(content)): - tmp = content[i].split(' ') - # remove the '#'s in file names - g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1)) - data.append(g) - append_label_names(label_names, l_names) - y.append(float(tmp[1])) - else: # y in a seperate file - if extension == 'ct': - for i in range(0, len(content)): - tmp = content[i] - # remove the '#'s in file names - data.append( - load_ct(dirname_dataset + '/' + tmp.replace('#', '', 1))) - elif extension == 'gxl': - for i in range(0, len(content)): - tmp = content[i] - # remove the '#'s in file names - g, l_names = load_gxl(dirname_dataset + '/' + tmp[0].replace('#', '', 1)) - data.append(g) - append_label_names(label_names, l_names) + for i in range(0, len(content)): + tmp = content[i].split(' ') + # remove the '#'s in file names + g, l_names = load_file_fun(dirname_dataset + '/' + tmp[0].replace('#', '', 1)) + data.append(g) + __append_label_names(label_names, l_names) + y.append(float(tmp[1])) + else: # targets in a seperate file + for i in range(0, len(content)): + tmp = content[i] + # remove the '#'s in file names + g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1)) + data.append(g) + __append_label_names(label_names, l_names) content_y = open(filename_targets).read().splitlines() # assume entries in filename and filename_targets have the same order. @@ -728,7 +700,50 @@ def load_from_ds(filename, filename_targets): y.append(float(tmp[2])) return data, y, label_names - + + +# def load_from_cxl(filename): +# import xml.etree.ElementTree as ET +# +# dirname_dataset = dirname(filename) +# tree = ET.parse(filename) +# root = tree.getroot() +# data = [] +# y = [] +# for graph in root.iter('graph'): +# mol_filename = graph.attrib['file'] +# mol_class = graph.attrib['class'] +# data.append(load_gxl(dirname_dataset + '/' + mol_filename)) +# y.append(mol_class) + + +def load_from_xml(filename, dir_dataset=None): + import xml.etree.ElementTree as ET + + if dir_dataset is not None: + dir_dataset = dir_dataset + else: + dir_dataset = dirname(filename) + tree = ET.parse(filename) + root = tree.getroot() + data = [] + y = [] + label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} + for graph in root.iter('graph'): + mol_filename = graph.attrib['file'] + mol_class = graph.attrib['class'] + g, l_names = load_gxl(dir_dataset + '/' + mol_filename) + data.append(g) + __append_label_names(label_names, l_names) + y.append(mol_class) + + return data, y, label_names + + +def __append_label_names(label_names, new_names): + for key, val in label_names.items(): + label_names[key] += [name for name in new_names[key] if name not in val] + if __name__ == '__main__': # ### Load dataset from .ds file. @@ -736,25 +751,34 @@ if __name__ == '__main__': # ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', # 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} # Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) -## ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'} # node symb -## Gn, y = loadDataset(ds['dataset']) -## ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb -## Gn, y = loadDataset(ds['dataset']) +# ds_file = '../../datasets/Acyclic/dataset_bps.ds' # node symb +# Gn, targets, label_names = load_dataset(ds_file) +# ds_file = '../../datasets/MAO/dataset.ds' # node/edge symb +# Gn, targets, label_names = load_dataset(ds_file) ## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled ## Gn, y = loadDataset(ds['dataset']) -# print(Gn[1].nodes(data=True)) -# print(Gn[1].edges(data=True)) -# print(y[1]) +# print(Gn[1].graph) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(targets[1]) - # .gxl file. - ds = {'name': 'monoterpenoides', - 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y, label_names = load_dataset(ds['dataset']) +# # .gxl file. +# ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb +# Gn, y, label_names = load_dataset(ds_file) +# print(Gn[1].graph) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) + + # .mat file. + ds_file = '../../datasets/MUTAG_mat/MUTAG.mat' + order = [0, 0, 3, 1, 2] + Gn, targets, label_names = load_dataset(ds_file, order=order) print(Gn[1].graph) print(Gn[1].nodes(data=True)) print(Gn[1].edges(data=True)) - print(y[1]) - + print(targets[1]) + # ### Convert graph from one format to another. # # .gxl file. # import networkx as nx