@@ -0,0 +1,22 @@ | |||||
# -*-coding:utf-8 -*- | |||||
"""gklearn - datasets module | |||||
Implement some methods to manage graph datasets | |||||
graph_fetcher.py : fetch graph datasets from the Internet. | |||||
""" | |||||
# info | |||||
__version__ = "0.2" | |||||
__author__ = "Linlin Jia" | |||||
__date__ = "October 2020" | |||||
from gklearn.dataset.metadata import DATABASES, DATASET_META | |||||
from gklearn.dataset.metadata import GREYC_META, IAM_META, TUDataset_META | |||||
from gklearn.dataset.metadata import list_of_databases, list_of_datasets | |||||
from gklearn.dataset.graph_synthesizer import GraphSynthesizer | |||||
from gklearn.dataset.data_fetcher import DataFetcher | |||||
from gklearn.dataset.file_managers import DataLoader, DataSaver | |||||
from gklearn.dataset.dataset import Dataset, split_dataset_by_target |
@@ -0,0 +1,823 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Thu Mar 26 18:48:27 2020 | |||||
@author: ljia | |||||
""" | |||||
import numpy as np | |||||
import networkx as nx | |||||
from gklearn.utils.graph_files import load_dataset | |||||
import os | |||||
class Dataset(object): | |||||
def __init__(self, filename=None, filename_targets=None, **kwargs): | |||||
if filename is None: | |||||
self._graphs = None | |||||
self._targets = None | |||||
self._node_labels = None | |||||
self._edge_labels = None | |||||
self._node_attrs = None | |||||
self._edge_attrs = None | |||||
else: | |||||
self.load_dataset(filename, filename_targets=filename_targets, **kwargs) | |||||
self._substructures = None | |||||
self._node_label_dim = None | |||||
self._edge_label_dim = None | |||||
self._directed = None | |||||
self._dataset_size = None | |||||
self._total_node_num = None | |||||
self._ave_node_num = None | |||||
self._min_node_num = None | |||||
self._max_node_num = None | |||||
self._total_edge_num = None | |||||
self._ave_edge_num = None | |||||
self._min_edge_num = None | |||||
self._max_edge_num = None | |||||
self._ave_node_degree = None | |||||
self._min_node_degree = None | |||||
self._max_node_degree = None | |||||
self._ave_fill_factor = None | |||||
self._min_fill_factor = None | |||||
self._max_fill_factor = None | |||||
self._node_label_nums = None | |||||
self._edge_label_nums = None | |||||
self._node_attr_dim = None | |||||
self._edge_attr_dim = None | |||||
self._class_number = None | |||||
def load_dataset(self, filename, filename_targets=None, **kwargs): | |||||
self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs) | |||||
self._node_labels = label_names['node_labels'] | |||||
self._node_attrs = label_names['node_attrs'] | |||||
self._edge_labels = label_names['edge_labels'] | |||||
self._edge_attrs = label_names['edge_attrs'] | |||||
self.clean_labels() | |||||
def load_graphs(self, graphs, targets=None): | |||||
# this has to be followed by set_labels(). | |||||
self._graphs = graphs | |||||
self._targets = targets | |||||
# self.set_labels_attrs() # @todo | |||||
def load_predefined_dataset(self, ds_name): | |||||
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | |||||
if ds_name == 'Acyclic': | |||||
ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'AIDS': | |||||
ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'Alkane': | |||||
ds_file = current_path + '../../datasets/Alkane/dataset.ds' | |||||
fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets) | |||||
elif ds_name == 'COIL-DEL': | |||||
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'COIL-RAG': | |||||
ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'COLORS-3': | |||||
ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'Cuneiform': | |||||
ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'DD': | |||||
ds_file = current_path + '../../datasets/DD/DD_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'ENZYMES': | |||||
ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'Fingerprint': | |||||
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'FRANKENSTEIN': | |||||
ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'Letter-high': # node non-symb | |||||
ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'Letter-low': # node non-symb | |||||
ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'Letter-med': # node non-symb | |||||
ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'MAO': | |||||
ds_file = current_path + '../../datasets/MAO/dataset.ds' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'Monoterpenoides': | |||||
ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'MUTAG': | |||||
ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'NCI1': | |||||
ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'NCI109': | |||||
ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'PAH': | |||||
ds_file = current_path + '../../datasets/PAH/dataset.ds' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'SYNTHETIC': | |||||
pass | |||||
elif ds_name == 'SYNTHETICnew': | |||||
ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||||
elif ds_name == 'Synthie': | |||||
pass | |||||
else: | |||||
raise Exception('The dataset name "', ds_name, '" is not pre-defined.') | |||||
self._node_labels = label_names['node_labels'] | |||||
self._node_attrs = label_names['node_attrs'] | |||||
self._edge_labels = label_names['edge_labels'] | |||||
self._edge_attrs = label_names['edge_attrs'] | |||||
self.clean_labels() | |||||
def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): | |||||
self._node_labels = node_labels | |||||
self._node_attrs = node_attrs | |||||
self._edge_labels = edge_labels | |||||
self._edge_attrs = edge_attrs | |||||
def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): | |||||
# @todo: remove labels which have only one possible values. | |||||
if node_labels is None: | |||||
self._node_labels = self._graphs[0].graph['node_labels'] | |||||
# # graphs are considered node unlabeled if all nodes have the same label. | |||||
# infos.update({'node_labeled': is_nl if node_label_num > 1 else False}) | |||||
if node_attrs is None: | |||||
self._node_attrs = self._graphs[0].graph['node_attrs'] | |||||
# for G in Gn: | |||||
# for n in G.nodes(data=True): | |||||
# if 'attributes' in n[1]: | |||||
# return len(n[1]['attributes']) | |||||
# return 0 | |||||
if edge_labels is None: | |||||
self._edge_labels = self._graphs[0].graph['edge_labels'] | |||||
# # graphs are considered edge unlabeled if all edges have the same label. | |||||
# infos.update({'edge_labeled': is_el if edge_label_num > 1 else False}) | |||||
if edge_attrs is None: | |||||
self._edge_attrs = self._graphs[0].graph['edge_attrs'] | |||||
# for G in Gn: | |||||
# if nx.number_of_edges(G) > 0: | |||||
# for e in G.edges(data=True): | |||||
# if 'attributes' in e[2]: | |||||
# return len(e[2]['attributes']) | |||||
# return 0 | |||||
def get_dataset_infos(self, keys=None, params=None): | |||||
"""Computes and returns the structure and property information of the graph dataset. | |||||
Parameters | |||||
---------- | |||||
keys : list, optional | |||||
A list of strings which indicate which informations will be returned. The | |||||
possible choices includes: | |||||
'substructures': sub-structures graphs contains, including 'linear', 'non | |||||
linear' and 'cyclic'. | |||||
'node_label_dim': whether vertices have symbolic labels. | |||||
'edge_label_dim': whether egdes have symbolic labels. | |||||
'directed': whether graphs in dataset are directed. | |||||
'dataset_size': number of graphs in dataset. | |||||
'total_node_num': total number of vertices of all graphs in dataset. | |||||
'ave_node_num': average number of vertices of graphs in dataset. | |||||
'min_node_num': minimum number of vertices of graphs in dataset. | |||||
'max_node_num': maximum number of vertices of graphs in dataset. | |||||
'total_edge_num': total number of edges of all graphs in dataset. | |||||
'ave_edge_num': average number of edges of graphs in dataset. | |||||
'min_edge_num': minimum number of edges of graphs in dataset. | |||||
'max_edge_num': maximum number of edges of graphs in dataset. | |||||
'ave_node_degree': average vertex degree of graphs in dataset. | |||||
'min_node_degree': minimum vertex degree of graphs in dataset. | |||||
'max_node_degree': maximum vertex degree of graphs in dataset. | |||||
'ave_fill_factor': average fill factor (number_of_edges / | |||||
(number_of_nodes ** 2)) of graphs in dataset. | |||||
'min_fill_factor': minimum fill factor of graphs in dataset. | |||||
'max_fill_factor': maximum fill factor of graphs in dataset. | |||||
'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset. | |||||
'edge_label_nums': list number of symbolic edge labels of graphs in dataset. | |||||
'node_attr_dim': number of dimensions of non-symbolic vertex labels. | |||||
Extracted from the 'attributes' attribute of graph nodes. | |||||
'edge_attr_dim': number of dimensions of non-symbolic edge labels. | |||||
Extracted from the 'attributes' attribute of graph edges. | |||||
'class_number': number of classes. Only available for classification problems. | |||||
'all_degree_entropy': the entropy of degree distribution of each graph. | |||||
'ave_degree_entropy': the average entropy of degree distribution of all graphs. | |||||
All informations above will be returned if `keys` is not given. | |||||
params: dict of dict, optional | |||||
A dictinary which contains extra parameters for each possible | |||||
element in ``keys``. | |||||
Return | |||||
------ | |||||
dict | |||||
Information of the graph dataset keyed by `keys`. | |||||
""" | |||||
infos = {} | |||||
if keys == None: | |||||
keys = [ | |||||
'substructures', | |||||
'node_label_dim', | |||||
'edge_label_dim', | |||||
'directed', | |||||
'dataset_size', | |||||
'total_node_num', | |||||
'ave_node_num', | |||||
'min_node_num', | |||||
'max_node_num', | |||||
'total_edge_num', | |||||
'ave_edge_num', | |||||
'min_edge_num', | |||||
'max_edge_num', | |||||
'ave_node_degree', | |||||
'min_node_degree', | |||||
'max_node_degree', | |||||
'ave_fill_factor', | |||||
'min_fill_factor', | |||||
'max_fill_factor', | |||||
'node_label_nums', | |||||
'edge_label_nums', | |||||
'node_attr_dim', | |||||
'edge_attr_dim', | |||||
'class_number', | |||||
'all_degree_entropy', | |||||
'ave_degree_entropy' | |||||
] | |||||
# dataset size | |||||
if 'dataset_size' in keys: | |||||
if self._dataset_size is None: | |||||
self._dataset_size = self._get_dataset_size() | |||||
infos['dataset_size'] = self._dataset_size | |||||
# graph node number | |||||
if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']): | |||||
all_node_nums = self._get_all_node_nums() | |||||
if 'total_node_num' in keys: | |||||
if self._total_node_num is None: | |||||
self._total_node_num = self._get_total_node_num(all_node_nums) | |||||
infos['total_node_num'] = self._total_node_num | |||||
if 'ave_node_num' in keys: | |||||
if self._ave_node_num is None: | |||||
self._ave_node_num = self._get_ave_node_num(all_node_nums) | |||||
infos['ave_node_num'] = self._ave_node_num | |||||
if 'min_node_num' in keys: | |||||
if self._min_node_num is None: | |||||
self._min_node_num = self._get_min_node_num(all_node_nums) | |||||
infos['min_node_num'] = self._min_node_num | |||||
if 'max_node_num' in keys: | |||||
if self._max_node_num is None: | |||||
self._max_node_num = self._get_max_node_num(all_node_nums) | |||||
infos['max_node_num'] = self._max_node_num | |||||
# graph edge number | |||||
if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']): | |||||
all_edge_nums = self._get_all_edge_nums() | |||||
if 'total_edge_num' in keys: | |||||
if self._total_edge_num is None: | |||||
self._total_edge_num = self._get_total_edge_num(all_edge_nums) | |||||
infos['total_edge_num'] = self._total_edge_num | |||||
if 'ave_edge_num' in keys: | |||||
if self._ave_edge_num is None: | |||||
self._ave_edge_num = self._get_ave_edge_num(all_edge_nums) | |||||
infos['ave_edge_num'] = self._ave_edge_num | |||||
if 'max_edge_num' in keys: | |||||
if self._max_edge_num is None: | |||||
self._max_edge_num = self._get_max_edge_num(all_edge_nums) | |||||
infos['max_edge_num'] = self._max_edge_num | |||||
if 'min_edge_num' in keys: | |||||
if self._min_edge_num is None: | |||||
self._min_edge_num = self._get_min_edge_num(all_edge_nums) | |||||
infos['min_edge_num'] = self._min_edge_num | |||||
# label number | |||||
if 'node_label_dim' in keys: | |||||
if self._node_label_dim is None: | |||||
self._node_label_dim = self._get_node_label_dim() | |||||
infos['node_label_dim'] = self._node_label_dim | |||||
if 'node_label_nums' in keys: | |||||
if self._node_label_nums is None: | |||||
self._node_label_nums = {} | |||||
for node_label in self._node_labels: | |||||
self._node_label_nums[node_label] = self._get_node_label_num(node_label) | |||||
infos['node_label_nums'] = self._node_label_nums | |||||
if 'edge_label_dim' in keys: | |||||
if self._edge_label_dim is None: | |||||
self._edge_label_dim = self._get_edge_label_dim() | |||||
infos['edge_label_dim'] = self._edge_label_dim | |||||
if 'edge_label_nums' in keys: | |||||
if self._edge_label_nums is None: | |||||
self._edge_label_nums = {} | |||||
for edge_label in self._edge_labels: | |||||
self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label) | |||||
infos['edge_label_nums'] = self._edge_label_nums | |||||
if 'directed' in keys or 'substructures' in keys: | |||||
if self._directed is None: | |||||
self._directed = self._is_directed() | |||||
infos['directed'] = self._directed | |||||
# node degree | |||||
if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']): | |||||
all_node_degrees = self._get_all_node_degrees() | |||||
if 'ave_node_degree' in keys: | |||||
if self._ave_node_degree is None: | |||||
self._ave_node_degree = self._get_ave_node_degree(all_node_degrees) | |||||
infos['ave_node_degree'] = self._ave_node_degree | |||||
if 'max_node_degree' in keys: | |||||
if self._max_node_degree is None: | |||||
self._max_node_degree = self._get_max_node_degree(all_node_degrees) | |||||
infos['max_node_degree'] = self._max_node_degree | |||||
if 'min_node_degree' in keys: | |||||
if self._min_node_degree is None: | |||||
self._min_node_degree = self._get_min_node_degree(all_node_degrees) | |||||
infos['min_node_degree'] = self._min_node_degree | |||||
# fill factor | |||||
if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']): | |||||
all_fill_factors = self._get_all_fill_factors() | |||||
if 'ave_fill_factor' in keys: | |||||
if self._ave_fill_factor is None: | |||||
self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors) | |||||
infos['ave_fill_factor'] = self._ave_fill_factor | |||||
if 'max_fill_factor' in keys: | |||||
if self._max_fill_factor is None: | |||||
self._max_fill_factor = self._get_max_fill_factor(all_fill_factors) | |||||
infos['max_fill_factor'] = self._max_fill_factor | |||||
if 'min_fill_factor' in keys: | |||||
if self._min_fill_factor is None: | |||||
self._min_fill_factor = self._get_min_fill_factor(all_fill_factors) | |||||
infos['min_fill_factor'] = self._min_fill_factor | |||||
if 'substructures' in keys: | |||||
if self._substructures is None: | |||||
self._substructures = self._get_substructures() | |||||
infos['substructures'] = self._substructures | |||||
if 'class_number' in keys: | |||||
if self._class_number is None: | |||||
self._class_number = self._get_class_number() | |||||
infos['class_number'] = self._class_number | |||||
if 'node_attr_dim' in keys: | |||||
if self._node_attr_dim is None: | |||||
self._node_attr_dim = self._get_node_attr_dim() | |||||
infos['node_attr_dim'] = self._node_attr_dim | |||||
if 'edge_attr_dim' in keys: | |||||
if self._edge_attr_dim is None: | |||||
self._edge_attr_dim = self._get_edge_attr_dim() | |||||
infos['edge_attr_dim'] = self._edge_attr_dim | |||||
# entropy of degree distribution. | |||||
if 'all_degree_entropy' in keys: | |||||
if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): | |||||
base = params['all_degree_entropy']['base'] | |||||
else: | |||||
base = None | |||||
infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base) | |||||
if 'ave_degree_entropy' in keys: | |||||
if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): | |||||
base = params['ave_degree_entropy']['base'] | |||||
else: | |||||
base = None | |||||
infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) | |||||
return infos | |||||
def print_graph_infos(self, infos): | |||||
from collections import OrderedDict | |||||
keys = list(infos.keys()) | |||||
print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) | |||||
def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): | |||||
node_labels = [item for item in node_labels if item in self._node_labels] | |||||
edge_labels = [item for item in edge_labels if item in self._edge_labels] | |||||
node_attrs = [item for item in node_attrs if item in self._node_attrs] | |||||
edge_attrs = [item for item in edge_attrs if item in self._edge_attrs] | |||||
for g in self._graphs: | |||||
for nd in g.nodes(): | |||||
for nl in node_labels: | |||||
del g.nodes[nd][nl] | |||||
for na in node_attrs: | |||||
del g.nodes[nd][na] | |||||
for ed in g.edges(): | |||||
for el in edge_labels: | |||||
del g.edges[ed][el] | |||||
for ea in edge_attrs: | |||||
del g.edges[ed][ea] | |||||
if len(node_labels) > 0: | |||||
self._node_labels = [nl for nl in self._node_labels if nl not in node_labels] | |||||
if len(edge_labels) > 0: | |||||
self._edge_labels = [el for el in self._edge_labels if el not in edge_labels] | |||||
if len(node_attrs) > 0: | |||||
self._node_attrs = [na for na in self._node_attrs if na not in node_attrs] | |||||
if len(edge_attrs) > 0: | |||||
self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs] | |||||
def clean_labels(self): | |||||
labels = [] | |||||
for name in self._node_labels: | |||||
label = set() | |||||
for G in self._graphs: | |||||
label = label | set(nx.get_node_attributes(G, name).values()) | |||||
if len(label) > 1: | |||||
labels.append(name) | |||||
break | |||||
if len(label) < 2: | |||||
for G in self._graphs: | |||||
for nd in G.nodes(): | |||||
del G.nodes[nd][name] | |||||
self._node_labels = labels | |||||
labels = [] | |||||
for name in self._edge_labels: | |||||
label = set() | |||||
for G in self._graphs: | |||||
label = label | set(nx.get_edge_attributes(G, name).values()) | |||||
if len(label) > 1: | |||||
labels.append(name) | |||||
break | |||||
if len(label) < 2: | |||||
for G in self._graphs: | |||||
for ed in G.edges(): | |||||
del G.edges[ed][name] | |||||
self._edge_labels = labels | |||||
labels = [] | |||||
for name in self._node_attrs: | |||||
label = set() | |||||
for G in self._graphs: | |||||
label = label | set(nx.get_node_attributes(G, name).values()) | |||||
if len(label) > 1: | |||||
labels.append(name) | |||||
break | |||||
if len(label) < 2: | |||||
for G in self._graphs: | |||||
for nd in G.nodes(): | |||||
del G.nodes[nd][name] | |||||
self._node_attrs = labels | |||||
labels = [] | |||||
for name in self._edge_attrs: | |||||
label = set() | |||||
for G in self._graphs: | |||||
label = label | set(nx.get_edge_attributes(G, name).values()) | |||||
if len(label) > 1: | |||||
labels.append(name) | |||||
break | |||||
if len(label) < 2: | |||||
for G in self._graphs: | |||||
for ed in G.edges(): | |||||
del G.edges[ed][name] | |||||
self._edge_attrs = labels | |||||
def cut_graphs(self, range_): | |||||
self._graphs = [self._graphs[i] for i in range_] | |||||
if self._targets is not None: | |||||
self._targets = [self._targets[i] for i in range_] | |||||
self.clean_labels() | |||||
def trim_dataset(self, edge_required=False): | |||||
if edge_required: | |||||
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)] | |||||
else: | |||||
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0] | |||||
idx = [p[0] for p in trimed_pairs] | |||||
self._graphs = [p[1] for p in trimed_pairs] | |||||
self._targets = [self._targets[i] for i in idx] | |||||
self.clean_labels() | |||||
def copy(self): | |||||
dataset = Dataset() | |||||
graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None | |||||
target = self._targets.copy() if self._targets is not None else None | |||||
node_labels = self._node_labels.copy() if self._node_labels is not None else None | |||||
node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None | |||||
edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None | |||||
edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None | |||||
dataset.load_graphs(graphs, target) | |||||
dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) | |||||
# @todo: clean_labels and add other class members? | |||||
return dataset | |||||
def get_all_node_labels(self): | |||||
node_labels = [] | |||||
for g in self._graphs: | |||||
for n in g.nodes(): | |||||
nl = tuple(g.nodes[n].items()) | |||||
if nl not in node_labels: | |||||
node_labels.append(nl) | |||||
return node_labels | |||||
def get_all_edge_labels(self): | |||||
edge_labels = [] | |||||
for g in self._graphs: | |||||
for e in g.edges(): | |||||
el = tuple(g.edges[e].items()) | |||||
if el not in edge_labels: | |||||
edge_labels.append(el) | |||||
return edge_labels | |||||
def _get_dataset_size(self): | |||||
return len(self._graphs) | |||||
def _get_all_node_nums(self): | |||||
return [nx.number_of_nodes(G) for G in self._graphs] | |||||
def _get_total_node_nums(self, all_node_nums): | |||||
return np.sum(all_node_nums) | |||||
def _get_ave_node_num(self, all_node_nums): | |||||
return np.mean(all_node_nums) | |||||
def _get_min_node_num(self, all_node_nums): | |||||
return np.amin(all_node_nums) | |||||
def _get_max_node_num(self, all_node_nums): | |||||
return np.amax(all_node_nums) | |||||
def _get_all_edge_nums(self): | |||||
return [nx.number_of_edges(G) for G in self._graphs] | |||||
def _get_total_edge_nums(self, all_edge_nums): | |||||
return np.sum(all_edge_nums) | |||||
def _get_ave_edge_num(self, all_edge_nums): | |||||
return np.mean(all_edge_nums) | |||||
def _get_min_edge_num(self, all_edge_nums): | |||||
return np.amin(all_edge_nums) | |||||
def _get_max_edge_num(self, all_edge_nums): | |||||
return np.amax(all_edge_nums) | |||||
def _get_node_label_dim(self): | |||||
return len(self._node_labels) | |||||
def _get_node_label_num(self, node_label): | |||||
nl = set() | |||||
for G in self._graphs: | |||||
nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||||
return len(nl) | |||||
def _get_edge_label_dim(self): | |||||
return len(self._edge_labels) | |||||
def _get_edge_label_num(self, edge_label): | |||||
el = set() | |||||
for G in self._graphs: | |||||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||||
return len(el) | |||||
def _is_directed(self): | |||||
return nx.is_directed(self._graphs[0]) | |||||
def _get_all_node_degrees(self): | |||||
return [np.mean(list(dict(G.degree()).values())) for G in self._graphs] | |||||
def _get_ave_node_degree(self, all_node_degrees): | |||||
return np.mean(all_node_degrees) | |||||
def _get_max_node_degree(self, all_node_degrees): | |||||
return np.amax(all_node_degrees) | |||||
def _get_min_node_degree(self, all_node_degrees): | |||||
return np.amin(all_node_degrees) | |||||
def _get_all_fill_factors(self): | |||||
"""Get fill factor, the number of non-zero entries in the adjacency matrix. | |||||
Returns | |||||
------- | |||||
list[float] | |||||
List of fill factors for all graphs. | |||||
""" | |||||
return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs] | |||||
def _get_ave_fill_factor(self, all_fill_factors): | |||||
return np.mean(all_fill_factors) | |||||
def _get_max_fill_factor(self, all_fill_factors): | |||||
return np.amax(all_fill_factors) | |||||
def _get_min_fill_factor(self, all_fill_factors): | |||||
return np.amin(all_fill_factors) | |||||
def _get_substructures(self): | |||||
subs = set() | |||||
for G in self._graphs: | |||||
degrees = list(dict(G.degree()).values()) | |||||
if any(i == 2 for i in degrees): | |||||
subs.add('linear') | |||||
if np.amax(degrees) >= 3: | |||||
subs.add('non linear') | |||||
if 'linear' in subs and 'non linear' in subs: | |||||
break | |||||
if self._directed: | |||||
for G in self._graphs: | |||||
if len(list(nx.find_cycle(G))) > 0: | |||||
subs.add('cyclic') | |||||
break | |||||
# else: | |||||
# # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way. | |||||
# upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10 | |||||
# for G in Gn: | |||||
# if (nx.number_of_edges(G) < upper): | |||||
# cyc = list(nx.simple_cycles(G.to_directed())) | |||||
# if any(len(i) > 2 for i in cyc): | |||||
# subs.add('cyclic') | |||||
# break | |||||
# if 'cyclic' not in subs: | |||||
# for G in Gn: | |||||
# cyc = list(nx.simple_cycles(G.to_directed())) | |||||
# if any(len(i) > 2 for i in cyc): | |||||
# subs.add('cyclic') | |||||
# break | |||||
return subs | |||||
def _get_class_num(self): | |||||
return len(set(self._targets)) | |||||
def _get_node_attr_dim(self): | |||||
return len(self._node_attrs) | |||||
def _get_edge_attr_dim(self): | |||||
return len(self._edge_attrs) | |||||
def _compute_all_degree_entropy(self, base=None): | |||||
"""Compute the entropy of degree distribution of each graph. | |||||
Parameters | |||||
---------- | |||||
base : float, optional | |||||
The logarithmic base to use. The default is ``e`` (natural logarithm). | |||||
Returns | |||||
------- | |||||
degree_entropy : float | |||||
The calculated entropy. | |||||
""" | |||||
from gklearn.utils.stats import entropy | |||||
degree_entropy = [] | |||||
for g in self._graphs: | |||||
degrees = list(dict(g.degree()).values()) | |||||
en = entropy(degrees, base=base) | |||||
degree_entropy.append(en) | |||||
return degree_entropy | |||||
@property | |||||
def graphs(self): | |||||
return self._graphs | |||||
@property | |||||
def targets(self): | |||||
return self._targets | |||||
@property | |||||
def node_labels(self): | |||||
return self._node_labels | |||||
@property | |||||
def edge_labels(self): | |||||
return self._edge_labels | |||||
@property | |||||
def node_attrs(self): | |||||
return self._node_attrs | |||||
@property | |||||
def edge_attrs(self): | |||||
return self._edge_attrs | |||||
def split_dataset_by_target(dataset): | |||||
from gklearn.preimage.utils import get_same_item_indices | |||||
graphs = dataset.graphs | |||||
targets = dataset.targets | |||||
datasets = [] | |||||
idx_targets = get_same_item_indices(targets) | |||||
for key, val in idx_targets.items(): | |||||
sub_graphs = [graphs[i] for i in val] | |||||
sub_dataset = Dataset() | |||||
sub_dataset.load_graphs(sub_graphs, [key] * len(val)) | |||||
node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None | |||||
node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None | |||||
edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None | |||||
edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None | |||||
sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) | |||||
datasets.append(sub_dataset) | |||||
# @todo: clean_labels? | |||||
return datasets |
@@ -0,0 +1,824 @@ | |||||
""" Utilities function to manage graph files | |||||
""" | |||||
from os.path import dirname, splitext | |||||
class DataLoader(): | |||||
def __init__(self, filename, filename_targets=None, gformat=None, **kwargs): | |||||
"""Read graph data from filename and load them as NetworkX graphs. | |||||
Parameters | |||||
---------- | |||||
filename : string | |||||
The name of the file from where the dataset is read. | |||||
filename_targets : string | |||||
The name of file of the targets corresponding to graphs. | |||||
Notes | |||||
----- | |||||
This function supports following graph dataset formats: | |||||
'ds': load data from .ds file. See comments of function loadFromDS for a example. | |||||
'cxl': load data from Graph eXchange Language file (.cxl file). See | |||||
`here <http://www.gupro.de/GXL/Introduction/background.html>`__ for detail. | |||||
'sdf': load data from structured data file (.sdf file). See | |||||
`here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ | |||||
for details. | |||||
'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See | |||||
README in `downloadable file <http://mlcb.is.tuebingen.mpg.de/Mitarbeiter/Nino/WL/>`__ | |||||
for details. | |||||
'txt': Load graph data from the TUDataset. See | |||||
`here <https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets>`__ | |||||
for details. Note here filename is the name of either .txt file in | |||||
the dataset directory. | |||||
""" | |||||
extension = splitext(filename)[1][1:] | |||||
if extension == "ds": | |||||
self._graphs, self._targets, self._label_names = self.load_from_ds(filename, filename_targets) | |||||
elif extension == "cxl": | |||||
dir_dataset = kwargs.get('dirname_dataset', None) | |||||
self._graphs, self._targets, self._label_names = self.load_from_xml(filename, dir_dataset) | |||||
elif extension == 'xml': | |||||
dir_dataset = kwargs.get('dirname_dataset', None) | |||||
self._graphs, self._targets, self._label_names = self.load_from_xml(filename, dir_dataset) | |||||
elif extension == "mat": | |||||
order = kwargs.get('order') | |||||
self._graphs, self._targets, self._label_names = self.load_mat(filename, order) | |||||
elif extension == 'txt': | |||||
self._graphs, self._targets, self._label_names = self.load_tud(filename) | |||||
else: | |||||
raise ValueError('The input file with the extension ".', extension, '" is not supported. The supported extensions includes: ".ds", ".cxl", ".xml", ".mat", ".txt".') | |||||
def load_from_ds(self, filename, filename_targets): | |||||
"""Load data from .ds file. | |||||
Possible graph formats include: | |||||
'.ct': see function load_ct for detail. | |||||
'.gxl': see dunction load_gxl for detail. | |||||
Note these graph formats are checked automatically by the extensions of | |||||
graph files. | |||||
""" | |||||
dirname_dataset = dirname(filename) | |||||
data = [] | |||||
y = [] | |||||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||||
with open(filename) as fn: | |||||
content = fn.read().splitlines() | |||||
extension = splitext(content[0].split(' ')[0])[1][1:] | |||||
if extension == 'ct': | |||||
load_file_fun = self.load_ct | |||||
elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet. | |||||
load_file_fun = self.load_gxl | |||||
if filename_targets is None or filename_targets == '': | |||||
for i in range(0, len(content)): | |||||
tmp = content[i].split(' ') | |||||
# remove the '#'s in file names | |||||
g, l_names = load_file_fun(dirname_dataset + '/' + tmp[0].replace('#', '', 1)) | |||||
data.append(g) | |||||
self._append_label_names(label_names, l_names) | |||||
y.append(float(tmp[1])) | |||||
else: # targets in a seperate file | |||||
for i in range(0, len(content)): | |||||
tmp = content[i] | |||||
# remove the '#'s in file names | |||||
g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1)) | |||||
data.append(g) | |||||
self._append_label_names(label_names, l_names) | |||||
with open(filename_targets) as fnt: | |||||
content_y = fnt.read().splitlines() | |||||
# assume entries in filename and filename_targets have the same order. | |||||
for item in content_y: | |||||
tmp = item.split(' ') | |||||
# assume the 3rd entry in a line is y (for Alkane dataset) | |||||
y.append(float(tmp[2])) | |||||
return data, y, label_names | |||||
def load_from_xml(self, filename, dir_dataset=None): | |||||
import xml.etree.ElementTree as ET | |||||
if dir_dataset is not None: | |||||
dir_dataset = dir_dataset | |||||
else: | |||||
dir_dataset = dirname(filename) | |||||
tree = ET.parse(filename) | |||||
root = tree.getroot() | |||||
data = [] | |||||
y = [] | |||||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||||
for graph in root.iter('graph'): | |||||
mol_filename = graph.attrib['file'] | |||||
mol_class = graph.attrib['class'] | |||||
g, l_names = self.load_gxl(dir_dataset + '/' + mol_filename) | |||||
data.append(g) | |||||
self._append_label_names(label_names, l_names) | |||||
y.append(mol_class) | |||||
return data, y, label_names | |||||
def load_mat(self, filename, order): # @todo: need to be updated (auto order) or deprecated. | |||||
"""Load graph data from a MATLAB (up to version 7.1) .mat file. | |||||
Notes | |||||
------ | |||||
A MAT file contains a struct array containing graphs, and a column vector lx containing a class label for each graph. | |||||
Check README in `downloadable file <http://mlcb.is.tuebingen.mpg.de/Mitarbeiter/Nino/WL/>`__ for detailed structure. | |||||
""" | |||||
from scipy.io import loadmat | |||||
import numpy as np | |||||
import networkx as nx | |||||
data = [] | |||||
content = loadmat(filename) | |||||
for key, value in content.items(): | |||||
if key[0] == 'l': # class label | |||||
y = np.transpose(value)[0].tolist() | |||||
elif key[0] != '_': | |||||
# if adjacency matrix is not compressed / edge label exists | |||||
if order[1] == 0: | |||||
for i, item in enumerate(value[0]): | |||||
g = nx.Graph(name=i) # set name of the graph | |||||
nl = np.transpose(item[order[3]][0][0][0]) # node label | |||||
for index, label in enumerate(nl[0]): | |||||
g.add_node(index, label_1=str(label)) | |||||
el = item[order[4]][0][0][0] # edge label | |||||
for edge in el: | |||||
g.add_edge(edge[0] - 1, edge[1] - 1, label_1=str(edge[2])) | |||||
data.append(g) | |||||
else: | |||||
for i, item in enumerate(value[0]): | |||||
g = nx.Graph(name=i) # set name of the graph | |||||
nl = np.transpose(item[order[3]][0][0][0]) # node label | |||||
for index, label in enumerate(nl[0]): | |||||
g.add_node(index, label_1=str(label)) | |||||
sam = item[order[0]] # sparse adjacency matrix | |||||
index_no0 = sam.nonzero() | |||||
for col, row in zip(index_no0[0], index_no0[1]): | |||||
g.add_edge(col, row) | |||||
data.append(g) | |||||
label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||||
if order[1] == 0: | |||||
label_names['edge_labels'].append('label_1') | |||||
return data, y, label_names | |||||
def load_tud(self, filename): | |||||
"""Load graph data from TUD dataset files. | |||||
Notes | |||||
------ | |||||
The graph data is loaded from separate files. | |||||
Check README in `downloadable file <http://tiny.cc/PK_MLJ_data>`__, 2018 for detailed structure. | |||||
""" | |||||
import networkx as nx | |||||
from os import listdir | |||||
from os.path import dirname, basename | |||||
def get_infos_from_readme(frm): # @todo: add README (cuniform), maybe node/edge label maps. | |||||
"""Get information from DS_label_readme.txt file. | |||||
""" | |||||
def get_label_names_from_line(line): | |||||
"""Get names of labels/attributes from a line. | |||||
""" | |||||
str_names = line.split('[')[1].split(']')[0] | |||||
names = str_names.split(',') | |||||
names = [attr.strip() for attr in names] | |||||
return names | |||||
def get_class_label_map(label_map_strings): | |||||
label_map = {} | |||||
for string in label_map_strings: | |||||
integer, label = string.split('\t') | |||||
label_map[int(integer.strip())] = label.strip() | |||||
return label_map | |||||
label_names = {'node_labels': [], 'node_attrs': [], | |||||
'edge_labels': [], 'edge_attrs': []} | |||||
class_label_map = None | |||||
class_label_map_strings = [] | |||||
with open(frm) as rm: | |||||
content_rm = rm.read().splitlines() | |||||
i = 0 | |||||
while i < len(content_rm): | |||||
line = content_rm[i].strip() | |||||
# get node/edge labels and attributes. | |||||
if line.startswith('Node labels:'): | |||||
label_names['node_labels'] = get_label_names_from_line(line) | |||||
elif line.startswith('Node attributes:'): | |||||
label_names['node_attrs'] = get_label_names_from_line(line) | |||||
elif line.startswith('Edge labels:'): | |||||
label_names['edge_labels'] = get_label_names_from_line(line) | |||||
elif line.startswith('Edge attributes:'): | |||||
label_names['edge_attrs'] = get_label_names_from_line(line) | |||||
# get class label map. | |||||
elif line.startswith('Class labels were converted to integer values using this map:'): | |||||
i += 2 | |||||
line = content_rm[i].strip() | |||||
while line != '' and i < len(content_rm): | |||||
class_label_map_strings.append(line) | |||||
i += 1 | |||||
line = content_rm[i].strip() | |||||
class_label_map = get_class_label_map(class_label_map_strings) | |||||
i += 1 | |||||
return label_names, class_label_map | |||||
# get dataset name. | |||||
dirname_dataset = dirname(filename) | |||||
filename = basename(filename) | |||||
fn_split = filename.split('_A') | |||||
ds_name = fn_split[0].strip() | |||||
# load data file names | |||||
for name in listdir(dirname_dataset): | |||||
if ds_name + '_A' in name: | |||||
fam = dirname_dataset + '/' + name | |||||
elif ds_name + '_graph_indicator' in name: | |||||
fgi = dirname_dataset + '/' + name | |||||
elif ds_name + '_graph_labels' in name: | |||||
fgl = dirname_dataset + '/' + name | |||||
elif ds_name + '_node_labels' in name: | |||||
fnl = dirname_dataset + '/' + name | |||||
elif ds_name + '_edge_labels' in name: | |||||
fel = dirname_dataset + '/' + name | |||||
elif ds_name + '_edge_attributes' in name: | |||||
fea = dirname_dataset + '/' + name | |||||
elif ds_name + '_node_attributes' in name: | |||||
fna = dirname_dataset + '/' + name | |||||
elif ds_name + '_graph_attributes' in name: | |||||
fga = dirname_dataset + '/' + name | |||||
elif ds_name + '_label_readme' in name: | |||||
frm = dirname_dataset + '/' + name | |||||
# this is supposed to be the node attrs, make sure to put this as the last 'elif' | |||||
elif ds_name + '_attributes' in name: | |||||
fna = dirname_dataset + '/' + name | |||||
# get labels and attributes names. | |||||
if 'frm' in locals(): | |||||
label_names, class_label_map = get_infos_from_readme(frm) | |||||
else: | |||||
label_names = {'node_labels': [], 'node_attrs': [], | |||||
'edge_labels': [], 'edge_attrs': []} | |||||
class_label_map = None | |||||
with open(fgi) as gi: | |||||
content_gi = gi.read().splitlines() # graph indicator | |||||
with open(fam) as am: | |||||
content_am = am.read().splitlines() # adjacency matrix | |||||
# load targets. | |||||
if 'fgl' in locals(): | |||||
with open(fgl) as gl: | |||||
content_targets = gl.read().splitlines() # targets (classification) | |||||
targets = [float(i) for i in content_targets] | |||||
elif 'fga' in locals(): | |||||
with open(fga) as ga: | |||||
content_targets = ga.read().splitlines() # targets (regression) | |||||
targets = [int(i) for i in content_targets] | |||||
else: | |||||
raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') | |||||
if class_label_map is not None: | |||||
targets = [class_label_map[t] for t in targets] | |||||
# create graphs and add nodes | |||||
data = [nx.Graph(name=str(i)) for i in range(0, len(content_targets))] | |||||
if 'fnl' in locals(): | |||||
with open(fnl) as nl: | |||||
content_nl = nl.read().splitlines() # node labels | |||||
for idx, line in enumerate(content_gi): | |||||
# transfer to int first in case of unexpected blanks | |||||
data[int(line) - 1].add_node(idx) | |||||
labels = [l.strip() for l in content_nl[idx].split(',')] | |||||
if label_names['node_labels'] == []: # @todo: need fix bug. | |||||
for i, label in enumerate(labels): | |||||
l_name = 'label_' + str(i) | |||||
data[int(line) - 1].nodes[idx][l_name] = label | |||||
label_names['node_labels'].append(l_name) | |||||
else: | |||||
for i, l_name in enumerate(label_names['node_labels']): | |||||
data[int(line) - 1].nodes[idx][l_name] = labels[i] | |||||
else: | |||||
for i, line in enumerate(content_gi): | |||||
data[int(line) - 1].add_node(i) | |||||
# add edges | |||||
for line in content_am: | |||||
tmp = line.split(',') | |||||
n1 = int(tmp[0]) - 1 | |||||
n2 = int(tmp[1]) - 1 | |||||
# ignore edge weight here. | |||||
g = int(content_gi[n1]) - 1 | |||||
data[g].add_edge(n1, n2) | |||||
# add edge labels | |||||
if 'fel' in locals(): | |||||
with open(fel) as el: | |||||
content_el = el.read().splitlines() | |||||
for idx, line in enumerate(content_el): | |||||
labels = [l.strip() for l in line.split(',')] | |||||
n = [int(i) - 1 for i in content_am[idx].split(',')] | |||||
g = int(content_gi[n[0]]) - 1 | |||||
if label_names['edge_labels'] == []: | |||||
for i, label in enumerate(labels): | |||||
l_name = 'label_' + str(i) | |||||
data[g].edges[n[0], n[1]][l_name] = label | |||||
label_names['edge_labels'].append(l_name) | |||||
else: | |||||
for i, l_name in enumerate(label_names['edge_labels']): | |||||
data[g].edges[n[0], n[1]][l_name] = labels[i] | |||||
# add node attributes | |||||
if 'fna' in locals(): | |||||
with open(fna) as na: | |||||
content_na = na.read().splitlines() | |||||
for idx, line in enumerate(content_na): | |||||
attrs = [a.strip() for a in line.split(',')] | |||||
g = int(content_gi[idx]) - 1 | |||||
if label_names['node_attrs'] == []: | |||||
for i, attr in enumerate(attrs): | |||||
a_name = 'attr_' + str(i) | |||||
data[g].nodes[idx][a_name] = attr | |||||
label_names['node_attrs'].append(a_name) | |||||
else: | |||||
for i, a_name in enumerate(label_names['node_attrs']): | |||||
data[g].nodes[idx][a_name] = attrs[i] | |||||
# add edge attributes | |||||
if 'fea' in locals(): | |||||
with open(fea) as ea: | |||||
content_ea = ea.read().splitlines() | |||||
for idx, line in enumerate(content_ea): | |||||
attrs = [a.strip() for a in line.split(',')] | |||||
n = [int(i) - 1 for i in content_am[idx].split(',')] | |||||
g = int(content_gi[n[0]]) - 1 | |||||
if label_names['edge_attrs'] == []: | |||||
for i, attr in enumerate(attrs): | |||||
a_name = 'attr_' + str(i) | |||||
data[g].edges[n[0], n[1]][a_name] = attr | |||||
label_names['edge_attrs'].append(a_name) | |||||
else: | |||||
for i, a_name in enumerate(label_names['edge_attrs']): | |||||
data[g].edges[n[0], n[1]][a_name] = attrs[i] | |||||
return data, targets, label_names | |||||
def load_ct(self, filename): # @todo: this function is only tested on CTFile V2000; header not considered; only simple cases (atoms and bonds are considered.) | |||||
"""load data from a Chemical Table (.ct) file. | |||||
Notes | |||||
------ | |||||
a typical example of data in .ct is like this: | |||||
3 2 <- number of nodes and edges | |||||
0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) | |||||
0.0000 0.0000 0.0000 C | |||||
0.0000 0.0000 0.0000 O | |||||
1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo | |||||
2 3 1 1 | |||||
Check `CTFile Formats file <https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS>`__ | |||||
for detailed format discription. | |||||
""" | |||||
import networkx as nx | |||||
from os.path import basename | |||||
g = nx.Graph() | |||||
with open(filename) as f: | |||||
content = f.read().splitlines() | |||||
g = nx.Graph(name=str(content[0]), filename=basename(filename)) # set name of the graph | |||||
# read the counts line. | |||||
tmp = content[1].split(' ') | |||||
tmp = [x for x in tmp if x != ''] | |||||
nb_atoms = int(tmp[0].strip()) # number of atoms | |||||
nb_bonds = int(tmp[1].strip()) # number of bonds | |||||
count_line_tags = ['number_of_atoms', 'number_of_bonds', 'number_of_atom_lists', '', 'chiral_flag', 'number_of_stext_entries', '', '', '', '', 'number_of_properties', 'CT_version'] | |||||
i = 0 | |||||
while i < len(tmp): | |||||
if count_line_tags[i] != '': # if not obsoleted | |||||
g.graph[count_line_tags[i]] = tmp[i].strip() | |||||
i += 1 | |||||
# read the atom block. | |||||
atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag'] | |||||
for i in range(0, nb_atoms): | |||||
tmp = content[i + 2].split(' ') | |||||
tmp = [x for x in tmp if x != ''] | |||||
g.add_node(i) | |||||
j = 0 | |||||
while j < len(tmp): | |||||
if atom_tags[j] != '': | |||||
g.nodes[i][atom_tags[j]] = tmp[j].strip() | |||||
j += 1 | |||||
# read the bond block. | |||||
bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status'] | |||||
for i in range(0, nb_bonds): | |||||
tmp = content[i + g.number_of_nodes() + 2].split(' ') | |||||
tmp = [x for x in tmp if x != ''] | |||||
n1, n2 = int(tmp[0].strip()) - 1, int(tmp[1].strip()) - 1 | |||||
g.add_edge(n1, n2) | |||||
j = 2 | |||||
while j < len(tmp): | |||||
if bond_tags[j] != '': | |||||
g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip() | |||||
j += 1 | |||||
# get label names. | |||||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||||
atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1] | |||||
for nd in g.nodes(): | |||||
for key in g.nodes[nd]: | |||||
if atom_symbolic[atom_tags.index(key)] == 1: | |||||
label_names['node_labels'].append(key) | |||||
else: | |||||
label_names['node_attrs'].append(key) | |||||
break | |||||
bond_symbolic = [None, None, 1, 1, None, 1, 1] | |||||
for ed in g.edges(): | |||||
for key in g.edges[ed]: | |||||
if bond_symbolic[bond_tags.index(key)] == 1: | |||||
label_names['edge_labels'].append(key) | |||||
else: | |||||
label_names['edge_attrs'].append(key) | |||||
break | |||||
return g, label_names | |||||
def load_gxl(self, filename): # @todo: directed graphs. | |||||
from os.path import basename | |||||
import networkx as nx | |||||
import xml.etree.ElementTree as ET | |||||
tree = ET.parse(filename) | |||||
root = tree.getroot() | |||||
index = 0 | |||||
g = nx.Graph(filename=basename(filename), name=root[0].attrib['id']) | |||||
dic = {} # used to retrieve incident nodes of edges | |||||
for node in root.iter('node'): | |||||
dic[node.attrib['id']] = index | |||||
labels = {} | |||||
for attr in node.iter('attr'): | |||||
labels[attr.attrib['name']] = attr[0].text | |||||
g.add_node(index, **labels) | |||||
index += 1 | |||||
for edge in root.iter('edge'): | |||||
labels = {} | |||||
for attr in edge.iter('attr'): | |||||
labels[attr.attrib['name']] = attr[0].text | |||||
g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) | |||||
# get label names. | |||||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||||
for node in root.iter('node'): | |||||
for attr in node.iter('attr'): | |||||
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||||
label_names['node_labels'].append(attr.attrib['name']) | |||||
else: | |||||
label_names['node_attrs'].append(attr.attrib['name']) | |||||
break | |||||
for edge in root.iter('edge'): | |||||
for attr in edge.iter('attr'): | |||||
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||||
label_names['edge_labels'].append(attr.attrib['name']) | |||||
else: | |||||
label_names['edge_attrs'].append(attr.attrib['name']) | |||||
break | |||||
return g, label_names | |||||
def _append_label_names(self, label_names, new_names): | |||||
for key, val in label_names.items(): | |||||
label_names[key] += [name for name in new_names[key] if name not in val] | |||||
@property | |||||
def data(self): | |||||
return self._graphs, self._targets, self._label_names | |||||
@property | |||||
def graphs(self): | |||||
return self._graphs | |||||
@property | |||||
def targets(self): | |||||
return self._targets | |||||
@property | |||||
def label_names(self): | |||||
return self._label_names | |||||
class DataSaver(): | |||||
def __init__(self, graphs, targets=None, filename='gfile', gformat='gxl', group=None, **kwargs): | |||||
"""Save list of graphs. | |||||
""" | |||||
import os | |||||
dirname_ds = os.path.dirname(filename) | |||||
if dirname_ds != '': | |||||
dirname_ds += '/' | |||||
os.makedirs(dirname_ds, exist_ok=True) | |||||
if 'graph_dir' in kwargs: | |||||
graph_dir = kwargs['graph_dir'] + '/' | |||||
os.makedirs(graph_dir, exist_ok=True) | |||||
del kwargs['graph_dir'] | |||||
else: | |||||
graph_dir = dirname_ds | |||||
if group == 'xml' and gformat == 'gxl': | |||||
with open(filename + '.xml', 'w') as fgroup: | |||||
fgroup.write("<?xml version=\"1.0\"?>") | |||||
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||||
fgroup.write("\n<GraphCollection>") | |||||
for idx, g in enumerate(graphs): | |||||
fname_tmp = "graph" + str(idx) + ".gxl" | |||||
self.save_gxl(g, graph_dir + fname_tmp, **kwargs) | |||||
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(targets[idx]) + "\"/>") | |||||
fgroup.write("\n</GraphCollection>") | |||||
fgroup.close() | |||||
def save_gxl(self, graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): | |||||
if method == 'default': | |||||
gxl_file = open(filename, 'w') | |||||
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||||
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||||
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||||
if 'name' in graph.graph: | |||||
name = str(graph.graph['name']) | |||||
else: | |||||
name = 'dummy' | |||||
gxl_file.write("<graph id=\"" + name + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||||
for v, attrs in graph.nodes(data=True): | |||||
gxl_file.write("<node id=\"_" + str(v) + "\">") | |||||
for l_name in node_labels: | |||||
gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||||
str(attrs[l_name]) + "</int></attr>") | |||||
for a_name in node_attrs: | |||||
gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||||
str(attrs[a_name]) + "</float></attr>") | |||||
gxl_file.write("</node>\n") | |||||
for v1, v2, attrs in graph.edges(data=True): | |||||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | |||||
for l_name in edge_labels: | |||||
gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||||
str(attrs[l_name]) + "</int></attr>") | |||||
for a_name in edge_attrs: | |||||
gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||||
str(attrs[a_name]) + "</float></attr>") | |||||
gxl_file.write("</edge>\n") | |||||
gxl_file.write("</graph>\n") | |||||
gxl_file.write("</gxl>") | |||||
gxl_file.close() | |||||
elif method == 'benoit': | |||||
import xml.etree.ElementTree as ET | |||||
root_node = ET.Element('gxl') | |||||
attr = dict() | |||||
attr['id'] = str(graph.graph['name']) | |||||
attr['edgeids'] = 'true' | |||||
attr['edgemode'] = 'undirected' | |||||
graph_node = ET.SubElement(root_node, 'graph', attrib=attr) | |||||
for v in graph: | |||||
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) | |||||
for attr in graph.nodes[v].keys(): | |||||
cur_attr = ET.SubElement( | |||||
current_node, 'attr', attrib={'name': attr}) | |||||
cur_value = ET.SubElement(cur_attr, | |||||
graph.nodes[v][attr].__class__.__name__) | |||||
cur_value.text = graph.nodes[v][attr] | |||||
for v1 in graph: | |||||
for v2 in graph[v1]: | |||||
if (v1 < v2): # Non oriented graphs | |||||
cur_edge = ET.SubElement( | |||||
graph_node, | |||||
'edge', | |||||
attrib={ | |||||
'from': str(v1), | |||||
'to': str(v2) | |||||
}) | |||||
for attr in graph[v1][v2].keys(): | |||||
cur_attr = ET.SubElement( | |||||
cur_edge, 'attr', attrib={'name': attr}) | |||||
cur_value = ET.SubElement( | |||||
cur_attr, graph[v1][v2][attr].__class__.__name__) | |||||
cur_value.text = str(graph[v1][v2][attr]) | |||||
tree = ET.ElementTree(root_node) | |||||
tree.write(filename) | |||||
elif method == 'gedlib': | |||||
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||||
# pass | |||||
gxl_file = open(filename, 'w') | |||||
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||||
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||||
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||||
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") | |||||
for v, attrs in graph.nodes(data=True): | |||||
gxl_file.write("<node id=\"_" + str(v) + "\">") | |||||
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['chem']) + "</int></attr>") | |||||
gxl_file.write("</node>\n") | |||||
for v1, v2, attrs in graph.edges(data=True): | |||||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | |||||
gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['valence']) + "</int></attr>") | |||||
# gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>") | |||||
gxl_file.write("</edge>\n") | |||||
gxl_file.write("</graph>\n") | |||||
gxl_file.write("</gxl>") | |||||
gxl_file.close() | |||||
elif method == 'gedlib-letter': | |||||
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||||
# and https://github.com/dbblumenthal/gedlib/blob/master/data/datasets/Letter/HIGH/AP1_0000.gxl | |||||
gxl_file = open(filename, 'w') | |||||
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||||
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||||
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||||
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||||
for v, attrs in graph.nodes(data=True): | |||||
gxl_file.write("<node id=\"_" + str(v) + "\">") | |||||
gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>") | |||||
gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>") | |||||
gxl_file.write("</node>\n") | |||||
for v1, v2, attrs in graph.edges(data=True): | |||||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>\n") | |||||
gxl_file.write("</graph>\n") | |||||
gxl_file.write("</gxl>") | |||||
gxl_file.close() | |||||
# def loadSDF(filename): | |||||
# """load data from structured data file (.sdf file). | |||||
# Notes | |||||
# ------ | |||||
# A SDF file contains a group of molecules, represented in the similar way as in MOL format. | |||||
# Check `here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ for detailed structure. | |||||
# """ | |||||
# import networkx as nx | |||||
# from os.path import basename | |||||
# from tqdm import tqdm | |||||
# import sys | |||||
# data = [] | |||||
# with open(filename) as f: | |||||
# content = f.read().splitlines() | |||||
# index = 0 | |||||
# pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) | |||||
# while index < len(content): | |||||
# index_old = index | |||||
# g = nx.Graph(name=content[index].strip()) # set name of the graph | |||||
# tmp = content[index + 3] | |||||
# nb_nodes = int(tmp[:3]) # number of the nodes | |||||
# nb_edges = int(tmp[3:6]) # number of the edges | |||||
# for i in range(0, nb_nodes): | |||||
# tmp = content[i + index + 4] | |||||
# g.add_node(i, atom=tmp[31:34].strip()) | |||||
# for i in range(0, nb_edges): | |||||
# tmp = content[i + index + g.number_of_nodes() + 4] | |||||
# tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] | |||||
# g.add_edge( | |||||
# int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) | |||||
# data.append(g) | |||||
# index += 4 + g.number_of_nodes() + g.number_of_edges() | |||||
# while content[index].strip() != '$$$$': # seperator | |||||
# index += 1 | |||||
# index += 1 | |||||
# pbar.update(index - index_old) | |||||
# pbar.update(1) | |||||
# pbar.close() | |||||
# return data | |||||
# def load_from_cxl(filename): | |||||
# import xml.etree.ElementTree as ET | |||||
# | |||||
# dirname_dataset = dirname(filename) | |||||
# tree = ET.parse(filename) | |||||
# root = tree.getroot() | |||||
# data = [] | |||||
# y = [] | |||||
# for graph in root.iter('graph'): | |||||
# mol_filename = graph.attrib['file'] | |||||
# mol_class = graph.attrib['class'] | |||||
# data.append(load_gxl(dirname_dataset + '/' + mol_filename)) | |||||
# y.append(mol_class) | |||||
if __name__ == '__main__': | |||||
# ### Load dataset from .ds file. | |||||
# # .ct files. | |||||
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||||
# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||||
# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||||
# ds_file = '../../datasets/Acyclic/dataset_bps.ds' # node symb | |||||
# Gn, targets, label_names = load_dataset(ds_file) | |||||
# ds_file = '../../datasets/MAO/dataset.ds' # node/edge symb | |||||
# Gn, targets, label_names = load_dataset(ds_file) | |||||
## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||||
## Gn, y = loadDataset(ds['dataset']) | |||||
# print(Gn[1].graph) | |||||
# print(Gn[1].nodes(data=True)) | |||||
# print(Gn[1].edges(data=True)) | |||||
# print(targets[1]) | |||||
# # .gxl file. | |||||
# ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb | |||||
# Gn, y, label_names = load_dataset(ds_file) | |||||
# print(Gn[1].graph) | |||||
# print(Gn[1].nodes(data=True)) | |||||
# print(Gn[1].edges(data=True)) | |||||
# print(y[1]) | |||||
# .mat file. | |||||
ds_file = '../../datasets/MUTAG_mat/MUTAG.mat' | |||||
order = [0, 0, 3, 1, 2] | |||||
gloader = DataLoader(ds_file, order=order) | |||||
Gn, targets, label_names = gloader.data | |||||
print(Gn[1].graph) | |||||
print(Gn[1].nodes(data=True)) | |||||
print(Gn[1].edges(data=True)) | |||||
print(targets[1]) | |||||
# ### Convert graph from one format to another. | |||||
# # .gxl file. | |||||
# import networkx as nx | |||||
# ds = {'name': 'monoterpenoides', | |||||
# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||||
# Gn, y = loadDataset(ds['dataset']) | |||||
# y = [int(i) for i in y] | |||||
# print(Gn[1].nodes(data=True)) | |||||
# print(Gn[1].edges(data=True)) | |||||
# print(y[1]) | |||||
# # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. | |||||
# Gn_new = [] | |||||
# for G in Gn: | |||||
# G_new = nx.Graph() | |||||
# for nd, attrs in G.nodes(data=True): | |||||
# G_new.add_node(str(nd), chem=attrs['atom']) | |||||
# for nd1, nd2, attrs in G.edges(data=True): | |||||
# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||||
## G_new.add_edge(str(nd1), str(nd2)) | |||||
# Gn_new.append(G_new) | |||||
# print(Gn_new[1].nodes(data=True)) | |||||
# print(Gn_new[1].edges(data=True)) | |||||
# print(Gn_new[1]) | |||||
# filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' | |||||
# xparams = {'method': 'gedlib'} | |||||
# saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) | |||||
# save dataset. | |||||
# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||||
# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||||
# saveDataset(Gn, y, group='xml', filename='temp/temp') | |||||
# test - new way to add labels and attributes. | |||||
# dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||||
# filename = '../../datasets/Fingerprint/Fingerprint_A.txt' | |||||
# dataset = '../../datasets/Letter-med/Letter-med_A.txt' | |||||
# dataset = '../../datasets/AIDS/AIDS_A.txt' | |||||
# dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||||
# Gn, targets, label_names = load_dataset(filename) | |||||
pass |
@@ -0,0 +1,61 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri Sep 11 18:10:06 2020 | |||||
@author: ljia | |||||
""" | |||||
import numpy as np | |||||
import networkx as nx | |||||
import random | |||||
class GraphSynthesizer(object): | |||||
def __init__(self, g_type=None, *args, **kwargs): | |||||
if g_type == 'unified': | |||||
self._graphs = self.unified_graphs(*args, *kwargs) | |||||
else: | |||||
self._graphs = None | |||||
def random_graph(self, num_nodes, num_edges, num_node_labels=0, num_edge_labels=0, seed=None, directed=False, max_num_edges=None, all_edges=None): | |||||
g = nx.Graph() | |||||
if num_node_labels > 0: | |||||
node_labels = np.random.randint(0, high=num_node_labels, size=num_nodes) | |||||
for i in range(0, num_nodes): | |||||
g.add_node(str(i), atom=node_labels[i]) # @todo: update "atom". | |||||
else: | |||||
for i in range(0, num_nodes): | |||||
g.add_node(str(i)) | |||||
if num_edge_labels > 0: | |||||
edge_labels = np.random.randint(0, high=num_edge_labels, size=num_edges) | |||||
for idx, i in enumerate(random.sample(range(0, max_num_edges), num_edges)): | |||||
node1, node2 = all_edges[i] | |||||
g.add_edge(str(node1), str(node2), bond_type=edge_labels[idx]) # @todo: update "bond_type". | |||||
else: | |||||
for i in random.sample(range(0, max_num_edges), num_edges): | |||||
node1, node2 = all_edges[i] | |||||
g.add_edge(str(node1), str(node2)) | |||||
return g | |||||
def unified_graphs(self, num_graphs=1000, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=0, seed=None, directed=False): | |||||
max_num_edges = int((num_nodes - 1) * num_nodes / 2) | |||||
if num_edges > max_num_edges: | |||||
raise Exception('Too many edges.') | |||||
all_edges = [(i, j) for i in range(0, num_nodes) for j in range(i + 1, num_nodes)] # @todo: optimize. No directed graphs. | |||||
graphs = [] | |||||
for idx in range(0, num_graphs): | |||||
graphs.append(self.random_graph(num_nodes, num_edges, num_node_labels=num_node_labels, num_edge_labels=num_edge_labels, seed=seed, directed=directed, max_num_edges=max_num_edges, all_edges=all_edges)) | |||||
return graphs | |||||
@property | |||||
def graphs(self): | |||||
return self._graphs |
@@ -0,0 +1,142 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Wed Oct 20 11:48:02 2020 | |||||
@author: ljia | |||||
""" | |||||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||||
import os | |||||
import multiprocessing | |||||
import pickle | |||||
import logging | |||||
from gklearn.ged.util import compute_geds | |||||
import time | |||||
import sys | |||||
from group_results import group_trials | |||||
def generate_graphs(): | |||||
from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||||
gsyzer = GraphSynthesizer() | |||||
graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||||
return graphs | |||||
def xp_compute_ged_matrix(graphs, N, max_num_solutions, ratio, trial): | |||||
save_file_suffix = '.' + str(N) + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
# Return if the file exists. | |||||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
return None, None | |||||
"""**2. Set parameters.**""" | |||||
# Parameters for GED computation. | |||||
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
'lsape_model': 'ECBP', # | |||||
# ??when bigger than 1, then the method is considered mIPFP. | |||||
# the actual number of computed solutions might be smaller than the specified value | |||||
'max_num_solutions': max_num_solutions, | |||||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
'greedy_method': 'BASIC', # | |||||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
'attr_distance': 'euclidean', | |||||
'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
'threads': multiprocessing.cpu_count(), | |||||
'centrality_method': 'NONE', | |||||
'centrality_weight': 0.7, | |||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
} | |||||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
options = ged_options.copy() | |||||
options['edit_cost_constants'] = edit_cost_constants | |||||
options['node_labels'] = [] | |||||
options['edge_labels'] = [] | |||||
options['node_attrs'] = [] | |||||
options['edge_attrs'] = [] | |||||
parallel = True # if num_solutions == 1 else False | |||||
"""**5. Compute GED matrix.**""" | |||||
ged_mat = 'error' | |||||
runtime = 0 | |||||
try: | |||||
time0 = time.time() | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||||
runtime = time.time() - time0 | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = save_dir + 'error.txt' | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception(save_file_suffix) | |||||
print(repr(exp)) | |||||
"""**6. Get results.**""" | |||||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(ged_mat, f) | |||||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
return ged_mat, runtime | |||||
def save_trials_as_group(graphs, N, max_num_solutions, ratio): | |||||
# Return if the group file exists. | |||||
name_middle = '.' + str(N) + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
if os.path.isfile(name_group): | |||||
return | |||||
ged_mats = [] | |||||
runtimes = [] | |||||
for trial in range(1, 101): | |||||
print() | |||||
print('Trial:', trial) | |||||
ged_mat, runtime = xp_compute_ged_matrix(graphs, N, max_num_solutions, ratio, trial) | |||||
ged_mats.append(ged_mat) | |||||
runtimes.append(runtime) | |||||
# Group trials and Remove single files. | |||||
name_prefix = 'ged_matrix' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
name_prefix = 'runtime' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
def results_for_a_ratio(ratio): | |||||
for N in N_list: | |||||
print() | |||||
print('# of graphs:', N) | |||||
for max_num_solutions in [1, 20, 40, 60, 80, 100]: | |||||
print() | |||||
print('Max # of solutions:', max_num_solutions) | |||||
save_trials_as_group(graphs[:N], N, max_num_solutions, ratio) | |||||
if __name__ == '__main__': | |||||
if len(sys.argv) > 1: | |||||
N_list = [int(i) for i in sys.argv[1:]] | |||||
else: | |||||
N_list = [10, 50, 100] | |||||
# Generate graphs. | |||||
graphs = generate_graphs() | |||||
save_dir = 'outputs/edit_costs.max_num_sols.N.bipartite/' | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
for ratio in [10, 1, 0.1]: | |||||
print() | |||||
print('Ratio:', ratio) | |||||
results_for_a_ratio(ratio) |
@@ -12,18 +12,19 @@ import multiprocessing | |||||
import pickle | import pickle | ||||
import logging | import logging | ||||
from gklearn.ged.util import compute_geds | from gklearn.ged.util import compute_geds | ||||
import numpy as np | |||||
import time | import time | ||||
from utils import get_dataset | from utils import get_dataset | ||||
import sys | import sys | ||||
from group_results import group_trials | |||||
def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | ||||
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | ||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
# Return if the file exists. | |||||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
return None, None | |||||
"""**2. Set parameters.**""" | """**2. Set parameters.**""" | ||||
@@ -83,6 +84,12 @@ def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||||
def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | ||||
# Return if the group file exists. | |||||
name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
if os.path.isfile(name_group): | |||||
return | |||||
ged_mats = [] | ged_mats = [] | ||||
runtimes = [] | runtimes = [] | ||||
for trial in range(1, 101): | for trial in range(1, 101): | ||||
@@ -92,25 +99,36 @@ def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||||
ged_mats.append(ged_mat) | ged_mats.append(ged_mat) | ||||
runtimes.append(runtime) | runtimes.append(runtime) | ||||
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
np.save(f, np.array(ged_mats)) | |||||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
# Group trials and Remove single files. | |||||
name_prefix = 'ged_matrix' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
name_prefix = 'runtime' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
def results_for_a_dataset(ds_name): | def results_for_a_dataset(ds_name): | ||||
"""**1. Get dataset.**""" | """**1. Get dataset.**""" | ||||
dataset = get_dataset(ds_name) | dataset = get_dataset(ds_name) | ||||
for max_num_solutions in [1, 20, 40, 60, 80, 100]: | |||||
for max_num_solutions in mnum_solutions_list: | |||||
print() | print() | ||||
print('Max # of solutions:', max_num_solutions) | print('Max # of solutions:', max_num_solutions) | ||||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
for ratio in ratio_list: | |||||
print() | print() | ||||
print('Ratio:', ratio) | print('Ratio:', ratio) | ||||
save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) | save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) | ||||
def get_param_lists(ds_name): | |||||
if ds_name == 'AIDS_symb': | |||||
mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
else: | |||||
mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
return mnum_solutions_list, ratio_list | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
if len(sys.argv) > 1: | if len(sys.argv) > 1: | ||||
@@ -119,12 +137,11 @@ if __name__ == '__main__': | |||||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ||||
save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' | save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
if not os.path.exists(save_dir + 'groups/'): | |||||
os.makedirs(save_dir + 'groups/') | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
for ds_name in ds_name_list: | for ds_name in ds_name_list: | ||||
print() | print() | ||||
print('Dataset:', ds_name) | print('Dataset:', ds_name) | ||||
mnum_solutions_list, ratio_list = get_param_lists(ds_name) | |||||
results_for_a_dataset(ds_name) | results_for_a_dataset(ds_name) |
@@ -0,0 +1,137 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Wed Oct 20 11:48:02 2020 | |||||
@author: ljia | |||||
""" | |||||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||||
import os | |||||
import multiprocessing | |||||
import pickle | |||||
import logging | |||||
from gklearn.ged.util import compute_geds | |||||
import time | |||||
import sys | |||||
from group_results import group_trials | |||||
def generate_graphs(): | |||||
from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||||
gsyzer = GraphSynthesizer() | |||||
graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||||
return graphs | |||||
def xp_compute_ged_matrix(graphs, N, num_solutions, ratio, trial): | |||||
save_file_suffix = '.' + str(N) + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
# Return if the file exists. | |||||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
return None, None | |||||
"""**2. Set parameters.**""" | |||||
# Parameters for GED computation. | |||||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||||
# when bigger than 1, then the method is considered mIPFP. | |||||
'initial_solutions': int(num_solutions * 4), | |||||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
'attr_distance': 'euclidean', | |||||
'ratio_runs_from_initial_solutions': 0.25, | |||||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
'threads': multiprocessing.cpu_count(), | |||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
} | |||||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
options = ged_options.copy() | |||||
options['edit_cost_constants'] = edit_cost_constants | |||||
options['node_labels'] = [] | |||||
options['edge_labels'] = [] | |||||
options['node_attrs'] = [] | |||||
options['edge_attrs'] = [] | |||||
parallel = True # if num_solutions == 1 else False | |||||
"""**5. Compute GED matrix.**""" | |||||
ged_mat = 'error' | |||||
runtime = 0 | |||||
try: | |||||
time0 = time.time() | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||||
runtime = time.time() - time0 | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = save_dir + 'error.txt' | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception(save_file_suffix) | |||||
print(repr(exp)) | |||||
"""**6. Get results.**""" | |||||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(ged_mat, f) | |||||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
return ged_mat, runtime | |||||
def save_trials_as_group(graphs, N, num_solutions, ratio): | |||||
# Return if the group file exists. | |||||
name_middle = '.' + str(N) + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
if os.path.isfile(name_group): | |||||
return | |||||
ged_mats = [] | |||||
runtimes = [] | |||||
for trial in range(1, 101): | |||||
print() | |||||
print('Trial:', trial) | |||||
ged_mat, runtime = xp_compute_ged_matrix(graphs, N, num_solutions, ratio, trial) | |||||
ged_mats.append(ged_mat) | |||||
runtimes.append(runtime) | |||||
# Group trials and Remove single files. | |||||
name_prefix = 'ged_matrix' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
name_prefix = 'runtime' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
def results_for_a_ratio(ratio): | |||||
for N in N_list: | |||||
print() | |||||
print('# of graphs:', N) | |||||
for num_solutions in [1, 20, 40, 60, 80, 100]: | |||||
print() | |||||
print('# of solutions:', num_solutions) | |||||
save_trials_as_group(graphs[:N], N, num_solutions, ratio) | |||||
if __name__ == '__main__': | |||||
if len(sys.argv) > 1: | |||||
N_list = [int(i) for i in sys.argv[1:]] | |||||
else: | |||||
N_list = [10, 50, 100] | |||||
# Generate graphs. | |||||
graphs = generate_graphs() | |||||
save_dir = 'outputs/edit_costs.num_sols.N.IPFP/' | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
for ratio in [10, 1, 0.1]: | |||||
print() | |||||
print('Ratio:', ratio) | |||||
results_for_a_ratio(ratio) |
@@ -12,15 +12,19 @@ import multiprocessing | |||||
import pickle | import pickle | ||||
import logging | import logging | ||||
from gklearn.ged.util import compute_geds | from gklearn.ged.util import compute_geds | ||||
import numpy as np | |||||
import time | import time | ||||
from utils import get_dataset | from utils import get_dataset | ||||
import sys | import sys | ||||
from group_results import group_trials | |||||
def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | ||||
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | ||||
# Return if the file exists. | |||||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
return None, None | |||||
"""**2. Set parameters.**""" | """**2. Set parameters.**""" | ||||
@@ -39,8 +43,8 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||||
} | } | ||||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | ||||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
options = ged_options.copy() | options = ged_options.copy() | ||||
options['edit_cost_constants'] = edit_cost_constants | options['edit_cost_constants'] = edit_cost_constants | ||||
@@ -55,7 +59,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||||
runtime = 0 | runtime = 0 | ||||
try: | try: | ||||
time0 = time.time() | time0 = time.time() | ||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||||
runtime = time.time() - time0 | runtime = time.time() - time0 | ||||
except Exception as exp: | except Exception as exp: | ||||
print('An exception occured when running this experiment:') | print('An exception occured when running this experiment:') | ||||
@@ -70,11 +74,17 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||||
pickle.dump(ged_mat, f) | pickle.dump(ged_mat, f) | ||||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | ||||
pickle.dump(runtime, f) | pickle.dump(runtime, f) | ||||
return ged_mat, runtime | return ged_mat, runtime | ||||
def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | ||||
# Return if the group file exists. | |||||
name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
if os.path.isfile(name_group): | |||||
return | |||||
ged_mats = [] | ged_mats = [] | ||||
runtimes = [] | runtimes = [] | ||||
for trial in range(1, 101): | for trial in range(1, 101): | ||||
@@ -84,24 +94,35 @@ def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||||
ged_mats.append(ged_mat) | ged_mats.append(ged_mat) | ||||
runtimes.append(runtime) | runtimes.append(runtime) | ||||
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
np.save(f, np.array(ged_mats)) | |||||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
# Group trials and Remove single files. | |||||
name_prefix = 'ged_matrix' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
name_prefix = 'runtime' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
def results_for_a_dataset(ds_name): | def results_for_a_dataset(ds_name): | ||||
"""**1. Get dataset.**""" | """**1. Get dataset.**""" | ||||
dataset = get_dataset(ds_name) | dataset = get_dataset(ds_name) | ||||
for num_solutions in [1, 20, 40, 60, 80, 100]: | |||||
for num_solutions in num_solutions_list: | |||||
print() | print() | ||||
print('# of solutions:', num_solutions) | print('# of solutions:', num_solutions) | ||||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
for ratio in ratio_list: | |||||
print() | print() | ||||
print('Ratio:', ratio) | print('Ratio:', ratio) | ||||
save_trials_as_group(dataset, ds_name, num_solutions, ratio) | save_trials_as_group(dataset, ds_name, num_solutions, ratio) | ||||
def get_param_lists(ds_name): | |||||
if ds_name == 'AIDS_symb': | |||||
num_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
else: | |||||
num_solutions_list = [1, 20, 40, 60, 80, 100] | |||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
return num_solutions_list, ratio_list | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
@@ -111,12 +132,11 @@ if __name__ == '__main__': | |||||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ||||
save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
if not os.path.exists(save_dir + 'groups/'): | |||||
os.makedirs(save_dir + 'groups/') | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
for ds_name in ds_name_list: | for ds_name in ds_name_list: | ||||
print() | print() | ||||
print('Dataset:', ds_name) | print('Dataset:', ds_name) | ||||
num_solutions_list, ratio_list = get_param_lists(ds_name) | |||||
results_for_a_dataset(ds_name) | results_for_a_dataset(ds_name) |
@@ -0,0 +1,137 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Wed Oct 20 11:48:02 2020 | |||||
@author: ljia | |||||
""" | |||||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||||
import os | |||||
import multiprocessing | |||||
import pickle | |||||
import logging | |||||
from gklearn.ged.util import compute_geds | |||||
import time | |||||
import sys | |||||
from group_results import group_trials | |||||
def generate_graphs(): | |||||
from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||||
gsyzer = GraphSynthesizer() | |||||
graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||||
return graphs | |||||
def xp_compute_ged_matrix(graphs, N, repeats, ratio, trial): | |||||
save_file_suffix = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
# Return if the file exists. | |||||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
return None, None | |||||
"""**2. Set parameters.**""" | |||||
# Parameters for GED computation. | |||||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||||
# when bigger than 1, then the method is considered mIPFP. | |||||
'initial_solutions': 1, | |||||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
'attr_distance': 'euclidean', | |||||
'ratio_runs_from_initial_solutions': 1, | |||||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
'threads': multiprocessing.cpu_count(), | |||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
} | |||||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
options = ged_options.copy() | |||||
options['edit_cost_constants'] = edit_cost_constants | |||||
options['node_labels'] = [] | |||||
options['edge_labels'] = [] | |||||
options['node_attrs'] = [] | |||||
options['edge_attrs'] = [] | |||||
parallel = True # if num_solutions == 1 else False | |||||
"""**5. Compute GED matrix.**""" | |||||
ged_mat = 'error' | |||||
runtime = 0 | |||||
try: | |||||
time0 = time.time() | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||||
runtime = time.time() - time0 | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = save_dir + 'error.txt' | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception(save_file_suffix) | |||||
print(repr(exp)) | |||||
"""**6. Get results.**""" | |||||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(ged_mat, f) | |||||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
return ged_mat, runtime | |||||
def save_trials_as_group(graphs, N, repeats, ratio): | |||||
# Return if the group file exists. | |||||
name_middle = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
if os.path.isfile(name_group): | |||||
return | |||||
ged_mats = [] | |||||
runtimes = [] | |||||
for trial in range(1, 101): | |||||
print() | |||||
print('Trial:', trial) | |||||
ged_mat, runtime = xp_compute_ged_matrix(graphs, N, repeats, ratio, trial) | |||||
ged_mats.append(ged_mat) | |||||
runtimes.append(runtime) | |||||
# Group trials and Remove single files. | |||||
name_prefix = 'ged_matrix' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
name_prefix = 'runtime' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
def results_for_a_ratio(ratio): | |||||
for N in N_list: | |||||
print() | |||||
print('# of graphs:', N) | |||||
for repeats in [1, 20, 40, 60, 80, 100]: | |||||
print() | |||||
print('Repeats:', repeats) | |||||
save_trials_as_group(graphs[:N], N, repeats, ratio) | |||||
if __name__ == '__main__': | |||||
if len(sys.argv) > 1: | |||||
N_list = [int(i) for i in sys.argv[1:]] | |||||
else: | |||||
N_list = [10, 50, 100] | |||||
# Generate graphs. | |||||
graphs = generate_graphs() | |||||
save_dir = 'outputs/edit_costs.repeats.N.IPFP/' | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
for ratio in [10, 1, 0.1]: | |||||
print() | |||||
print('Ratio:', ratio) | |||||
results_for_a_ratio(ratio) |
@@ -0,0 +1,142 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Wed Oct 20 11:48:02 2020 | |||||
@author: ljia | |||||
""" | |||||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||||
import os | |||||
import multiprocessing | |||||
import pickle | |||||
import logging | |||||
from gklearn.ged.util import compute_geds | |||||
import time | |||||
import sys | |||||
from group_results import group_trials | |||||
def generate_graphs(): | |||||
from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||||
gsyzer = GraphSynthesizer() | |||||
graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||||
return graphs | |||||
def xp_compute_ged_matrix(graphs, N, repeats, ratio, trial): | |||||
save_file_suffix = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||||
# Return if the file exists. | |||||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
return None, None | |||||
"""**2. Set parameters.**""" | |||||
# Parameters for GED computation. | |||||
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||||
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||||
'lsape_model': 'ECBP', # | |||||
# ??when bigger than 1, then the method is considered mIPFP. | |||||
# the actual number of computed solutions might be smaller than the specified value | |||||
'max_num_solutions': 1, | |||||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
'greedy_method': 'BASIC', # | |||||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
'attr_distance': 'euclidean', | |||||
'optimal': True, # if TRUE, the option --greedy-method has no effect | |||||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
'threads': multiprocessing.cpu_count(), | |||||
'centrality_method': 'NONE', | |||||
'centrality_weight': 0.7, | |||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
} | |||||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||||
options = ged_options.copy() | |||||
options['edit_cost_constants'] = edit_cost_constants | |||||
options['node_labels'] = [] | |||||
options['edge_labels'] = [] | |||||
options['node_attrs'] = [] | |||||
options['edge_attrs'] = [] | |||||
parallel = True # if num_solutions == 1 else False | |||||
"""**5. Compute GED matrix.**""" | |||||
ged_mat = 'error' | |||||
runtime = 0 | |||||
try: | |||||
time0 = time.time() | |||||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||||
runtime = time.time() - time0 | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = save_dir + 'error.txt' | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception(save_file_suffix) | |||||
print(repr(exp)) | |||||
"""**6. Get results.**""" | |||||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(ged_mat, f) | |||||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
return ged_mat, runtime | |||||
def save_trials_as_group(graphs, N, repeats, ratio): | |||||
# Return if the group file exists. | |||||
name_middle = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
if os.path.isfile(name_group): | |||||
return | |||||
ged_mats = [] | |||||
runtimes = [] | |||||
for trial in range(1, 101): | |||||
print() | |||||
print('Trial:', trial) | |||||
ged_mat, runtime = xp_compute_ged_matrix(graphs, N, repeats, ratio, trial) | |||||
ged_mats.append(ged_mat) | |||||
runtimes.append(runtime) | |||||
# Group trials and Remove single files. | |||||
name_prefix = 'ged_matrix' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
name_prefix = 'runtime' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
def results_for_a_ratio(ratio): | |||||
for N in N_list: | |||||
print() | |||||
print('# of graphs:', N) | |||||
for repeats in [1, 20, 40, 60, 80, 100]: | |||||
print() | |||||
print('Repeats:', repeats) | |||||
save_trials_as_group(graphs[:N], N, repeats, ratio) | |||||
if __name__ == '__main__': | |||||
if len(sys.argv) > 1: | |||||
N_list = [int(i) for i in sys.argv[1:]] | |||||
else: | |||||
N_list = [10, 50, 100] | |||||
# Generate graphs. | |||||
graphs = generate_graphs() | |||||
save_dir = 'outputs/edit_costs.repeats.N.bipartite/' | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
for ratio in [10, 1, 0.1]: | |||||
print() | |||||
print('Ratio:', ratio) | |||||
results_for_a_ratio(ratio) |
@@ -12,18 +12,19 @@ import multiprocessing | |||||
import pickle | import pickle | ||||
import logging | import logging | ||||
from gklearn.ged.util import compute_geds | from gklearn.ged.util import compute_geds | ||||
import numpy as np | |||||
import time | import time | ||||
from utils import get_dataset | from utils import get_dataset | ||||
import sys | import sys | ||||
from group_results import group_trials | |||||
def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | ||||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | ||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
# Return if the file exists. | |||||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
return None, None | |||||
"""**2. Set parameters.**""" | """**2. Set parameters.**""" | ||||
@@ -78,6 +79,12 @@ def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||||
def save_trials_as_group(dataset, ds_name, repeats, ratio): | def save_trials_as_group(dataset, ds_name, repeats, ratio): | ||||
# Return if the group file exists. | |||||
name_middle = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
if os.path.isfile(name_group): | |||||
return | |||||
ged_mats = [] | ged_mats = [] | ||||
runtimes = [] | runtimes = [] | ||||
for trial in range(1, 101): | for trial in range(1, 101): | ||||
@@ -87,25 +94,36 @@ def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||||
ged_mats.append(ged_mat) | ged_mats.append(ged_mat) | ||||
runtimes.append(runtime) | runtimes.append(runtime) | ||||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
np.save(f, np.array(ged_mats)) | |||||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
# Group trials and Remove single files. | |||||
name_prefix = 'ged_matrix' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
name_prefix = 'runtime' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
def results_for_a_dataset(ds_name): | def results_for_a_dataset(ds_name): | ||||
"""**1. Get dataset.**""" | """**1. Get dataset.**""" | ||||
dataset = get_dataset(ds_name) | dataset = get_dataset(ds_name) | ||||
for repeats in [1, 20, 40, 60, 80, 100]: | |||||
for repeats in repeats_list: | |||||
print() | print() | ||||
print('Repeats:', repeats) | print('Repeats:', repeats) | ||||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
for ratio in ratio_list: | |||||
print() | print() | ||||
print('Ratio:', ratio) | print('Ratio:', ratio) | ||||
save_trials_as_group(dataset, ds_name, repeats, ratio) | save_trials_as_group(dataset, ds_name, repeats, ratio) | ||||
def get_param_lists(ds_name): | |||||
if ds_name == 'AIDS_symb': | |||||
repeats_list = [1, 20, 40, 60, 80, 100] | |||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
else: | |||||
repeats_list = [1, 20, 40, 60, 80, 100] | |||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
return repeats_list, ratio_list | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
if len(sys.argv) > 1: | if len(sys.argv) > 1: | ||||
@@ -114,12 +132,11 @@ if __name__ == '__main__': | |||||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ||||
save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/' | save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
if not os.path.exists(save_dir + 'groups/'): | |||||
os.makedirs(save_dir + 'groups/') | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
for ds_name in ds_name_list: | for ds_name in ds_name_list: | ||||
print() | print() | ||||
print('Dataset:', ds_name) | print('Dataset:', ds_name) | ||||
repeats_list, ratio_list = get_param_lists(ds_name) | |||||
results_for_a_dataset(ds_name) | results_for_a_dataset(ds_name) |
@@ -12,18 +12,19 @@ import multiprocessing | |||||
import pickle | import pickle | ||||
import logging | import logging | ||||
from gklearn.ged.util import compute_geds | from gklearn.ged.util import compute_geds | ||||
import numpy as np | |||||
import time | import time | ||||
from utils import get_dataset | from utils import get_dataset | ||||
import sys | import sys | ||||
from group_results import group_trials | |||||
def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | ||||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | ||||
"""**1. Get dataset.**""" | |||||
dataset = get_dataset(ds_name) | |||||
# Return if the file exists. | |||||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||||
return None, None | |||||
"""**2. Set parameters.**""" | """**2. Set parameters.**""" | ||||
@@ -83,6 +84,12 @@ def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||||
def save_trials_as_group(dataset, ds_name, repeats, ratio): | def save_trials_as_group(dataset, ds_name, repeats, ratio): | ||||
# Return if the group file exists. | |||||
name_middle = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||||
if os.path.isfile(name_group): | |||||
return | |||||
ged_mats = [] | ged_mats = [] | ||||
runtimes = [] | runtimes = [] | ||||
for trial in range(1, 101): | for trial in range(1, 101): | ||||
@@ -92,25 +99,36 @@ def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||||
ged_mats.append(ged_mat) | ged_mats.append(ged_mat) | ||||
runtimes.append(runtime) | runtimes.append(runtime) | ||||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||||
np.save(f, np.array(ged_mats)) | |||||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||||
pickle.dump(runtime, f) | |||||
# Group trials and Remove single files. | |||||
name_prefix = 'ged_matrix' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
name_prefix = 'runtime' + name_middle | |||||
group_trials(save_dir, name_prefix, True, True, False) | |||||
def results_for_a_dataset(ds_name): | def results_for_a_dataset(ds_name): | ||||
"""**1. Get dataset.**""" | """**1. Get dataset.**""" | ||||
dataset = get_dataset(ds_name) | dataset = get_dataset(ds_name) | ||||
for repeats in [1, 20, 40, 60, 80, 100]: | |||||
for repeats in repeats_list: | |||||
print() | print() | ||||
print('Repeats:', repeats) | print('Repeats:', repeats) | ||||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||||
for ratio in ratio_list: | |||||
print() | print() | ||||
print('Ratio:', ratio) | print('Ratio:', ratio) | ||||
save_trials_as_group(dataset, ds_name, repeats, ratio) | save_trials_as_group(dataset, ds_name, repeats, ratio) | ||||
def get_param_lists(ds_name): | |||||
if ds_name == 'AIDS_symb': | |||||
repeats_list = [1, 20, 40, 60, 80, 100] | |||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
else: | |||||
repeats_list = [1, 20, 40, 60, 80, 100] | |||||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||||
return repeats_list, ratio_list | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
if len(sys.argv) > 1: | if len(sys.argv) > 1: | ||||
@@ -119,12 +137,11 @@ if __name__ == '__main__': | |||||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | ||||
save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/' | save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
if not os.path.exists(save_dir + 'groups/'): | |||||
os.makedirs(save_dir + 'groups/') | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||||
for ds_name in ds_name_list: | for ds_name in ds_name_list: | ||||
print() | print() | ||||
print('Dataset:', ds_name) | print('Dataset:', ds_name) | ||||
repeats_list, ratio_list = get_param_lists(ds_name) | |||||
results_for_a_dataset(ds_name) | results_for_a_dataset(ds_name) |
@@ -16,6 +16,7 @@ from tqdm import tqdm | |||||
import sys | import sys | ||||
# This function is used by other scripts. Modify it carefully. | |||||
def group_trials(dir_folder, name_prefix, override, clear, backup): | def group_trials(dir_folder, name_prefix, override, clear, backup): | ||||
# Get group name. | # Get group name. | ||||
@@ -47,8 +48,20 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||||
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | ||||
if os.path.isfile(file_name): | if os.path.isfile(file_name): | ||||
with open(file_name, 'rb') as f: | with open(file_name, 'rb') as f: | ||||
data = pickle.load(f) | |||||
try: | |||||
data = pickle.load(f) | |||||
except EOFError: | |||||
print('EOF Error occurred.') | |||||
return | |||||
data_group.append(data) | data_group.append(data) | ||||
# unpickler = pickle.Unpickler(f) | |||||
# data = unpickler.load() | |||||
# if not isinstance(data, np.array): | |||||
# return | |||||
# else: | |||||
# data_group.append(data) | |||||
else: # Not all trials are completed. | else: # Not all trials are completed. | ||||
return | return | ||||
@@ -81,11 +94,9 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||||
def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): | def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): | ||||
# Create folders. | # Create folders. | ||||
if not os.path.exists(dir_folder + 'groups/'): | |||||
os.makedirs(dir_folder + 'groups/') | |||||
os.makedirs(dir_folder + 'groups/', exist_ok=True) | |||||
if backup: | if backup: | ||||
if not os.path.exists(dir_folder + 'backups'): | |||||
os.makedirs(dir_folder + 'backups') | |||||
os.makedirs(dir_folder + 'backups', exist_ok=True) | |||||
# Iterate all files. | # Iterate all files. | ||||
cur_file_prefix = '' | cur_file_prefix = '' | ||||
@@ -105,4 +116,10 @@ if __name__ == '__main__': | |||||
group_all_in_folder(dir_folder) | group_all_in_folder(dir_folder) | ||||
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' | dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' | ||||
group_all_in_folder(dir_folder) | |||||
dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' | |||||
group_all_in_folder(dir_folder) | |||||
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' | |||||
group_all_in_folder(dir_folder) | group_all_in_folder(dir_folder) |
@@ -0,0 +1,56 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Tue Nov 3 20:23:25 2020 | |||||
@author: ljia | |||||
""" | |||||
import os | |||||
import re | |||||
def get_job_script(arg, params): | |||||
ged_method = params[0] | |||||
multi_method = params[1] | |||||
job_name_label = r"rep." if multi_method == 'repeats' else r"" | |||||
script = r""" | |||||
#!/bin/bash | |||||
#SBATCH --exclusive | |||||
#SBATCH --job-name="st.""" + job_name_label + r"N" + arg + r"." + ged_method + r"""" | |||||
#SBATCH --partition=tlong | |||||
#SBATCH --mail-type=ALL | |||||
#SBATCH --mail-user=jajupmochi@gmail.com | |||||
#SBATCH --output="outputs/output_edit_costs.""" + multi_method + r".N." + ged_method + r"." + arg + r""".txt" | |||||
#SBATCH --error="errors/error_edit_costs.""" + multi_method + r".N." + ged_method + r"." + arg + r""".txt" | |||||
# | |||||
#SBATCH --ntasks=1 | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --cpus-per-task=1 | |||||
#SBATCH --time=300:00:00 | |||||
#SBATCH --mem-per-cpu=4000 | |||||
srun hostname | |||||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||||
srun python3 edit_costs.""" + multi_method + r".N." + ged_method + r".py " + arg | |||||
script = script.strip() | |||||
script = re.sub('\n\t+', '\n', script) | |||||
script = re.sub('\n +', '\n', script) | |||||
return script | |||||
if __name__ == '__main__': | |||||
params_list = [('IPFP', 'nums_sols'), | |||||
('IPFP', 'repeats'), | |||||
('bipartite', 'max_num_sols'), | |||||
('bipartite', 'repeats')] | |||||
N_list = [10, 50, 100] | |||||
for params in params_list[1:]: | |||||
for N in [N_list[i] for i in [0, 1, 2]]: | |||||
job_script = get_job_script(str(N), params) | |||||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
# print(command) | |||||
os.system(command) | |||||
# os.popen(command) | |||||
# output = stream.readlines() |
@@ -0,0 +1,47 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Tue Nov 3 20:23:25 2020 | |||||
@author: ljia | |||||
""" | |||||
import os | |||||
import re | |||||
def get_job_script(arg): | |||||
script = r""" | |||||
#!/bin/bash | |||||
#SBATCH --exclusive | |||||
#SBATCH --job-name="st.""" + arg + r""".bp" | |||||
#SBATCH --partition=tlong | |||||
#SBATCH --mail-type=ALL | |||||
#SBATCH --mail-user=jajupmochi@gmail.com | |||||
#SBATCH --output="outputs/output_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||||
#SBATCH --error="errors/error_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||||
# | |||||
#SBATCH --ntasks=1 | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --cpus-per-task=1 | |||||
#SBATCH --time=300:00:00 | |||||
#SBATCH --mem-per-cpu=4000 | |||||
srun hostname | |||||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||||
srun python3 edit_costs.max_nums_sols.ratios.bipartite.py """ + arg | |||||
script = script.strip() | |||||
script = re.sub('\n\t+', '\n', script) | |||||
script = re.sub('\n +', '\n', script) | |||||
return script | |||||
if __name__ == '__main__': | |||||
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: | |||||
job_script = get_job_script(ds_name) | |||||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
# print(command) | |||||
os.system(command) | |||||
# os.popen(command) | |||||
# output = stream.readlines() |
@@ -0,0 +1,47 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Tue Nov 3 20:23:25 2020 | |||||
@author: ljia | |||||
""" | |||||
import os | |||||
import re | |||||
def get_job_script(arg): | |||||
script = r""" | |||||
#!/bin/bash | |||||
#SBATCH --exclusive | |||||
#SBATCH --job-name="st.""" + arg + r""".IPFP" | |||||
#SBATCH --partition=tlong | |||||
#SBATCH --mail-type=ALL | |||||
#SBATCH --mail-user=jajupmochi@gmail.com | |||||
#SBATCH --output="outputs/output_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||||
#SBATCH --error="errors/error_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||||
# | |||||
#SBATCH --ntasks=1 | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --cpus-per-task=1 | |||||
#SBATCH --time=300:00:00 | |||||
#SBATCH --mem-per-cpu=4000 | |||||
srun hostname | |||||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||||
srun python3 edit_costs.nums_sols.ratios.IPFP.py """ + arg | |||||
script = script.strip() | |||||
script = re.sub('\n\t+', '\n', script) | |||||
script = re.sub('\n +', '\n', script) | |||||
return script | |||||
if __name__ == '__main__': | |||||
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
for ds_name in [ds_list[i] for i in [0, 3]]: | |||||
job_script = get_job_script(ds_name) | |||||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
# print(command) | |||||
os.system(command) | |||||
# os.popen(command) | |||||
# output = stream.readlines() |
@@ -0,0 +1,47 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Tue Nov 3 20:23:25 2020 | |||||
@author: ljia | |||||
""" | |||||
import os | |||||
import re | |||||
def get_job_script(arg): | |||||
script = r""" | |||||
#!/bin/bash | |||||
#SBATCH --exclusive | |||||
#SBATCH --job-name="st.rep.""" + arg + r""".IPFP" | |||||
#SBATCH --partition=tlong | |||||
#SBATCH --mail-type=ALL | |||||
#SBATCH --mail-user=jajupmochi@gmail.com | |||||
#SBATCH --output="outputs/output_edit_costs.repeats.ratios.IPFP.""" + arg + """.txt" | |||||
#SBATCH --error="errors/error_edit_costs.repeats.ratios.IPFP.""" + arg + """.txt" | |||||
# | |||||
#SBATCH --ntasks=1 | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --cpus-per-task=1 | |||||
#SBATCH --time=300:00:00 | |||||
#SBATCH --mem-per-cpu=4000 | |||||
srun hostname | |||||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||||
srun python3 edit_costs.repeats.ratios.IPFP.py """ + arg | |||||
script = script.strip() | |||||
script = re.sub('\n\t+', '\n', script) | |||||
script = re.sub('\n +', '\n', script) | |||||
return script | |||||
if __name__ == '__main__': | |||||
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
for ds_name in [ds_list[i] for i in [0, 3]]: | |||||
job_script = get_job_script(ds_name) | |||||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
# print(command) | |||||
os.system(command) | |||||
# os.popen(command) | |||||
# output = stream.readlines() |
@@ -0,0 +1,47 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Tue Nov 3 20:23:25 2020 | |||||
@author: ljia | |||||
""" | |||||
import os | |||||
import re | |||||
def get_job_script(arg): | |||||
script = r""" | |||||
#!/bin/bash | |||||
#SBATCH --exclusive | |||||
#SBATCH --job-name="st.rep.""" + arg + r""".bp" | |||||
#SBATCH --partition=tlong | |||||
#SBATCH --mail-type=ALL | |||||
#SBATCH --mail-user=jajupmochi@gmail.com | |||||
#SBATCH --output="outputs/output_edit_costs.repeats.ratios.bipartite.""" + arg + """.txt" | |||||
#SBATCH --error="errors/error_edit_costs.repeats.ratios.bipartite.""" + arg + """.txt" | |||||
# | |||||
#SBATCH --ntasks=1 | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --cpus-per-task=1 | |||||
#SBATCH --time=300:00:00 | |||||
#SBATCH --mem-per-cpu=4000 | |||||
srun hostname | |||||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||||
srun python3 edit_costs.repeats.ratios.bipartite.py """ + arg | |||||
script = script.strip() | |||||
script = re.sub('\n\t+', '\n', script) | |||||
script = re.sub('\n +', '\n', script) | |||||
return script | |||||
if __name__ == '__main__': | |||||
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||||
for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: | |||||
job_script = get_job_script(ds_name) | |||||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
# print(command) | |||||
os.system(command) | |||||
# os.popen(command) | |||||
# output = stream.readlines() |
@@ -150,8 +150,7 @@ def xp_accuracy_diff_entropy(): | |||||
import pickle | import pickle | ||||
import os | import os | ||||
save_dir = 'outputs/accuracy_diff_entropy/' | save_dir = 'outputs/accuracy_diff_entropy/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
accuracies = {} | accuracies = {} | ||||
confidences = {} | confidences = {} | ||||
@@ -16,8 +16,7 @@ def xp_runtimes_of_all_28cores(): | |||||
import pickle | import pickle | ||||
import os | import os | ||||
save_dir = 'outputs/runtimes_of_all_28cores/' | save_dir = 'outputs/runtimes_of_all_28cores/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
run_times = {} | run_times = {} | ||||
@@ -16,8 +16,7 @@ def xp_runtimes_diff_chunksizes(): | |||||
import pickle | import pickle | ||||
import os | import os | ||||
save_dir = 'outputs/runtimes_diff_chunksizes/' | save_dir = 'outputs/runtimes_diff_chunksizes/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
run_times = {} | run_times = {} | ||||
@@ -25,8 +25,7 @@ def xp_synthesized_graphs_dataset_size(): | |||||
import pickle | import pickle | ||||
import os | import os | ||||
save_dir = 'outputs/synthesized_graphs_N/' | save_dir = 'outputs/synthesized_graphs_N/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
run_times = {} | run_times = {} | ||||
@@ -22,8 +22,7 @@ def xp_synthesized_graphs_degrees(): | |||||
import pickle | import pickle | ||||
import os | import os | ||||
save_dir = 'outputs/synthesized_graphs_degrees/' | save_dir = 'outputs/synthesized_graphs_degrees/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
run_times = {} | run_times = {} | ||||
@@ -22,8 +22,7 @@ def xp_synthesized_graphs_num_node_label_alphabet(): | |||||
import pickle | import pickle | ||||
import os | import os | ||||
save_dir = 'outputs/synthesized_graphs_num_node_label_alphabet/' | save_dir = 'outputs/synthesized_graphs_num_node_label_alphabet/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
run_times = {} | run_times = {} | ||||
@@ -22,8 +22,7 @@ def xp_synthesized_graphs_num_nodes(): | |||||
import pickle | import pickle | ||||
import os | import os | ||||
save_dir = 'outputs/synthesized_graphs_num_nodes/' | save_dir = 'outputs/synthesized_graphs_num_nodes/' | ||||
if not os.path.exists(save_dir): | |||||
os.makedirs(save_dir) | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
run_times = {} | run_times = {} | ||||
@@ -154,6 +154,6 @@ def test_median_graph_estimator_symb(): | |||||
return set_median, gen_median | return set_median, gen_median | ||||
if __name__ == '__main__': | |||||
if _name_ == '_main_': | |||||
# set_median, gen_median = test_median_graph_estimator() | # set_median, gen_median = test_median_graph_estimator() | ||||
set_median, gen_median = test_median_graph_estimator_symb() | set_median, gen_median = test_median_graph_estimator_symb() |
@@ -7,6 +7,8 @@ __version__ = "0.1" | |||||
__author__ = "Linlin Jia" | __author__ = "Linlin Jia" | ||||
__date__ = "November 2018" | __date__ = "November 2018" | ||||
from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels | |||||
from gklearn.kernels.graph_kernel import GraphKernel | from gklearn.kernels.graph_kernel import GraphKernel | ||||
from gklearn.kernels.common_walk import CommonWalk | from gklearn.kernels.common_walk import CommonWalk | ||||
from gklearn.kernels.marginalized import Marginalized | from gklearn.kernels.marginalized import Marginalized | ||||
@@ -0,0 +1,36 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri Nov 6 10:11:08 2020 | |||||
@author: ljia | |||||
""" | |||||
# The metadata of all graph kernels. | |||||
GRAPH_KERNELS = { | |||||
### based on walks. | |||||
'common walk': '', | |||||
'marginalized': '', | |||||
'sylvester equation': '', | |||||
'fixed_point': '', | |||||
'conjugate gradient': '', | |||||
'spectral decomposition': '', | |||||
### based on paths. | |||||
'shortest path': '', | |||||
'structural shortest path': '', | |||||
'path up to length h': '', | |||||
### based on non-linear patterns. | |||||
'weisfeiler-lehman subtree': '', | |||||
'treelet': '', | |||||
} | |||||
def list_of_graph_kernels(): | |||||
"""List names of all graph kernels. | |||||
Returns | |||||
------- | |||||
list | |||||
The list of all graph kernels. | |||||
""" | |||||
return [i for i in GRAPH_KERNELS] |
@@ -126,8 +126,7 @@ def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, sav | |||||
# save median graphs. | # save median graphs. | ||||
if save_preimages: | if save_preimages: | ||||
if not os.path.exists(dir_save + 'preimages/'): | |||||
os.makedirs(dir_save + 'preimages/') | |||||
os.makedirs(dir_save + 'preimages/', exist_ok=True) | |||||
print('Saving preimages to files...') | print('Saving preimages to files...') | ||||
fn_best_dataset = dir_save + 'preimages/g_best_dataset.' + 'nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | fn_best_dataset = dir_save + 'preimages/g_best_dataset.' + 'nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | ||||
saveGXL(rpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | saveGXL(rpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | ||||
@@ -167,8 +166,7 @@ def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, sav | |||||
def _init_output_file_preimage(ds_name, gkernel, dir_output): | def _init_output_file_preimage(ds_name, gkernel, dir_output): | ||||
if not os.path.exists(dir_output): | |||||
os.makedirs(dir_output) | |||||
os.makedirs(dir_output, exist_ok=True) | |||||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | ||||
f_detail = open(dir_output + fn_output_detail, 'a') | f_detail = open(dir_output + fn_output_detail, 'a') | ||||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'num graphs', | csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'num graphs', | ||||
@@ -218,8 +218,7 @@ def remove_best_graph(ds_name, mpg_options, kernel_options, ged_options, mge_opt | |||||
# save median graphs. | # save median graphs. | ||||
if save_medians: | if save_medians: | ||||
if not os.path.exists(dir_save + 'medians/'): | |||||
os.makedirs(dir_save + 'medians/') | |||||
os.makedirs(dir_save + 'medians/', exist_ok=True) | |||||
print('Saving median graphs to files...') | print('Saving median graphs to files...') | ||||
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | ||||
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | ||||
@@ -375,8 +374,7 @@ def _compute_gram_matrix_unnorm(dataset, kernel_options): | |||||
def _init_output_file(ds_name, gkernel, fit_method, dir_output): | def _init_output_file(ds_name, gkernel, fit_method, dir_output): | ||||
if not os.path.exists(dir_output): | |||||
os.makedirs(dir_output) | |||||
os.makedirs(dir_output, exist_ok=True) | |||||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | ||||
f_detail = open(dir_output + fn_output_detail, 'a') | f_detail = open(dir_output + fn_output_detail, 'a') | ||||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', | csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', | ||||
@@ -230,8 +230,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged | |||||
# save median graphs. | # save median graphs. | ||||
if save_medians: | if save_medians: | ||||
if not os.path.exists(dir_save + 'medians/'): | |||||
os.makedirs(dir_save + 'medians/') | |||||
os.makedirs(dir_save + 'medians/', exist_ok=True) | |||||
print('Saving median graphs to files...') | print('Saving median graphs to files...') | ||||
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | ||||
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | ||||
@@ -308,8 +307,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged | |||||
def _init_output_file_preimage(ds_name, gkernel, fit_method, dir_output): | def _init_output_file_preimage(ds_name, gkernel, fit_method, dir_output): | ||||
if not os.path.exists(dir_output): | |||||
os.makedirs(dir_output) | |||||
os.makedirs(dir_output, exist_ok=True) | |||||
# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | # fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | ||||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | ||||
f_detail = open(dir_output + fn_output_detail, 'a') | f_detail = open(dir_output + fn_output_detail, 'a') | ||||
@@ -52,6 +52,14 @@ def chooseDataset(ds_name): | |||||
return dataset | return dataset | ||||
def test_list_graph_kernels(): | |||||
""" | |||||
""" | |||||
from gklearn.kernels import GRAPH_KERNELS, list_of_graph_kernels | |||||
assert list_of_graph_kernels() == [i for i in GRAPH_KERNELS] | |||||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | ||||
@pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) | @pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) | ||||
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) | @pytest.mark.parametrize('parallel', ['imap_unordered', None]) | ||||
@@ -433,10 +441,11 @@ def test_WLSubtree(ds_name, parallel): | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
test_list_graph_kernels() | |||||
# test_spkernel('Alkane', 'imap_unordered') | # test_spkernel('Alkane', 'imap_unordered') | ||||
# test_StructuralSP('Fingerprint_edge', 'imap_unordered') | # test_StructuralSP('Fingerprint_edge', 'imap_unordered') | ||||
test_WLSubtree('Acyclic', 'imap_unordered') | |||||
# test_WLSubtree('Acyclic', 'imap_unordered') | |||||
# test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') | # test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') | ||||
# test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') | # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') | ||||
# test_RandomWalk('Acyclic', 'fp', None, None) | # test_RandomWalk('Acyclic', 'fp', None, None) | ||||
# test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') | |||||
# test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') |
@@ -13,6 +13,10 @@ import os | |||||
class Dataset(object): | class Dataset(object): | ||||
import warnings | |||||
warnings.simplefilter('always', DeprecationWarning) | |||||
warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) | |||||
def __init__(self, filename=None, filename_targets=None, **kwargs): | def __init__(self, filename=None, filename_targets=None, **kwargs): | ||||
if filename is None: | if filename is None: | ||||
@@ -803,6 +807,10 @@ class Dataset(object): | |||||
def split_dataset_by_target(dataset): | def split_dataset_by_target(dataset): | ||||
import warnings | |||||
warnings.simplefilter('always', DeprecationWarning) | |||||
warnings.warn('This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) | |||||
from gklearn.preimage.utils import get_same_item_indices | from gklearn.preimage.utils import get_same_item_indices | ||||
graphs = dataset.graphs | graphs = dataset.graphs | ||||
@@ -1,5 +1,9 @@ | |||||
""" Utilities function to manage graph files | """ Utilities function to manage graph files | ||||
""" | """ | ||||
import warnings | |||||
warnings.simplefilter('always', DeprecationWarning) | |||||
warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning) | |||||
from os.path import dirname, splitext | from os.path import dirname, splitext | ||||
@@ -45,6 +49,10 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): | |||||
for details. Note here filename is the name of either .txt file in | for details. Note here filename is the name of either .txt file in | ||||
the dataset directory. | the dataset directory. | ||||
""" | """ | ||||
import warnings | |||||
warnings.simplefilter('always', DeprecationWarning) | |||||
warnings.warn('The function "gklearn.utils.load_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataLoader" instead.', DeprecationWarning) | |||||
extension = splitext(filename)[1][1:] | extension = splitext(filename)[1][1:] | ||||
if extension == "ds": | if extension == "ds": | ||||
data, y, label_names = load_from_ds(filename, filename_targets) | data, y, label_names = load_from_ds(filename, filename_targets) | ||||
@@ -66,17 +74,19 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): | |||||
def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', **kwargs): | def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', **kwargs): | ||||
"""Save list of graphs. | """Save list of graphs. | ||||
""" | """ | ||||
import warnings | |||||
warnings.simplefilter('always', DeprecationWarning) | |||||
warnings.warn('The function "gklearn.utils.save_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataSaver" instead.', DeprecationWarning) | |||||
import os | import os | ||||
dirname_ds = os.path.dirname(filename) | dirname_ds = os.path.dirname(filename) | ||||
if dirname_ds != '': | if dirname_ds != '': | ||||
dirname_ds += '/' | dirname_ds += '/' | ||||
if not os.path.exists(dirname_ds) : | |||||
os.makedirs(dirname_ds) | |||||
os.makedirs(dirname_ds, exist_ok=True) | |||||
if 'graph_dir' in kwargs: | if 'graph_dir' in kwargs: | ||||
graph_dir = kwargs['graph_dir'] + '/' | graph_dir = kwargs['graph_dir'] + '/' | ||||
if not os.path.exists(graph_dir): | |||||
os.makedirs(graph_dir) | |||||
os.makedirs(graph_dir, exist_ok=True) | |||||
del kwargs['graph_dir'] | del kwargs['graph_dir'] | ||||
else: | else: | ||||
graph_dir = dirname_ds | graph_dir = dirname_ds | ||||
@@ -13,6 +13,11 @@ import random | |||||
class GraphSynthesizer(object): | class GraphSynthesizer(object): | ||||
import warnings | |||||
warnings.simplefilter('always', DeprecationWarning) | |||||
warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.graph_synthesizer.GraphSynthesizer" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.2.2.', DeprecationWarning) | |||||
def __init__(self): | def __init__(self): | ||||
pass | pass | ||||
@@ -671,13 +671,11 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||||
dirname_ds = os.path.dirname(filename) | dirname_ds = os.path.dirname(filename) | ||||
if dirname_ds != '': | if dirname_ds != '': | ||||
dirname_ds += '/' | dirname_ds += '/' | ||||
if not os.path.exists(dirname_ds) : | |||||
os.makedirs(dirname_ds) | |||||
os.makedirs(dirname_ds, exist_ok=True) | |||||
if xparams is not None and 'graph_dir' in xparams: | if xparams is not None and 'graph_dir' in xparams: | ||||
graph_dir = xparams['graph_dir'] + '/' | graph_dir = xparams['graph_dir'] + '/' | ||||
if not os.path.exists(graph_dir): | |||||
os.makedirs(graph_dir) | |||||
os.makedirs(graph_dir, exist_ok=True) | |||||
else: | else: | ||||
graph_dir = dirname_ds | graph_dir = dirname_ds | ||||
@@ -91,8 +91,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
tqdm.monitor_interval = 0 | tqdm.monitor_interval = 0 | ||||
output_dir += estimator.__name__ | output_dir += estimator.__name__ | ||||
if not os.path.exists(output_dir): | |||||
os.makedirs(output_dir) | |||||
os.makedirs(output_dir, exist_ok=True) | |||||
# a string to save all the results. | # a string to save all the results. | ||||
str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' | str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' | ||||
str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | ||||
@@ -604,8 +603,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) | str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) | ||||
# open file to save all results for this dataset. | # open file to save all results for this dataset. | ||||
if not os.path.exists(output_dir): | |||||
os.makedirs(output_dir) | |||||
os.makedirs(output_dir, exist_ok=True) | |||||
# print out results as table. | # print out results as table. | ||||
str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, | str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, | ||||
@@ -458,8 +458,7 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d | |||||
print() | print() | ||||
print('4. saving results...') | print('4. saving results...') | ||||
if save_results: | if save_results: | ||||
if not os.path.exists(dir_save): | |||||
os.makedirs(dir_save) | |||||
os.makedirs(dir_save, exist_ok=True) | |||||
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list) | np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list) | ||||
print('\ncomplete.') | print('\ncomplete.') | ||||