@@ -0,0 +1,22 @@ | |||
# -*-coding:utf-8 -*- | |||
"""gklearn - datasets module | |||
Implement some methods to manage graph datasets | |||
graph_fetcher.py : fetch graph datasets from the Internet. | |||
""" | |||
# info | |||
__version__ = "0.2" | |||
__author__ = "Linlin Jia" | |||
__date__ = "October 2020" | |||
from gklearn.dataset.metadata import DATABASES, DATASET_META | |||
from gklearn.dataset.metadata import GREYC_META, IAM_META, TUDataset_META | |||
from gklearn.dataset.metadata import list_of_databases, list_of_datasets | |||
from gklearn.dataset.graph_synthesizer import GraphSynthesizer | |||
from gklearn.dataset.data_fetcher import DataFetcher | |||
from gklearn.dataset.file_managers import DataLoader, DataSaver | |||
from gklearn.dataset.dataset import Dataset, split_dataset_by_target |
@@ -0,0 +1,823 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Thu Mar 26 18:48:27 2020 | |||
@author: ljia | |||
""" | |||
import numpy as np | |||
import networkx as nx | |||
from gklearn.utils.graph_files import load_dataset | |||
import os | |||
class Dataset(object): | |||
def __init__(self, filename=None, filename_targets=None, **kwargs): | |||
if filename is None: | |||
self._graphs = None | |||
self._targets = None | |||
self._node_labels = None | |||
self._edge_labels = None | |||
self._node_attrs = None | |||
self._edge_attrs = None | |||
else: | |||
self.load_dataset(filename, filename_targets=filename_targets, **kwargs) | |||
self._substructures = None | |||
self._node_label_dim = None | |||
self._edge_label_dim = None | |||
self._directed = None | |||
self._dataset_size = None | |||
self._total_node_num = None | |||
self._ave_node_num = None | |||
self._min_node_num = None | |||
self._max_node_num = None | |||
self._total_edge_num = None | |||
self._ave_edge_num = None | |||
self._min_edge_num = None | |||
self._max_edge_num = None | |||
self._ave_node_degree = None | |||
self._min_node_degree = None | |||
self._max_node_degree = None | |||
self._ave_fill_factor = None | |||
self._min_fill_factor = None | |||
self._max_fill_factor = None | |||
self._node_label_nums = None | |||
self._edge_label_nums = None | |||
self._node_attr_dim = None | |||
self._edge_attr_dim = None | |||
self._class_number = None | |||
def load_dataset(self, filename, filename_targets=None, **kwargs): | |||
self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs) | |||
self._node_labels = label_names['node_labels'] | |||
self._node_attrs = label_names['node_attrs'] | |||
self._edge_labels = label_names['edge_labels'] | |||
self._edge_attrs = label_names['edge_attrs'] | |||
self.clean_labels() | |||
def load_graphs(self, graphs, targets=None): | |||
# this has to be followed by set_labels(). | |||
self._graphs = graphs | |||
self._targets = targets | |||
# self.set_labels_attrs() # @todo | |||
def load_predefined_dataset(self, ds_name): | |||
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | |||
if ds_name == 'Acyclic': | |||
ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'AIDS': | |||
ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'Alkane': | |||
ds_file = current_path + '../../datasets/Alkane/dataset.ds' | |||
fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets) | |||
elif ds_name == 'COIL-DEL': | |||
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'COIL-RAG': | |||
ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'COLORS-3': | |||
ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'Cuneiform': | |||
ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'DD': | |||
ds_file = current_path + '../../datasets/DD/DD_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'ENZYMES': | |||
ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'Fingerprint': | |||
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'FRANKENSTEIN': | |||
ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'Letter-high': # node non-symb | |||
ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'Letter-low': # node non-symb | |||
ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'Letter-med': # node non-symb | |||
ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'MAO': | |||
ds_file = current_path + '../../datasets/MAO/dataset.ds' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'Monoterpenoides': | |||
ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'MUTAG': | |||
ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'NCI1': | |||
ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'NCI109': | |||
ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'PAH': | |||
ds_file = current_path + '../../datasets/PAH/dataset.ds' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'SYNTHETIC': | |||
pass | |||
elif ds_name == 'SYNTHETICnew': | |||
ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'Synthie': | |||
pass | |||
else: | |||
raise Exception('The dataset name "', ds_name, '" is not pre-defined.') | |||
self._node_labels = label_names['node_labels'] | |||
self._node_attrs = label_names['node_attrs'] | |||
self._edge_labels = label_names['edge_labels'] | |||
self._edge_attrs = label_names['edge_attrs'] | |||
self.clean_labels() | |||
def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): | |||
self._node_labels = node_labels | |||
self._node_attrs = node_attrs | |||
self._edge_labels = edge_labels | |||
self._edge_attrs = edge_attrs | |||
def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): | |||
# @todo: remove labels which have only one possible values. | |||
if node_labels is None: | |||
self._node_labels = self._graphs[0].graph['node_labels'] | |||
# # graphs are considered node unlabeled if all nodes have the same label. | |||
# infos.update({'node_labeled': is_nl if node_label_num > 1 else False}) | |||
if node_attrs is None: | |||
self._node_attrs = self._graphs[0].graph['node_attrs'] | |||
# for G in Gn: | |||
# for n in G.nodes(data=True): | |||
# if 'attributes' in n[1]: | |||
# return len(n[1]['attributes']) | |||
# return 0 | |||
if edge_labels is None: | |||
self._edge_labels = self._graphs[0].graph['edge_labels'] | |||
# # graphs are considered edge unlabeled if all edges have the same label. | |||
# infos.update({'edge_labeled': is_el if edge_label_num > 1 else False}) | |||
if edge_attrs is None: | |||
self._edge_attrs = self._graphs[0].graph['edge_attrs'] | |||
# for G in Gn: | |||
# if nx.number_of_edges(G) > 0: | |||
# for e in G.edges(data=True): | |||
# if 'attributes' in e[2]: | |||
# return len(e[2]['attributes']) | |||
# return 0 | |||
def get_dataset_infos(self, keys=None, params=None): | |||
"""Computes and returns the structure and property information of the graph dataset. | |||
Parameters | |||
---------- | |||
keys : list, optional | |||
A list of strings which indicate which informations will be returned. The | |||
possible choices includes: | |||
'substructures': sub-structures graphs contains, including 'linear', 'non | |||
linear' and 'cyclic'. | |||
'node_label_dim': whether vertices have symbolic labels. | |||
'edge_label_dim': whether egdes have symbolic labels. | |||
'directed': whether graphs in dataset are directed. | |||
'dataset_size': number of graphs in dataset. | |||
'total_node_num': total number of vertices of all graphs in dataset. | |||
'ave_node_num': average number of vertices of graphs in dataset. | |||
'min_node_num': minimum number of vertices of graphs in dataset. | |||
'max_node_num': maximum number of vertices of graphs in dataset. | |||
'total_edge_num': total number of edges of all graphs in dataset. | |||
'ave_edge_num': average number of edges of graphs in dataset. | |||
'min_edge_num': minimum number of edges of graphs in dataset. | |||
'max_edge_num': maximum number of edges of graphs in dataset. | |||
'ave_node_degree': average vertex degree of graphs in dataset. | |||
'min_node_degree': minimum vertex degree of graphs in dataset. | |||
'max_node_degree': maximum vertex degree of graphs in dataset. | |||
'ave_fill_factor': average fill factor (number_of_edges / | |||
(number_of_nodes ** 2)) of graphs in dataset. | |||
'min_fill_factor': minimum fill factor of graphs in dataset. | |||
'max_fill_factor': maximum fill factor of graphs in dataset. | |||
'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset. | |||
'edge_label_nums': list number of symbolic edge labels of graphs in dataset. | |||
'node_attr_dim': number of dimensions of non-symbolic vertex labels. | |||
Extracted from the 'attributes' attribute of graph nodes. | |||
'edge_attr_dim': number of dimensions of non-symbolic edge labels. | |||
Extracted from the 'attributes' attribute of graph edges. | |||
'class_number': number of classes. Only available for classification problems. | |||
'all_degree_entropy': the entropy of degree distribution of each graph. | |||
'ave_degree_entropy': the average entropy of degree distribution of all graphs. | |||
All informations above will be returned if `keys` is not given. | |||
params: dict of dict, optional | |||
A dictinary which contains extra parameters for each possible | |||
element in ``keys``. | |||
Return | |||
------ | |||
dict | |||
Information of the graph dataset keyed by `keys`. | |||
""" | |||
infos = {} | |||
if keys == None: | |||
keys = [ | |||
'substructures', | |||
'node_label_dim', | |||
'edge_label_dim', | |||
'directed', | |||
'dataset_size', | |||
'total_node_num', | |||
'ave_node_num', | |||
'min_node_num', | |||
'max_node_num', | |||
'total_edge_num', | |||
'ave_edge_num', | |||
'min_edge_num', | |||
'max_edge_num', | |||
'ave_node_degree', | |||
'min_node_degree', | |||
'max_node_degree', | |||
'ave_fill_factor', | |||
'min_fill_factor', | |||
'max_fill_factor', | |||
'node_label_nums', | |||
'edge_label_nums', | |||
'node_attr_dim', | |||
'edge_attr_dim', | |||
'class_number', | |||
'all_degree_entropy', | |||
'ave_degree_entropy' | |||
] | |||
# dataset size | |||
if 'dataset_size' in keys: | |||
if self._dataset_size is None: | |||
self._dataset_size = self._get_dataset_size() | |||
infos['dataset_size'] = self._dataset_size | |||
# graph node number | |||
if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']): | |||
all_node_nums = self._get_all_node_nums() | |||
if 'total_node_num' in keys: | |||
if self._total_node_num is None: | |||
self._total_node_num = self._get_total_node_num(all_node_nums) | |||
infos['total_node_num'] = self._total_node_num | |||
if 'ave_node_num' in keys: | |||
if self._ave_node_num is None: | |||
self._ave_node_num = self._get_ave_node_num(all_node_nums) | |||
infos['ave_node_num'] = self._ave_node_num | |||
if 'min_node_num' in keys: | |||
if self._min_node_num is None: | |||
self._min_node_num = self._get_min_node_num(all_node_nums) | |||
infos['min_node_num'] = self._min_node_num | |||
if 'max_node_num' in keys: | |||
if self._max_node_num is None: | |||
self._max_node_num = self._get_max_node_num(all_node_nums) | |||
infos['max_node_num'] = self._max_node_num | |||
# graph edge number | |||
if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']): | |||
all_edge_nums = self._get_all_edge_nums() | |||
if 'total_edge_num' in keys: | |||
if self._total_edge_num is None: | |||
self._total_edge_num = self._get_total_edge_num(all_edge_nums) | |||
infos['total_edge_num'] = self._total_edge_num | |||
if 'ave_edge_num' in keys: | |||
if self._ave_edge_num is None: | |||
self._ave_edge_num = self._get_ave_edge_num(all_edge_nums) | |||
infos['ave_edge_num'] = self._ave_edge_num | |||
if 'max_edge_num' in keys: | |||
if self._max_edge_num is None: | |||
self._max_edge_num = self._get_max_edge_num(all_edge_nums) | |||
infos['max_edge_num'] = self._max_edge_num | |||
if 'min_edge_num' in keys: | |||
if self._min_edge_num is None: | |||
self._min_edge_num = self._get_min_edge_num(all_edge_nums) | |||
infos['min_edge_num'] = self._min_edge_num | |||
# label number | |||
if 'node_label_dim' in keys: | |||
if self._node_label_dim is None: | |||
self._node_label_dim = self._get_node_label_dim() | |||
infos['node_label_dim'] = self._node_label_dim | |||
if 'node_label_nums' in keys: | |||
if self._node_label_nums is None: | |||
self._node_label_nums = {} | |||
for node_label in self._node_labels: | |||
self._node_label_nums[node_label] = self._get_node_label_num(node_label) | |||
infos['node_label_nums'] = self._node_label_nums | |||
if 'edge_label_dim' in keys: | |||
if self._edge_label_dim is None: | |||
self._edge_label_dim = self._get_edge_label_dim() | |||
infos['edge_label_dim'] = self._edge_label_dim | |||
if 'edge_label_nums' in keys: | |||
if self._edge_label_nums is None: | |||
self._edge_label_nums = {} | |||
for edge_label in self._edge_labels: | |||
self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label) | |||
infos['edge_label_nums'] = self._edge_label_nums | |||
if 'directed' in keys or 'substructures' in keys: | |||
if self._directed is None: | |||
self._directed = self._is_directed() | |||
infos['directed'] = self._directed | |||
# node degree | |||
if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']): | |||
all_node_degrees = self._get_all_node_degrees() | |||
if 'ave_node_degree' in keys: | |||
if self._ave_node_degree is None: | |||
self._ave_node_degree = self._get_ave_node_degree(all_node_degrees) | |||
infos['ave_node_degree'] = self._ave_node_degree | |||
if 'max_node_degree' in keys: | |||
if self._max_node_degree is None: | |||
self._max_node_degree = self._get_max_node_degree(all_node_degrees) | |||
infos['max_node_degree'] = self._max_node_degree | |||
if 'min_node_degree' in keys: | |||
if self._min_node_degree is None: | |||
self._min_node_degree = self._get_min_node_degree(all_node_degrees) | |||
infos['min_node_degree'] = self._min_node_degree | |||
# fill factor | |||
if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']): | |||
all_fill_factors = self._get_all_fill_factors() | |||
if 'ave_fill_factor' in keys: | |||
if self._ave_fill_factor is None: | |||
self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors) | |||
infos['ave_fill_factor'] = self._ave_fill_factor | |||
if 'max_fill_factor' in keys: | |||
if self._max_fill_factor is None: | |||
self._max_fill_factor = self._get_max_fill_factor(all_fill_factors) | |||
infos['max_fill_factor'] = self._max_fill_factor | |||
if 'min_fill_factor' in keys: | |||
if self._min_fill_factor is None: | |||
self._min_fill_factor = self._get_min_fill_factor(all_fill_factors) | |||
infos['min_fill_factor'] = self._min_fill_factor | |||
if 'substructures' in keys: | |||
if self._substructures is None: | |||
self._substructures = self._get_substructures() | |||
infos['substructures'] = self._substructures | |||
if 'class_number' in keys: | |||
if self._class_number is None: | |||
self._class_number = self._get_class_number() | |||
infos['class_number'] = self._class_number | |||
if 'node_attr_dim' in keys: | |||
if self._node_attr_dim is None: | |||
self._node_attr_dim = self._get_node_attr_dim() | |||
infos['node_attr_dim'] = self._node_attr_dim | |||
if 'edge_attr_dim' in keys: | |||
if self._edge_attr_dim is None: | |||
self._edge_attr_dim = self._get_edge_attr_dim() | |||
infos['edge_attr_dim'] = self._edge_attr_dim | |||
# entropy of degree distribution. | |||
if 'all_degree_entropy' in keys: | |||
if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): | |||
base = params['all_degree_entropy']['base'] | |||
else: | |||
base = None | |||
infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base) | |||
if 'ave_degree_entropy' in keys: | |||
if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): | |||
base = params['ave_degree_entropy']['base'] | |||
else: | |||
base = None | |||
infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) | |||
return infos | |||
def print_graph_infos(self, infos): | |||
from collections import OrderedDict | |||
keys = list(infos.keys()) | |||
print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) | |||
def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): | |||
node_labels = [item for item in node_labels if item in self._node_labels] | |||
edge_labels = [item for item in edge_labels if item in self._edge_labels] | |||
node_attrs = [item for item in node_attrs if item in self._node_attrs] | |||
edge_attrs = [item for item in edge_attrs if item in self._edge_attrs] | |||
for g in self._graphs: | |||
for nd in g.nodes(): | |||
for nl in node_labels: | |||
del g.nodes[nd][nl] | |||
for na in node_attrs: | |||
del g.nodes[nd][na] | |||
for ed in g.edges(): | |||
for el in edge_labels: | |||
del g.edges[ed][el] | |||
for ea in edge_attrs: | |||
del g.edges[ed][ea] | |||
if len(node_labels) > 0: | |||
self._node_labels = [nl for nl in self._node_labels if nl not in node_labels] | |||
if len(edge_labels) > 0: | |||
self._edge_labels = [el for el in self._edge_labels if el not in edge_labels] | |||
if len(node_attrs) > 0: | |||
self._node_attrs = [na for na in self._node_attrs if na not in node_attrs] | |||
if len(edge_attrs) > 0: | |||
self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs] | |||
def clean_labels(self): | |||
labels = [] | |||
for name in self._node_labels: | |||
label = set() | |||
for G in self._graphs: | |||
label = label | set(nx.get_node_attributes(G, name).values()) | |||
if len(label) > 1: | |||
labels.append(name) | |||
break | |||
if len(label) < 2: | |||
for G in self._graphs: | |||
for nd in G.nodes(): | |||
del G.nodes[nd][name] | |||
self._node_labels = labels | |||
labels = [] | |||
for name in self._edge_labels: | |||
label = set() | |||
for G in self._graphs: | |||
label = label | set(nx.get_edge_attributes(G, name).values()) | |||
if len(label) > 1: | |||
labels.append(name) | |||
break | |||
if len(label) < 2: | |||
for G in self._graphs: | |||
for ed in G.edges(): | |||
del G.edges[ed][name] | |||
self._edge_labels = labels | |||
labels = [] | |||
for name in self._node_attrs: | |||
label = set() | |||
for G in self._graphs: | |||
label = label | set(nx.get_node_attributes(G, name).values()) | |||
if len(label) > 1: | |||
labels.append(name) | |||
break | |||
if len(label) < 2: | |||
for G in self._graphs: | |||
for nd in G.nodes(): | |||
del G.nodes[nd][name] | |||
self._node_attrs = labels | |||
labels = [] | |||
for name in self._edge_attrs: | |||
label = set() | |||
for G in self._graphs: | |||
label = label | set(nx.get_edge_attributes(G, name).values()) | |||
if len(label) > 1: | |||
labels.append(name) | |||
break | |||
if len(label) < 2: | |||
for G in self._graphs: | |||
for ed in G.edges(): | |||
del G.edges[ed][name] | |||
self._edge_attrs = labels | |||
def cut_graphs(self, range_): | |||
self._graphs = [self._graphs[i] for i in range_] | |||
if self._targets is not None: | |||
self._targets = [self._targets[i] for i in range_] | |||
self.clean_labels() | |||
def trim_dataset(self, edge_required=False): | |||
if edge_required: | |||
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)] | |||
else: | |||
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0] | |||
idx = [p[0] for p in trimed_pairs] | |||
self._graphs = [p[1] for p in trimed_pairs] | |||
self._targets = [self._targets[i] for i in idx] | |||
self.clean_labels() | |||
def copy(self): | |||
dataset = Dataset() | |||
graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None | |||
target = self._targets.copy() if self._targets is not None else None | |||
node_labels = self._node_labels.copy() if self._node_labels is not None else None | |||
node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None | |||
edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None | |||
edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None | |||
dataset.load_graphs(graphs, target) | |||
dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) | |||
# @todo: clean_labels and add other class members? | |||
return dataset | |||
def get_all_node_labels(self): | |||
node_labels = [] | |||
for g in self._graphs: | |||
for n in g.nodes(): | |||
nl = tuple(g.nodes[n].items()) | |||
if nl not in node_labels: | |||
node_labels.append(nl) | |||
return node_labels | |||
def get_all_edge_labels(self): | |||
edge_labels = [] | |||
for g in self._graphs: | |||
for e in g.edges(): | |||
el = tuple(g.edges[e].items()) | |||
if el not in edge_labels: | |||
edge_labels.append(el) | |||
return edge_labels | |||
def _get_dataset_size(self): | |||
return len(self._graphs) | |||
def _get_all_node_nums(self): | |||
return [nx.number_of_nodes(G) for G in self._graphs] | |||
def _get_total_node_nums(self, all_node_nums): | |||
return np.sum(all_node_nums) | |||
def _get_ave_node_num(self, all_node_nums): | |||
return np.mean(all_node_nums) | |||
def _get_min_node_num(self, all_node_nums): | |||
return np.amin(all_node_nums) | |||
def _get_max_node_num(self, all_node_nums): | |||
return np.amax(all_node_nums) | |||
def _get_all_edge_nums(self): | |||
return [nx.number_of_edges(G) for G in self._graphs] | |||
def _get_total_edge_nums(self, all_edge_nums): | |||
return np.sum(all_edge_nums) | |||
def _get_ave_edge_num(self, all_edge_nums): | |||
return np.mean(all_edge_nums) | |||
def _get_min_edge_num(self, all_edge_nums): | |||
return np.amin(all_edge_nums) | |||
def _get_max_edge_num(self, all_edge_nums): | |||
return np.amax(all_edge_nums) | |||
def _get_node_label_dim(self): | |||
return len(self._node_labels) | |||
def _get_node_label_num(self, node_label): | |||
nl = set() | |||
for G in self._graphs: | |||
nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||
return len(nl) | |||
def _get_edge_label_dim(self): | |||
return len(self._edge_labels) | |||
def _get_edge_label_num(self, edge_label): | |||
el = set() | |||
for G in self._graphs: | |||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
return len(el) | |||
def _is_directed(self): | |||
return nx.is_directed(self._graphs[0]) | |||
def _get_all_node_degrees(self): | |||
return [np.mean(list(dict(G.degree()).values())) for G in self._graphs] | |||
def _get_ave_node_degree(self, all_node_degrees): | |||
return np.mean(all_node_degrees) | |||
def _get_max_node_degree(self, all_node_degrees): | |||
return np.amax(all_node_degrees) | |||
def _get_min_node_degree(self, all_node_degrees): | |||
return np.amin(all_node_degrees) | |||
def _get_all_fill_factors(self): | |||
"""Get fill factor, the number of non-zero entries in the adjacency matrix. | |||
Returns | |||
------- | |||
list[float] | |||
List of fill factors for all graphs. | |||
""" | |||
return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs] | |||
def _get_ave_fill_factor(self, all_fill_factors): | |||
return np.mean(all_fill_factors) | |||
def _get_max_fill_factor(self, all_fill_factors): | |||
return np.amax(all_fill_factors) | |||
def _get_min_fill_factor(self, all_fill_factors): | |||
return np.amin(all_fill_factors) | |||
def _get_substructures(self): | |||
subs = set() | |||
for G in self._graphs: | |||
degrees = list(dict(G.degree()).values()) | |||
if any(i == 2 for i in degrees): | |||
subs.add('linear') | |||
if np.amax(degrees) >= 3: | |||
subs.add('non linear') | |||
if 'linear' in subs and 'non linear' in subs: | |||
break | |||
if self._directed: | |||
for G in self._graphs: | |||
if len(list(nx.find_cycle(G))) > 0: | |||
subs.add('cyclic') | |||
break | |||
# else: | |||
# # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way. | |||
# upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10 | |||
# for G in Gn: | |||
# if (nx.number_of_edges(G) < upper): | |||
# cyc = list(nx.simple_cycles(G.to_directed())) | |||
# if any(len(i) > 2 for i in cyc): | |||
# subs.add('cyclic') | |||
# break | |||
# if 'cyclic' not in subs: | |||
# for G in Gn: | |||
# cyc = list(nx.simple_cycles(G.to_directed())) | |||
# if any(len(i) > 2 for i in cyc): | |||
# subs.add('cyclic') | |||
# break | |||
return subs | |||
def _get_class_num(self): | |||
return len(set(self._targets)) | |||
def _get_node_attr_dim(self): | |||
return len(self._node_attrs) | |||
def _get_edge_attr_dim(self): | |||
return len(self._edge_attrs) | |||
def _compute_all_degree_entropy(self, base=None): | |||
"""Compute the entropy of degree distribution of each graph. | |||
Parameters | |||
---------- | |||
base : float, optional | |||
The logarithmic base to use. The default is ``e`` (natural logarithm). | |||
Returns | |||
------- | |||
degree_entropy : float | |||
The calculated entropy. | |||
""" | |||
from gklearn.utils.stats import entropy | |||
degree_entropy = [] | |||
for g in self._graphs: | |||
degrees = list(dict(g.degree()).values()) | |||
en = entropy(degrees, base=base) | |||
degree_entropy.append(en) | |||
return degree_entropy | |||
@property | |||
def graphs(self): | |||
return self._graphs | |||
@property | |||
def targets(self): | |||
return self._targets | |||
@property | |||
def node_labels(self): | |||
return self._node_labels | |||
@property | |||
def edge_labels(self): | |||
return self._edge_labels | |||
@property | |||
def node_attrs(self): | |||
return self._node_attrs | |||
@property | |||
def edge_attrs(self): | |||
return self._edge_attrs | |||
def split_dataset_by_target(dataset): | |||
from gklearn.preimage.utils import get_same_item_indices | |||
graphs = dataset.graphs | |||
targets = dataset.targets | |||
datasets = [] | |||
idx_targets = get_same_item_indices(targets) | |||
for key, val in idx_targets.items(): | |||
sub_graphs = [graphs[i] for i in val] | |||
sub_dataset = Dataset() | |||
sub_dataset.load_graphs(sub_graphs, [key] * len(val)) | |||
node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None | |||
node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None | |||
edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None | |||
edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None | |||
sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) | |||
datasets.append(sub_dataset) | |||
# @todo: clean_labels? | |||
return datasets |
@@ -0,0 +1,824 @@ | |||
""" Utilities function to manage graph files | |||
""" | |||
from os.path import dirname, splitext | |||
class DataLoader(): | |||
def __init__(self, filename, filename_targets=None, gformat=None, **kwargs): | |||
"""Read graph data from filename and load them as NetworkX graphs. | |||
Parameters | |||
---------- | |||
filename : string | |||
The name of the file from where the dataset is read. | |||
filename_targets : string | |||
The name of file of the targets corresponding to graphs. | |||
Notes | |||
----- | |||
This function supports following graph dataset formats: | |||
'ds': load data from .ds file. See comments of function loadFromDS for a example. | |||
'cxl': load data from Graph eXchange Language file (.cxl file). See | |||
`here <http://www.gupro.de/GXL/Introduction/background.html>`__ for detail. | |||
'sdf': load data from structured data file (.sdf file). See | |||
`here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ | |||
for details. | |||
'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See | |||
README in `downloadable file <http://mlcb.is.tuebingen.mpg.de/Mitarbeiter/Nino/WL/>`__ | |||
for details. | |||
'txt': Load graph data from the TUDataset. See | |||
`here <https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets>`__ | |||
for details. Note here filename is the name of either .txt file in | |||
the dataset directory. | |||
""" | |||
extension = splitext(filename)[1][1:] | |||
if extension == "ds": | |||
self._graphs, self._targets, self._label_names = self.load_from_ds(filename, filename_targets) | |||
elif extension == "cxl": | |||
dir_dataset = kwargs.get('dirname_dataset', None) | |||
self._graphs, self._targets, self._label_names = self.load_from_xml(filename, dir_dataset) | |||
elif extension == 'xml': | |||
dir_dataset = kwargs.get('dirname_dataset', None) | |||
self._graphs, self._targets, self._label_names = self.load_from_xml(filename, dir_dataset) | |||
elif extension == "mat": | |||
order = kwargs.get('order') | |||
self._graphs, self._targets, self._label_names = self.load_mat(filename, order) | |||
elif extension == 'txt': | |||
self._graphs, self._targets, self._label_names = self.load_tud(filename) | |||
else: | |||
raise ValueError('The input file with the extension ".', extension, '" is not supported. The supported extensions includes: ".ds", ".cxl", ".xml", ".mat", ".txt".') | |||
def load_from_ds(self, filename, filename_targets): | |||
"""Load data from .ds file. | |||
Possible graph formats include: | |||
'.ct': see function load_ct for detail. | |||
'.gxl': see dunction load_gxl for detail. | |||
Note these graph formats are checked automatically by the extensions of | |||
graph files. | |||
""" | |||
dirname_dataset = dirname(filename) | |||
data = [] | |||
y = [] | |||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
with open(filename) as fn: | |||
content = fn.read().splitlines() | |||
extension = splitext(content[0].split(' ')[0])[1][1:] | |||
if extension == 'ct': | |||
load_file_fun = self.load_ct | |||
elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet. | |||
load_file_fun = self.load_gxl | |||
if filename_targets is None or filename_targets == '': | |||
for i in range(0, len(content)): | |||
tmp = content[i].split(' ') | |||
# remove the '#'s in file names | |||
g, l_names = load_file_fun(dirname_dataset + '/' + tmp[0].replace('#', '', 1)) | |||
data.append(g) | |||
self._append_label_names(label_names, l_names) | |||
y.append(float(tmp[1])) | |||
else: # targets in a seperate file | |||
for i in range(0, len(content)): | |||
tmp = content[i] | |||
# remove the '#'s in file names | |||
g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1)) | |||
data.append(g) | |||
self._append_label_names(label_names, l_names) | |||
with open(filename_targets) as fnt: | |||
content_y = fnt.read().splitlines() | |||
# assume entries in filename and filename_targets have the same order. | |||
for item in content_y: | |||
tmp = item.split(' ') | |||
# assume the 3rd entry in a line is y (for Alkane dataset) | |||
y.append(float(tmp[2])) | |||
return data, y, label_names | |||
def load_from_xml(self, filename, dir_dataset=None): | |||
import xml.etree.ElementTree as ET | |||
if dir_dataset is not None: | |||
dir_dataset = dir_dataset | |||
else: | |||
dir_dataset = dirname(filename) | |||
tree = ET.parse(filename) | |||
root = tree.getroot() | |||
data = [] | |||
y = [] | |||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
for graph in root.iter('graph'): | |||
mol_filename = graph.attrib['file'] | |||
mol_class = graph.attrib['class'] | |||
g, l_names = self.load_gxl(dir_dataset + '/' + mol_filename) | |||
data.append(g) | |||
self._append_label_names(label_names, l_names) | |||
y.append(mol_class) | |||
return data, y, label_names | |||
def load_mat(self, filename, order): # @todo: need to be updated (auto order) or deprecated. | |||
"""Load graph data from a MATLAB (up to version 7.1) .mat file. | |||
Notes | |||
------ | |||
A MAT file contains a struct array containing graphs, and a column vector lx containing a class label for each graph. | |||
Check README in `downloadable file <http://mlcb.is.tuebingen.mpg.de/Mitarbeiter/Nino/WL/>`__ for detailed structure. | |||
""" | |||
from scipy.io import loadmat | |||
import numpy as np | |||
import networkx as nx | |||
data = [] | |||
content = loadmat(filename) | |||
for key, value in content.items(): | |||
if key[0] == 'l': # class label | |||
y = np.transpose(value)[0].tolist() | |||
elif key[0] != '_': | |||
# if adjacency matrix is not compressed / edge label exists | |||
if order[1] == 0: | |||
for i, item in enumerate(value[0]): | |||
g = nx.Graph(name=i) # set name of the graph | |||
nl = np.transpose(item[order[3]][0][0][0]) # node label | |||
for index, label in enumerate(nl[0]): | |||
g.add_node(index, label_1=str(label)) | |||
el = item[order[4]][0][0][0] # edge label | |||
for edge in el: | |||
g.add_edge(edge[0] - 1, edge[1] - 1, label_1=str(edge[2])) | |||
data.append(g) | |||
else: | |||
for i, item in enumerate(value[0]): | |||
g = nx.Graph(name=i) # set name of the graph | |||
nl = np.transpose(item[order[3]][0][0][0]) # node label | |||
for index, label in enumerate(nl[0]): | |||
g.add_node(index, label_1=str(label)) | |||
sam = item[order[0]] # sparse adjacency matrix | |||
index_no0 = sam.nonzero() | |||
for col, row in zip(index_no0[0], index_no0[1]): | |||
g.add_edge(col, row) | |||
data.append(g) | |||
label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
if order[1] == 0: | |||
label_names['edge_labels'].append('label_1') | |||
return data, y, label_names | |||
def load_tud(self, filename): | |||
"""Load graph data from TUD dataset files. | |||
Notes | |||
------ | |||
The graph data is loaded from separate files. | |||
Check README in `downloadable file <http://tiny.cc/PK_MLJ_data>`__, 2018 for detailed structure. | |||
""" | |||
import networkx as nx | |||
from os import listdir | |||
from os.path import dirname, basename | |||
def get_infos_from_readme(frm): # @todo: add README (cuniform), maybe node/edge label maps. | |||
"""Get information from DS_label_readme.txt file. | |||
""" | |||
def get_label_names_from_line(line): | |||
"""Get names of labels/attributes from a line. | |||
""" | |||
str_names = line.split('[')[1].split(']')[0] | |||
names = str_names.split(',') | |||
names = [attr.strip() for attr in names] | |||
return names | |||
def get_class_label_map(label_map_strings): | |||
label_map = {} | |||
for string in label_map_strings: | |||
integer, label = string.split('\t') | |||
label_map[int(integer.strip())] = label.strip() | |||
return label_map | |||
label_names = {'node_labels': [], 'node_attrs': [], | |||
'edge_labels': [], 'edge_attrs': []} | |||
class_label_map = None | |||
class_label_map_strings = [] | |||
with open(frm) as rm: | |||
content_rm = rm.read().splitlines() | |||
i = 0 | |||
while i < len(content_rm): | |||
line = content_rm[i].strip() | |||
# get node/edge labels and attributes. | |||
if line.startswith('Node labels:'): | |||
label_names['node_labels'] = get_label_names_from_line(line) | |||
elif line.startswith('Node attributes:'): | |||
label_names['node_attrs'] = get_label_names_from_line(line) | |||
elif line.startswith('Edge labels:'): | |||
label_names['edge_labels'] = get_label_names_from_line(line) | |||
elif line.startswith('Edge attributes:'): | |||
label_names['edge_attrs'] = get_label_names_from_line(line) | |||
# get class label map. | |||
elif line.startswith('Class labels were converted to integer values using this map:'): | |||
i += 2 | |||
line = content_rm[i].strip() | |||
while line != '' and i < len(content_rm): | |||
class_label_map_strings.append(line) | |||
i += 1 | |||
line = content_rm[i].strip() | |||
class_label_map = get_class_label_map(class_label_map_strings) | |||
i += 1 | |||
return label_names, class_label_map | |||
# get dataset name. | |||
dirname_dataset = dirname(filename) | |||
filename = basename(filename) | |||
fn_split = filename.split('_A') | |||
ds_name = fn_split[0].strip() | |||
# load data file names | |||
for name in listdir(dirname_dataset): | |||
if ds_name + '_A' in name: | |||
fam = dirname_dataset + '/' + name | |||
elif ds_name + '_graph_indicator' in name: | |||
fgi = dirname_dataset + '/' + name | |||
elif ds_name + '_graph_labels' in name: | |||
fgl = dirname_dataset + '/' + name | |||
elif ds_name + '_node_labels' in name: | |||
fnl = dirname_dataset + '/' + name | |||
elif ds_name + '_edge_labels' in name: | |||
fel = dirname_dataset + '/' + name | |||
elif ds_name + '_edge_attributes' in name: | |||
fea = dirname_dataset + '/' + name | |||
elif ds_name + '_node_attributes' in name: | |||
fna = dirname_dataset + '/' + name | |||
elif ds_name + '_graph_attributes' in name: | |||
fga = dirname_dataset + '/' + name | |||
elif ds_name + '_label_readme' in name: | |||
frm = dirname_dataset + '/' + name | |||
# this is supposed to be the node attrs, make sure to put this as the last 'elif' | |||
elif ds_name + '_attributes' in name: | |||
fna = dirname_dataset + '/' + name | |||
# get labels and attributes names. | |||
if 'frm' in locals(): | |||
label_names, class_label_map = get_infos_from_readme(frm) | |||
else: | |||
label_names = {'node_labels': [], 'node_attrs': [], | |||
'edge_labels': [], 'edge_attrs': []} | |||
class_label_map = None | |||
with open(fgi) as gi: | |||
content_gi = gi.read().splitlines() # graph indicator | |||
with open(fam) as am: | |||
content_am = am.read().splitlines() # adjacency matrix | |||
# load targets. | |||
if 'fgl' in locals(): | |||
with open(fgl) as gl: | |||
content_targets = gl.read().splitlines() # targets (classification) | |||
targets = [float(i) for i in content_targets] | |||
elif 'fga' in locals(): | |||
with open(fga) as ga: | |||
content_targets = ga.read().splitlines() # targets (regression) | |||
targets = [int(i) for i in content_targets] | |||
else: | |||
raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') | |||
if class_label_map is not None: | |||
targets = [class_label_map[t] for t in targets] | |||
# create graphs and add nodes | |||
data = [nx.Graph(name=str(i)) for i in range(0, len(content_targets))] | |||
if 'fnl' in locals(): | |||
with open(fnl) as nl: | |||
content_nl = nl.read().splitlines() # node labels | |||
for idx, line in enumerate(content_gi): | |||
# transfer to int first in case of unexpected blanks | |||
data[int(line) - 1].add_node(idx) | |||
labels = [l.strip() for l in content_nl[idx].split(',')] | |||
if label_names['node_labels'] == []: # @todo: need fix bug. | |||
for i, label in enumerate(labels): | |||
l_name = 'label_' + str(i) | |||
data[int(line) - 1].nodes[idx][l_name] = label | |||
label_names['node_labels'].append(l_name) | |||
else: | |||
for i, l_name in enumerate(label_names['node_labels']): | |||
data[int(line) - 1].nodes[idx][l_name] = labels[i] | |||
else: | |||
for i, line in enumerate(content_gi): | |||
data[int(line) - 1].add_node(i) | |||
# add edges | |||
for line in content_am: | |||
tmp = line.split(',') | |||
n1 = int(tmp[0]) - 1 | |||
n2 = int(tmp[1]) - 1 | |||
# ignore edge weight here. | |||
g = int(content_gi[n1]) - 1 | |||
data[g].add_edge(n1, n2) | |||
# add edge labels | |||
if 'fel' in locals(): | |||
with open(fel) as el: | |||
content_el = el.read().splitlines() | |||
for idx, line in enumerate(content_el): | |||
labels = [l.strip() for l in line.split(',')] | |||
n = [int(i) - 1 for i in content_am[idx].split(',')] | |||
g = int(content_gi[n[0]]) - 1 | |||
if label_names['edge_labels'] == []: | |||
for i, label in enumerate(labels): | |||
l_name = 'label_' + str(i) | |||
data[g].edges[n[0], n[1]][l_name] = label | |||
label_names['edge_labels'].append(l_name) | |||
else: | |||
for i, l_name in enumerate(label_names['edge_labels']): | |||
data[g].edges[n[0], n[1]][l_name] = labels[i] | |||
# add node attributes | |||
if 'fna' in locals(): | |||
with open(fna) as na: | |||
content_na = na.read().splitlines() | |||
for idx, line in enumerate(content_na): | |||
attrs = [a.strip() for a in line.split(',')] | |||
g = int(content_gi[idx]) - 1 | |||
if label_names['node_attrs'] == []: | |||
for i, attr in enumerate(attrs): | |||
a_name = 'attr_' + str(i) | |||
data[g].nodes[idx][a_name] = attr | |||
label_names['node_attrs'].append(a_name) | |||
else: | |||
for i, a_name in enumerate(label_names['node_attrs']): | |||
data[g].nodes[idx][a_name] = attrs[i] | |||
# add edge attributes | |||
if 'fea' in locals(): | |||
with open(fea) as ea: | |||
content_ea = ea.read().splitlines() | |||
for idx, line in enumerate(content_ea): | |||
attrs = [a.strip() for a in line.split(',')] | |||
n = [int(i) - 1 for i in content_am[idx].split(',')] | |||
g = int(content_gi[n[0]]) - 1 | |||
if label_names['edge_attrs'] == []: | |||
for i, attr in enumerate(attrs): | |||
a_name = 'attr_' + str(i) | |||
data[g].edges[n[0], n[1]][a_name] = attr | |||
label_names['edge_attrs'].append(a_name) | |||
else: | |||
for i, a_name in enumerate(label_names['edge_attrs']): | |||
data[g].edges[n[0], n[1]][a_name] = attrs[i] | |||
return data, targets, label_names | |||
def load_ct(self, filename): # @todo: this function is only tested on CTFile V2000; header not considered; only simple cases (atoms and bonds are considered.) | |||
"""load data from a Chemical Table (.ct) file. | |||
Notes | |||
------ | |||
a typical example of data in .ct is like this: | |||
3 2 <- number of nodes and edges | |||
0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) | |||
0.0000 0.0000 0.0000 C | |||
0.0000 0.0000 0.0000 O | |||
1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo | |||
2 3 1 1 | |||
Check `CTFile Formats file <https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS>`__ | |||
for detailed format discription. | |||
""" | |||
import networkx as nx | |||
from os.path import basename | |||
g = nx.Graph() | |||
with open(filename) as f: | |||
content = f.read().splitlines() | |||
g = nx.Graph(name=str(content[0]), filename=basename(filename)) # set name of the graph | |||
# read the counts line. | |||
tmp = content[1].split(' ') | |||
tmp = [x for x in tmp if x != ''] | |||
nb_atoms = int(tmp[0].strip()) # number of atoms | |||
nb_bonds = int(tmp[1].strip()) # number of bonds | |||
count_line_tags = ['number_of_atoms', 'number_of_bonds', 'number_of_atom_lists', '', 'chiral_flag', 'number_of_stext_entries', '', '', '', '', 'number_of_properties', 'CT_version'] | |||
i = 0 | |||
while i < len(tmp): | |||
if count_line_tags[i] != '': # if not obsoleted | |||
g.graph[count_line_tags[i]] = tmp[i].strip() | |||
i += 1 | |||
# read the atom block. | |||
atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag'] | |||
for i in range(0, nb_atoms): | |||
tmp = content[i + 2].split(' ') | |||
tmp = [x for x in tmp if x != ''] | |||
g.add_node(i) | |||
j = 0 | |||
while j < len(tmp): | |||
if atom_tags[j] != '': | |||
g.nodes[i][atom_tags[j]] = tmp[j].strip() | |||
j += 1 | |||
# read the bond block. | |||
bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status'] | |||
for i in range(0, nb_bonds): | |||
tmp = content[i + g.number_of_nodes() + 2].split(' ') | |||
tmp = [x for x in tmp if x != ''] | |||
n1, n2 = int(tmp[0].strip()) - 1, int(tmp[1].strip()) - 1 | |||
g.add_edge(n1, n2) | |||
j = 2 | |||
while j < len(tmp): | |||
if bond_tags[j] != '': | |||
g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip() | |||
j += 1 | |||
# get label names. | |||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1] | |||
for nd in g.nodes(): | |||
for key in g.nodes[nd]: | |||
if atom_symbolic[atom_tags.index(key)] == 1: | |||
label_names['node_labels'].append(key) | |||
else: | |||
label_names['node_attrs'].append(key) | |||
break | |||
bond_symbolic = [None, None, 1, 1, None, 1, 1] | |||
for ed in g.edges(): | |||
for key in g.edges[ed]: | |||
if bond_symbolic[bond_tags.index(key)] == 1: | |||
label_names['edge_labels'].append(key) | |||
else: | |||
label_names['edge_attrs'].append(key) | |||
break | |||
return g, label_names | |||
def load_gxl(self, filename): # @todo: directed graphs. | |||
from os.path import basename | |||
import networkx as nx | |||
import xml.etree.ElementTree as ET | |||
tree = ET.parse(filename) | |||
root = tree.getroot() | |||
index = 0 | |||
g = nx.Graph(filename=basename(filename), name=root[0].attrib['id']) | |||
dic = {} # used to retrieve incident nodes of edges | |||
for node in root.iter('node'): | |||
dic[node.attrib['id']] = index | |||
labels = {} | |||
for attr in node.iter('attr'): | |||
labels[attr.attrib['name']] = attr[0].text | |||
g.add_node(index, **labels) | |||
index += 1 | |||
for edge in root.iter('edge'): | |||
labels = {} | |||
for attr in edge.iter('attr'): | |||
labels[attr.attrib['name']] = attr[0].text | |||
g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) | |||
# get label names. | |||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
for node in root.iter('node'): | |||
for attr in node.iter('attr'): | |||
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||
label_names['node_labels'].append(attr.attrib['name']) | |||
else: | |||
label_names['node_attrs'].append(attr.attrib['name']) | |||
break | |||
for edge in root.iter('edge'): | |||
for attr in edge.iter('attr'): | |||
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||
label_names['edge_labels'].append(attr.attrib['name']) | |||
else: | |||
label_names['edge_attrs'].append(attr.attrib['name']) | |||
break | |||
return g, label_names | |||
def _append_label_names(self, label_names, new_names): | |||
for key, val in label_names.items(): | |||
label_names[key] += [name for name in new_names[key] if name not in val] | |||
@property | |||
def data(self): | |||
return self._graphs, self._targets, self._label_names | |||
@property | |||
def graphs(self): | |||
return self._graphs | |||
@property | |||
def targets(self): | |||
return self._targets | |||
@property | |||
def label_names(self): | |||
return self._label_names | |||
class DataSaver(): | |||
def __init__(self, graphs, targets=None, filename='gfile', gformat='gxl', group=None, **kwargs): | |||
"""Save list of graphs. | |||
""" | |||
import os | |||
dirname_ds = os.path.dirname(filename) | |||
if dirname_ds != '': | |||
dirname_ds += '/' | |||
os.makedirs(dirname_ds, exist_ok=True) | |||
if 'graph_dir' in kwargs: | |||
graph_dir = kwargs['graph_dir'] + '/' | |||
os.makedirs(graph_dir, exist_ok=True) | |||
del kwargs['graph_dir'] | |||
else: | |||
graph_dir = dirname_ds | |||
if group == 'xml' and gformat == 'gxl': | |||
with open(filename + '.xml', 'w') as fgroup: | |||
fgroup.write("<?xml version=\"1.0\"?>") | |||
fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">") | |||
fgroup.write("\n<GraphCollection>") | |||
for idx, g in enumerate(graphs): | |||
fname_tmp = "graph" + str(idx) + ".gxl" | |||
self.save_gxl(g, graph_dir + fname_tmp, **kwargs) | |||
fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(targets[idx]) + "\"/>") | |||
fgroup.write("\n</GraphCollection>") | |||
fgroup.close() | |||
def save_gxl(self, graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): | |||
if method == 'default': | |||
gxl_file = open(filename, 'w') | |||
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||
if 'name' in graph.graph: | |||
name = str(graph.graph['name']) | |||
else: | |||
name = 'dummy' | |||
gxl_file.write("<graph id=\"" + name + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||
for v, attrs in graph.nodes(data=True): | |||
gxl_file.write("<node id=\"_" + str(v) + "\">") | |||
for l_name in node_labels: | |||
gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||
str(attrs[l_name]) + "</int></attr>") | |||
for a_name in node_attrs: | |||
gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||
str(attrs[a_name]) + "</float></attr>") | |||
gxl_file.write("</node>\n") | |||
for v1, v2, attrs in graph.edges(data=True): | |||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | |||
for l_name in edge_labels: | |||
gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||
str(attrs[l_name]) + "</int></attr>") | |||
for a_name in edge_attrs: | |||
gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||
str(attrs[a_name]) + "</float></attr>") | |||
gxl_file.write("</edge>\n") | |||
gxl_file.write("</graph>\n") | |||
gxl_file.write("</gxl>") | |||
gxl_file.close() | |||
elif method == 'benoit': | |||
import xml.etree.ElementTree as ET | |||
root_node = ET.Element('gxl') | |||
attr = dict() | |||
attr['id'] = str(graph.graph['name']) | |||
attr['edgeids'] = 'true' | |||
attr['edgemode'] = 'undirected' | |||
graph_node = ET.SubElement(root_node, 'graph', attrib=attr) | |||
for v in graph: | |||
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) | |||
for attr in graph.nodes[v].keys(): | |||
cur_attr = ET.SubElement( | |||
current_node, 'attr', attrib={'name': attr}) | |||
cur_value = ET.SubElement(cur_attr, | |||
graph.nodes[v][attr].__class__.__name__) | |||
cur_value.text = graph.nodes[v][attr] | |||
for v1 in graph: | |||
for v2 in graph[v1]: | |||
if (v1 < v2): # Non oriented graphs | |||
cur_edge = ET.SubElement( | |||
graph_node, | |||
'edge', | |||
attrib={ | |||
'from': str(v1), | |||
'to': str(v2) | |||
}) | |||
for attr in graph[v1][v2].keys(): | |||
cur_attr = ET.SubElement( | |||
cur_edge, 'attr', attrib={'name': attr}) | |||
cur_value = ET.SubElement( | |||
cur_attr, graph[v1][v2][attr].__class__.__name__) | |||
cur_value.text = str(graph[v1][v2][attr]) | |||
tree = ET.ElementTree(root_node) | |||
tree.write(filename) | |||
elif method == 'gedlib': | |||
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||
# pass | |||
gxl_file = open(filename, 'w') | |||
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") | |||
for v, attrs in graph.nodes(data=True): | |||
gxl_file.write("<node id=\"_" + str(v) + "\">") | |||
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['chem']) + "</int></attr>") | |||
gxl_file.write("</node>\n") | |||
for v1, v2, attrs in graph.edges(data=True): | |||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | |||
gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['valence']) + "</int></attr>") | |||
# gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>") | |||
gxl_file.write("</edge>\n") | |||
gxl_file.write("</graph>\n") | |||
gxl_file.write("</gxl>") | |||
gxl_file.close() | |||
elif method == 'gedlib-letter': | |||
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | |||
# and https://github.com/dbblumenthal/gedlib/blob/master/data/datasets/Letter/HIGH/AP1_0000.gxl | |||
gxl_file = open(filename, 'w') | |||
gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") | |||
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||
gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n") | |||
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||
for v, attrs in graph.nodes(data=True): | |||
gxl_file.write("<node id=\"_" + str(v) + "\">") | |||
gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>") | |||
gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>") | |||
gxl_file.write("</node>\n") | |||
for v1, v2, attrs in graph.edges(data=True): | |||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>\n") | |||
gxl_file.write("</graph>\n") | |||
gxl_file.write("</gxl>") | |||
gxl_file.close() | |||
# def loadSDF(filename): | |||
# """load data from structured data file (.sdf file). | |||
# Notes | |||
# ------ | |||
# A SDF file contains a group of molecules, represented in the similar way as in MOL format. | |||
# Check `here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ for detailed structure. | |||
# """ | |||
# import networkx as nx | |||
# from os.path import basename | |||
# from tqdm import tqdm | |||
# import sys | |||
# data = [] | |||
# with open(filename) as f: | |||
# content = f.read().splitlines() | |||
# index = 0 | |||
# pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) | |||
# while index < len(content): | |||
# index_old = index | |||
# g = nx.Graph(name=content[index].strip()) # set name of the graph | |||
# tmp = content[index + 3] | |||
# nb_nodes = int(tmp[:3]) # number of the nodes | |||
# nb_edges = int(tmp[3:6]) # number of the edges | |||
# for i in range(0, nb_nodes): | |||
# tmp = content[i + index + 4] | |||
# g.add_node(i, atom=tmp[31:34].strip()) | |||
# for i in range(0, nb_edges): | |||
# tmp = content[i + index + g.number_of_nodes() + 4] | |||
# tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] | |||
# g.add_edge( | |||
# int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) | |||
# data.append(g) | |||
# index += 4 + g.number_of_nodes() + g.number_of_edges() | |||
# while content[index].strip() != '$$$$': # seperator | |||
# index += 1 | |||
# index += 1 | |||
# pbar.update(index - index_old) | |||
# pbar.update(1) | |||
# pbar.close() | |||
# return data | |||
# def load_from_cxl(filename): | |||
# import xml.etree.ElementTree as ET | |||
# | |||
# dirname_dataset = dirname(filename) | |||
# tree = ET.parse(filename) | |||
# root = tree.getroot() | |||
# data = [] | |||
# y = [] | |||
# for graph in root.iter('graph'): | |||
# mol_filename = graph.attrib['file'] | |||
# mol_class = graph.attrib['class'] | |||
# data.append(load_gxl(dirname_dataset + '/' + mol_filename)) | |||
# y.append(mol_class) | |||
if __name__ == '__main__': | |||
# ### Load dataset from .ds file. | |||
# # .ct files. | |||
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||
# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} | |||
# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) | |||
# ds_file = '../../datasets/Acyclic/dataset_bps.ds' # node symb | |||
# Gn, targets, label_names = load_dataset(ds_file) | |||
# ds_file = '../../datasets/MAO/dataset.ds' # node/edge symb | |||
# Gn, targets, label_names = load_dataset(ds_file) | |||
## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled | |||
## Gn, y = loadDataset(ds['dataset']) | |||
# print(Gn[1].graph) | |||
# print(Gn[1].nodes(data=True)) | |||
# print(Gn[1].edges(data=True)) | |||
# print(targets[1]) | |||
# # .gxl file. | |||
# ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb | |||
# Gn, y, label_names = load_dataset(ds_file) | |||
# print(Gn[1].graph) | |||
# print(Gn[1].nodes(data=True)) | |||
# print(Gn[1].edges(data=True)) | |||
# print(y[1]) | |||
# .mat file. | |||
ds_file = '../../datasets/MUTAG_mat/MUTAG.mat' | |||
order = [0, 0, 3, 1, 2] | |||
gloader = DataLoader(ds_file, order=order) | |||
Gn, targets, label_names = gloader.data | |||
print(Gn[1].graph) | |||
print(Gn[1].nodes(data=True)) | |||
print(Gn[1].edges(data=True)) | |||
print(targets[1]) | |||
# ### Convert graph from one format to another. | |||
# # .gxl file. | |||
# import networkx as nx | |||
# ds = {'name': 'monoterpenoides', | |||
# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
# Gn, y = loadDataset(ds['dataset']) | |||
# y = [int(i) for i in y] | |||
# print(Gn[1].nodes(data=True)) | |||
# print(Gn[1].edges(data=True)) | |||
# print(y[1]) | |||
# # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. | |||
# Gn_new = [] | |||
# for G in Gn: | |||
# G_new = nx.Graph() | |||
# for nd, attrs in G.nodes(data=True): | |||
# G_new.add_node(str(nd), chem=attrs['atom']) | |||
# for nd1, nd2, attrs in G.edges(data=True): | |||
# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) | |||
## G_new.add_edge(str(nd1), str(nd2)) | |||
# Gn_new.append(G_new) | |||
# print(Gn_new[1].nodes(data=True)) | |||
# print(Gn_new[1].edges(data=True)) | |||
# print(Gn_new[1]) | |||
# filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' | |||
# xparams = {'method': 'gedlib'} | |||
# saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) | |||
# save dataset. | |||
# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||
# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
# saveDataset(Gn, y, group='xml', filename='temp/temp') | |||
# test - new way to add labels and attributes. | |||
# dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||
# filename = '../../datasets/Fingerprint/Fingerprint_A.txt' | |||
# dataset = '../../datasets/Letter-med/Letter-med_A.txt' | |||
# dataset = '../../datasets/AIDS/AIDS_A.txt' | |||
# dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' | |||
# Gn, targets, label_names = load_dataset(filename) | |||
pass |
@@ -0,0 +1,61 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri Sep 11 18:10:06 2020 | |||
@author: ljia | |||
""" | |||
import numpy as np | |||
import networkx as nx | |||
import random | |||
class GraphSynthesizer(object): | |||
def __init__(self, g_type=None, *args, **kwargs): | |||
if g_type == 'unified': | |||
self._graphs = self.unified_graphs(*args, *kwargs) | |||
else: | |||
self._graphs = None | |||
def random_graph(self, num_nodes, num_edges, num_node_labels=0, num_edge_labels=0, seed=None, directed=False, max_num_edges=None, all_edges=None): | |||
g = nx.Graph() | |||
if num_node_labels > 0: | |||
node_labels = np.random.randint(0, high=num_node_labels, size=num_nodes) | |||
for i in range(0, num_nodes): | |||
g.add_node(str(i), atom=node_labels[i]) # @todo: update "atom". | |||
else: | |||
for i in range(0, num_nodes): | |||
g.add_node(str(i)) | |||
if num_edge_labels > 0: | |||
edge_labels = np.random.randint(0, high=num_edge_labels, size=num_edges) | |||
for idx, i in enumerate(random.sample(range(0, max_num_edges), num_edges)): | |||
node1, node2 = all_edges[i] | |||
g.add_edge(str(node1), str(node2), bond_type=edge_labels[idx]) # @todo: update "bond_type". | |||
else: | |||
for i in random.sample(range(0, max_num_edges), num_edges): | |||
node1, node2 = all_edges[i] | |||
g.add_edge(str(node1), str(node2)) | |||
return g | |||
def unified_graphs(self, num_graphs=1000, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=0, seed=None, directed=False): | |||
max_num_edges = int((num_nodes - 1) * num_nodes / 2) | |||
if num_edges > max_num_edges: | |||
raise Exception('Too many edges.') | |||
all_edges = [(i, j) for i in range(0, num_nodes) for j in range(i + 1, num_nodes)] # @todo: optimize. No directed graphs. | |||
graphs = [] | |||
for idx in range(0, num_graphs): | |||
graphs.append(self.random_graph(num_nodes, num_edges, num_node_labels=num_node_labels, num_edge_labels=num_edge_labels, seed=seed, directed=directed, max_num_edges=max_num_edges, all_edges=all_edges)) | |||
return graphs | |||
@property | |||
def graphs(self): | |||
return self._graphs |
@@ -0,0 +1,142 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Wed Oct 20 11:48:02 2020 | |||
@author: ljia | |||
""" | |||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||
import os | |||
import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import time | |||
import sys | |||
from group_results import group_trials | |||
def generate_graphs(): | |||
from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||
gsyzer = GraphSynthesizer() | |||
graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||
return graphs | |||
def xp_compute_ged_matrix(graphs, N, max_num_solutions, ratio, trial): | |||
save_file_suffix = '.' + str(N) + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
# Return if the file exists. | |||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
return None, None | |||
"""**2. Set parameters.**""" | |||
# Parameters for GED computation. | |||
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||
'lsape_model': 'ECBP', # | |||
# ??when bigger than 1, then the method is considered mIPFP. | |||
# the actual number of computed solutions might be smaller than the specified value | |||
'max_num_solutions': max_num_solutions, | |||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
'greedy_method': 'BASIC', # | |||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'attr_distance': 'euclidean', | |||
'optimal': True, # if TRUE, the option --greedy-method has no effect | |||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||
'threads': multiprocessing.cpu_count(), | |||
'centrality_method': 'NONE', | |||
'centrality_weight': 0.7, | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
options = ged_options.copy() | |||
options['edit_cost_constants'] = edit_cost_constants | |||
options['node_labels'] = [] | |||
options['edge_labels'] = [] | |||
options['node_attrs'] = [] | |||
options['edge_attrs'] = [] | |||
parallel = True # if num_solutions == 1 else False | |||
"""**5. Compute GED matrix.**""" | |||
ged_mat = 'error' | |||
runtime = 0 | |||
try: | |||
time0 = time.time() | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||
runtime = time.time() - time0 | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
"""**6. Get results.**""" | |||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
return ged_mat, runtime | |||
def save_trials_as_group(graphs, N, max_num_solutions, ratio): | |||
# Return if the group file exists. | |||
name_middle = '.' + str(N) + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
if os.path.isfile(name_group): | |||
return | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
print() | |||
print('Trial:', trial) | |||
ged_mat, runtime = xp_compute_ged_matrix(graphs, N, max_num_solutions, ratio, trial) | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
# Group trials and Remove single files. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
name_prefix = 'runtime' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
def results_for_a_ratio(ratio): | |||
for N in N_list: | |||
print() | |||
print('# of graphs:', N) | |||
for max_num_solutions in [1, 20, 40, 60, 80, 100]: | |||
print() | |||
print('Max # of solutions:', max_num_solutions) | |||
save_trials_as_group(graphs[:N], N, max_num_solutions, ratio) | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
N_list = [int(i) for i in sys.argv[1:]] | |||
else: | |||
N_list = [10, 50, 100] | |||
# Generate graphs. | |||
graphs = generate_graphs() | |||
save_dir = 'outputs/edit_costs.max_num_sols.N.bipartite/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
for ratio in [10, 1, 0.1]: | |||
print() | |||
print('Ratio:', ratio) | |||
results_for_a_ratio(ratio) |
@@ -12,18 +12,19 @@ import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import numpy as np | |||
import time | |||
from utils import get_dataset | |||
import sys | |||
from group_results import group_trials | |||
def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
# Return if the file exists. | |||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
return None, None | |||
"""**2. Set parameters.**""" | |||
@@ -83,6 +84,12 @@ def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): | |||
def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||
# Return if the group file exists. | |||
name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
if os.path.isfile(name_group): | |||
return | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
@@ -92,25 +99,36 @@ def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
np.save(f, np.array(ged_mats)) | |||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
# Group trials and Remove single files. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
name_prefix = 'runtime' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for max_num_solutions in [1, 20, 40, 60, 80, 100]: | |||
for max_num_solutions in mnum_solutions_list: | |||
print() | |||
print('Max # of solutions:', max_num_solutions) | |||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
for ratio in ratio_list: | |||
print() | |||
print('Ratio:', ratio) | |||
save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) | |||
def get_param_lists(ds_name): | |||
if ds_name == 'AIDS_symb': | |||
mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
else: | |||
mnum_solutions_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
return mnum_solutions_list, ratio_list | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
@@ -119,12 +137,11 @@ if __name__ == '__main__': | |||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
if not os.path.exists(save_dir + 'groups/'): | |||
os.makedirs(save_dir + 'groups/') | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
mnum_solutions_list, ratio_list = get_param_lists(ds_name) | |||
results_for_a_dataset(ds_name) |
@@ -0,0 +1,137 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Wed Oct 20 11:48:02 2020 | |||
@author: ljia | |||
""" | |||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||
import os | |||
import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import time | |||
import sys | |||
from group_results import group_trials | |||
def generate_graphs(): | |||
from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||
gsyzer = GraphSynthesizer() | |||
graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||
return graphs | |||
def xp_compute_ged_matrix(graphs, N, num_solutions, ratio, trial): | |||
save_file_suffix = '.' + str(N) + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
# Return if the file exists. | |||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
return None, None | |||
"""**2. Set parameters.**""" | |||
# Parameters for GED computation. | |||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||
# when bigger than 1, then the method is considered mIPFP. | |||
'initial_solutions': int(num_solutions * 4), | |||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'attr_distance': 'euclidean', | |||
'ratio_runs_from_initial_solutions': 0.25, | |||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||
'threads': multiprocessing.cpu_count(), | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
options = ged_options.copy() | |||
options['edit_cost_constants'] = edit_cost_constants | |||
options['node_labels'] = [] | |||
options['edge_labels'] = [] | |||
options['node_attrs'] = [] | |||
options['edge_attrs'] = [] | |||
parallel = True # if num_solutions == 1 else False | |||
"""**5. Compute GED matrix.**""" | |||
ged_mat = 'error' | |||
runtime = 0 | |||
try: | |||
time0 = time.time() | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||
runtime = time.time() - time0 | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
"""**6. Get results.**""" | |||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
return ged_mat, runtime | |||
def save_trials_as_group(graphs, N, num_solutions, ratio): | |||
# Return if the group file exists. | |||
name_middle = '.' + str(N) + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
if os.path.isfile(name_group): | |||
return | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
print() | |||
print('Trial:', trial) | |||
ged_mat, runtime = xp_compute_ged_matrix(graphs, N, num_solutions, ratio, trial) | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
# Group trials and Remove single files. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
name_prefix = 'runtime' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
def results_for_a_ratio(ratio): | |||
for N in N_list: | |||
print() | |||
print('# of graphs:', N) | |||
for num_solutions in [1, 20, 40, 60, 80, 100]: | |||
print() | |||
print('# of solutions:', num_solutions) | |||
save_trials_as_group(graphs[:N], N, num_solutions, ratio) | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
N_list = [int(i) for i in sys.argv[1:]] | |||
else: | |||
N_list = [10, 50, 100] | |||
# Generate graphs. | |||
graphs = generate_graphs() | |||
save_dir = 'outputs/edit_costs.num_sols.N.IPFP/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
for ratio in [10, 1, 0.1]: | |||
print() | |||
print('Ratio:', ratio) | |||
results_for_a_ratio(ratio) |
@@ -12,15 +12,19 @@ import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import numpy as np | |||
import time | |||
from utils import get_dataset | |||
import sys | |||
from group_results import group_trials | |||
def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
# Return if the file exists. | |||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
return None, None | |||
"""**2. Set parameters.**""" | |||
@@ -39,8 +43,8 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
} | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
options = ged_options.copy() | |||
options['edit_cost_constants'] = edit_cost_constants | |||
@@ -55,7 +59,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
runtime = 0 | |||
try: | |||
time0 = time.time() | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) | |||
runtime = time.time() - time0 | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
@@ -70,11 +74,17 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
return ged_mat, runtime | |||
def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||
# Return if the group file exists. | |||
name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
if os.path.isfile(name_group): | |||
return | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
@@ -84,24 +94,35 @@ def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) | |||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
np.save(f, np.array(ged_mats)) | |||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
# Group trials and Remove single files. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
name_prefix = 'runtime' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for num_solutions in [1, 20, 40, 60, 80, 100]: | |||
for num_solutions in num_solutions_list: | |||
print() | |||
print('# of solutions:', num_solutions) | |||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
for ratio in ratio_list: | |||
print() | |||
print('Ratio:', ratio) | |||
save_trials_as_group(dataset, ds_name, num_solutions, ratio) | |||
def get_param_lists(ds_name): | |||
if ds_name == 'AIDS_symb': | |||
num_solutions_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
else: | |||
num_solutions_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
return num_solutions_list, ratio_list | |||
if __name__ == '__main__': | |||
@@ -111,12 +132,11 @@ if __name__ == '__main__': | |||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
if not os.path.exists(save_dir + 'groups/'): | |||
os.makedirs(save_dir + 'groups/') | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
num_solutions_list, ratio_list = get_param_lists(ds_name) | |||
results_for_a_dataset(ds_name) |
@@ -0,0 +1,137 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Wed Oct 20 11:48:02 2020 | |||
@author: ljia | |||
""" | |||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||
import os | |||
import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import time | |||
import sys | |||
from group_results import group_trials | |||
def generate_graphs(): | |||
from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||
gsyzer = GraphSynthesizer() | |||
graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||
return graphs | |||
def xp_compute_ged_matrix(graphs, N, repeats, ratio, trial): | |||
save_file_suffix = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
# Return if the file exists. | |||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
return None, None | |||
"""**2. Set parameters.**""" | |||
# Parameters for GED computation. | |||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||
# when bigger than 1, then the method is considered mIPFP. | |||
'initial_solutions': 1, | |||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'attr_distance': 'euclidean', | |||
'ratio_runs_from_initial_solutions': 1, | |||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||
'threads': multiprocessing.cpu_count(), | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
options = ged_options.copy() | |||
options['edit_cost_constants'] = edit_cost_constants | |||
options['node_labels'] = [] | |||
options['edge_labels'] = [] | |||
options['node_attrs'] = [] | |||
options['edge_attrs'] = [] | |||
parallel = True # if num_solutions == 1 else False | |||
"""**5. Compute GED matrix.**""" | |||
ged_mat = 'error' | |||
runtime = 0 | |||
try: | |||
time0 = time.time() | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||
runtime = time.time() - time0 | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
"""**6. Get results.**""" | |||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
return ged_mat, runtime | |||
def save_trials_as_group(graphs, N, repeats, ratio): | |||
# Return if the group file exists. | |||
name_middle = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
if os.path.isfile(name_group): | |||
return | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
print() | |||
print('Trial:', trial) | |||
ged_mat, runtime = xp_compute_ged_matrix(graphs, N, repeats, ratio, trial) | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
# Group trials and Remove single files. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
name_prefix = 'runtime' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
def results_for_a_ratio(ratio): | |||
for N in N_list: | |||
print() | |||
print('# of graphs:', N) | |||
for repeats in [1, 20, 40, 60, 80, 100]: | |||
print() | |||
print('Repeats:', repeats) | |||
save_trials_as_group(graphs[:N], N, repeats, ratio) | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
N_list = [int(i) for i in sys.argv[1:]] | |||
else: | |||
N_list = [10, 50, 100] | |||
# Generate graphs. | |||
graphs = generate_graphs() | |||
save_dir = 'outputs/edit_costs.repeats.N.IPFP/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
for ratio in [10, 1, 0.1]: | |||
print() | |||
print('Ratio:', ratio) | |||
results_for_a_ratio(ratio) |
@@ -0,0 +1,142 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Wed Oct 20 11:48:02 2020 | |||
@author: ljia | |||
""" | |||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||
import os | |||
import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import time | |||
import sys | |||
from group_results import group_trials | |||
def generate_graphs(): | |||
from gklearn.utils.graph_synthesizer import GraphSynthesizer | |||
gsyzer = GraphSynthesizer() | |||
graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) | |||
return graphs | |||
def xp_compute_ged_matrix(graphs, N, repeats, ratio, trial): | |||
save_file_suffix = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
# Return if the file exists. | |||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
return None, None | |||
"""**2. Set parameters.**""" | |||
# Parameters for GED computation. | |||
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. | |||
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) | |||
'lsape_model': 'ECBP', # | |||
# ??when bigger than 1, then the method is considered mIPFP. | |||
# the actual number of computed solutions might be smaller than the specified value | |||
'max_num_solutions': 1, | |||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
'greedy_method': 'BASIC', # | |||
# the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'attr_distance': 'euclidean', | |||
'optimal': True, # if TRUE, the option --greedy-method has no effect | |||
# parallel threads. Do not work if mpg_options['parallel'] = False. | |||
'threads': multiprocessing.cpu_count(), | |||
'centrality_method': 'NONE', | |||
'centrality_weight': 0.7, | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
options = ged_options.copy() | |||
options['edit_cost_constants'] = edit_cost_constants | |||
options['node_labels'] = [] | |||
options['edge_labels'] = [] | |||
options['node_attrs'] = [] | |||
options['edge_attrs'] = [] | |||
parallel = True # if num_solutions == 1 else False | |||
"""**5. Compute GED matrix.**""" | |||
ged_mat = 'error' | |||
runtime = 0 | |||
try: | |||
time0 = time.time() | |||
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) | |||
runtime = time.time() - time0 | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
"""**6. Get results.**""" | |||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
return ged_mat, runtime | |||
def save_trials_as_group(graphs, N, repeats, ratio): | |||
# Return if the group file exists. | |||
name_middle = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
if os.path.isfile(name_group): | |||
return | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
print() | |||
print('Trial:', trial) | |||
ged_mat, runtime = xp_compute_ged_matrix(graphs, N, repeats, ratio, trial) | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
# Group trials and Remove single files. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
name_prefix = 'runtime' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
def results_for_a_ratio(ratio): | |||
for N in N_list: | |||
print() | |||
print('# of graphs:', N) | |||
for repeats in [1, 20, 40, 60, 80, 100]: | |||
print() | |||
print('Repeats:', repeats) | |||
save_trials_as_group(graphs[:N], N, repeats, ratio) | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
N_list = [int(i) for i in sys.argv[1:]] | |||
else: | |||
N_list = [10, 50, 100] | |||
# Generate graphs. | |||
graphs = generate_graphs() | |||
save_dir = 'outputs/edit_costs.repeats.N.bipartite/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
for ratio in [10, 1, 0.1]: | |||
print() | |||
print('Ratio:', ratio) | |||
results_for_a_ratio(ratio) |
@@ -12,18 +12,19 @@ import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import numpy as np | |||
import time | |||
from utils import get_dataset | |||
import sys | |||
from group_results import group_trials | |||
def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
# Return if the file exists. | |||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
return None, None | |||
"""**2. Set parameters.**""" | |||
@@ -78,6 +79,12 @@ def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
# Return if the group file exists. | |||
name_middle = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
if os.path.isfile(name_group): | |||
return | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
@@ -87,25 +94,36 @@ def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
np.save(f, np.array(ged_mats)) | |||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
# Group trials and Remove single files. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
name_prefix = 'runtime' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for repeats in [1, 20, 40, 60, 80, 100]: | |||
for repeats in repeats_list: | |||
print() | |||
print('Repeats:', repeats) | |||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
for ratio in ratio_list: | |||
print() | |||
print('Ratio:', ratio) | |||
save_trials_as_group(dataset, ds_name, repeats, ratio) | |||
def get_param_lists(ds_name): | |||
if ds_name == 'AIDS_symb': | |||
repeats_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
else: | |||
repeats_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
return repeats_list, ratio_list | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
@@ -114,12 +132,11 @@ if __name__ == '__main__': | |||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
if not os.path.exists(save_dir + 'groups/'): | |||
os.makedirs(save_dir + 'groups/') | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
repeats_list, ratio_list = get_param_lists(ds_name) | |||
results_for_a_dataset(ds_name) |
@@ -12,18 +12,19 @@ import multiprocessing | |||
import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import numpy as np | |||
import time | |||
from utils import get_dataset | |||
import sys | |||
from group_results import group_trials | |||
def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
# Return if the file exists. | |||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
return None, None | |||
"""**2. Set parameters.**""" | |||
@@ -83,6 +84,12 @@ def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): | |||
def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
# Return if the group file exists. | |||
name_middle = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
if os.path.isfile(name_group): | |||
return | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
@@ -92,25 +99,36 @@ def save_trials_as_group(dataset, ds_name, repeats, ratio): | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) | |||
with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: | |||
np.save(f, np.array(ged_mats)) | |||
with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(runtime, f) | |||
# Group trials and Remove single files. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
name_prefix = 'runtime' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for repeats in [1, 20, 40, 60, 80, 100]: | |||
for repeats in repeats_list: | |||
print() | |||
print('Repeats:', repeats) | |||
for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: | |||
for ratio in ratio_list: | |||
print() | |||
print('Ratio:', ratio) | |||
save_trials_as_group(dataset, ds_name, repeats, ratio) | |||
def get_param_lists(ds_name): | |||
if ds_name == 'AIDS_symb': | |||
repeats_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
else: | |||
repeats_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
return repeats_list, ratio_list | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
@@ -119,12 +137,11 @@ if __name__ == '__main__': | |||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
if not os.path.exists(save_dir + 'groups/'): | |||
os.makedirs(save_dir + 'groups/') | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
repeats_list, ratio_list = get_param_lists(ds_name) | |||
results_for_a_dataset(ds_name) |
@@ -16,6 +16,7 @@ from tqdm import tqdm | |||
import sys | |||
# This function is used by other scripts. Modify it carefully. | |||
def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
# Get group name. | |||
@@ -47,8 +48,20 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
data = pickle.load(f) | |||
try: | |||
data = pickle.load(f) | |||
except EOFError: | |||
print('EOF Error occurred.') | |||
return | |||
data_group.append(data) | |||
# unpickler = pickle.Unpickler(f) | |||
# data = unpickler.load() | |||
# if not isinstance(data, np.array): | |||
# return | |||
# else: | |||
# data_group.append(data) | |||
else: # Not all trials are completed. | |||
return | |||
@@ -81,11 +94,9 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): | |||
# Create folders. | |||
if not os.path.exists(dir_folder + 'groups/'): | |||
os.makedirs(dir_folder + 'groups/') | |||
os.makedirs(dir_folder + 'groups/', exist_ok=True) | |||
if backup: | |||
if not os.path.exists(dir_folder + 'backups'): | |||
os.makedirs(dir_folder + 'backups') | |||
os.makedirs(dir_folder + 'backups', exist_ok=True) | |||
# Iterate all files. | |||
cur_file_prefix = '' | |||
@@ -105,4 +116,10 @@ if __name__ == '__main__': | |||
group_all_in_folder(dir_folder) | |||
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' | |||
group_all_in_folder(dir_folder) | |||
dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' | |||
group_all_in_folder(dir_folder) | |||
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' | |||
group_all_in_folder(dir_folder) |
@@ -0,0 +1,56 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Nov 3 20:23:25 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
import re | |||
def get_job_script(arg, params): | |||
ged_method = params[0] | |||
multi_method = params[1] | |||
job_name_label = r"rep." if multi_method == 'repeats' else r"" | |||
script = r""" | |||
#!/bin/bash | |||
#SBATCH --exclusive | |||
#SBATCH --job-name="st.""" + job_name_label + r"N" + arg + r"." + ged_method + r"""" | |||
#SBATCH --partition=tlong | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_edit_costs.""" + multi_method + r".N." + ged_method + r"." + arg + r""".txt" | |||
#SBATCH --error="errors/error_edit_costs.""" + multi_method + r".N." + ged_method + r"." + arg + r""".txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=300:00:00 | |||
#SBATCH --mem-per-cpu=4000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
srun python3 edit_costs.""" + multi_method + r".N." + ged_method + r".py " + arg | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
return script | |||
if __name__ == '__main__': | |||
params_list = [('IPFP', 'nums_sols'), | |||
('IPFP', 'repeats'), | |||
('bipartite', 'max_num_sols'), | |||
('bipartite', 'repeats')] | |||
N_list = [10, 50, 100] | |||
for params in params_list[1:]: | |||
for N in [N_list[i] for i in [0, 1, 2]]: | |||
job_script = get_job_script(str(N), params) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
# os.popen(command) | |||
# output = stream.readlines() |
@@ -0,0 +1,47 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Nov 3 20:23:25 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
import re | |||
def get_job_script(arg): | |||
script = r""" | |||
#!/bin/bash | |||
#SBATCH --exclusive | |||
#SBATCH --job-name="st.""" + arg + r""".bp" | |||
#SBATCH --partition=tlong | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||
#SBATCH --error="errors/error_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=300:00:00 | |||
#SBATCH --mem-per-cpu=4000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
srun python3 edit_costs.max_nums_sols.ratios.bipartite.py """ + arg | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
return script | |||
if __name__ == '__main__': | |||
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: | |||
job_script = get_job_script(ds_name) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
# os.popen(command) | |||
# output = stream.readlines() |
@@ -0,0 +1,47 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Nov 3 20:23:25 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
import re | |||
def get_job_script(arg): | |||
script = r""" | |||
#!/bin/bash | |||
#SBATCH --exclusive | |||
#SBATCH --job-name="st.""" + arg + r""".IPFP" | |||
#SBATCH --partition=tlong | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||
#SBATCH --error="errors/error_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=300:00:00 | |||
#SBATCH --mem-per-cpu=4000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
srun python3 edit_costs.nums_sols.ratios.IPFP.py """ + arg | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
return script | |||
if __name__ == '__main__': | |||
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
for ds_name in [ds_list[i] for i in [0, 3]]: | |||
job_script = get_job_script(ds_name) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
# os.popen(command) | |||
# output = stream.readlines() |
@@ -0,0 +1,47 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Nov 3 20:23:25 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
import re | |||
def get_job_script(arg): | |||
script = r""" | |||
#!/bin/bash | |||
#SBATCH --exclusive | |||
#SBATCH --job-name="st.rep.""" + arg + r""".IPFP" | |||
#SBATCH --partition=tlong | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_edit_costs.repeats.ratios.IPFP.""" + arg + """.txt" | |||
#SBATCH --error="errors/error_edit_costs.repeats.ratios.IPFP.""" + arg + """.txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=300:00:00 | |||
#SBATCH --mem-per-cpu=4000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
srun python3 edit_costs.repeats.ratios.IPFP.py """ + arg | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
return script | |||
if __name__ == '__main__': | |||
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
for ds_name in [ds_list[i] for i in [0, 3]]: | |||
job_script = get_job_script(ds_name) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
# os.popen(command) | |||
# output = stream.readlines() |
@@ -0,0 +1,47 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Nov 3 20:23:25 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
import re | |||
def get_job_script(arg): | |||
script = r""" | |||
#!/bin/bash | |||
#SBATCH --exclusive | |||
#SBATCH --job-name="st.rep.""" + arg + r""".bp" | |||
#SBATCH --partition=tlong | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_edit_costs.repeats.ratios.bipartite.""" + arg + """.txt" | |||
#SBATCH --error="errors/error_edit_costs.repeats.ratios.bipartite.""" + arg + """.txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=300:00:00 | |||
#SBATCH --mem-per-cpu=4000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
srun python3 edit_costs.repeats.ratios.bipartite.py """ + arg | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
return script | |||
if __name__ == '__main__': | |||
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]: | |||
job_script = get_job_script(ds_name) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
# os.popen(command) | |||
# output = stream.readlines() |
@@ -150,8 +150,7 @@ def xp_accuracy_diff_entropy(): | |||
import pickle | |||
import os | |||
save_dir = 'outputs/accuracy_diff_entropy/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
os.makedirs(save_dir, exist_ok=True) | |||
accuracies = {} | |||
confidences = {} | |||
@@ -16,8 +16,7 @@ def xp_runtimes_of_all_28cores(): | |||
import pickle | |||
import os | |||
save_dir = 'outputs/runtimes_of_all_28cores/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_times = {} | |||
@@ -16,8 +16,7 @@ def xp_runtimes_diff_chunksizes(): | |||
import pickle | |||
import os | |||
save_dir = 'outputs/runtimes_diff_chunksizes/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_times = {} | |||
@@ -25,8 +25,7 @@ def xp_synthesized_graphs_dataset_size(): | |||
import pickle | |||
import os | |||
save_dir = 'outputs/synthesized_graphs_N/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_times = {} | |||
@@ -22,8 +22,7 @@ def xp_synthesized_graphs_degrees(): | |||
import pickle | |||
import os | |||
save_dir = 'outputs/synthesized_graphs_degrees/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_times = {} | |||
@@ -22,8 +22,7 @@ def xp_synthesized_graphs_num_node_label_alphabet(): | |||
import pickle | |||
import os | |||
save_dir = 'outputs/synthesized_graphs_num_node_label_alphabet/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_times = {} | |||
@@ -22,8 +22,7 @@ def xp_synthesized_graphs_num_nodes(): | |||
import pickle | |||
import os | |||
save_dir = 'outputs/synthesized_graphs_num_nodes/' | |||
if not os.path.exists(save_dir): | |||
os.makedirs(save_dir) | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_times = {} | |||
@@ -154,6 +154,6 @@ def test_median_graph_estimator_symb(): | |||
return set_median, gen_median | |||
if __name__ == '__main__': | |||
if _name_ == '_main_': | |||
# set_median, gen_median = test_median_graph_estimator() | |||
set_median, gen_median = test_median_graph_estimator_symb() |
@@ -7,6 +7,8 @@ __version__ = "0.1" | |||
__author__ = "Linlin Jia" | |||
__date__ = "November 2018" | |||
from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels | |||
from gklearn.kernels.graph_kernel import GraphKernel | |||
from gklearn.kernels.common_walk import CommonWalk | |||
from gklearn.kernels.marginalized import Marginalized | |||
@@ -0,0 +1,36 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri Nov 6 10:11:08 2020 | |||
@author: ljia | |||
""" | |||
# The metadata of all graph kernels. | |||
GRAPH_KERNELS = { | |||
### based on walks. | |||
'common walk': '', | |||
'marginalized': '', | |||
'sylvester equation': '', | |||
'fixed_point': '', | |||
'conjugate gradient': '', | |||
'spectral decomposition': '', | |||
### based on paths. | |||
'shortest path': '', | |||
'structural shortest path': '', | |||
'path up to length h': '', | |||
### based on non-linear patterns. | |||
'weisfeiler-lehman subtree': '', | |||
'treelet': '', | |||
} | |||
def list_of_graph_kernels(): | |||
"""List names of all graph kernels. | |||
Returns | |||
------- | |||
list | |||
The list of all graph kernels. | |||
""" | |||
return [i for i in GRAPH_KERNELS] |
@@ -126,8 +126,7 @@ def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, sav | |||
# save median graphs. | |||
if save_preimages: | |||
if not os.path.exists(dir_save + 'preimages/'): | |||
os.makedirs(dir_save + 'preimages/') | |||
os.makedirs(dir_save + 'preimages/', exist_ok=True) | |||
print('Saving preimages to files...') | |||
fn_best_dataset = dir_save + 'preimages/g_best_dataset.' + 'nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
saveGXL(rpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | |||
@@ -167,8 +166,7 @@ def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, sav | |||
def _init_output_file_preimage(ds_name, gkernel, dir_output): | |||
if not os.path.exists(dir_output): | |||
os.makedirs(dir_output) | |||
os.makedirs(dir_output, exist_ok=True) | |||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||
f_detail = open(dir_output + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'num graphs', | |||
@@ -218,8 +218,7 @@ def remove_best_graph(ds_name, mpg_options, kernel_options, ged_options, mge_opt | |||
# save median graphs. | |||
if save_medians: | |||
if not os.path.exists(dir_save + 'medians/'): | |||
os.makedirs(dir_save + 'medians/') | |||
os.makedirs(dir_save + 'medians/', exist_ok=True) | |||
print('Saving median graphs to files...') | |||
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | |||
@@ -375,8 +374,7 @@ def _compute_gram_matrix_unnorm(dataset, kernel_options): | |||
def _init_output_file(ds_name, gkernel, fit_method, dir_output): | |||
if not os.path.exists(dir_output): | |||
os.makedirs(dir_output) | |||
os.makedirs(dir_output, exist_ok=True) | |||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||
f_detail = open(dir_output + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', | |||
@@ -230,8 +230,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged | |||
# save median graphs. | |||
if save_medians: | |||
if not os.path.exists(dir_save + 'medians/'): | |||
os.makedirs(dir_save + 'medians/') | |||
os.makedirs(dir_save + 'medians/', exist_ok=True) | |||
print('Saving median graphs to files...') | |||
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', | |||
@@ -308,8 +307,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged | |||
def _init_output_file_preimage(ds_name, gkernel, fit_method, dir_output): | |||
if not os.path.exists(dir_output): | |||
os.makedirs(dir_output) | |||
os.makedirs(dir_output, exist_ok=True) | |||
# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' | |||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||
f_detail = open(dir_output + fn_output_detail, 'a') | |||
@@ -52,6 +52,14 @@ def chooseDataset(ds_name): | |||
return dataset | |||
def test_list_graph_kernels(): | |||
""" | |||
""" | |||
from gklearn.kernels import GRAPH_KERNELS, list_of_graph_kernels | |||
assert list_of_graph_kernels() == [i for i in GRAPH_KERNELS] | |||
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) | |||
@pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) | |||
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) | |||
@@ -433,10 +441,11 @@ def test_WLSubtree(ds_name, parallel): | |||
if __name__ == "__main__": | |||
test_list_graph_kernels() | |||
# test_spkernel('Alkane', 'imap_unordered') | |||
# test_StructuralSP('Fingerprint_edge', 'imap_unordered') | |||
test_WLSubtree('Acyclic', 'imap_unordered') | |||
# test_WLSubtree('Acyclic', 'imap_unordered') | |||
# test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') | |||
# test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') | |||
# test_RandomWalk('Acyclic', 'fp', None, None) | |||
# test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') | |||
# test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') |
@@ -13,6 +13,10 @@ import os | |||
class Dataset(object): | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) | |||
def __init__(self, filename=None, filename_targets=None, **kwargs): | |||
if filename is None: | |||
@@ -803,6 +807,10 @@ class Dataset(object): | |||
def split_dataset_by_target(dataset): | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) | |||
from gklearn.preimage.utils import get_same_item_indices | |||
graphs = dataset.graphs | |||
@@ -1,5 +1,9 @@ | |||
""" Utilities function to manage graph files | |||
""" | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning) | |||
from os.path import dirname, splitext | |||
@@ -45,6 +49,10 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): | |||
for details. Note here filename is the name of either .txt file in | |||
the dataset directory. | |||
""" | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('The function "gklearn.utils.load_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataLoader" instead.', DeprecationWarning) | |||
extension = splitext(filename)[1][1:] | |||
if extension == "ds": | |||
data, y, label_names = load_from_ds(filename, filename_targets) | |||
@@ -66,17 +74,19 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): | |||
def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', **kwargs): | |||
"""Save list of graphs. | |||
""" | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('The function "gklearn.utils.save_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataSaver" instead.', DeprecationWarning) | |||
import os | |||
dirname_ds = os.path.dirname(filename) | |||
if dirname_ds != '': | |||
dirname_ds += '/' | |||
if not os.path.exists(dirname_ds) : | |||
os.makedirs(dirname_ds) | |||
os.makedirs(dirname_ds, exist_ok=True) | |||
if 'graph_dir' in kwargs: | |||
graph_dir = kwargs['graph_dir'] + '/' | |||
if not os.path.exists(graph_dir): | |||
os.makedirs(graph_dir) | |||
os.makedirs(graph_dir, exist_ok=True) | |||
del kwargs['graph_dir'] | |||
else: | |||
graph_dir = dirname_ds | |||
@@ -13,6 +13,11 @@ import random | |||
class GraphSynthesizer(object): | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.graph_synthesizer.GraphSynthesizer" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.2.2.', DeprecationWarning) | |||
def __init__(self): | |||
pass | |||
@@ -671,13 +671,11 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None | |||
dirname_ds = os.path.dirname(filename) | |||
if dirname_ds != '': | |||
dirname_ds += '/' | |||
if not os.path.exists(dirname_ds) : | |||
os.makedirs(dirname_ds) | |||
os.makedirs(dirname_ds, exist_ok=True) | |||
if xparams is not None and 'graph_dir' in xparams: | |||
graph_dir = xparams['graph_dir'] + '/' | |||
if not os.path.exists(graph_dir): | |||
os.makedirs(graph_dir) | |||
os.makedirs(graph_dir, exist_ok=True) | |||
else: | |||
graph_dir = dirname_ds | |||
@@ -91,8 +91,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||
tqdm.monitor_interval = 0 | |||
output_dir += estimator.__name__ | |||
if not os.path.exists(output_dir): | |||
os.makedirs(output_dir) | |||
os.makedirs(output_dir, exist_ok=True) | |||
# a string to save all the results. | |||
str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' | |||
str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' | |||
@@ -604,8 +603,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||
str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) | |||
# open file to save all results for this dataset. | |||
if not os.path.exists(output_dir): | |||
os.makedirs(output_dir) | |||
os.makedirs(output_dir, exist_ok=True) | |||
# print out results as table. | |||
str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, | |||
@@ -458,8 +458,7 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d | |||
print() | |||
print('4. saving results...') | |||
if save_results: | |||
if not os.path.exists(dir_save): | |||
os.makedirs(dir_save) | |||
os.makedirs(dir_save, exist_ok=True) | |||
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list) | |||
print('\ncomplete.') | |||