diff --git a/gklearn/dataset/__init__.py b/gklearn/dataset/__init__.py new file mode 100644 index 0000000..ba6715e --- /dev/null +++ b/gklearn/dataset/__init__.py @@ -0,0 +1,22 @@ +# -*-coding:utf-8 -*- +"""gklearn - datasets module + +Implement some methods to manage graph datasets + graph_fetcher.py : fetch graph datasets from the Internet. + + +""" + +# info +__version__ = "0.2" +__author__ = "Linlin Jia" +__date__ = "October 2020" + + +from gklearn.dataset.metadata import DATABASES, DATASET_META +from gklearn.dataset.metadata import GREYC_META, IAM_META, TUDataset_META +from gklearn.dataset.metadata import list_of_databases, list_of_datasets +from gklearn.dataset.graph_synthesizer import GraphSynthesizer +from gklearn.dataset.data_fetcher import DataFetcher +from gklearn.dataset.file_managers import DataLoader, DataSaver +from gklearn.dataset.dataset import Dataset, split_dataset_by_target \ No newline at end of file diff --git a/gklearn/dataset/data_fetcher.py b/gklearn/dataset/data_fetcher.py new file mode 100644 index 0000000..8f3f167 --- /dev/null +++ b/gklearn/dataset/data_fetcher.py @@ -0,0 +1,1896 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Oct 20 14:25:49 2020 + +@author: + Paul Zanoncelli, paul.zanoncelli@ecole.ensicaen.fr + Luc Brun luc.brun@ensicaen.fr + Sebastien Bougleux sebastien.bougleux@unicaen.fr + Benoit Gaüzère benoit.gauzere@insa-rouen.fr + Linlin Jia linlin.jia@insa-rouen.fr +""" +import os +import os.path as osp +import urllib +import tarfile +from zipfile import ZipFile +# from gklearn.utils.graphfiles import loadDataset +import torch.nn.functional as F +import networkx as nx +import torch +import random +import sys +from lxml import etree +import re +from tqdm import tqdm +from gklearn.dataset import DATABASES, DATASET_META + + +class DataFetcher(): + + def __init__(self, name=None, root='datasets', reload=False, verbose=False): + self._name = name + self._root = root + if not osp.exists(self._root): + os.makedirs(self._root) + self._reload = reload + self._verbose = verbose +# self.has_train_valid_test = { +# "Coil_Del" : ('COIL-DEL/data/test.cxl','COIL-DEL/data/train.cxl','COIL-DEL/data/valid.cxl'), +# "Coil_Rag" : ('COIL-RAG/data/test.cxl','COIL-RAG/data/train.cxl','COIL-RAG/data/valid.cxl'), +# "Fingerprint" : ('Fingerprint/data/test.cxl','Fingerprint/data/train.cxl','Fingerprint/data/valid.cxl'), +# # "Grec" : ('GREC/data/test.cxl','GREC/data/train.cxl','GREC/data/valid.cxl'), +# "Letter" : {'HIGH' : ('Letter/HIGH/test.cxl','Letter/HIGH/train.cxl','Letter/HIGH/validation.cxl'), +# 'MED' : ('Letter/MED/test.cxl','Letter/MED/train.cxl','Letter/MED/validation.cxl'), +# 'LOW' : ('Letter/LOW/test.cxl','Letter/LOW/train.cxl','Letter/LOW/validation.cxl') +# }, +# "Mutagenicity" : ('Mutagenicity/data/test.cxl','Mutagenicity/data/train.cxl','Mutagenicity/data/validation.cxl'), +# # "Pah" : ['PAH/testset_0.ds','PAH/trainset_0.ds'], +# "Protein" : ('Protein/data/test.cxl','Protein/data/train.cxl','Protein/data/valid.cxl'), +# # "Web" : ('Web/data/test.cxl','Web/data/train.cxl','Web/data/valid.cxl') +# } + + if self._name is None: + if self._verbose: + print('No dataset name entered. All possible datasets will be loaded.') + self._name, self._path = [], [] + for idx, ds_name in enumerate(DATASET_META): + if self._verbose: + print(str(idx + 1), '/', str(len(DATASET_META)), 'Fetching', ds_name, end='... ') + self._name.append(ds_name) + success = self.write_archive_file(ds_name) + if success: + self._path.append(self.open_files(ds_name)) + else: + self._path.append(None) + if self._verbose and self._path[-1] is not None and not self._reload: + print('Fetched.') + + if self._verbose: + print('Finished.', str(sum(v is not None for v in self._path)), 'of', str(len(self._path)), 'datasets are successfully fetched.') + + elif self._name not in DATASET_META: + message = 'Invalid Dataset name "' + self._name + '".' + message += '\nAvailable datasets are as follows: \n\n' + message += '\n'.join(ds for ds in sorted(DATASET_META)) + raise ValueError(message) + else: + self.write_archive_file(self._name) + self._path = self.open_files(self._name) + +# self.max_for_letter = 0 +# if mode == 'Pytorch': +# if self._name in self.data_to_use_in_datasets : +# Gs,y = self.dataset +# inputs,adjs,y = self.from_networkx_to_pytorch(Gs,y) +# #print(inputs,adjs) +# self.pytorch_dataset = inputs,adjs,y +# elif self._name == "Pah": +# self.pytorch_dataset = [] +# test,train = self.dataset +# Gs_test,y_test = test +# Gs_train,y_train = train +# self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_test,y_test)) +# self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_train,y_train)) +# elif self._name in self.has_train_valid_test: +# self.pytorch_dataset = [] +# #[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs]) +# test,train,valid = self.dataset +# Gs_test,y_test = test +# +# Gs_train,y_train = train +# Gs_valid,y_valid = valid +# self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_test,y_test)) +# self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_train,y_train)) +# self.pytorch_dataset.append(self.from_networkx_to_pytorch(Gs_valid,y_valid)) +# ############# +# """ +# for G in Gs : +# for e in G.edges(): +# print(G[e[0]]) +# """ +# ############## + + + def download_file(self, url): + try : + response = urllib.request.urlopen(url) + except urllib.error.HTTPError: + print('"', url.split('/')[-1], '" is not available or incorrect http link.') + return + except urllib.error.URLError: + print('Network is unreachable.') + return + return response + + + def write_archive_file(self, ds_name): + path = osp.join(self._root, ds_name) + url = DATASET_META[ds_name]['url'] +# filename_dir = osp.join(path,filename) + if not osp.exists(path) or self._reload: + response = self.download_file(url) + if response is None: + return False + os.makedirs(path, exist_ok=True) + with open(os.path.join(path, url.split('/')[-1]), 'wb') as outfile: + outfile.write(response.read()) + + return True + + + def open_files(self, ds_name=None): + if ds_name is None: + ds_name = (self._name if isinstance(self._name, str) else self._name[0]) + filename = DATASET_META[ds_name]['url'].split('/')[-1] + path = osp.join(self._root, ds_name) + filename_archive = osp.join(path, filename) + + if filename.endswith('gz'): + if tarfile.is_tarfile(filename_archive): + with tarfile.open(filename_archive, 'r:gz') as tar: + if self._reload and self._verbose: + print(filename + ' Downloaded.') + subpath = os.path.join(path, tar.getnames()[0]) + if not osp.exists(subpath) or self._reload: + tar.extractall(path = path) + return subpath + elif filename.endswith('.tar'): + if tarfile.is_tarfile(filename_archive): + with tarfile.open(filename_archive, 'r:') as tar: + if self._reload and self._verbose: + print(filename + ' Downloaded.') + subpath = os.path.join(path, tar.getnames()[0]) + if not osp.exists(subpath) or self._reload: + tar.extractall(path = path) + return subpath + elif filename.endswith('.zip'): + with ZipFile(filename_archive, 'r') as zip_ref: + if self._reload and self._verbose: + print(filename + ' Downloaded.') + subpath = os.path.join(path, zip_ref.namelist()[0]) + if not osp.exists(subpath) or self._reload: + zip_ref.extractall(path) + return subpath + else: + raise ValueError(filename + ' Unsupported file.') + + + def get_all_ds_infos(self, database): + """Get information of all datasets from a database. + + Parameters + ---------- + database : string + DESCRIPTION. + + Returns + ------- + None. + """ + if database.lower() == 'tudataset': + infos = self.get_all_tud_ds_infos() + elif database.lower() == 'iam': + pass + else: + msg = 'Invalid Database name "' + database + '"' + msg += '\n Available databases are as follows: \n\n' + msg += '\n'.join(db for db in sorted(DATABASES)) + msg += 'Check "gklearn.dataset.DATASET_META" for more details.' + raise ValueError(msg) + + return infos + + + def get_all_tud_ds_infos(self): + """Get information of all datasets from database TUDataset. + + Returns + ------- + None. + """ + try: + response = urllib.request.urlopen(DATABASES['tudataset']) + except urllib.error.HTTPError: + print('The URL of the database "TUDataset" is not available:\n' + DATABASES['tudataset']) + + infos = {} + + # Get tables. + h_str = response.read() + tree = etree.HTML(h_str) + tables = tree.xpath('//table') + for table in tables: + # Get the domain of the datasets. + h2_nodes = table.getprevious() + if h2_nodes is not None and h2_nodes.tag == 'h2': + domain = h2_nodes.text.strip().lower() + else: + domain = '' + + # Get each line in the table. + tr_nodes = table.xpath('tbody/tr') + for tr in tr_nodes[1:]: + # Get each element in the line. + td_node = tr.xpath('td') + + # task type. + cls_txt = td_node[3].text.strip() + if not cls_txt.startswith('R'): + class_number = int(cls_txt) + task_type = 'classification' + else: + class_number = None + task_type = 'regression' + + # node attrs. + na_text = td_node[8].text.strip() + if not na_text.startswith('+'): + node_attr_dim = 0 + else: + node_attr_dim = int(re.findall('\((.*)\)', na_text)[0]) + + # edge attrs. + ea_text = td_node[10].text.strip() + if ea_text == 'temporal': + edge_attr_dim = ea_text + elif not ea_text.startswith('+'): + edge_attr_dim = 0 + else: + edge_attr_dim = int(re.findall('\((.*)\)', ea_text)[0]) + + # geometry. + geo_txt = td_node[9].text.strip() + if geo_txt == '–': + geometry = None + else: + geometry = geo_txt + + # url. + url = td_node[11].xpath('a')[0].attrib['href'].strip() + pos_zip = url.rfind('.zip') + url = url[:pos_zip + 4] + + infos[td_node[0].xpath('strong')[0].text.strip()] = { + 'database': 'tudataset', + 'reference': td_node[1].text.strip(), + 'dataset_size': int(td_node[2].text.strip()), + 'class_number': class_number, + 'task_type': task_type, + 'ave_node_num': float(td_node[4].text.strip()), + 'ave_edge_num': float(td_node[5].text.strip()), + 'node_labeled': True if td_node[6].text.strip() == '+' else False, + 'edge_labeled': True if td_node[7].text.strip() == '+' else False, + 'node_attr_dim': node_attr_dim, + 'geometry': geometry, + 'edge_attr_dim': edge_attr_dim, + 'url': url, + 'domain': domain + } + + return infos + + + def pretty_ds_infos(self, infos): + """Get the string that pretty prints the information of datasets. + + Parameters + ---------- + datasets : dict + The datasets' information. + + Returns + ------- + p_str : string + The pretty print of the datasets' information. + """ + p_str = '{\n' + for key, val in infos.items(): + p_str += '\t\'' + str(key) + '\': {\n' + for k, v in val.items(): + p_str += '\t\t\'' + str(k) + '\': ' + if isinstance(v, str): + p_str += '\'' + str(v) + '\',\n' + else: + p_str += '' + str(v) + ',\n' + p_str += '\t},\n' + p_str += '}' + + return p_str + + + @property + def path(self): + return self._path + + + + + + + + + + + + + + + + + + + + + def dataset(self): + if self.mode == "Tensorflow": + return #something + if self.mode == "Pytorch": + return self.pytorch_dataset + return self.dataset + + + def info(self): + print(self.info_dataset[self._name]) + + + def iter_load_dataset(self,data): + results = [] + for datasets in data : + results.append(loadDataset(osp.join(self._root,self._name,datasets))) + return results + + + def load_dataset(self,list_files): + if self._name == "Ptc": + if type(self.option) != str or self.option.upper() not in ['FR','FM','MM','MR']: + raise ValueError('option for Ptc dataset needs to be one of : \n fr fm mm mr') + results = [] + results.append(loadDataset(osp.join(self.root,self._name,'PTC/Test',self.gender + '.ds'))) + results.append(loadDataset(osp.join(self.root,self._name,'PTC/Train',self.gender + '.ds'))) + return results + if self.name == "Pah": + maximum_sets = 0 + for file in list_files: + if file.endswith('ds'): + maximum_sets = max(maximum_sets,int(file.split('_')[1].split('.')[0])) + self.max_for_letter = maximum_sets + if not type(self.option) == int or self.option > maximum_sets or self.option < 0: + raise ValueError('option needs to be an integer between 0 and ' + str(maximum_sets)) + data = self.has_train_valid_test["Pah"] + data[0] = self.has_train_valid_test["Pah"][0].split('_')[0] + '_' + str(self.option) + '.ds' + data[1] = self.has_train_valid_test["Pah"][1].split('_')[0] + '_' + str(self.option) + '.ds' + return self.iter_load_dataset(data) + if self.name == "Letter": + if type(self.option) == str and self.option.upper() in self.has_train_valid_test["Letter"]: + data = self.has_train_valid_test["Letter"][self.option.upper()] + else: + message = "The parameter for letter is incorrect choose between : " + message += "\nhigh med low" + raise ValueError(message) + return self.iter_load_dataset(data) + if self.name in self.has_train_valid_test : #common IAM dataset with train, valid and test + data = self.has_train_valid_test[self.name] + return self.iter_load_dataset(data) + else: #common dataset without train,valid and test, only dataset.ds file + data = self.data_to_use_in_datasets[self.name] + if len(data) > 1 and data[0] in list_files and data[1] in list_files: #case for Alkane + return loadDataset(osp.join(self.root,self.name,data[0]),filename_y = osp.join(self.root,self.name,data[1])) + if data in list_files: + return loadDataset(osp.join(self.root,self.name,data)) + + + def build_dictionary(self,Gs): + labels = set() + #next line : from DeepGraphWithNNTorch + #bond_type_number_maxi = int(max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs]))) + sizes = set() + for G in Gs : + for _,node in G.nodes(data = True): # or for node in nx.nodes(G) + #print(_,node) + labels.add(node["label"][0]) # labels.add(G.nodes[node]["label"][0]) #what do we use for IAM datasets (they don't have bond_type or event label) ? + sizes.add(G.order()) + label_dict = {} + #print("labels : ", labels, bond_type_number_maxi) + for i,label in enumerate(labels): + label_dict[label] = [0.]*len(labels) + label_dict[label][i] = 1. + return label_dict + + + def from_networkx_to_pytorch(self,Gs,y): + #exemple for MAO: atom_to_onehot = {'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]} + # code from https://github.com/bgauzere/pygnn/blob/master/utils.py + atom_to_onehot = self.build_dictionary(Gs) + max_size = 30 + adjs = [] + inputs = [] + for i, G in enumerate(Gs): + I = torch.eye(G.order(), G.order()) + #A = torch.Tensor(nx.adjacency_matrix(G).todense()) + #A = torch.Tensor(nx.to_numpy_matrix(G)) + A = torch.tensor(nx.to_scipy_sparse_matrix(G,dtype = int,weight = 'bond_type').todense(),dtype = torch.int) #what do we use for IAM datasets (they don't have bond_type or event label) ? + adj = F.pad(A, pad=(0, max_size-G.order(), 0, max_size-G.order())) #add I now ? if yes : F.pad(A + I,pad = (...)) + adjs.append(adj) + + f_0 = [] + for _, label in G.nodes(data=True): + #print(_,label) + cur_label = atom_to_onehot[label['label'][0]].copy() + f_0.append(cur_label) + + X = F.pad(torch.Tensor(f_0), pad=(0, 0, 0, max_size-G.order())) + inputs.append(X) + return inputs,adjs,y + + + def from_pytorch_to_tensorflow(self,batch_size): + seed = random.randrange(sys.maxsize) + random.seed(seed) + tf_inputs = random.sample(self.pytorch_dataset[0],batch_size) + random.seed(seed) + tf_y = random.sample(self.pytorch_dataset[2],batch_size) + + + def from_networkx_to_tensor(self,G,dict): + A=nx.to_numpy_matrix(G) + lab=[dict[G.nodes[v]['label'][0]] for v in nx.nodes(G)] + return (torch.tensor(A).view(1,A.shape[0]*A.shape[1]),torch.tensor(lab)) + + + + + #dataset= selfopen_files() + #print(build_dictionary(Gs)) + #dic={'C':0,'N':1,'O':2} + #A,labels=from_networkx_to_tensor(Gs[13],dic) + #print(nx.to_numpy_matrix(Gs[13]),labels) + #print(A,labels) + + #@todo : from_networkx_to_tensorflow + + +# dataloader = DataLoader('Acyclic',root = "database",option = 'high',mode = "Pytorch") +# dataloader.info() +# inputs,adjs,y = dataloader.pytorch_dataset + +# """ +# test,train,valid = dataloader.dataset +# Gs,y = test +# Gs2,y2 = train +# Gs3,y3 = valid +# """ +# #Gs,y = dataloader. +# #print(Gs,y) +# """ +# Gs,y = dataloader.dataset +# for G in Gs : +# for e in G.edges(): +# print(G[e[0]]) + +# """ + +# #for e in Gs[13].edges(): +# # print(Gs[13][e[0]]) + +# #print(from_networkx_to_tensor(Gs[7],{'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]})) + +# #dataset.open_files() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# import os +# import os.path as osp +# import urllib +# import tarfile +# from zipfile import ZipFile +# from gklearn.utils.graphfiles import loadDataset +# import torch +# import torch.nn.functional as F +# import networkx as nx +# import matplotlib.pyplot as plt +# import numpy as np + + +# +# def DataLoader(name,root = 'data',mode = "Networkx",downloadAll = False,reload = False,letter = "High",number = 0,gender = "MM"): +# dir_name = "_".join(name.split("-")) +# if not osp.exists(root) : +# os.makedirs(root) +# url = "https://brunl01.users.greyc.fr/CHEMISTRY/" +# urliam = "https://iapr-tc15.greyc.fr/IAM/" +# list_database = { +# "Ace" : (url,"ACEDataset.tar"), +# "Acyclic" : (url,"Acyclic.tar.gz"), +# "Aids" : (urliam,"AIDS.zip"), +# "Alkane" : (url,"alkane_dataset.tar.gz"), +# "Chiral" : (url,"DatasetAcyclicChiral.tar"), +# "Coil_Del" : (urliam,"COIL-DEL.zip"), +# "Coil_Rag" : (urliam,"COIL-RAG.zip"), +# "Fingerprint" : (urliam,"Fingerprint.zip"), +# "Grec" : (urliam,"GREC.zip"), +# "Letter" : (urliam,"Letter.zip"), +# "Mao" : (url,"mao.tgz"), +# "Monoterpenoides" : (url,"monoterpenoides.tar.gz"), +# "Mutagenicity" : (urliam,"Mutagenicity.zip"), +# "Pah" : (url,"PAH.tar.gz"), +# "Protein" : (urliam,"Protein.zip"), +# "Ptc" : (url,"ptc.tgz"), +# "Steroid" : (url,"SteroidDataset.tar"), +# "Vitamin" : (url,"DatasetVitamin.tar"), +# "Web" : (urliam,"Web.zip") +# } +# +# data_to_use_in_datasets = { +# "Acyclic" : ("Acyclic/dataset_bps.ds"), +# "Aids" : ("AIDS_A.txt"), +# "Alkane" : ("Alkane/dataset.ds","Alkane/dataset_boiling_point_names.txt"), +# "Mao" : ("MAO/dataset.ds"), +# "Monoterpenoides" : ("monoterpenoides/dataset_10+.ds"), #('monoterpenoides/dataset.ds'),('monoterpenoides/dataset_9.ds'),('monoterpenoides/trainset_9.ds') +# +# } +# has_train_valid_test = { +# "Coil_Del" : ('COIL-DEL/data/test.cxl','COIL-DEL/data/train.cxl','COIL-DEL/data/valid.cxl'), +# "Coil_Rag" : ('COIL-RAG/data/test.cxl','COIL-RAG/data/train.cxl','COIL-RAG/data/valid.cxl'), +# "Fingerprint" : ('Fingerprint/data/test.cxl','Fingerprint/data/train.cxl','Fingerprint/data/valid.cxl'), +# "Grec" : ('GREC/data/test.cxl','GREC/data/train.cxl','GREC/data/valid.cxl'), +# "Letter" : {'HIGH' : ('Letter/HIGH/test.cxl','Letter/HIGH/train.cxl','Letter/HIGH/validation.cxl'), +# 'MED' : ('Letter/MED/test.cxl','Letter/MED/train.cxl','Letter/MED/validation.cxl'), +# 'LOW' : ('Letter/LOW/test.cxl','Letter/LOW/train.cxl','Letter/LOW/validation.cxl') +# }, +# "Mutagenicity" : ('Mutagenicity/data/test.cxl','Mutagenicity/data/train.cxl','Mutagenicity/data/validation.cxl'), +# "Pah" : ['PAH/testset_0.ds','PAH/trainset_0.ds'], +# "Protein" : ('Protein/data/test.cxl','Protein/data/train.cxl','Protein/data/valid.cxl'), +# "Web" : ('Web/data/test.cxl','Web/data/train.cxl','Web/data/valid.cxl') +# } +# +# if not name : +# raise ValueError("No dataset entered") +# if name not in list_database: +# message = "Invalid Dataset name " + name +# message += '\n Available datasets are as follows : \n\n' + +# message += '\n'.join(database for database in list_database) +# raise ValueError(message) +# +# def download_file(url,filename): +# try : +# response = urllib.request.urlopen(url + filename) +# except urllib.error.HTTPError: +# print(filename + " not available or incorrect http link") +# return +# return response +# +# def write_archive_file(root,database): +# path = osp.join(root,database) +# url,filename = list_database[database] +# filename_dir = osp.join(path,filename) +# if not osp.exists(filename_dir) or reload: +# response = download_file(url,filename) +# if response is None : +# return +# if not osp.exists(path) : +# os.makedirs(path) +# with open(filename_dir,'wb') as outfile : +# outfile.write(response.read()) +# +# if downloadAll : +# print('Waiting...') +# for database in list_database : +# write_archive_file(root,database) +# print('Downloading finished') +# else: +# write_archive_file(root,name) +# +# def iter_load_dataset(data): +# results = [] +# for datasets in data : +# results.append(loadDataset(osp.join(root,name,datasets))) +# return results +# +# def load_dataset(list_files): +# if name == "Ptc": +# if gender.upper() not in ['FR','FM','MM','MR']: +# raise ValueError('gender chosen needs to be one of \n fr fm mm mr') +# results = [] +# results.append(loadDataset(osp.join(root,name,'PTC/Test',gender.upper() + '.ds'))) +# results.append(loadDataset(osp.join(root,name,'PTC/Train',gender.upper() + '.ds'))) +# return results +# if name == "Pah": +# maximum_sets = 0 +# for file in list_files: +# if file.endswith('ds'): +# maximum_sets = max(maximum_sets,int(file.split('_')[1].split('.')[0])) +# if number > maximum_sets : +# raise ValueError("Please select a dataset with number less than " + str(maximum_sets + 1)) +# data = has_train_valid_test["Pah"] +# data[0] = has_train_valid_test["Pah"][0].split('_')[0] + '_' + str(number) + '.ds' +# data[1] = has_train_valid_test["Pah"][1].split('_')[0] + '_' + str(number) + '.ds' +# #print(data) +# return iter_load_dataset(data) +# if name == "Letter": +# if letter.upper() in has_train_valid_test["Letter"]: +# data = has_train_valid_test["Letter"][letter.upper()] +# else: +# message = "The parameter for letter is incorrect choose between : " +# message += "\nhigh med low" +# raise ValueError(message) +# results = [] +# for datasets in data: +# results.append(loadDataset(osp.join(root,name,datasets))) +# return results +# if name in has_train_valid_test : #common IAM dataset with train, valid and test +# data = has_train_valid_test[name] +# results = [] +# for datasets in data : +# results.append(loadDataset(osp.join(root,name,datasets))) +# return results +# else: #common dataset without train,valid and test, only dataset.ds file +# data = data_to_use_in_datasets[name] +# if len(data) > 1 and data[0] in list_files and data[1] in list_files: +# return loadDataset(osp.join(root,name,data[0]),filename_y = osp.join(root,name,data[1])) +# if data in list_files: +# return loadDataset(osp.join(root,name,data)) + +# def open_files(): +# filename = list_database[name][1] +# path = osp.join(root,name) +# filename_archive = osp.join(root,name,filename) +# +# if filename.endswith('gz'): +# if tarfile.is_tarfile(filename_archive): +# with tarfile.open(filename_archive,"r:gz") as tar: +# if reload: +# print(filename + " Downloaded") +# tar.extractall(path = path) +# return load_dataset(tar.getnames()) +# #raise ValueError("dataset not available") +# +# +# elif filename.endswith('.tar'): +# if tarfile.is_tarfile(filename_archive): +# with tarfile.open(filename_archive,"r:") as tar: +# if reload : +# print(filename + " Downloaded") +# tar.extractall(path = path) +# return load_dataset(tar.getnames()) +# elif filename.endswith('.zip'): +# with ZipFile(filename_archive,"r") as zip_ref: +# if reload : +# print(filename + " Downloaded") +# zip_ref.extractall(path) +# return load_dataset(zip_ref.namelist()) +# else: +# print(filename + " Unsupported file") +# """ +# with tarfile.open(osp.join(root,name,list_database[name][1]),"r:gz") as files: +# for file in files.getnames(): +# print(file) +# """ +# +# def build_dictionary(Gs): +# labels = set() +# bond_type_number_maxi = int(max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs]))) +# print(bond_type_number_maxi) +# sizes = set() +# for G in Gs : +# for _,node in G.nodes(data = True): # or for node in nx.nodes(G) +# #print(node) +# labels.add(node["label"][0]) # labels.add(G.nodes[node]["label"][0]) +# sizes.add(G.order()) +# if len(labels) >= bond_type_number_maxi: +# break +# label_dict = {} +# for i,label in enumerate(labels): +# label_dict[label] = [0.]*bond_type_number_maxi +# label_dict[label][i] = 1. +# return label_dict +# +# def from_networkx_to_pytorch(Gs): +# #exemple : atom_to_onehot = {'C': [1., 0., 0.], 'N': [0., 1., 0.], 'O': [0., 0., 1.]} +# # code from https://github.com/bgauzere/pygnn/blob/master/utils.py +# atom_to_onehot = build_dictionary(Gs) +# max_size = 30 +# adjs = [] +# inputs = [] +# for i, G in enumerate(Gs): +# I = torch.eye(G.order(), G.order()) +# A = torch.Tensor(nx.adjacency_matrix(G).todense()) +# A = torch.tensor(nx.to_scipy_sparse_matrix(G,dtype = int,weight = 'bond_type').todense(),dtype = torch.int) +# adj = F.pad(A+I, pad=(0, max_size-G.order(), 0, max_size-G.order())) #add I now ? +# adjs.append(adj) + +# f_0 = [] +# for _, label in G.nodes(data=True): +# #print(_,label) +# cur_label = atom_to_onehot[label['label'][0]].copy() +# f_0.append(cur_label) + +# X = F.pad(torch.Tensor(f_0), pad=(0, 0, 0, max_size-G.order())) +# inputs.append(X) +# return inputs,adjs,y +# +# def from_networkx_to_tensor(G,dict): + +# A=nx.to_numpy_matrix(G) +# lab=[dict[G.nodes[v]['label'][0]] for v in nx.nodes(G)] +# return (torch.tensor(A).view(1,A.shape[0]*A.shape[1]),torch.tensor(lab)) +# +# dataset= open_files() +# #print(build_dictionary(Gs)) +# #dic={'C':0,'N':1,'O':2} +# #A,labels=from_networkx_to_tensor(Gs[13],dic) +# #print(nx.to_numpy_matrix(Gs[13]),labels) +# #print(A,labels) +# +# """ +# for G in Gs : +# for node in nx.nodes(G): +# print(G.nodes[node]) +# """ +# if mode == "pytorch": +# Gs,y = dataset +# inputs,adjs,y = from_networkx_to_pytorch(Gs) +# print(inputs,adjs) +# return inputs,adjs,y +# +# +# """ +# dic = dict() +# for i,l in enumerate(label): +# dic[l] = i +# dic = {'C': 0, 'N': 1, 'O': 2} +# A,labels=from_networkx_to_tensor(Gs[0],dic) +# #print(A,labels) +# return A,labels +# """ +# +# return dataset +# +# #open_files() +# + +# def label_to_color(label): +# if label == 'C': +# return 0.1 +# elif label == 'O': +# return 0.8 +# +# def nodes_to_color_sequence(G): +# return [label_to_color(c[1]['label'][0]) for c in G.nodes(data=True)] + + +# ############## +# """ +# dataset = DataLoader('Mao',root = "database") +# print(dataset) +# Gs,y = dataset +# """ + +# """ +# dataset = DataLoader('Alkane',root = "database") # Gs is empty here whereas y isn't -> not working +# Gs,y = dataset +# """ + +# """ +# dataset = DataLoader('Acyclic', root = "database") +# Gs,y = dataset +# """ + +# """ +# dataset = DataLoader('Monoterpenoides', root = "database") +# Gs,y = dataset +# """ + +# """ +# dataset = DataLoader('Pah',root = 'database', number = 8) +# test_set,train_set = dataset +# Gs,y = test_set +# Gs2,y2 = train_set +# """ + +# """ +# dataset = DataLoader('Coil_Del',root = "database") +# test,train,valid = dataset +# Gs,y = test +# Gs2,y2 = train +# Gs3, y3 = valid +# """ + +# """ +# dataset = DataLoader('Coil_Rag',root = "database") +# test,train,valid = dataset +# Gs,y = test +# Gs2,y2 = train +# Gs3, y3 = valid +# """ + +# """ +# dataset = DataLoader('Fingerprint',root = "database") +# test,train,valid = dataset +# Gs,y = test +# Gs2,y2 = train +# Gs3, y3 = valid +# """ + +# """ +# dataset = DataLoader('Grec',root = "database") +# test,train,valid = dataset +# Gs,y = test +# Gs2,y2 = train +# Gs3, y3 = valid +# """ + +# """ +# dataset = DataLoader('Letter',root = "database",letter = 'low') #high low med +# test,train,valid = dataset +# Gs,y = test +# Gs2,y2 = train +# Gs3, y3 = valid +# """ + +# """ +# dataset = DataLoader('Mutagenicity',root = "database") +# test,train,valid = dataset +# Gs,y = test +# Gs2,y2 = train +# Gs3, y3 = valid +# """ +# """ +# dataset = DataLoader('Protein',root = "database") +# test,train,valid = dataset +# Gs,y = test +# Gs2,y2 = train +# Gs3, y3 = valid +# """ + + +# """ +# dataset = DataLoader('Ptc', root = "database",gender = 'fm') # not working, Gs and y are empty perhaps issue coming from loadDataset +# valid,train = dataset +# Gs,y = valid +# Gs2,y2 = train +# """ + +# """ +# dataset = DataLoader('Web', root = "database") +# test,train,valid = dataset +# Gs,y = test +# Gs2,y2 = train +# Gs3,y3 = valid +# """ +# print(Gs,y) +# print(len(dataset)) +# ############## +# #print('edge max label',max(max([[G[e[0]][e[1]]['bond_type'] for e in G.edges()] for G in Gs]))) +# G1 = Gs[13] +# G2 = Gs[23] +# """ +# nx.draw_networkx(G1,with_labels=True,node_color = nodes_to_color_sequence(G1),cmap='autumn') +# plt.figure() + +# nx.draw_networkx(G2,with_labels=True,node_color = nodes_to_color_sequence(G2),cmap='autumn') +# """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# from pathlib import Path + +# DATA_PATH = Path("data") + +# def import_datasets(): +# +# import urllib +# import tarfile +# from zipfile import ZipFile + +# URL = "https://brunl01.users.greyc.fr/CHEMISTRY/" +# URLIAM = "https://iapr-tc15.greyc.fr/IAM/" +# + +# LIST_DATABASE = { +# "Pah" : (URL,"PAH.tar.gz"), +# "Mao" : (URL,"mao.tgz"), +# "Ptc" : (URL,"ptc.tgz"), +# "Aids" : (URLIAM,"AIDS.zip"), +# "Acyclic" : (URL,"Acyclic.tar.gz"), +# "Alkane" : (URL,"alkane_dataset.tar.gz"), +# "Chiral" : (URL,"DatasetAcyclicChiral.tar"), +# "Vitamin" : (URL,"DatasetVitamin.tar"), +# "Ace" : (URL,"ACEDataset.tar"), +# "Steroid" : (URL,"SteroidDataset.tar"), +# "Monoterpenoides" : (URL,"monoterpenoides.tar.gz"), +# "Letter" : (URLIAM,"Letter.zip"), +# "Grec" : (URLIAM,"GREC.zip"), +# "Fingerprint" : (URLIAM,"Fingerprint.zip"), +# "Coil_Rag" : (URLIAM,"COIL-RAG.zip"), +# "Coil_Del" : (URLIAM,"COIL-DEL.zip"), +# "Web" : (URLIAM,"Web.zip"), +# "Mutagenicity" : (URLIAM,"Mutagenicity.zip"), +# "Protein" : (URLIAM,"Protein.zip") +# } +# print("Select databases in the list. Select multiple, split by white spaces .\nWrite All to select all of them.\n") +# print(', '.join(database for database in LIST_DATABASE)) + +# print("Choice : ",end = ' ') +# selected_databases = input().split() + +# +# def download_file(url,filename): +# try : +# response = urllib.request.urlopen(url + filename) +# except urllib.error.HTTPError: +# print(filename + " not available or incorrect http link") +# return +# return response +# +# def write_archive_file(database): +# +# PATH = DATA_PATH / database +# url,filename = LIST_DATABASE[database] +# if not (PATH / filename).exists(): +# response = download_file(url,filename) +# if response is None : +# return +# if not PATH.exists() : +# PATH.mkdir(parents=True, exist_ok=True) +# with open(PATH/filename,'wb') as outfile : +# outfile.write(response.read()) +# +# if filename[-2:] == 'gz': +# if tarfile.is_tarfile(PATH/filename): +# with tarfile.open(PATH/filename,"r:gz") as tar: +# tar.extractall(path = PATH) +# print(filename + ' Downloaded') +# elif filename[-3:] == 'tar': +# if tarfile.is_tarfile(PATH/filename): +# with tarfile.open(PATH/filename,"r:") as tar: +# tar.extractall(path = PATH) +# print(filename + ' Downloaded') +# elif filename[-3:] == 'zip': +# with ZipFile(PATH/filename,"r") as zip_ref: +# zip_ref.extractall(PATH) +# print(filename + ' Downloaded') +# else: +# print("Unsupported file") + +# if 'All' in selected_databases: +# print('Waiting...') +# for database in LIST_DATABASE : +# write_archive_file(database) +# print('Finished') +# else: +# print('Waiting...') +# for database in selected_databases : +# if database in LIST_DATABASE : +# write_archive_file(database) +# print('Finished') +# import_datasets() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# class GraphFetcher(object): +# +# +# def __init__(self, filename=None, filename_targets=None, **kwargs): +# if filename is None: +# self._graphs = None +# self._targets = None +# self._node_labels = None +# self._edge_labels = None +# self._node_attrs = None +# self._edge_attrs = None +# else: +# self.load_dataset(filename, filename_targets=filename_targets, **kwargs) +# +# self._substructures = None +# self._node_label_dim = None +# self._edge_label_dim = None +# self._directed = None +# self._dataset_size = None +# self._total_node_num = None +# self._ave_node_num = None +# self._min_node_num = None +# self._max_node_num = None +# self._total_edge_num = None +# self._ave_edge_num = None +# self._min_edge_num = None +# self._max_edge_num = None +# self._ave_node_degree = None +# self._min_node_degree = None +# self._max_node_degree = None +# self._ave_fill_factor = None +# self._min_fill_factor = None +# self._max_fill_factor = None +# self._node_label_nums = None +# self._edge_label_nums = None +# self._node_attr_dim = None +# self._edge_attr_dim = None +# self._class_number = None +# +# +# def load_dataset(self, filename, filename_targets=None, **kwargs): +# self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs) +# self._node_labels = label_names['node_labels'] +# self._node_attrs = label_names['node_attrs'] +# self._edge_labels = label_names['edge_labels'] +# self._edge_attrs = label_names['edge_attrs'] +# self.clean_labels() +# +# +# def load_graphs(self, graphs, targets=None): +# # this has to be followed by set_labels(). +# self._graphs = graphs +# self._targets = targets +# # self.set_labels_attrs() # @todo +# +# +# def load_predefined_dataset(self, ds_name): +# current_path = os.path.dirname(os.path.realpath(__file__)) + '/' +# if ds_name == 'Acyclic': +# ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'AIDS': +# ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'Alkane': +# ds_file = current_path + '../../datasets/Alkane/dataset.ds' +# fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets) +# elif ds_name == 'COIL-DEL': +# ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'COIL-RAG': +# ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'COLORS-3': +# ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'Cuneiform': +# ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'DD': +# ds_file = current_path + '../../datasets/DD/DD_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'ENZYMES': +# ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'Fingerprint': +# ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'FRANKENSTEIN': +# ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'Letter-high': # node non-symb +# ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'Letter-low': # node non-symb +# ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'Letter-med': # node non-symb +# ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'MAO': +# ds_file = current_path + '../../datasets/MAO/dataset.ds' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'Monoterpenoides': +# ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'MUTAG': +# ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'NCI1': +# ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'NCI109': +# ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'PAH': +# ds_file = current_path + '../../datasets/PAH/dataset.ds' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'SYNTHETIC': +# pass +# elif ds_name == 'SYNTHETICnew': +# ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' +# self._graphs, self._targets, label_names = load_dataset(ds_file) +# elif ds_name == 'Synthie': +# pass +# else: +# raise Exception('The dataset name "', ds_name, '" is not pre-defined.') +# +# self._node_labels = label_names['node_labels'] +# self._node_attrs = label_names['node_attrs'] +# self._edge_labels = label_names['edge_labels'] +# self._edge_attrs = label_names['edge_attrs'] +# self.clean_labels() +# + +# def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): +# self._node_labels = node_labels +# self._node_attrs = node_attrs +# self._edge_labels = edge_labels +# self._edge_attrs = edge_attrs + +# +# def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): +# # @todo: remove labels which have only one possible values. +# if node_labels is None: +# self._node_labels = self._graphs[0].graph['node_labels'] +# # # graphs are considered node unlabeled if all nodes have the same label. +# # infos.update({'node_labeled': is_nl if node_label_num > 1 else False}) +# if node_attrs is None: +# self._node_attrs = self._graphs[0].graph['node_attrs'] +# # for G in Gn: +# # for n in G.nodes(data=True): +# # if 'attributes' in n[1]: +# # return len(n[1]['attributes']) +# # return 0 +# if edge_labels is None: +# self._edge_labels = self._graphs[0].graph['edge_labels'] +# # # graphs are considered edge unlabeled if all edges have the same label. +# # infos.update({'edge_labeled': is_el if edge_label_num > 1 else False}) +# if edge_attrs is None: +# self._edge_attrs = self._graphs[0].graph['edge_attrs'] +# # for G in Gn: +# # if nx.number_of_edges(G) > 0: +# # for e in G.edges(data=True): +# # if 'attributes' in e[2]: +# # return len(e[2]['attributes']) +# # return 0 +# +# +# def get_dataset_infos(self, keys=None, params=None): +# """Computes and returns the structure and property information of the graph dataset. +# +# Parameters +# ---------- +# keys : list, optional +# A list of strings which indicate which informations will be returned. The +# possible choices includes: +# +# 'substructures': sub-structures graphs contains, including 'linear', 'non +# linear' and 'cyclic'. +# +# 'node_label_dim': whether vertices have symbolic labels. +# +# 'edge_label_dim': whether egdes have symbolic labels. +# +# 'directed': whether graphs in dataset are directed. +# +# 'dataset_size': number of graphs in dataset. +# +# 'total_node_num': total number of vertices of all graphs in dataset. +# +# 'ave_node_num': average number of vertices of graphs in dataset. +# +# 'min_node_num': minimum number of vertices of graphs in dataset. +# +# 'max_node_num': maximum number of vertices of graphs in dataset. +# +# 'total_edge_num': total number of edges of all graphs in dataset. +# +# 'ave_edge_num': average number of edges of graphs in dataset. +# +# 'min_edge_num': minimum number of edges of graphs in dataset. +# +# 'max_edge_num': maximum number of edges of graphs in dataset. +# +# 'ave_node_degree': average vertex degree of graphs in dataset. +# +# 'min_node_degree': minimum vertex degree of graphs in dataset. +# +# 'max_node_degree': maximum vertex degree of graphs in dataset. +# +# 'ave_fill_factor': average fill factor (number_of_edges / +# (number_of_nodes ** 2)) of graphs in dataset. +# +# 'min_fill_factor': minimum fill factor of graphs in dataset. +# +# 'max_fill_factor': maximum fill factor of graphs in dataset. +# +# 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset. +# +# 'edge_label_nums': list number of symbolic edge labels of graphs in dataset. +# +# 'node_attr_dim': number of dimensions of non-symbolic vertex labels. +# Extracted from the 'attributes' attribute of graph nodes. +# +# 'edge_attr_dim': number of dimensions of non-symbolic edge labels. +# Extracted from the 'attributes' attribute of graph edges. +# +# 'class_number': number of classes. Only available for classification problems. +# +# 'all_degree_entropy': the entropy of degree distribution of each graph. +# +# 'ave_degree_entropy': the average entropy of degree distribution of all graphs. +# +# All informations above will be returned if `keys` is not given. +# +# params: dict of dict, optional +# A dictinary which contains extra parameters for each possible +# element in ``keys``. +# +# Return +# ------ +# dict +# Information of the graph dataset keyed by `keys`. +# """ +# infos = {} +# +# if keys == None: +# keys = [ +# 'substructures', +# 'node_label_dim', +# 'edge_label_dim', +# 'directed', +# 'dataset_size', +# 'total_node_num', +# 'ave_node_num', +# 'min_node_num', +# 'max_node_num', +# 'total_edge_num', +# 'ave_edge_num', +# 'min_edge_num', +# 'max_edge_num', +# 'ave_node_degree', +# 'min_node_degree', +# 'max_node_degree', +# 'ave_fill_factor', +# 'min_fill_factor', +# 'max_fill_factor', +# 'node_label_nums', +# 'edge_label_nums', +# 'node_attr_dim', +# 'edge_attr_dim', +# 'class_number', +# 'all_degree_entropy', +# 'ave_degree_entropy' +# ] +# +# # dataset size +# if 'dataset_size' in keys: +# if self._dataset_size is None: +# self._dataset_size = self._get_dataset_size() +# infos['dataset_size'] = self._dataset_size +# +# # graph node number +# if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']): +# all_node_nums = self._get_all_node_nums() + +# if 'total_node_num' in keys: +# if self._total_node_num is None: +# self._total_node_num = self._get_total_node_num(all_node_nums) +# infos['total_node_num'] = self._total_node_num +# +# if 'ave_node_num' in keys: +# if self._ave_node_num is None: +# self._ave_node_num = self._get_ave_node_num(all_node_nums) +# infos['ave_node_num'] = self._ave_node_num +# +# if 'min_node_num' in keys: +# if self._min_node_num is None: +# self._min_node_num = self._get_min_node_num(all_node_nums) +# infos['min_node_num'] = self._min_node_num +# +# if 'max_node_num' in keys: +# if self._max_node_num is None: +# self._max_node_num = self._get_max_node_num(all_node_nums) +# infos['max_node_num'] = self._max_node_num +# +# # graph edge number +# if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']): +# all_edge_nums = self._get_all_edge_nums() + +# if 'total_edge_num' in keys: +# if self._total_edge_num is None: +# self._total_edge_num = self._get_total_edge_num(all_edge_nums) +# infos['total_edge_num'] = self._total_edge_num +# +# if 'ave_edge_num' in keys: +# if self._ave_edge_num is None: +# self._ave_edge_num = self._get_ave_edge_num(all_edge_nums) +# infos['ave_edge_num'] = self._ave_edge_num +# +# if 'max_edge_num' in keys: +# if self._max_edge_num is None: +# self._max_edge_num = self._get_max_edge_num(all_edge_nums) +# infos['max_edge_num'] = self._max_edge_num + +# if 'min_edge_num' in keys: +# if self._min_edge_num is None: +# self._min_edge_num = self._get_min_edge_num(all_edge_nums) +# infos['min_edge_num'] = self._min_edge_num +# +# # label number +# if 'node_label_dim' in keys: +# if self._node_label_dim is None: +# self._node_label_dim = self._get_node_label_dim() +# infos['node_label_dim'] = self._node_label_dim +# +# if 'node_label_nums' in keys: +# if self._node_label_nums is None: +# self._node_label_nums = {} +# for node_label in self._node_labels: +# self._node_label_nums[node_label] = self._get_node_label_num(node_label) +# infos['node_label_nums'] = self._node_label_nums +# +# if 'edge_label_dim' in keys: +# if self._edge_label_dim is None: +# self._edge_label_dim = self._get_edge_label_dim() +# infos['edge_label_dim'] = self._edge_label_dim +# +# if 'edge_label_nums' in keys: +# if self._edge_label_nums is None: +# self._edge_label_nums = {} +# for edge_label in self._edge_labels: +# self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label) +# infos['edge_label_nums'] = self._edge_label_nums +# +# if 'directed' in keys or 'substructures' in keys: +# if self._directed is None: +# self._directed = self._is_directed() +# infos['directed'] = self._directed +# +# # node degree +# if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']): +# all_node_degrees = self._get_all_node_degrees() +# +# if 'ave_node_degree' in keys: +# if self._ave_node_degree is None: +# self._ave_node_degree = self._get_ave_node_degree(all_node_degrees) +# infos['ave_node_degree'] = self._ave_node_degree +# +# if 'max_node_degree' in keys: +# if self._max_node_degree is None: +# self._max_node_degree = self._get_max_node_degree(all_node_degrees) +# infos['max_node_degree'] = self._max_node_degree +# +# if 'min_node_degree' in keys: +# if self._min_node_degree is None: +# self._min_node_degree = self._get_min_node_degree(all_node_degrees) +# infos['min_node_degree'] = self._min_node_degree +# +# # fill factor +# if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']): +# all_fill_factors = self._get_all_fill_factors() +# +# if 'ave_fill_factor' in keys: +# if self._ave_fill_factor is None: +# self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors) +# infos['ave_fill_factor'] = self._ave_fill_factor +# +# if 'max_fill_factor' in keys: +# if self._max_fill_factor is None: +# self._max_fill_factor = self._get_max_fill_factor(all_fill_factors) +# infos['max_fill_factor'] = self._max_fill_factor +# +# if 'min_fill_factor' in keys: +# if self._min_fill_factor is None: +# self._min_fill_factor = self._get_min_fill_factor(all_fill_factors) +# infos['min_fill_factor'] = self._min_fill_factor +# +# if 'substructures' in keys: +# if self._substructures is None: +# self._substructures = self._get_substructures() +# infos['substructures'] = self._substructures +# +# if 'class_number' in keys: +# if self._class_number is None: +# self._class_number = self._get_class_number() +# infos['class_number'] = self._class_number +# +# if 'node_attr_dim' in keys: +# if self._node_attr_dim is None: +# self._node_attr_dim = self._get_node_attr_dim() +# infos['node_attr_dim'] = self._node_attr_dim +# +# if 'edge_attr_dim' in keys: +# if self._edge_attr_dim is None: +# self._edge_attr_dim = self._get_edge_attr_dim() +# infos['edge_attr_dim'] = self._edge_attr_dim +# +# # entropy of degree distribution. +# +# if 'all_degree_entropy' in keys: +# if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): +# base = params['all_degree_entropy']['base'] +# else: +# base = None +# infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base) +# +# if 'ave_degree_entropy' in keys: +# if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): +# base = params['ave_degree_entropy']['base'] +# else: +# base = None +# infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) +# +# return infos +# +# +# def print_graph_infos(self, infos): +# from collections import OrderedDict +# keys = list(infos.keys()) +# print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) +# +# +# def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): +# node_labels = [item for item in node_labels if item in self._node_labels] +# edge_labels = [item for item in edge_labels if item in self._edge_labels] +# node_attrs = [item for item in node_attrs if item in self._node_attrs] +# edge_attrs = [item for item in edge_attrs if item in self._edge_attrs] + +# for g in self._graphs: +# for nd in g.nodes(): +# for nl in node_labels: +# del g.nodes[nd][nl] +# for na in node_attrs: +# del g.nodes[nd][na] +# for ed in g.edges(): +# for el in edge_labels: +# del g.edges[ed][el] +# for ea in edge_attrs: +# del g.edges[ed][ea] +# if len(node_labels) > 0: +# self._node_labels = [nl for nl in self._node_labels if nl not in node_labels] +# if len(edge_labels) > 0: +# self._edge_labels = [el for el in self._edge_labels if el not in edge_labels] +# if len(node_attrs) > 0: +# self._node_attrs = [na for na in self._node_attrs if na not in node_attrs] +# if len(edge_attrs) > 0: +# self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs] +# +# +# def clean_labels(self): +# labels = [] +# for name in self._node_labels: +# label = set() +# for G in self._graphs: +# label = label | set(nx.get_node_attributes(G, name).values()) +# if len(label) > 1: +# labels.append(name) +# break +# if len(label) < 2: +# for G in self._graphs: +# for nd in G.nodes(): +# del G.nodes[nd][name] +# self._node_labels = labels + +# labels = [] +# for name in self._edge_labels: +# label = set() +# for G in self._graphs: +# label = label | set(nx.get_edge_attributes(G, name).values()) +# if len(label) > 1: +# labels.append(name) +# break +# if len(label) < 2: +# for G in self._graphs: +# for ed in G.edges(): +# del G.edges[ed][name] +# self._edge_labels = labels + +# labels = [] +# for name in self._node_attrs: +# label = set() +# for G in self._graphs: +# label = label | set(nx.get_node_attributes(G, name).values()) +# if len(label) > 1: +# labels.append(name) +# break +# if len(label) < 2: +# for G in self._graphs: +# for nd in G.nodes(): +# del G.nodes[nd][name] +# self._node_attrs = labels + +# labels = [] +# for name in self._edge_attrs: +# label = set() +# for G in self._graphs: +# label = label | set(nx.get_edge_attributes(G, name).values()) +# if len(label) > 1: +# labels.append(name) +# break +# if len(label) < 2: +# for G in self._graphs: +# for ed in G.edges(): +# del G.edges[ed][name] +# self._edge_attrs = labels +# +# +# def cut_graphs(self, range_): +# self._graphs = [self._graphs[i] for i in range_] +# if self._targets is not None: +# self._targets = [self._targets[i] for i in range_] +# self.clean_labels() + + +# def trim_dataset(self, edge_required=False): +# if edge_required: +# trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)] +# else: +# trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0] +# idx = [p[0] for p in trimed_pairs] +# self._graphs = [p[1] for p in trimed_pairs] +# self._targets = [self._targets[i] for i in idx] +# self.clean_labels() +# +# +# def copy(self): +# dataset = Dataset() +# graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None +# target = self._targets.copy() if self._targets is not None else None +# node_labels = self._node_labels.copy() if self._node_labels is not None else None +# node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None +# edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None +# edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None +# dataset.load_graphs(graphs, target) +# dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) +# # @todo: clean_labels and add other class members? +# return dataset +# +# +# def get_all_node_labels(self): +# node_labels = [] +# for g in self._graphs: +# for n in g.nodes(): +# nl = tuple(g.nodes[n].items()) +# if nl not in node_labels: +# node_labels.append(nl) +# return node_labels +# +# +# def get_all_edge_labels(self): +# edge_labels = [] +# for g in self._graphs: +# for e in g.edges(): +# el = tuple(g.edges[e].items()) +# if el not in edge_labels: +# edge_labels.append(el) +# return edge_labels +# +# +# def _get_dataset_size(self): +# return len(self._graphs) +# +# +# def _get_all_node_nums(self): +# return [nx.number_of_nodes(G) for G in self._graphs] +# +# +# def _get_total_node_nums(self, all_node_nums): +# return np.sum(all_node_nums) +# +# +# def _get_ave_node_num(self, all_node_nums): +# return np.mean(all_node_nums) +# +# +# def _get_min_node_num(self, all_node_nums): +# return np.amin(all_node_nums) +# +# +# def _get_max_node_num(self, all_node_nums): +# return np.amax(all_node_nums) +# +# +# def _get_all_edge_nums(self): +# return [nx.number_of_edges(G) for G in self._graphs] +# +# +# def _get_total_edge_nums(self, all_edge_nums): +# return np.sum(all_edge_nums) +# +# +# def _get_ave_edge_num(self, all_edge_nums): +# return np.mean(all_edge_nums) +# +# +# def _get_min_edge_num(self, all_edge_nums): +# return np.amin(all_edge_nums) +# +# +# def _get_max_edge_num(self, all_edge_nums): +# return np.amax(all_edge_nums) +# +# +# def _get_node_label_dim(self): +# return len(self._node_labels) +# +# +# def _get_node_label_num(self, node_label): +# nl = set() +# for G in self._graphs: +# nl = nl | set(nx.get_node_attributes(G, node_label).values()) +# return len(nl) +# +# +# def _get_edge_label_dim(self): +# return len(self._edge_labels) +# +# +# def _get_edge_label_num(self, edge_label): +# el = set() +# for G in self._graphs: +# el = el | set(nx.get_edge_attributes(G, edge_label).values()) +# return len(el) +# +# +# def _is_directed(self): +# return nx.is_directed(self._graphs[0]) +# +# +# def _get_all_node_degrees(self): +# return [np.mean(list(dict(G.degree()).values())) for G in self._graphs] +# +# +# def _get_ave_node_degree(self, all_node_degrees): +# return np.mean(all_node_degrees) +# +# +# def _get_max_node_degree(self, all_node_degrees): +# return np.amax(all_node_degrees) +# +# +# def _get_min_node_degree(self, all_node_degrees): +# return np.amin(all_node_degrees) +# +# +# def _get_all_fill_factors(self): +# """Get fill factor, the number of non-zero entries in the adjacency matrix. + +# Returns +# ------- +# list[float] +# List of fill factors for all graphs. +# """ +# return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs] +# + +# def _get_ave_fill_factor(self, all_fill_factors): +# return np.mean(all_fill_factors) +# +# +# def _get_max_fill_factor(self, all_fill_factors): +# return np.amax(all_fill_factors) +# +# +# def _get_min_fill_factor(self, all_fill_factors): +# return np.amin(all_fill_factors) +# +# +# def _get_substructures(self): +# subs = set() +# for G in self._graphs: +# degrees = list(dict(G.degree()).values()) +# if any(i == 2 for i in degrees): +# subs.add('linear') +# if np.amax(degrees) >= 3: +# subs.add('non linear') +# if 'linear' in subs and 'non linear' in subs: +# break + +# if self._directed: +# for G in self._graphs: +# if len(list(nx.find_cycle(G))) > 0: +# subs.add('cyclic') +# break +# # else: +# # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way. +# # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10 +# # for G in Gn: +# # if (nx.number_of_edges(G) < upper): +# # cyc = list(nx.simple_cycles(G.to_directed())) +# # if any(len(i) > 2 for i in cyc): +# # subs.add('cyclic') +# # break +# # if 'cyclic' not in subs: +# # for G in Gn: +# # cyc = list(nx.simple_cycles(G.to_directed())) +# # if any(len(i) > 2 for i in cyc): +# # subs.add('cyclic') +# # break +# +# return subs +# +# +# def _get_class_num(self): +# return len(set(self._targets)) +# +# +# def _get_node_attr_dim(self): +# return len(self._node_attrs) +# +# +# def _get_edge_attr_dim(self): +# return len(self._edge_attrs) + +# +# def _compute_all_degree_entropy(self, base=None): +# """Compute the entropy of degree distribution of each graph. + +# Parameters +# ---------- +# base : float, optional +# The logarithmic base to use. The default is ``e`` (natural logarithm). + +# Returns +# ------- +# degree_entropy : float +# The calculated entropy. +# """ +# from gklearn.utils.stats import entropy +# +# degree_entropy = [] +# for g in self._graphs: +# degrees = list(dict(g.degree()).values()) +# en = entropy(degrees, base=base) +# degree_entropy.append(en) +# return degree_entropy +# +# +# @property +# def graphs(self): +# return self._graphs + + +# @property +# def targets(self): +# return self._targets +# +# +# @property +# def node_labels(self): +# return self._node_labels + + +# @property +# def edge_labels(self): +# return self._edge_labels +# +# +# @property +# def node_attrs(self): +# return self._node_attrs +# +# +# @property +# def edge_attrs(self): +# return self._edge_attrs +# +# +# def split_dataset_by_target(dataset): +# from gklearn.preimage.utils import get_same_item_indices +# +# graphs = dataset.graphs +# targets = dataset.targets +# datasets = [] +# idx_targets = get_same_item_indices(targets) +# for key, val in idx_targets.items(): +# sub_graphs = [graphs[i] for i in val] +# sub_dataset = Dataset() +# sub_dataset.load_graphs(sub_graphs, [key] * len(val)) +# node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None +# node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None +# edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None +# edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None +# sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) +# datasets.append(sub_dataset) +# # @todo: clean_labels? +# return datasets \ No newline at end of file diff --git a/gklearn/dataset/dataset.py b/gklearn/dataset/dataset.py new file mode 100644 index 0000000..0343c0b --- /dev/null +++ b/gklearn/dataset/dataset.py @@ -0,0 +1,823 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 26 18:48:27 2020 + +@author: ljia +""" +import numpy as np +import networkx as nx +from gklearn.utils.graph_files import load_dataset +import os + + +class Dataset(object): + + + def __init__(self, filename=None, filename_targets=None, **kwargs): + if filename is None: + self._graphs = None + self._targets = None + self._node_labels = None + self._edge_labels = None + self._node_attrs = None + self._edge_attrs = None + else: + self.load_dataset(filename, filename_targets=filename_targets, **kwargs) + + self._substructures = None + self._node_label_dim = None + self._edge_label_dim = None + self._directed = None + self._dataset_size = None + self._total_node_num = None + self._ave_node_num = None + self._min_node_num = None + self._max_node_num = None + self._total_edge_num = None + self._ave_edge_num = None + self._min_edge_num = None + self._max_edge_num = None + self._ave_node_degree = None + self._min_node_degree = None + self._max_node_degree = None + self._ave_fill_factor = None + self._min_fill_factor = None + self._max_fill_factor = None + self._node_label_nums = None + self._edge_label_nums = None + self._node_attr_dim = None + self._edge_attr_dim = None + self._class_number = None + + + def load_dataset(self, filename, filename_targets=None, **kwargs): + self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs) + self._node_labels = label_names['node_labels'] + self._node_attrs = label_names['node_attrs'] + self._edge_labels = label_names['edge_labels'] + self._edge_attrs = label_names['edge_attrs'] + self.clean_labels() + + + def load_graphs(self, graphs, targets=None): + # this has to be followed by set_labels(). + self._graphs = graphs + self._targets = targets +# self.set_labels_attrs() # @todo + + + def load_predefined_dataset(self, ds_name): + current_path = os.path.dirname(os.path.realpath(__file__)) + '/' + if ds_name == 'Acyclic': + ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'AIDS': + ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'Alkane': + ds_file = current_path + '../../datasets/Alkane/dataset.ds' + fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets) + elif ds_name == 'COIL-DEL': + ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'COIL-RAG': + ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'COLORS-3': + ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'Cuneiform': + ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'DD': + ds_file = current_path + '../../datasets/DD/DD_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'ENZYMES': + ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'Fingerprint': + ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'FRANKENSTEIN': + ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'Letter-high': # node non-symb + ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'Letter-low': # node non-symb + ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'Letter-med': # node non-symb + ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'MAO': + ds_file = current_path + '../../datasets/MAO/dataset.ds' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'Monoterpenoides': + ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'MUTAG': + ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'NCI1': + ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'NCI109': + ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'PAH': + ds_file = current_path + '../../datasets/PAH/dataset.ds' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'SYNTHETIC': + pass + elif ds_name == 'SYNTHETICnew': + ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' + self._graphs, self._targets, label_names = load_dataset(ds_file) + elif ds_name == 'Synthie': + pass + else: + raise Exception('The dataset name "', ds_name, '" is not pre-defined.') + + self._node_labels = label_names['node_labels'] + self._node_attrs = label_names['node_attrs'] + self._edge_labels = label_names['edge_labels'] + self._edge_attrs = label_names['edge_attrs'] + self.clean_labels() + + + def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): + self._node_labels = node_labels + self._node_attrs = node_attrs + self._edge_labels = edge_labels + self._edge_attrs = edge_attrs + + + def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): + # @todo: remove labels which have only one possible values. + if node_labels is None: + self._node_labels = self._graphs[0].graph['node_labels'] +# # graphs are considered node unlabeled if all nodes have the same label. +# infos.update({'node_labeled': is_nl if node_label_num > 1 else False}) + if node_attrs is None: + self._node_attrs = self._graphs[0].graph['node_attrs'] +# for G in Gn: +# for n in G.nodes(data=True): +# if 'attributes' in n[1]: +# return len(n[1]['attributes']) +# return 0 + if edge_labels is None: + self._edge_labels = self._graphs[0].graph['edge_labels'] +# # graphs are considered edge unlabeled if all edges have the same label. +# infos.update({'edge_labeled': is_el if edge_label_num > 1 else False}) + if edge_attrs is None: + self._edge_attrs = self._graphs[0].graph['edge_attrs'] +# for G in Gn: +# if nx.number_of_edges(G) > 0: +# for e in G.edges(data=True): +# if 'attributes' in e[2]: +# return len(e[2]['attributes']) +# return 0 + + + def get_dataset_infos(self, keys=None, params=None): + """Computes and returns the structure and property information of the graph dataset. + + Parameters + ---------- + keys : list, optional + A list of strings which indicate which informations will be returned. The + possible choices includes: + + 'substructures': sub-structures graphs contains, including 'linear', 'non + linear' and 'cyclic'. + + 'node_label_dim': whether vertices have symbolic labels. + + 'edge_label_dim': whether egdes have symbolic labels. + + 'directed': whether graphs in dataset are directed. + + 'dataset_size': number of graphs in dataset. + + 'total_node_num': total number of vertices of all graphs in dataset. + + 'ave_node_num': average number of vertices of graphs in dataset. + + 'min_node_num': minimum number of vertices of graphs in dataset. + + 'max_node_num': maximum number of vertices of graphs in dataset. + + 'total_edge_num': total number of edges of all graphs in dataset. + + 'ave_edge_num': average number of edges of graphs in dataset. + + 'min_edge_num': minimum number of edges of graphs in dataset. + + 'max_edge_num': maximum number of edges of graphs in dataset. + + 'ave_node_degree': average vertex degree of graphs in dataset. + + 'min_node_degree': minimum vertex degree of graphs in dataset. + + 'max_node_degree': maximum vertex degree of graphs in dataset. + + 'ave_fill_factor': average fill factor (number_of_edges / + (number_of_nodes ** 2)) of graphs in dataset. + + 'min_fill_factor': minimum fill factor of graphs in dataset. + + 'max_fill_factor': maximum fill factor of graphs in dataset. + + 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset. + + 'edge_label_nums': list number of symbolic edge labels of graphs in dataset. + + 'node_attr_dim': number of dimensions of non-symbolic vertex labels. + Extracted from the 'attributes' attribute of graph nodes. + + 'edge_attr_dim': number of dimensions of non-symbolic edge labels. + Extracted from the 'attributes' attribute of graph edges. + + 'class_number': number of classes. Only available for classification problems. + + 'all_degree_entropy': the entropy of degree distribution of each graph. + + 'ave_degree_entropy': the average entropy of degree distribution of all graphs. + + All informations above will be returned if `keys` is not given. + + params: dict of dict, optional + A dictinary which contains extra parameters for each possible + element in ``keys``. + + Return + ------ + dict + Information of the graph dataset keyed by `keys`. + """ + infos = {} + + if keys == None: + keys = [ + 'substructures', + 'node_label_dim', + 'edge_label_dim', + 'directed', + 'dataset_size', + 'total_node_num', + 'ave_node_num', + 'min_node_num', + 'max_node_num', + 'total_edge_num', + 'ave_edge_num', + 'min_edge_num', + 'max_edge_num', + 'ave_node_degree', + 'min_node_degree', + 'max_node_degree', + 'ave_fill_factor', + 'min_fill_factor', + 'max_fill_factor', + 'node_label_nums', + 'edge_label_nums', + 'node_attr_dim', + 'edge_attr_dim', + 'class_number', + 'all_degree_entropy', + 'ave_degree_entropy' + ] + + # dataset size + if 'dataset_size' in keys: + if self._dataset_size is None: + self._dataset_size = self._get_dataset_size() + infos['dataset_size'] = self._dataset_size + + # graph node number + if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']): + all_node_nums = self._get_all_node_nums() + + if 'total_node_num' in keys: + if self._total_node_num is None: + self._total_node_num = self._get_total_node_num(all_node_nums) + infos['total_node_num'] = self._total_node_num + + if 'ave_node_num' in keys: + if self._ave_node_num is None: + self._ave_node_num = self._get_ave_node_num(all_node_nums) + infos['ave_node_num'] = self._ave_node_num + + if 'min_node_num' in keys: + if self._min_node_num is None: + self._min_node_num = self._get_min_node_num(all_node_nums) + infos['min_node_num'] = self._min_node_num + + if 'max_node_num' in keys: + if self._max_node_num is None: + self._max_node_num = self._get_max_node_num(all_node_nums) + infos['max_node_num'] = self._max_node_num + + # graph edge number + if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']): + all_edge_nums = self._get_all_edge_nums() + + if 'total_edge_num' in keys: + if self._total_edge_num is None: + self._total_edge_num = self._get_total_edge_num(all_edge_nums) + infos['total_edge_num'] = self._total_edge_num + + if 'ave_edge_num' in keys: + if self._ave_edge_num is None: + self._ave_edge_num = self._get_ave_edge_num(all_edge_nums) + infos['ave_edge_num'] = self._ave_edge_num + + if 'max_edge_num' in keys: + if self._max_edge_num is None: + self._max_edge_num = self._get_max_edge_num(all_edge_nums) + infos['max_edge_num'] = self._max_edge_num + + if 'min_edge_num' in keys: + if self._min_edge_num is None: + self._min_edge_num = self._get_min_edge_num(all_edge_nums) + infos['min_edge_num'] = self._min_edge_num + + # label number + if 'node_label_dim' in keys: + if self._node_label_dim is None: + self._node_label_dim = self._get_node_label_dim() + infos['node_label_dim'] = self._node_label_dim + + if 'node_label_nums' in keys: + if self._node_label_nums is None: + self._node_label_nums = {} + for node_label in self._node_labels: + self._node_label_nums[node_label] = self._get_node_label_num(node_label) + infos['node_label_nums'] = self._node_label_nums + + if 'edge_label_dim' in keys: + if self._edge_label_dim is None: + self._edge_label_dim = self._get_edge_label_dim() + infos['edge_label_dim'] = self._edge_label_dim + + if 'edge_label_nums' in keys: + if self._edge_label_nums is None: + self._edge_label_nums = {} + for edge_label in self._edge_labels: + self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label) + infos['edge_label_nums'] = self._edge_label_nums + + if 'directed' in keys or 'substructures' in keys: + if self._directed is None: + self._directed = self._is_directed() + infos['directed'] = self._directed + + # node degree + if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']): + all_node_degrees = self._get_all_node_degrees() + + if 'ave_node_degree' in keys: + if self._ave_node_degree is None: + self._ave_node_degree = self._get_ave_node_degree(all_node_degrees) + infos['ave_node_degree'] = self._ave_node_degree + + if 'max_node_degree' in keys: + if self._max_node_degree is None: + self._max_node_degree = self._get_max_node_degree(all_node_degrees) + infos['max_node_degree'] = self._max_node_degree + + if 'min_node_degree' in keys: + if self._min_node_degree is None: + self._min_node_degree = self._get_min_node_degree(all_node_degrees) + infos['min_node_degree'] = self._min_node_degree + + # fill factor + if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']): + all_fill_factors = self._get_all_fill_factors() + + if 'ave_fill_factor' in keys: + if self._ave_fill_factor is None: + self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors) + infos['ave_fill_factor'] = self._ave_fill_factor + + if 'max_fill_factor' in keys: + if self._max_fill_factor is None: + self._max_fill_factor = self._get_max_fill_factor(all_fill_factors) + infos['max_fill_factor'] = self._max_fill_factor + + if 'min_fill_factor' in keys: + if self._min_fill_factor is None: + self._min_fill_factor = self._get_min_fill_factor(all_fill_factors) + infos['min_fill_factor'] = self._min_fill_factor + + if 'substructures' in keys: + if self._substructures is None: + self._substructures = self._get_substructures() + infos['substructures'] = self._substructures + + if 'class_number' in keys: + if self._class_number is None: + self._class_number = self._get_class_number() + infos['class_number'] = self._class_number + + if 'node_attr_dim' in keys: + if self._node_attr_dim is None: + self._node_attr_dim = self._get_node_attr_dim() + infos['node_attr_dim'] = self._node_attr_dim + + if 'edge_attr_dim' in keys: + if self._edge_attr_dim is None: + self._edge_attr_dim = self._get_edge_attr_dim() + infos['edge_attr_dim'] = self._edge_attr_dim + + # entropy of degree distribution. + + if 'all_degree_entropy' in keys: + if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): + base = params['all_degree_entropy']['base'] + else: + base = None + infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base) + + if 'ave_degree_entropy' in keys: + if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): + base = params['ave_degree_entropy']['base'] + else: + base = None + infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) + + return infos + + + def print_graph_infos(self, infos): + from collections import OrderedDict + keys = list(infos.keys()) + print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) + + + def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + node_labels = [item for item in node_labels if item in self._node_labels] + edge_labels = [item for item in edge_labels if item in self._edge_labels] + node_attrs = [item for item in node_attrs if item in self._node_attrs] + edge_attrs = [item for item in edge_attrs if item in self._edge_attrs] + + for g in self._graphs: + for nd in g.nodes(): + for nl in node_labels: + del g.nodes[nd][nl] + for na in node_attrs: + del g.nodes[nd][na] + for ed in g.edges(): + for el in edge_labels: + del g.edges[ed][el] + for ea in edge_attrs: + del g.edges[ed][ea] + if len(node_labels) > 0: + self._node_labels = [nl for nl in self._node_labels if nl not in node_labels] + if len(edge_labels) > 0: + self._edge_labels = [el for el in self._edge_labels if el not in edge_labels] + if len(node_attrs) > 0: + self._node_attrs = [na for na in self._node_attrs if na not in node_attrs] + if len(edge_attrs) > 0: + self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs] + + + def clean_labels(self): + labels = [] + for name in self._node_labels: + label = set() + for G in self._graphs: + label = label | set(nx.get_node_attributes(G, name).values()) + if len(label) > 1: + labels.append(name) + break + if len(label) < 2: + for G in self._graphs: + for nd in G.nodes(): + del G.nodes[nd][name] + self._node_labels = labels + + labels = [] + for name in self._edge_labels: + label = set() + for G in self._graphs: + label = label | set(nx.get_edge_attributes(G, name).values()) + if len(label) > 1: + labels.append(name) + break + if len(label) < 2: + for G in self._graphs: + for ed in G.edges(): + del G.edges[ed][name] + self._edge_labels = labels + + labels = [] + for name in self._node_attrs: + label = set() + for G in self._graphs: + label = label | set(nx.get_node_attributes(G, name).values()) + if len(label) > 1: + labels.append(name) + break + if len(label) < 2: + for G in self._graphs: + for nd in G.nodes(): + del G.nodes[nd][name] + self._node_attrs = labels + + labels = [] + for name in self._edge_attrs: + label = set() + for G in self._graphs: + label = label | set(nx.get_edge_attributes(G, name).values()) + if len(label) > 1: + labels.append(name) + break + if len(label) < 2: + for G in self._graphs: + for ed in G.edges(): + del G.edges[ed][name] + self._edge_attrs = labels + + + def cut_graphs(self, range_): + self._graphs = [self._graphs[i] for i in range_] + if self._targets is not None: + self._targets = [self._targets[i] for i in range_] + self.clean_labels() + + + def trim_dataset(self, edge_required=False): + if edge_required: + trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)] + else: + trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0] + idx = [p[0] for p in trimed_pairs] + self._graphs = [p[1] for p in trimed_pairs] + self._targets = [self._targets[i] for i in idx] + self.clean_labels() + + + def copy(self): + dataset = Dataset() + graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None + target = self._targets.copy() if self._targets is not None else None + node_labels = self._node_labels.copy() if self._node_labels is not None else None + node_attrs = self._node_attrs.copy() if self._node_attrs is not None else None + edge_labels = self._edge_labels.copy() if self._edge_labels is not None else None + edge_attrs = self._edge_attrs.copy() if self._edge_attrs is not None else None + dataset.load_graphs(graphs, target) + dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) + # @todo: clean_labels and add other class members? + return dataset + + + def get_all_node_labels(self): + node_labels = [] + for g in self._graphs: + for n in g.nodes(): + nl = tuple(g.nodes[n].items()) + if nl not in node_labels: + node_labels.append(nl) + return node_labels + + + def get_all_edge_labels(self): + edge_labels = [] + for g in self._graphs: + for e in g.edges(): + el = tuple(g.edges[e].items()) + if el not in edge_labels: + edge_labels.append(el) + return edge_labels + + + def _get_dataset_size(self): + return len(self._graphs) + + + def _get_all_node_nums(self): + return [nx.number_of_nodes(G) for G in self._graphs] + + + def _get_total_node_nums(self, all_node_nums): + return np.sum(all_node_nums) + + + def _get_ave_node_num(self, all_node_nums): + return np.mean(all_node_nums) + + + def _get_min_node_num(self, all_node_nums): + return np.amin(all_node_nums) + + + def _get_max_node_num(self, all_node_nums): + return np.amax(all_node_nums) + + + def _get_all_edge_nums(self): + return [nx.number_of_edges(G) for G in self._graphs] + + + def _get_total_edge_nums(self, all_edge_nums): + return np.sum(all_edge_nums) + + + def _get_ave_edge_num(self, all_edge_nums): + return np.mean(all_edge_nums) + + + def _get_min_edge_num(self, all_edge_nums): + return np.amin(all_edge_nums) + + + def _get_max_edge_num(self, all_edge_nums): + return np.amax(all_edge_nums) + + + def _get_node_label_dim(self): + return len(self._node_labels) + + + def _get_node_label_num(self, node_label): + nl = set() + for G in self._graphs: + nl = nl | set(nx.get_node_attributes(G, node_label).values()) + return len(nl) + + + def _get_edge_label_dim(self): + return len(self._edge_labels) + + + def _get_edge_label_num(self, edge_label): + el = set() + for G in self._graphs: + el = el | set(nx.get_edge_attributes(G, edge_label).values()) + return len(el) + + + def _is_directed(self): + return nx.is_directed(self._graphs[0]) + + + def _get_all_node_degrees(self): + return [np.mean(list(dict(G.degree()).values())) for G in self._graphs] + + + def _get_ave_node_degree(self, all_node_degrees): + return np.mean(all_node_degrees) + + + def _get_max_node_degree(self, all_node_degrees): + return np.amax(all_node_degrees) + + + def _get_min_node_degree(self, all_node_degrees): + return np.amin(all_node_degrees) + + + def _get_all_fill_factors(self): + """Get fill factor, the number of non-zero entries in the adjacency matrix. + + Returns + ------- + list[float] + List of fill factors for all graphs. + """ + return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs] + + + def _get_ave_fill_factor(self, all_fill_factors): + return np.mean(all_fill_factors) + + + def _get_max_fill_factor(self, all_fill_factors): + return np.amax(all_fill_factors) + + + def _get_min_fill_factor(self, all_fill_factors): + return np.amin(all_fill_factors) + + + def _get_substructures(self): + subs = set() + for G in self._graphs: + degrees = list(dict(G.degree()).values()) + if any(i == 2 for i in degrees): + subs.add('linear') + if np.amax(degrees) >= 3: + subs.add('non linear') + if 'linear' in subs and 'non linear' in subs: + break + + if self._directed: + for G in self._graphs: + if len(list(nx.find_cycle(G))) > 0: + subs.add('cyclic') + break + # else: + # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way. + # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10 + # for G in Gn: + # if (nx.number_of_edges(G) < upper): + # cyc = list(nx.simple_cycles(G.to_directed())) + # if any(len(i) > 2 for i in cyc): + # subs.add('cyclic') + # break + # if 'cyclic' not in subs: + # for G in Gn: + # cyc = list(nx.simple_cycles(G.to_directed())) + # if any(len(i) > 2 for i in cyc): + # subs.add('cyclic') + # break + + return subs + + + def _get_class_num(self): + return len(set(self._targets)) + + + def _get_node_attr_dim(self): + return len(self._node_attrs) + + + def _get_edge_attr_dim(self): + return len(self._edge_attrs) + + + def _compute_all_degree_entropy(self, base=None): + """Compute the entropy of degree distribution of each graph. + + Parameters + ---------- + base : float, optional + The logarithmic base to use. The default is ``e`` (natural logarithm). + + Returns + ------- + degree_entropy : float + The calculated entropy. + """ + from gklearn.utils.stats import entropy + + degree_entropy = [] + for g in self._graphs: + degrees = list(dict(g.degree()).values()) + en = entropy(degrees, base=base) + degree_entropy.append(en) + return degree_entropy + + + @property + def graphs(self): + return self._graphs + + + @property + def targets(self): + return self._targets + + + @property + def node_labels(self): + return self._node_labels + + + @property + def edge_labels(self): + return self._edge_labels + + + @property + def node_attrs(self): + return self._node_attrs + + + @property + def edge_attrs(self): + return self._edge_attrs + + +def split_dataset_by_target(dataset): + from gklearn.preimage.utils import get_same_item_indices + + graphs = dataset.graphs + targets = dataset.targets + datasets = [] + idx_targets = get_same_item_indices(targets) + for key, val in idx_targets.items(): + sub_graphs = [graphs[i] for i in val] + sub_dataset = Dataset() + sub_dataset.load_graphs(sub_graphs, [key] * len(val)) + node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None + node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None + edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None + edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None + sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) + datasets.append(sub_dataset) + # @todo: clean_labels? + return datasets \ No newline at end of file diff --git a/gklearn/dataset/file_managers.py b/gklearn/dataset/file_managers.py new file mode 100644 index 0000000..f2e539e --- /dev/null +++ b/gklearn/dataset/file_managers.py @@ -0,0 +1,824 @@ +""" Utilities function to manage graph files +""" +from os.path import dirname, splitext + + +class DataLoader(): + + + def __init__(self, filename, filename_targets=None, gformat=None, **kwargs): + """Read graph data from filename and load them as NetworkX graphs. + + Parameters + ---------- + filename : string + The name of the file from where the dataset is read. + filename_targets : string + The name of file of the targets corresponding to graphs. + + Notes + ----- + This function supports following graph dataset formats: + + 'ds': load data from .ds file. See comments of function loadFromDS for a example. + + 'cxl': load data from Graph eXchange Language file (.cxl file). See + `here `__ for detail. + + 'sdf': load data from structured data file (.sdf file). See + `here `__ + for details. + + 'mat': Load graph data from a MATLAB (up to version 7.1) .mat file. See + README in `downloadable file `__ + for details. + + 'txt': Load graph data from the TUDataset. See + `here `__ + for details. Note here filename is the name of either .txt file in + the dataset directory. + """ + extension = splitext(filename)[1][1:] + if extension == "ds": + self._graphs, self._targets, self._label_names = self.load_from_ds(filename, filename_targets) + elif extension == "cxl": + dir_dataset = kwargs.get('dirname_dataset', None) + self._graphs, self._targets, self._label_names = self.load_from_xml(filename, dir_dataset) + elif extension == 'xml': + dir_dataset = kwargs.get('dirname_dataset', None) + self._graphs, self._targets, self._label_names = self.load_from_xml(filename, dir_dataset) + elif extension == "mat": + order = kwargs.get('order') + self._graphs, self._targets, self._label_names = self.load_mat(filename, order) + elif extension == 'txt': + self._graphs, self._targets, self._label_names = self.load_tud(filename) + else: + raise ValueError('The input file with the extension ".', extension, '" is not supported. The supported extensions includes: ".ds", ".cxl", ".xml", ".mat", ".txt".') + + + def load_from_ds(self, filename, filename_targets): + """Load data from .ds file. + + Possible graph formats include: + + '.ct': see function load_ct for detail. + + '.gxl': see dunction load_gxl for detail. + + Note these graph formats are checked automatically by the extensions of + graph files. + """ + dirname_dataset = dirname(filename) + data = [] + y = [] + label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} + with open(filename) as fn: + content = fn.read().splitlines() + extension = splitext(content[0].split(' ')[0])[1][1:] + if extension == 'ct': + load_file_fun = self.load_ct + elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet. + load_file_fun = self.load_gxl + + if filename_targets is None or filename_targets == '': + for i in range(0, len(content)): + tmp = content[i].split(' ') + # remove the '#'s in file names + g, l_names = load_file_fun(dirname_dataset + '/' + tmp[0].replace('#', '', 1)) + data.append(g) + self._append_label_names(label_names, l_names) + y.append(float(tmp[1])) + else: # targets in a seperate file + for i in range(0, len(content)): + tmp = content[i] + # remove the '#'s in file names + g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1)) + data.append(g) + self._append_label_names(label_names, l_names) + + with open(filename_targets) as fnt: + content_y = fnt.read().splitlines() + # assume entries in filename and filename_targets have the same order. + for item in content_y: + tmp = item.split(' ') + # assume the 3rd entry in a line is y (for Alkane dataset) + y.append(float(tmp[2])) + + return data, y, label_names + + + def load_from_xml(self, filename, dir_dataset=None): + import xml.etree.ElementTree as ET + + if dir_dataset is not None: + dir_dataset = dir_dataset + else: + dir_dataset = dirname(filename) + tree = ET.parse(filename) + root = tree.getroot() + data = [] + y = [] + label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} + for graph in root.iter('graph'): + mol_filename = graph.attrib['file'] + mol_class = graph.attrib['class'] + g, l_names = self.load_gxl(dir_dataset + '/' + mol_filename) + data.append(g) + self._append_label_names(label_names, l_names) + y.append(mol_class) + + return data, y, label_names + + + def load_mat(self, filename, order): # @todo: need to be updated (auto order) or deprecated. + """Load graph data from a MATLAB (up to version 7.1) .mat file. + + Notes + ------ + A MAT file contains a struct array containing graphs, and a column vector lx containing a class label for each graph. + Check README in `downloadable file `__ for detailed structure. + """ + from scipy.io import loadmat + import numpy as np + import networkx as nx + data = [] + content = loadmat(filename) + for key, value in content.items(): + if key[0] == 'l': # class label + y = np.transpose(value)[0].tolist() + elif key[0] != '_': + # if adjacency matrix is not compressed / edge label exists + if order[1] == 0: + for i, item in enumerate(value[0]): + g = nx.Graph(name=i) # set name of the graph + nl = np.transpose(item[order[3]][0][0][0]) # node label + for index, label in enumerate(nl[0]): + g.add_node(index, label_1=str(label)) + el = item[order[4]][0][0][0] # edge label + for edge in el: + g.add_edge(edge[0] - 1, edge[1] - 1, label_1=str(edge[2])) + data.append(g) + else: + for i, item in enumerate(value[0]): + g = nx.Graph(name=i) # set name of the graph + nl = np.transpose(item[order[3]][0][0][0]) # node label + for index, label in enumerate(nl[0]): + g.add_node(index, label_1=str(label)) + sam = item[order[0]] # sparse adjacency matrix + index_no0 = sam.nonzero() + for col, row in zip(index_no0[0], index_no0[1]): + g.add_edge(col, row) + data.append(g) + + label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} + if order[1] == 0: + label_names['edge_labels'].append('label_1') + + return data, y, label_names + + + def load_tud(self, filename): + """Load graph data from TUD dataset files. + + Notes + ------ + The graph data is loaded from separate files. + Check README in `downloadable file `__, 2018 for detailed structure. + """ + import networkx as nx + from os import listdir + from os.path import dirname, basename + + + def get_infos_from_readme(frm): # @todo: add README (cuniform), maybe node/edge label maps. + """Get information from DS_label_readme.txt file. + """ + + def get_label_names_from_line(line): + """Get names of labels/attributes from a line. + """ + str_names = line.split('[')[1].split(']')[0] + names = str_names.split(',') + names = [attr.strip() for attr in names] + return names + + + def get_class_label_map(label_map_strings): + label_map = {} + for string in label_map_strings: + integer, label = string.split('\t') + label_map[int(integer.strip())] = label.strip() + return label_map + + + label_names = {'node_labels': [], 'node_attrs': [], + 'edge_labels': [], 'edge_attrs': []} + class_label_map = None + class_label_map_strings = [] + with open(frm) as rm: + content_rm = rm.read().splitlines() + i = 0 + while i < len(content_rm): + line = content_rm[i].strip() + # get node/edge labels and attributes. + if line.startswith('Node labels:'): + label_names['node_labels'] = get_label_names_from_line(line) + elif line.startswith('Node attributes:'): + label_names['node_attrs'] = get_label_names_from_line(line) + elif line.startswith('Edge labels:'): + label_names['edge_labels'] = get_label_names_from_line(line) + elif line.startswith('Edge attributes:'): + label_names['edge_attrs'] = get_label_names_from_line(line) + # get class label map. + elif line.startswith('Class labels were converted to integer values using this map:'): + i += 2 + line = content_rm[i].strip() + while line != '' and i < len(content_rm): + class_label_map_strings.append(line) + i += 1 + line = content_rm[i].strip() + class_label_map = get_class_label_map(class_label_map_strings) + i += 1 + + return label_names, class_label_map + + + # get dataset name. + dirname_dataset = dirname(filename) + filename = basename(filename) + fn_split = filename.split('_A') + ds_name = fn_split[0].strip() + + # load data file names + for name in listdir(dirname_dataset): + if ds_name + '_A' in name: + fam = dirname_dataset + '/' + name + elif ds_name + '_graph_indicator' in name: + fgi = dirname_dataset + '/' + name + elif ds_name + '_graph_labels' in name: + fgl = dirname_dataset + '/' + name + elif ds_name + '_node_labels' in name: + fnl = dirname_dataset + '/' + name + elif ds_name + '_edge_labels' in name: + fel = dirname_dataset + '/' + name + elif ds_name + '_edge_attributes' in name: + fea = dirname_dataset + '/' + name + elif ds_name + '_node_attributes' in name: + fna = dirname_dataset + '/' + name + elif ds_name + '_graph_attributes' in name: + fga = dirname_dataset + '/' + name + elif ds_name + '_label_readme' in name: + frm = dirname_dataset + '/' + name + # this is supposed to be the node attrs, make sure to put this as the last 'elif' + elif ds_name + '_attributes' in name: + fna = dirname_dataset + '/' + name + + # get labels and attributes names. + if 'frm' in locals(): + label_names, class_label_map = get_infos_from_readme(frm) + else: + label_names = {'node_labels': [], 'node_attrs': [], + 'edge_labels': [], 'edge_attrs': []} + class_label_map = None + + with open(fgi) as gi: + content_gi = gi.read().splitlines() # graph indicator + with open(fam) as am: + content_am = am.read().splitlines() # adjacency matrix + + # load targets. + if 'fgl' in locals(): + with open(fgl) as gl: + content_targets = gl.read().splitlines() # targets (classification) + targets = [float(i) for i in content_targets] + elif 'fga' in locals(): + with open(fga) as ga: + content_targets = ga.read().splitlines() # targets (regression) + targets = [int(i) for i in content_targets] + else: + raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') + if class_label_map is not None: + targets = [class_label_map[t] for t in targets] + + # create graphs and add nodes + data = [nx.Graph(name=str(i)) for i in range(0, len(content_targets))] + if 'fnl' in locals(): + with open(fnl) as nl: + content_nl = nl.read().splitlines() # node labels + for idx, line in enumerate(content_gi): + # transfer to int first in case of unexpected blanks + data[int(line) - 1].add_node(idx) + labels = [l.strip() for l in content_nl[idx].split(',')] + if label_names['node_labels'] == []: # @todo: need fix bug. + for i, label in enumerate(labels): + l_name = 'label_' + str(i) + data[int(line) - 1].nodes[idx][l_name] = label + label_names['node_labels'].append(l_name) + else: + for i, l_name in enumerate(label_names['node_labels']): + data[int(line) - 1].nodes[idx][l_name] = labels[i] + else: + for i, line in enumerate(content_gi): + data[int(line) - 1].add_node(i) + + # add edges + for line in content_am: + tmp = line.split(',') + n1 = int(tmp[0]) - 1 + n2 = int(tmp[1]) - 1 + # ignore edge weight here. + g = int(content_gi[n1]) - 1 + data[g].add_edge(n1, n2) + + # add edge labels + if 'fel' in locals(): + with open(fel) as el: + content_el = el.read().splitlines() + for idx, line in enumerate(content_el): + labels = [l.strip() for l in line.split(',')] + n = [int(i) - 1 for i in content_am[idx].split(',')] + g = int(content_gi[n[0]]) - 1 + if label_names['edge_labels'] == []: + for i, label in enumerate(labels): + l_name = 'label_' + str(i) + data[g].edges[n[0], n[1]][l_name] = label + label_names['edge_labels'].append(l_name) + else: + for i, l_name in enumerate(label_names['edge_labels']): + data[g].edges[n[0], n[1]][l_name] = labels[i] + + # add node attributes + if 'fna' in locals(): + with open(fna) as na: + content_na = na.read().splitlines() + for idx, line in enumerate(content_na): + attrs = [a.strip() for a in line.split(',')] + g = int(content_gi[idx]) - 1 + if label_names['node_attrs'] == []: + for i, attr in enumerate(attrs): + a_name = 'attr_' + str(i) + data[g].nodes[idx][a_name] = attr + label_names['node_attrs'].append(a_name) + else: + for i, a_name in enumerate(label_names['node_attrs']): + data[g].nodes[idx][a_name] = attrs[i] + + # add edge attributes + if 'fea' in locals(): + with open(fea) as ea: + content_ea = ea.read().splitlines() + for idx, line in enumerate(content_ea): + attrs = [a.strip() for a in line.split(',')] + n = [int(i) - 1 for i in content_am[idx].split(',')] + g = int(content_gi[n[0]]) - 1 + if label_names['edge_attrs'] == []: + for i, attr in enumerate(attrs): + a_name = 'attr_' + str(i) + data[g].edges[n[0], n[1]][a_name] = attr + label_names['edge_attrs'].append(a_name) + else: + for i, a_name in enumerate(label_names['edge_attrs']): + data[g].edges[n[0], n[1]][a_name] = attrs[i] + + return data, targets, label_names + + + def load_ct(self, filename): # @todo: this function is only tested on CTFile V2000; header not considered; only simple cases (atoms and bonds are considered.) + """load data from a Chemical Table (.ct) file. + + Notes + ------ + a typical example of data in .ct is like this: + + 3 2 <- number of nodes and edges + + 0.0000 0.0000 0.0000 C <- each line describes a node (x,y,z + label) + + 0.0000 0.0000 0.0000 C + + 0.0000 0.0000 0.0000 O + + 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo + + 2 3 1 1 + + Check `CTFile Formats file `__ + for detailed format discription. + """ + import networkx as nx + from os.path import basename + g = nx.Graph() + with open(filename) as f: + content = f.read().splitlines() + g = nx.Graph(name=str(content[0]), filename=basename(filename)) # set name of the graph + + # read the counts line. + tmp = content[1].split(' ') + tmp = [x for x in tmp if x != ''] + nb_atoms = int(tmp[0].strip()) # number of atoms + nb_bonds = int(tmp[1].strip()) # number of bonds + count_line_tags = ['number_of_atoms', 'number_of_bonds', 'number_of_atom_lists', '', 'chiral_flag', 'number_of_stext_entries', '', '', '', '', 'number_of_properties', 'CT_version'] + i = 0 + while i < len(tmp): + if count_line_tags[i] != '': # if not obsoleted + g.graph[count_line_tags[i]] = tmp[i].strip() + i += 1 + + # read the atom block. + atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag'] + for i in range(0, nb_atoms): + tmp = content[i + 2].split(' ') + tmp = [x for x in tmp if x != ''] + g.add_node(i) + j = 0 + while j < len(tmp): + if atom_tags[j] != '': + g.nodes[i][atom_tags[j]] = tmp[j].strip() + j += 1 + + # read the bond block. + bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status'] + for i in range(0, nb_bonds): + tmp = content[i + g.number_of_nodes() + 2].split(' ') + tmp = [x for x in tmp if x != ''] + n1, n2 = int(tmp[0].strip()) - 1, int(tmp[1].strip()) - 1 + g.add_edge(n1, n2) + j = 2 + while j < len(tmp): + if bond_tags[j] != '': + g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip() + j += 1 + + # get label names. + label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} + atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1] + for nd in g.nodes(): + for key in g.nodes[nd]: + if atom_symbolic[atom_tags.index(key)] == 1: + label_names['node_labels'].append(key) + else: + label_names['node_attrs'].append(key) + break + bond_symbolic = [None, None, 1, 1, None, 1, 1] + for ed in g.edges(): + for key in g.edges[ed]: + if bond_symbolic[bond_tags.index(key)] == 1: + label_names['edge_labels'].append(key) + else: + label_names['edge_attrs'].append(key) + break + + return g, label_names + + + def load_gxl(self, filename): # @todo: directed graphs. + from os.path import basename + import networkx as nx + import xml.etree.ElementTree as ET + + tree = ET.parse(filename) + root = tree.getroot() + index = 0 + g = nx.Graph(filename=basename(filename), name=root[0].attrib['id']) + dic = {} # used to retrieve incident nodes of edges + for node in root.iter('node'): + dic[node.attrib['id']] = index + labels = {} + for attr in node.iter('attr'): + labels[attr.attrib['name']] = attr[0].text + g.add_node(index, **labels) + index += 1 + + for edge in root.iter('edge'): + labels = {} + for attr in edge.iter('attr'): + labels[attr.attrib['name']] = attr[0].text + g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) + + # get label names. + label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} + for node in root.iter('node'): + for attr in node.iter('attr'): + if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. + label_names['node_labels'].append(attr.attrib['name']) + else: + label_names['node_attrs'].append(attr.attrib['name']) + break + for edge in root.iter('edge'): + for attr in edge.iter('attr'): + if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. + label_names['edge_labels'].append(attr.attrib['name']) + else: + label_names['edge_attrs'].append(attr.attrib['name']) + break + + return g, label_names + + + def _append_label_names(self, label_names, new_names): + for key, val in label_names.items(): + label_names[key] += [name for name in new_names[key] if name not in val] + + + @property + def data(self): + return self._graphs, self._targets, self._label_names + + + @property + def graphs(self): + return self._graphs + + + @property + def targets(self): + return self._targets + + + @property + def label_names(self): + return self._label_names + + +class DataSaver(): + + + def __init__(self, graphs, targets=None, filename='gfile', gformat='gxl', group=None, **kwargs): + """Save list of graphs. + """ + import os + dirname_ds = os.path.dirname(filename) + if dirname_ds != '': + dirname_ds += '/' + os.makedirs(dirname_ds, exist_ok=True) + + if 'graph_dir' in kwargs: + graph_dir = kwargs['graph_dir'] + '/' + os.makedirs(graph_dir, exist_ok=True) + del kwargs['graph_dir'] + else: + graph_dir = dirname_ds + + if group == 'xml' and gformat == 'gxl': + with open(filename + '.xml', 'w') as fgroup: + fgroup.write("") + fgroup.write("\n") + fgroup.write("\n") + for idx, g in enumerate(graphs): + fname_tmp = "graph" + str(idx) + ".gxl" + self.save_gxl(g, graph_dir + fname_tmp, **kwargs) + fgroup.write("\n\t") + fgroup.write("\n") + fgroup.close() + + + def save_gxl(self, graph, filename, method='default', node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + if method == 'default': + gxl_file = open(filename, 'w') + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + if 'name' in graph.graph: + name = str(graph.graph['name']) + else: + name = 'dummy' + gxl_file.write("\n") + for v, attrs in graph.nodes(data=True): + gxl_file.write("") + for l_name in node_labels: + gxl_file.write("" + + str(attrs[l_name]) + "") + for a_name in node_attrs: + gxl_file.write("" + + str(attrs[a_name]) + "") + gxl_file.write("\n") + for v1, v2, attrs in graph.edges(data=True): + gxl_file.write("") + for l_name in edge_labels: + gxl_file.write("" + + str(attrs[l_name]) + "") + for a_name in edge_attrs: + gxl_file.write("" + + str(attrs[a_name]) + "") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("") + gxl_file.close() + elif method == 'benoit': + import xml.etree.ElementTree as ET + root_node = ET.Element('gxl') + attr = dict() + attr['id'] = str(graph.graph['name']) + attr['edgeids'] = 'true' + attr['edgemode'] = 'undirected' + graph_node = ET.SubElement(root_node, 'graph', attrib=attr) + + for v in graph: + current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) + for attr in graph.nodes[v].keys(): + cur_attr = ET.SubElement( + current_node, 'attr', attrib={'name': attr}) + cur_value = ET.SubElement(cur_attr, + graph.nodes[v][attr].__class__.__name__) + cur_value.text = graph.nodes[v][attr] + + for v1 in graph: + for v2 in graph[v1]: + if (v1 < v2): # Non oriented graphs + cur_edge = ET.SubElement( + graph_node, + 'edge', + attrib={ + 'from': str(v1), + 'to': str(v2) + }) + for attr in graph[v1][v2].keys(): + cur_attr = ET.SubElement( + cur_edge, 'attr', attrib={'name': attr}) + cur_value = ET.SubElement( + cur_attr, graph[v1][v2][attr].__class__.__name__) + cur_value.text = str(graph[v1][v2][attr]) + + tree = ET.ElementTree(root_node) + tree.write(filename) + elif method == 'gedlib': + # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 + # pass + gxl_file = open(filename, 'w') + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + for v, attrs in graph.nodes(data=True): + gxl_file.write("") + gxl_file.write("" + str(attrs['chem']) + "") + gxl_file.write("\n") + for v1, v2, attrs in graph.edges(data=True): + gxl_file.write("") + gxl_file.write("" + str(attrs['valence']) + "") + # gxl_file.write("" + "1" + "") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("") + gxl_file.close() + elif method == 'gedlib-letter': + # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 + # and https://github.com/dbblumenthal/gedlib/blob/master/data/datasets/Letter/HIGH/AP1_0000.gxl + gxl_file = open(filename, 'w') + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("\n") + for v, attrs in graph.nodes(data=True): + gxl_file.write("") + gxl_file.write("" + str(attrs['attributes'][0]) + "") + gxl_file.write("" + str(attrs['attributes'][1]) + "") + gxl_file.write("\n") + for v1, v2, attrs in graph.edges(data=True): + gxl_file.write("\n") + gxl_file.write("\n") + gxl_file.write("") + gxl_file.close() + + +# def loadSDF(filename): +# """load data from structured data file (.sdf file). + +# Notes +# ------ +# A SDF file contains a group of molecules, represented in the similar way as in MOL format. +# Check `here `__ for detailed structure. +# """ +# import networkx as nx +# from os.path import basename +# from tqdm import tqdm +# import sys +# data = [] +# with open(filename) as f: +# content = f.read().splitlines() +# index = 0 +# pbar = tqdm(total=len(content) + 1, desc='load SDF', file=sys.stdout) +# while index < len(content): +# index_old = index + +# g = nx.Graph(name=content[index].strip()) # set name of the graph + +# tmp = content[index + 3] +# nb_nodes = int(tmp[:3]) # number of the nodes +# nb_edges = int(tmp[3:6]) # number of the edges + +# for i in range(0, nb_nodes): +# tmp = content[i + index + 4] +# g.add_node(i, atom=tmp[31:34].strip()) + +# for i in range(0, nb_edges): +# tmp = content[i + index + g.number_of_nodes() + 4] +# tmp = [tmp[i:i + 3] for i in range(0, len(tmp), 3)] +# g.add_edge( +# int(tmp[0]) - 1, int(tmp[1]) - 1, bond_type=tmp[2].strip()) + +# data.append(g) + +# index += 4 + g.number_of_nodes() + g.number_of_edges() +# while content[index].strip() != '$$$$': # seperator +# index += 1 +# index += 1 + +# pbar.update(index - index_old) +# pbar.update(1) +# pbar.close() + +# return data + + +# def load_from_cxl(filename): +# import xml.etree.ElementTree as ET +# +# dirname_dataset = dirname(filename) +# tree = ET.parse(filename) +# root = tree.getroot() +# data = [] +# y = [] +# for graph in root.iter('graph'): +# mol_filename = graph.attrib['file'] +# mol_class = graph.attrib['class'] +# data.append(load_gxl(dirname_dataset + '/' + mol_filename)) +# y.append(mol_class) + + +if __name__ == '__main__': +# ### Load dataset from .ds file. +# # .ct files. +# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', +# 'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'} +# Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y']) +# ds_file = '../../datasets/Acyclic/dataset_bps.ds' # node symb +# Gn, targets, label_names = load_dataset(ds_file) +# ds_file = '../../datasets/MAO/dataset.ds' # node/edge symb +# Gn, targets, label_names = load_dataset(ds_file) +## ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled +## Gn, y = loadDataset(ds['dataset']) +# print(Gn[1].graph) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(targets[1]) + +# # .gxl file. +# ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb +# Gn, y, label_names = load_dataset(ds_file) +# print(Gn[1].graph) +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) + + # .mat file. + ds_file = '../../datasets/MUTAG_mat/MUTAG.mat' + order = [0, 0, 3, 1, 2] + gloader = DataLoader(ds_file, order=order) + Gn, targets, label_names = gloader.data + print(Gn[1].graph) + print(Gn[1].nodes(data=True)) + print(Gn[1].edges(data=True)) + print(targets[1]) + +# ### Convert graph from one format to another. +# # .gxl file. +# import networkx as nx +# ds = {'name': 'monoterpenoides', +# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb +# Gn, y = loadDataset(ds['dataset']) +# y = [int(i) for i in y] +# print(Gn[1].nodes(data=True)) +# print(Gn[1].edges(data=True)) +# print(y[1]) +# # Convert a graph to the proper NetworkX format that can be recognized by library gedlib. +# Gn_new = [] +# for G in Gn: +# G_new = nx.Graph() +# for nd, attrs in G.nodes(data=True): +# G_new.add_node(str(nd), chem=attrs['atom']) +# for nd1, nd2, attrs in G.edges(data=True): +# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) +## G_new.add_edge(str(nd1), str(nd2)) +# Gn_new.append(G_new) +# print(Gn_new[1].nodes(data=True)) +# print(Gn_new[1].edges(data=True)) +# print(Gn_new[1]) +# filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' +# xparams = {'method': 'gedlib'} +# saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) + + # save dataset. +# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', +# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb +# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) +# saveDataset(Gn, y, group='xml', filename='temp/temp') + + # test - new way to add labels and attributes. +# dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' +# filename = '../../datasets/Fingerprint/Fingerprint_A.txt' +# dataset = '../../datasets/Letter-med/Letter-med_A.txt' +# dataset = '../../datasets/AIDS/AIDS_A.txt' +# dataset = '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' +# Gn, targets, label_names = load_dataset(filename) + pass \ No newline at end of file diff --git a/gklearn/dataset/graph_synthesizer.py b/gklearn/dataset/graph_synthesizer.py new file mode 100644 index 0000000..73c5e6e --- /dev/null +++ b/gklearn/dataset/graph_synthesizer.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 11 18:10:06 2020 + +@author: ljia +""" +import numpy as np +import networkx as nx +import random + + +class GraphSynthesizer(object): + + + def __init__(self, g_type=None, *args, **kwargs): + if g_type == 'unified': + self._graphs = self.unified_graphs(*args, *kwargs) + else: + self._graphs = None + + + def random_graph(self, num_nodes, num_edges, num_node_labels=0, num_edge_labels=0, seed=None, directed=False, max_num_edges=None, all_edges=None): + g = nx.Graph() + if num_node_labels > 0: + node_labels = np.random.randint(0, high=num_node_labels, size=num_nodes) + for i in range(0, num_nodes): + g.add_node(str(i), atom=node_labels[i]) # @todo: update "atom". + else: + for i in range(0, num_nodes): + g.add_node(str(i)) + + if num_edge_labels > 0: + edge_labels = np.random.randint(0, high=num_edge_labels, size=num_edges) + for idx, i in enumerate(random.sample(range(0, max_num_edges), num_edges)): + node1, node2 = all_edges[i] + g.add_edge(str(node1), str(node2), bond_type=edge_labels[idx]) # @todo: update "bond_type". + else: + for i in random.sample(range(0, max_num_edges), num_edges): + node1, node2 = all_edges[i] + g.add_edge(str(node1), str(node2)) + + return g + + + def unified_graphs(self, num_graphs=1000, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=0, seed=None, directed=False): + max_num_edges = int((num_nodes - 1) * num_nodes / 2) + if num_edges > max_num_edges: + raise Exception('Too many edges.') + all_edges = [(i, j) for i in range(0, num_nodes) for j in range(i + 1, num_nodes)] # @todo: optimize. No directed graphs. + + graphs = [] + for idx in range(0, num_graphs): + graphs.append(self.random_graph(num_nodes, num_edges, num_node_labels=num_node_labels, num_edge_labels=num_edge_labels, seed=seed, directed=directed, max_num_edges=max_num_edges, all_edges=all_edges)) + + return graphs + + + @property + def graphs(self): + return self._graphs \ No newline at end of file diff --git a/gklearn/dataset/metadata.py b/gklearn/dataset/metadata.py new file mode 100644 index 0000000..4fa48d9 --- /dev/null +++ b/gklearn/dataset/metadata.py @@ -0,0 +1,2485 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Oct 20 14:25:49 2020 + +@author: ljia +""" + +DATABASES = { + 'greyc': 'https://brunl01.users.greyc.fr/CHEMISTRY/', + 'iam': 'https://iapr-tc15.greyc.fr/IAM/', + 'tudataset': 'http://graphlearning.io/docs/datasets/', + } + + +### -------- database greyc -------- ### +GREYC_META = { + 'ACE': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 32, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 52, + 'ave_edge_num': None, + 'node_labeled': None, + 'edge_labeled': None, + 'node_attr_dim': None, + 'geometry': None, + 'edge_attr_dim': None, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/ACEDataset.tar', + 'domain': 'small molecules', + 'train_valid_test': [], + 'stereoisomerism': True, + 'load_files': [], + }, + 'Acyclic': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 183, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 8.15, + 'ave_edge_num': 7.15, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/Acyclic.tar.gz', + 'domain': 'small molecules', + 'train_valid_test': None, + 'stereoisomerism': False, + 'load_files': ['dataset_bps.ds'], + }, + 'AIDS_greyc': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 2000, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 15.69, + 'ave_edge_num': 16.20, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': None, + 'geometry': None, + 'edge_attr_dim': None, + 'url': 'https://iapr-tc15.greyc.fr/IAM/AIDS.zip', + 'domain': 'small molecules', + 'train_valid_test': ['data/train.cxl', 'data/valid.cxl', 'data/test.cxl',], + 'stereoisomerism': False, + 'load_files': ['data/'], + }, + 'Alkane': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 150, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 8.87, + 'ave_edge_num': 7.87, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/alkane_dataset.tar.gz', + 'domain': 'small molecules', + 'train_valid_test': None, + 'stereoisomerism': False, + 'load_files': ['dataset.ds', 'dataset_boiling_point_names.txt'], + }, + 'Chiral': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 35, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 21.29, + 'ave_edge_num': None, + 'node_labeled': None, + 'edge_labeled': None, + 'node_attr_dim': None, + 'geometry': None, + 'edge_attr_dim': None, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/DatasetAcyclicChiral.tar', + 'domain': 'small molecules', + 'train_valid_test': [], + 'stereoisomerism': True, + 'load_files': [], + }, + 'MAO': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 68, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 18.38, + 'ave_edge_num': 19.63, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/mao.tgz', + 'domain': 'small molecules', + 'train_valid_test': None, + 'stereoisomerism': False, + 'load_files': ['dataset.ds'], + }, + 'Monoterpenoides': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 382, + 'class_number': 10, + 'task_type': 'classification', + 'ave_node_num': 10, + 'ave_edge_num': None, + 'node_labeled': None, + 'edge_labeled': None, + 'node_attr_dim': None, + 'geometry': None, + 'edge_attr_dim': None, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/monoterpenoides.tar.gz', + 'domain': 'small molecules', + 'train_valid_test': None, + 'stereoisomerism': False, + 'load_files': ['dataset_10+.ds'], + }, + 'PAH': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 94, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 20.7, + 'ave_edge_num': 24.43, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/PAH.tar.gz', + 'domain': 'small molecules', + 'train_valid_test': ['trainset_0.ds', None, 'testset_0.ds'], + 'stereoisomerism': False, + 'load_files': [], + }, + 'PTC': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 416, + 'class_number': None, + 'task_type': 'classification', + 'ave_node_num': 14.4, + 'ave_edge_num': None, + 'node_labeled': None, + 'edge_labeled': None, + 'node_attr_dim': None, + 'geometry': None, + 'edge_attr_dim': None, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/ptc.tgz', + 'domain': 'small molecules', + 'train_valid_test': None, + 'stereoisomerism': False, + 'load_files': [], + 'extra_info': 'This dataset has test and train datasets. Select gender between mm, fm, mr, fr. \ndataloader = DataLoader(\'Ptc\',root = ..., option = \'mm\') \ntest,train = dataloader.dataset \nGs_test, y_test = test \nGs_train_, y_train = train', + }, + 'Steroid': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 64, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 75.11, + 'ave_edge_num': None, + 'node_labeled': None, + 'edge_labeled': None, + 'node_attr_dim': None, + 'geometry': None, + 'edge_attr_dim': None, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/SteroidDataset.tar', + 'domain': 'small molecules', + 'train_valid_test': ['trainset_0.ds', None, 'testset_0.ds'], + 'stereoisomerism': False, + 'load_files': [], + }, + 'Vitamin_D': { + 'database': 'greyc', + 'reference': None, + 'dataset_size': 69, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 76.91, + 'ave_edge_num': None, + 'node_labeled': None, + 'edge_labeled': None, + 'node_attr_dim': None, + 'geometry': None, + 'edge_attr_dim': None, + 'url': 'https://brunl01.users.greyc.fr/CHEMISTRY/DatasetVitamin.tar', + 'domain': 'small molecules', + 'train_valid_test': [], + 'stereoisomerism': True, + 'load_files': [], + }, +} + + +### -------- database iam -------- ### +# @todo: several datasets in this database are included in TUDataset. However they do not include train/valid/test sets. +IAM_META = { + 'GREC': { + 'database': 'iam', + 'reference': None, + 'dataset_size': None, + 'class_number': None, + 'task_type': None, + 'ave_node_num': None, + 'ave_edge_num': None, + 'node_labeled': None, + 'edge_labeled': None, + 'node_attr_dim': None, + 'geometry': None, + 'edge_attr_dim': None, + 'url': 'https://iapr-tc15.greyc.fr/IAM/GREC.zip', + 'domain': None, + 'train_valid_test': ['data/test.cxl','data/train.cxl', 'data/valid.cxl'], + 'load_files': [], + }, + 'Web': { + 'database': 'iam', + 'reference': None, + 'dataset_size': None, + 'class_number': None, + 'task_type': None, + 'ave_node_num': None, + 'ave_edge_num': None, + 'node_labeled': None, + 'edge_labeled': None, + 'node_attr_dim': None, + 'geometry': None, + 'edge_attr_dim': None, + 'url': 'https://iapr-tc15.greyc.fr/IAM/Web.zip', + 'domain': None, + 'train_valid_test': ['data/test.cxl', 'data/train.cxl', 'data/valid.cxl'], + 'load_files': [], + }, +} + + +### -------- database tudataset -------- ### +TUDataset_META = { + ### small molecules + 'AIDS': { + 'database': 'tudataset', + 'reference': '[16,17]', + 'dataset_size': 2000, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 15.69, + 'ave_edge_num': 16.2, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 4, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/AIDS.zip', + 'domain': 'small molecules', + }, + 'alchemy_full': { + 'database': 'tudataset', + 'reference': '[29]', + 'dataset_size': 202579, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 10.1, + 'ave_edge_num': 10.44, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 3, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/alchemy_full.zip', + 'domain': 'small molecules', + }, + 'aspirin': { + 'database': 'tudataset', + 'reference': '[36]', + 'dataset_size': 111763, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 21.0, + 'ave_edge_num': 151.52, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 6, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/aspirin.zip', + 'domain': 'small molecules', + }, + 'benzene': { + 'database': 'tudataset', + 'reference': '[36]', + 'dataset_size': 527984, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 12.0, + 'ave_edge_num': 64.94, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 6, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/benzene.zip', + 'domain': 'small molecules', + }, + 'BZR': { + 'database': 'tudataset', + 'reference': '[7]', + 'dataset_size': 405, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 35.75, + 'ave_edge_num': 38.36, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 3, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/BZR.zip', + 'domain': 'small molecules', + }, + 'BZR_MD': { + 'database': 'tudataset', + 'reference': '[7,23]', + 'dataset_size': 306, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 21.3, + 'ave_edge_num': 225.06, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 1, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/BZR_MD.zip', + 'domain': 'small molecules', + }, + 'COX2': { + 'database': 'tudataset', + 'reference': '[7]', + 'dataset_size': 467, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 41.22, + 'ave_edge_num': 43.45, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 3, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/COX2.zip', + 'domain': 'small molecules', + }, + 'COX2_MD': { + 'database': 'tudataset', + 'reference': '[7,23]', + 'dataset_size': 303, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.28, + 'ave_edge_num': 335.12, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 1, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/COX2_MD.zip', + 'domain': 'small molecules', + }, + 'DHFR': { + 'database': 'tudataset', + 'reference': '[7]', + 'dataset_size': 467, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 42.43, + 'ave_edge_num': 44.54, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 3, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/DHFR.zip', + 'domain': 'small molecules', + }, + 'DHFR_MD': { + 'database': 'tudataset', + 'reference': '[7,23]', + 'dataset_size': 393, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 23.87, + 'ave_edge_num': 283.01, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 1, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/DHFR_MD.zip', + 'domain': 'small molecules', + }, + 'ER_MD': { + 'database': 'tudataset', + 'reference': '[7,23]', + 'dataset_size': 446, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 21.33, + 'ave_edge_num': 234.85, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 1, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/ER_MD.zip', + 'domain': 'small molecules', + }, + 'ethanol': { + 'database': 'tudataset', + 'reference': '[36]', + 'dataset_size': 455093, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 9.0, + 'ave_edge_num': 36.0, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 6, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/ethanol.zip', + 'domain': 'small molecules', + }, + 'FRANKENSTEIN': { + 'database': 'tudataset', + 'reference': '[15]', + 'dataset_size': 4337, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 16.9, + 'ave_edge_num': 17.88, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 780, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/FRANKENSTEIN.zip', + 'domain': 'small molecules', + }, + 'malonaldehyde': { + 'database': 'tudataset', + 'reference': '[36]', + 'dataset_size': 893238, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 9.0, + 'ave_edge_num': 36.0, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 6, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/malonaldehyde.zip', + 'domain': 'small molecules', + }, + 'MCF-7': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 27770, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.39, + 'ave_edge_num': 28.52, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/MCF-7.zip', + 'domain': 'small molecules', + }, + 'MCF-7H': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 27770, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 47.3, + 'ave_edge_num': 49.43, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/MCF-7H.zip', + 'domain': 'small molecules', + }, + 'MOLT-4': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 39765, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.09, + 'ave_edge_num': 28.13, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/MOLT-4.zip', + 'domain': 'small molecules', + }, + 'MOLT-4H': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 39765, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 46.7, + 'ave_edge_num': 48.73, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/MOLT-4H.zip', + 'domain': 'small molecules', + }, + 'Mutagenicity': { + 'database': 'tudataset', + 'reference': '[16,20]', + 'dataset_size': 4337, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 30.32, + 'ave_edge_num': 30.77, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Mutagenicity.zip', + 'domain': 'small molecules', + }, + 'MUTAG': { + 'database': 'tudataset', + 'reference': '[1,23]', + 'dataset_size': 188, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.93, + 'ave_edge_num': 19.79, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip', + 'domain': 'small molecules', + }, + 'naphthalene': { + 'database': 'tudataset', + 'reference': '[36]', + 'dataset_size': 226256, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 18.0, + 'ave_edge_num': 127.37, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 6, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/naphthalene.zip', + 'domain': 'small molecules', + }, + 'NCI1': { + 'database': 'tudataset', + 'reference': '[8,9,22]', + 'dataset_size': 4110, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 29.87, + 'ave_edge_num': 32.3, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI1.zip', + 'domain': 'small molecules', + }, + 'NCI109': { + 'database': 'tudataset', + 'reference': '[8,9,22]', + 'dataset_size': 4127, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 29.68, + 'ave_edge_num': 32.13, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI109.zip', + 'domain': 'small molecules', + }, + 'NCI-H23': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40353, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.07, + 'ave_edge_num': 28.1, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23.zip-H23', + 'domain': 'small molecules', + }, + 'NCI-H23H': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40353, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 46.67, + 'ave_edge_num': 48.69, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23H.zip-H23H', + 'domain': 'small molecules', + }, + 'OVCAR-8': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40516, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.07, + 'ave_edge_num': 28.1, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8.zip-8', + 'domain': 'small molecules', + }, + 'OVCAR-8H': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40516, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 46.67, + 'ave_edge_num': 48.7, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8H.zip-8H', + 'domain': 'small molecules', + }, + 'P388': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 41472, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 22.11, + 'ave_edge_num': 23.55, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/P388.zip', + 'domain': 'small molecules', + }, + 'P388H': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 41472, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 40.44, + 'ave_edge_num': 41.88, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/P388H.zip', + 'domain': 'small molecules', + }, + 'PC-3': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 27509, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.35, + 'ave_edge_num': 28.49, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/PC-3.zip', + 'domain': 'small molecules', + }, + 'PC-3H': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 27509, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 47.19, + 'ave_edge_num': 49.32, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/PC-3H.zip', + 'domain': 'small molecules', + }, + 'PTC_FM': { + 'database': 'tudataset', + 'reference': '[2,23]', + 'dataset_size': 349, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 14.11, + 'ave_edge_num': 14.48, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/PTC_FM.zip', + 'domain': 'small molecules', + }, + 'PTC_FR': { + 'database': 'tudataset', + 'reference': '[2,23]', + 'dataset_size': 351, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 14.56, + 'ave_edge_num': 15.0, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/PTC_FR.zip', + 'domain': 'small molecules', + }, + 'PTC_MM': { + 'database': 'tudataset', + 'reference': '[2,23]', + 'dataset_size': 336, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 13.97, + 'ave_edge_num': 14.32, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/PTC_MM.zip', + 'domain': 'small molecules', + }, + 'PTC_MR': { + 'database': 'tudataset', + 'reference': '[2,23]', + 'dataset_size': 344, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 14.29, + 'ave_edge_num': 14.69, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/PTC_MR.zip', + 'domain': 'small molecules', + }, + 'QM9': { + 'database': 'tudataset', + 'reference': '[33,34,35]', + 'dataset_size': 129433, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 18.03, + 'ave_edge_num': 18.63, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 16, + 'geometry': '3D, RI', + 'edge_attr_dim': 4, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/QM9.zip', + 'domain': 'small molecules', + }, + 'salicylic_acid': { + 'database': 'tudataset', + 'reference': '[36]', + 'dataset_size': 220232, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 16.0, + 'ave_edge_num': 104.13, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 6, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/salicylic_acid.zip', + 'domain': 'small molecules', + }, + 'SF-295': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40271, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.06, + 'ave_edge_num': 28.08, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/SF-295.zip', + 'domain': 'small molecules', + }, + 'SF-295H': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40271, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 46.65, + 'ave_edge_num': 48.68, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/SF-295H.zip', + 'domain': 'small molecules', + }, + 'SN12C': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40004, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.08, + 'ave_edge_num': 28.11, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/SN12C.zip', + 'domain': 'small molecules', + }, + 'SN12CH': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40004, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 46.69, + 'ave_edge_num': 48.71, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/SN12CH.zip', + 'domain': 'small molecules', + }, + 'SW-620': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40532, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.05, + 'ave_edge_num': 28.08, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/SW-620.zip', + 'domain': 'small molecules', + }, + 'SW-620H': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 40532, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 46.62, + 'ave_edge_num': 48.65, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/SW-620H.zip', + 'domain': 'small molecules', + }, + 'toluene': { + 'database': 'tudataset', + 'reference': '[36]', + 'dataset_size': 342791, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 15.0, + 'ave_edge_num': 96.15, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 6, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/toluene.zip', + 'domain': 'small molecules', + }, + 'Tox21_AhR_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 8169, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 18.09, + 'ave_edge_num': 18.5, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_AhR_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_AhR_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 272, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 22.13, + 'ave_edge_num': 23.05, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_AhR_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_AhR_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 607, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.64, + 'ave_edge_num': 18.06, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_AhR_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_AR_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 9362, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 18.39, + 'ave_edge_num': 18.84, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_AR_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_AR_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 292, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 22.35, + 'ave_edge_num': 23.32, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_AR_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_AR_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 585, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.99, + 'ave_edge_num': 18.45, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_AR_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_AR-LBD_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 8599, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.77, + 'ave_edge_num': 18.16, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_AR-LBD_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_AR-LBD_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 253, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 21.85, + 'ave_edge_num': 22.73, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_AR-LBD_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_AR-LBD_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 580, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.09, + 'ave_edge_num': 17.42, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_AR-LBD_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_ARE_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 7167, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 16.28, + 'ave_edge_num': 16.52, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ARE_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_ARE_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 234, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 21.99, + 'ave_edge_num': 22.91, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ARE_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_ARE_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 552, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.01, + 'ave_edge_num': 17.33, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ARE_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_aromatase_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 7226, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.5, + 'ave_edge_num': 17.79, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_aromatase_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_aromatase_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 214, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 21.65, + 'ave_edge_num': 22.36, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_aromatase_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_aromatase_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 528, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 16.74, + 'ave_edge_num': 16.99, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_aromatase_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_ATAD5_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 9091, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.89, + 'ave_edge_num': 18.3, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ATAD5_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_ATAD5_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 272, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 21.99, + 'ave_edge_num': 22.89, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ATAD5_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_ATAD5_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 619, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.68, + 'ave_edge_num': 18.11, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ATAD5_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_ER_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 7697, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.58, + 'ave_edge_num': 17.94, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ER_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_ER_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 265, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 22.16, + 'ave_edge_num': 23.13, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ER_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_ER_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 515, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.66, + 'ave_edge_num': 18.1, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ER_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_ER-LBD_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 8753, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 18.06, + 'ave_edge_num': 18.47, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ER-LBD_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_ER-LBD_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 287, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 22.28, + 'ave_edge_num': 23.23, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ER-LBD_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_ER-LBD_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 599, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.75, + 'ave_edge_num': 18.17, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_ER-LBD_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_HSE_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 8150, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 16.72, + 'ave_edge_num': 17.04, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_HSE_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_HSE_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 267, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 22.07, + 'ave_edge_num': 23.0, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_HSE_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_HSE_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 607, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.61, + 'ave_edge_num': 18.01, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_HSE_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_MMP_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 7320, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.49, + 'ave_edge_num': 17.83, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_MMP_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_MMP_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 238, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 21.68, + 'ave_edge_num': 22.55, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_MMP_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_MMP_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 541, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 16.67, + 'ave_edge_num': 16.88, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_MMP_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_p53_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 8634, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.79, + 'ave_edge_num': 18.19, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_p53_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_p53_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 269, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 22.14, + 'ave_edge_num': 23.04, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_p53_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_p53_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 613, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.34, + 'ave_edge_num': 17.72, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_p53_evaluation.zip', + 'domain': 'small molecules', + }, + 'Tox21_PPAR-gamma_training': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 8184, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.23, + 'ave_edge_num': 17.55, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_PPAR-gamma_training.zip', + 'domain': 'small molecules', + }, + 'Tox21_PPAR-gamma_testing': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 267, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 22.04, + 'ave_edge_num': 22.93, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_PPAR-gamma_testing.zip', + 'domain': 'small molecules', + }, + 'Tox21_PPAR-gamma_evaluation': { + 'database': 'tudataset', + 'reference': '[24]', + 'dataset_size': 602, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 17.38, + 'ave_edge_num': 17.77, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Tox21_PPAR-gamma_evaluation.zip', + 'domain': 'small molecules', + }, + 'UACC257': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 39988, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.09, + 'ave_edge_num': 28.12, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/UACC257.zip', + 'domain': 'small molecules', + }, + 'UACC257H': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 39988, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 46.68, + 'ave_edge_num': 48.71, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/UACC257H.zip', + 'domain': 'small molecules', + }, + 'uracil': { + 'database': 'tudataset', + 'reference': '[36]', + 'dataset_size': 133770, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 12.0, + 'ave_edge_num': 64.44, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 6, + 'geometry': '3D, RI', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/uracil.zip', + 'domain': 'small molecules', + }, + 'Yeast': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 79601, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 21.54, + 'ave_edge_num': 22.84, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Yeast.zip', + 'domain': 'small molecules', + }, + 'YeastH': { + 'database': 'tudataset', + 'reference': '[28]', + 'dataset_size': 79601, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 39.44, + 'ave_edge_num': 40.74, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/YeastH.zip', + 'domain': 'small molecules', + }, + 'ZINC_full': { + 'database': 'tudataset', + 'reference': '[31]', + 'dataset_size': 249456, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 23.14, + 'ave_edge_num': 24.91, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/ZINC_full.zip', + 'domain': 'small molecules', + }, + 'ZINC_test': { + 'database': 'tudataset', + 'reference': '[31]', + 'dataset_size': 5000, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 23.1, + 'ave_edge_num': 24.83, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/ZINC_test.zip', + 'domain': 'small molecules', + }, + 'ZINC_train': { + 'database': 'tudataset', + 'reference': '[31]', + 'dataset_size': 220011, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 23.15, + 'ave_edge_num': 24.91, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/ZINC_train.zip', + 'domain': 'small molecules', + }, + 'ZINC_val': { + 'database': 'tudataset', + 'reference': '[31]', + 'dataset_size': 24445, + 'class_number': None, + 'task_type': 'regression', + 'ave_node_num': 23.13, + 'ave_edge_num': 24.88, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/ZINC_val.zip', + 'domain': 'small molecules', + }, + + ### bioinformatics + 'DD': { + 'database': 'tudataset', + 'reference': '[6,22]', + 'dataset_size': 1178, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 284.32, + 'ave_edge_num': 715.66, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/DD.zip', + 'domain': 'bioinformatics', + }, + 'ENZYMES': { + 'database': 'tudataset', + 'reference': '[4,5]', + 'dataset_size': 600, + 'class_number': 6, + 'task_type': 'classification', + 'ave_node_num': 32.63, + 'ave_edge_num': 62.14, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 18, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'http://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip', + 'domain': 'bioinformatics', + }, + 'KKI': { + 'database': 'tudataset', + 'reference': '[26]', + 'dataset_size': 83, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 26.96, + 'ave_edge_num': 48.42, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/KKI.zip', + 'domain': 'bioinformatics', + }, + 'OHSU': { + 'database': 'tudataset', + 'reference': '[26]', + 'dataset_size': 79, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 82.01, + 'ave_edge_num': 199.66, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OHSU.zip', + 'domain': 'bioinformatics', + }, + 'Peking_1': { + 'database': 'tudataset', + 'reference': '[26]', + 'dataset_size': 85, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 39.31, + 'ave_edge_num': 77.35, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Peking_1.zip', + 'domain': 'bioinformatics', + }, + 'PROTEINS': { + 'database': 'tudataset', + 'reference': '[4,6]', + 'dataset_size': 1113, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 39.06, + 'ave_edge_num': 72.82, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 1, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip', + 'domain': 'bioinformatics', + }, + 'PROTEINS_full': { + 'database': 'tudataset', + 'reference': '[4,6]', + 'dataset_size': 1113, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 39.06, + 'ave_edge_num': 72.82, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 29, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS_full.zip', + 'domain': 'bioinformatics', + }, + + ### computer vision + 'COIL-DEL': { + 'database': 'tudataset', + 'reference': '[16,18]', + 'dataset_size': 3900, + 'class_number': 100, + 'task_type': 'classification', + 'ave_node_num': 21.54, + 'ave_edge_num': 54.24, + 'node_labeled': False, + 'edge_labeled': True, + 'node_attr_dim': 2, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/COIL-DEL.zip', + 'domain': 'computer vision', + }, + 'COIL-RAG': { + 'database': 'tudataset', + 'reference': '[16,18]', + 'dataset_size': 3900, + 'class_number': 100, + 'task_type': 'classification', + 'ave_node_num': 3.01, + 'ave_edge_num': 3.02, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 64, + 'geometry': None, + 'edge_attr_dim': 1, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/COIL-RAG.zip', + 'domain': 'computer vision', + }, + 'Cuneiform': { + 'database': 'tudataset', + 'reference': '[25]', + 'dataset_size': 267, + 'class_number': 30, + 'task_type': 'classification', + 'ave_node_num': 21.27, + 'ave_edge_num': 44.8, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 3, + 'geometry': '3D', + 'edge_attr_dim': 2, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Cuneiform.zip', + 'domain': 'computer vision', + }, + 'Fingerprint': { + 'database': 'tudataset', + 'reference': '[16,19]', + 'dataset_size': 2800, + 'class_number': 4, + 'task_type': 'classification', + 'ave_node_num': 5.42, + 'ave_edge_num': 4.42, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 2, + 'geometry': '2D', + 'edge_attr_dim': 2, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Fingerprint.zip', + 'domain': 'computer vision', + }, + 'FIRSTMM_DB': { + 'database': 'tudataset', + 'reference': '[11,12,13]', + 'dataset_size': 41, + 'class_number': 11, + 'task_type': 'classification', + 'ave_node_num': 1377.27, + 'ave_edge_num': 3074.1, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 1, + 'geometry': None, + 'edge_attr_dim': 2, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/FIRSTMM_DB.zip', + 'domain': 'computer vision', + }, + 'Letter-high': { + 'database': 'tudataset', + 'reference': '[16]', + 'dataset_size': 2250, + 'class_number': 15, + 'task_type': 'classification', + 'ave_node_num': 4.67, + 'ave_edge_num': 4.5, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 2, + 'geometry': '2D', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Letter-high.zip', + 'domain': 'computer vision', + }, + 'Letter-low': { + 'database': 'tudataset', + 'reference': '[16]', + 'dataset_size': 2250, + 'class_number': 15, + 'task_type': 'classification', + 'ave_node_num': 4.68, + 'ave_edge_num': 3.13, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 2, + 'geometry': '2D', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Letter-low.zip', + 'domain': 'computer vision', + }, + 'Letter-med': { + 'database': 'tudataset', + 'reference': '[16]', + 'dataset_size': 2250, + 'class_number': 15, + 'task_type': 'classification', + 'ave_node_num': 4.67, + 'ave_edge_num': 4.5, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 2, + 'geometry': '2D', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Letter-med.zip', + 'domain': 'computer vision', + }, + 'MSRC_9': { + 'database': 'tudataset', + 'reference': '[13]', + 'dataset_size': 221, + 'class_number': 8, + 'task_type': 'classification', + 'ave_node_num': 40.58, + 'ave_edge_num': 97.94, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/MSRC_9.zip', + 'domain': 'computer vision', + }, + 'MSRC_21': { + 'database': 'tudataset', + 'reference': '[13]', + 'dataset_size': 563, + 'class_number': 20, + 'task_type': 'classification', + 'ave_node_num': 77.52, + 'ave_edge_num': 198.32, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/MSRC_21.zip', + 'domain': 'computer vision', + }, + 'MSRC_21C': { + 'database': 'tudataset', + 'reference': '[13]', + 'dataset_size': 209, + 'class_number': 20, + 'task_type': 'classification', + 'ave_node_num': 40.28, + 'ave_edge_num': 96.6, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/MSRC_21C.zip', + 'domain': 'computer vision', + }, + + ### social networks + 'COLLAB': { + 'database': 'tudataset', + 'reference': '[14]', + 'dataset_size': 5000, + 'class_number': 3, + 'task_type': 'classification', + 'ave_node_num': 74.49, + 'ave_edge_num': 2457.78, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/COLLAB.zip', + 'domain': 'social networks', + }, + 'dblp_ct1': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 755, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 52.87, + 'ave_edge_num': 320.09, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/dblp_ct1.zip', + 'domain': 'social networks', + }, + 'dblp_ct2': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 755, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 52.87, + 'ave_edge_num': 320.09, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/dblp_ct2.zip', + 'domain': 'social networks', + }, + 'DBLP_v1': { + 'database': 'tudataset', + 'reference': '[26]', + 'dataset_size': 19456, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 10.48, + 'ave_edge_num': 19.65, + 'node_labeled': True, + 'edge_labeled': True, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/DBLP_v1.zip', + 'domain': 'social networks', + }, + 'deezer_ego_nets': { + 'database': 'tudataset', + 'reference': '[30]', + 'dataset_size': 9629, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 23.49, + 'ave_edge_num': 65.25, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/deezer_ego_nets.zip', + 'domain': 'social networks', + }, + 'facebook_ct1': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 995, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 95.72, + 'ave_edge_num': 269.01, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/facebook_ct1.zip', + 'domain': 'social networks', + }, + 'facebook_ct2': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 995, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 95.72, + 'ave_edge_num': 269.01, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/facebook_ct2.zip', + 'domain': 'social networks', + }, + 'github_stargazers': { + 'database': 'tudataset', + 'reference': '[30]', + 'dataset_size': 12725, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 113.79, + 'ave_edge_num': 234.64, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/github_stargazers.zip', + 'domain': 'social networks', + }, + 'highschool_ct1': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 180, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 52.32, + 'ave_edge_num': 544.81, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/highschool_ct1.zip', + 'domain': 'social networks', + }, + 'highschool_ct2': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 180, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 52.32, + 'ave_edge_num': 544.81, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/highschool_ct2.zip', + 'domain': 'social networks', + }, + 'IMDB-BINARY': { + 'database': 'tudataset', + 'reference': '[14]', + 'dataset_size': 1000, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 19.77, + 'ave_edge_num': 96.53, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': '', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/IMDB-BINARY.zip', + 'domain': 'social networks', + }, + 'IMDB-MULTI': { + 'database': 'tudataset', + 'reference': '[14]', + 'dataset_size': 1500, + 'class_number': 3, + 'task_type': 'classification', + 'ave_node_num': 13.0, + 'ave_edge_num': 65.94, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': '', + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/IMDB-MULTI.zip', + 'domain': 'social networks', + }, + 'infectious_ct1': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 200, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 50.0, + 'ave_edge_num': 459.72, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/infectious_ct1.zip', + 'domain': 'social networks', + }, + 'infectious_ct2': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 200, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 50.0, + 'ave_edge_num': 459.72, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/infectious_ct2.zip', + 'domain': 'social networks', + }, + 'mit_ct1': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 97, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 20.0, + 'ave_edge_num': 1469.15, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/mit_ct1.zip', + 'domain': 'social networks', + }, + 'mit_ct2': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 97, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 20.0, + 'ave_edge_num': 1469.15, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/mit_ct2.zip', + 'domain': 'social networks', + }, + 'REDDIT-BINARY': { + 'database': 'tudataset', + 'reference': '[14]', + 'dataset_size': 2000, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 429.63, + 'ave_edge_num': 497.75, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/REDDIT-BINARY.zip', + 'domain': 'social networks', + }, + 'REDDIT-MULTI-5K': { + 'database': 'tudataset', + 'reference': '[14]', + 'dataset_size': 4999, + 'class_number': 5, + 'task_type': 'classification', + 'ave_node_num': 508.52, + 'ave_edge_num': 594.87, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/REDDIT-MULTI-5K.zip', + 'domain': 'social networks', + }, + 'REDDIT-MULTI-12K': { + 'database': 'tudataset', + 'reference': '[14]', + 'dataset_size': 11929, + 'class_number': 11, + 'task_type': 'classification', + 'ave_node_num': 391.41, + 'ave_edge_num': 456.89, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/REDDIT-MULTI-12K.zip', + 'domain': 'social networks', + }, + 'reddit_threads': { + 'database': 'tudataset', + 'reference': '[30]', + 'dataset_size': 203088, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 23.93, + 'ave_edge_num': 24.99, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/reddit_threads.zip', + 'domain': 'social networks', + }, + 'tumblr_ct1': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 373, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 53.11, + 'ave_edge_num': 199.78, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/tumblr_ct1.zip', + 'domain': 'social networks', + }, + 'tumblr_ct2': { + 'database': 'tudataset', + 'reference': '[32]', + 'dataset_size': 373, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 53.11, + 'ave_edge_num': 199.78, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 'temporal', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/tumblr_ct2.zip', + 'domain': 'social networks', + }, + 'twitch_egos': { + 'database': 'tudataset', + 'reference': '[30]', + 'dataset_size': 127094, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 29.67, + 'ave_edge_num': 86.59, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/twitch_egos.zip', + 'domain': 'social networks', + }, + 'TWITTER-Real-Graph-Partial': { + 'database': 'tudataset', + 'reference': '[26]', + 'dataset_size': 144033, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 4.03, + 'ave_edge_num': 4.98, + 'node_labeled': True, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 1, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/TWITTER-Real-Graph-Partial.zip', + 'domain': 'social networks', + }, + + ### synthetic + 'COLORS-3': { + 'database': 'tudataset', + 'reference': '[27]', + 'dataset_size': 10500, + 'class_number': 11, + 'task_type': 'classification', + 'ave_node_num': 61.31, + 'ave_edge_num': 91.03, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 4, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/COLORS-3.zip', + 'domain': 'synthetic', + }, + 'SYNTHETIC': { + 'database': 'tudataset', + 'reference': '[3]', + 'dataset_size': 300, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 100.0, + 'ave_edge_num': 196.0, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 1, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/SYNTHETIC.zip', + 'domain': 'synthetic', + }, + 'SYNTHETICnew': { + 'database': 'tudataset', + 'reference': '[3,10]', + 'dataset_size': 300, + 'class_number': 2, + 'task_type': 'classification', + 'ave_node_num': 100.0, + 'ave_edge_num': 196.25, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 1, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/SYNTHETICnew.zip', + 'domain': 'synthetic', + }, + 'Synthie': { + 'database': 'tudataset', + 'reference': '[21]', + 'dataset_size': 400, + 'class_number': 4, + 'task_type': 'classification', + 'ave_node_num': 95.0, + 'ave_edge_num': 172.93, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 15, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/Synthie.zip', + 'domain': 'synthetic', + }, + 'TRIANGLES': { + 'database': 'tudataset', + 'reference': '[27]', + 'dataset_size': 45000, + 'class_number': 10, + 'task_type': 'classification', + 'ave_node_num': 20.85, + 'ave_edge_num': 32.74, + 'node_labeled': False, + 'edge_labeled': False, + 'node_attr_dim': 0, + 'geometry': None, + 'edge_attr_dim': 0, + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/TRIANGLES.zip', + 'domain': 'synthetic', + }, +} + + +DATASET_META = {**GREYC_META, **IAM_META, **TUDataset_META} + + +def list_of_databases(): + """List names of all databases. + + Returns + ------- + list + The list of all databases. + """ + return [i for i in DATABASES] + + +def list_of_datasets(): + """List names of all datasets. + + Returns + ------- + list + The list of all datasets. + """ + return [i for i in DATASET_META] \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/edit_costs.max_num_sols.N.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.max_num_sols.N.bipartite.py new file mode 100644 index 0000000..fd9e49e --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.max_num_sols.N.bipartite.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 20 11:48:02 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. + +import os +import multiprocessing +import pickle +import logging +from gklearn.ged.util import compute_geds +import time +import sys +from group_results import group_trials + + +def generate_graphs(): + from gklearn.utils.graph_synthesizer import GraphSynthesizer + gsyzer = GraphSynthesizer() + graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) + return graphs + + +def xp_compute_ged_matrix(graphs, N, max_num_solutions, ratio, trial): + + save_file_suffix = '.' + str(N) + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. + # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) + 'lsape_model': 'ECBP', # + # ??when bigger than 1, then the method is considered mIPFP. + # the actual number of computed solutions might be smaller than the specified value + 'max_num_solutions': max_num_solutions, + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + 'greedy_method': 'BASIC', # + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'optimal': True, # if TRUE, the option --greedy-method has no effect + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'centrality_method': 'NONE', + 'centrality_weight': 0.7, + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = [] + options['edge_labels'] = [] + options['node_attrs'] = [] + options['edge_attrs'] = [] + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + runtime = 0 + try: + time0 = time.time() + ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=1, parallel=parallel, verbose=True) + runtime = time.time() - time0 + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception(save_file_suffix) + print(repr(exp)) + + """**6. Get results.**""" + + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(graphs, N, max_num_solutions, ratio): + # Return if the group file exists. + name_middle = '.' + str(N) + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if os.path.isfile(name_group): + return + + ged_mats = [] + runtimes = [] + for trial in range(1, 101): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(graphs, N, max_num_solutions, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + + # Group trials and Remove single files. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + + +def results_for_a_ratio(ratio): + + for N in N_list: + print() + print('# of graphs:', N) + for max_num_solutions in [1, 20, 40, 60, 80, 100]: + print() + print('Max # of solutions:', max_num_solutions) + save_trials_as_group(graphs[:N], N, max_num_solutions, ratio) + + +if __name__ == '__main__': + if len(sys.argv) > 1: + N_list = [int(i) for i in sys.argv[1:]] + else: + N_list = [10, 50, 100] + + # Generate graphs. + graphs = generate_graphs() + + save_dir = 'outputs/edit_costs.max_num_sols.N.bipartite/' + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) + + for ratio in [10, 1, 0.1]: + print() + print('Ratio:', ratio) + results_for_a_ratio(ratio) diff --git a/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py index d05558a..1f01fd5 100644 --- a/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py +++ b/gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py @@ -12,18 +12,19 @@ import multiprocessing import pickle import logging from gklearn.ged.util import compute_geds -import numpy as np import time from utils import get_dataset import sys +from group_results import group_trials def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) - """**1. Get dataset.**""" - dataset = get_dataset(ds_name) + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None """**2. Set parameters.**""" @@ -83,6 +84,12 @@ def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial): def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): + # Return if the group file exists. + name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if os.path.isfile(name_group): + return + ged_mats = [] runtimes = [] for trial in range(1, 101): @@ -92,25 +99,36 @@ def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio): ged_mats.append(ged_mat) runtimes.append(runtime) - save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) - with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: - np.save(f, np.array(ged_mats)) - with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: - pickle.dump(runtime, f) - - + # Group trials and Remove single files. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + + def results_for_a_dataset(ds_name): """**1. Get dataset.**""" dataset = get_dataset(ds_name) - for max_num_solutions in [1, 20, 40, 60, 80, 100]: + for max_num_solutions in mnum_solutions_list: print() print('Max # of solutions:', max_num_solutions) - for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: + for ratio in ratio_list: print() print('Ratio:', ratio) save_trials_as_group(dataset, ds_name, max_num_solutions, ratio) + + +def get_param_lists(ds_name): + if ds_name == 'AIDS_symb': + mnum_solutions_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + else: + mnum_solutions_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + return mnum_solutions_list, ratio_list + if __name__ == '__main__': if len(sys.argv) > 1: @@ -119,12 +137,11 @@ if __name__ == '__main__': ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/' - if not os.path.exists(save_dir): - os.makedirs(save_dir) - if not os.path.exists(save_dir + 'groups/'): - os.makedirs(save_dir + 'groups/') + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) for ds_name in ds_name_list: print() print('Dataset:', ds_name) + mnum_solutions_list, ratio_list = get_param_lists(ds_name) results_for_a_dataset(ds_name) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/edit_costs.nums_sols.N.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.nums_sols.N.IPFP.py new file mode 100644 index 0000000..d65358a --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.nums_sols.N.IPFP.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 20 11:48:02 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. + +import os +import multiprocessing +import pickle +import logging +from gklearn.ged.util import compute_geds +import time +import sys +from group_results import group_trials + + +def generate_graphs(): + from gklearn.utils.graph_synthesizer import GraphSynthesizer + gsyzer = GraphSynthesizer() + graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) + return graphs + + +def xp_compute_ged_matrix(graphs, N, num_solutions, ratio, trial): + + save_file_suffix = '.' + str(N) + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'IPFP', # use IPFP huristic. + 'initialization_method': 'RANDOM', # or 'NODE', etc. + # when bigger than 1, then the method is considered mIPFP. + 'initial_solutions': int(num_solutions * 4), + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 0.25, + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = [] + options['edge_labels'] = [] + options['node_attrs'] = [] + options['edge_attrs'] = [] + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + runtime = 0 + try: + time0 = time.time() + ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=1, parallel=parallel, verbose=True) + runtime = time.time() - time0 + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception(save_file_suffix) + print(repr(exp)) + + """**6. Get results.**""" + + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(graphs, N, num_solutions, ratio): + # Return if the group file exists. + name_middle = '.' + str(N) + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if os.path.isfile(name_group): + return + + ged_mats = [] + runtimes = [] + for trial in range(1, 101): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(graphs, N, num_solutions, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + + # Group trials and Remove single files. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + + +def results_for_a_ratio(ratio): + + for N in N_list: + print() + print('# of graphs:', N) + for num_solutions in [1, 20, 40, 60, 80, 100]: + print() + print('# of solutions:', num_solutions) + save_trials_as_group(graphs[:N], N, num_solutions, ratio) + + +if __name__ == '__main__': + if len(sys.argv) > 1: + N_list = [int(i) for i in sys.argv[1:]] + else: + N_list = [10, 50, 100] + + # Generate graphs. + graphs = generate_graphs() + + save_dir = 'outputs/edit_costs.num_sols.N.IPFP/' + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) + + for ratio in [10, 1, 0.1]: + print() + print('Ratio:', ratio) + results_for_a_ratio(ratio) diff --git a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py index 4a3c0da..710213a 100644 --- a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py @@ -12,15 +12,19 @@ import multiprocessing import pickle import logging from gklearn.ged.util import compute_geds -import numpy as np import time from utils import get_dataset import sys +from group_results import group_trials def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None """**2. Set parameters.**""" @@ -39,8 +43,8 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): } edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] -# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] -# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) options = ged_options.copy() options['edit_cost_constants'] = edit_cost_constants @@ -55,7 +59,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): runtime = 0 try: time0 = time.time() - ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True) + ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True) runtime = time.time() - time0 except Exception as exp: print('An exception occured when running this experiment:') @@ -70,11 +74,17 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): pickle.dump(ged_mat, f) with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: pickle.dump(runtime, f) - + return ged_mat, runtime - + def save_trials_as_group(dataset, ds_name, num_solutions, ratio): + # Return if the group file exists. + name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if os.path.isfile(name_group): + return + ged_mats = [] runtimes = [] for trial in range(1, 101): @@ -84,24 +94,35 @@ def save_trials_as_group(dataset, ds_name, num_solutions, ratio): ged_mats.append(ged_mat) runtimes.append(runtime) - save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) - with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: - np.save(f, np.array(ged_mats)) - with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: - pickle.dump(runtime, f) - - + # Group trials and Remove single files. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + + def results_for_a_dataset(ds_name): """**1. Get dataset.**""" dataset = get_dataset(ds_name) - for num_solutions in [1, 20, 40, 60, 80, 100]: + for num_solutions in num_solutions_list: print() print('# of solutions:', num_solutions) - for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: + for ratio in ratio_list: print() print('Ratio:', ratio) save_trials_as_group(dataset, ds_name, num_solutions, ratio) + + +def get_param_lists(ds_name): + if ds_name == 'AIDS_symb': + num_solutions_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + else: + num_solutions_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + + return num_solutions_list, ratio_list if __name__ == '__main__': @@ -111,12 +132,11 @@ if __name__ == '__main__': ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' - if not os.path.exists(save_dir): - os.makedirs(save_dir) - if not os.path.exists(save_dir + 'groups/'): - os.makedirs(save_dir + 'groups/') + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) for ds_name in ds_name_list: print() print('Dataset:', ds_name) + num_solutions_list, ratio_list = get_param_lists(ds_name) results_for_a_dataset(ds_name) diff --git a/gklearn/experiments/ged/stability/edit_costs.repeats.N.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.repeats.N.IPFP.py new file mode 100644 index 0000000..6f6215e --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.repeats.N.IPFP.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 20 11:48:02 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. + +import os +import multiprocessing +import pickle +import logging +from gklearn.ged.util import compute_geds +import time +import sys +from group_results import group_trials + + +def generate_graphs(): + from gklearn.utils.graph_synthesizer import GraphSynthesizer + gsyzer = GraphSynthesizer() + graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) + return graphs + + +def xp_compute_ged_matrix(graphs, N, repeats, ratio, trial): + + save_file_suffix = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'IPFP', # use IPFP huristic. + 'initialization_method': 'RANDOM', # or 'NODE', etc. + # when bigger than 1, then the method is considered mIPFP. + 'initial_solutions': 1, + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = [] + options['edge_labels'] = [] + options['node_attrs'] = [] + options['edge_attrs'] = [] + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + runtime = 0 + try: + time0 = time.time() + ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) + runtime = time.time() - time0 + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception(save_file_suffix) + print(repr(exp)) + + """**6. Get results.**""" + + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(graphs, N, repeats, ratio): + # Return if the group file exists. + name_middle = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if os.path.isfile(name_group): + return + + ged_mats = [] + runtimes = [] + for trial in range(1, 101): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(graphs, N, repeats, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + + # Group trials and Remove single files. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + + +def results_for_a_ratio(ratio): + + for N in N_list: + print() + print('# of graphs:', N) + for repeats in [1, 20, 40, 60, 80, 100]: + print() + print('Repeats:', repeats) + save_trials_as_group(graphs[:N], N, repeats, ratio) + + +if __name__ == '__main__': + if len(sys.argv) > 1: + N_list = [int(i) for i in sys.argv[1:]] + else: + N_list = [10, 50, 100] + + # Generate graphs. + graphs = generate_graphs() + + save_dir = 'outputs/edit_costs.repeats.N.IPFP/' + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) + + for ratio in [10, 1, 0.1]: + print() + print('Ratio:', ratio) + results_for_a_ratio(ratio) diff --git a/gklearn/experiments/ged/stability/edit_costs.repeats.N.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.repeats.N.bipartite.py new file mode 100644 index 0000000..64984de --- /dev/null +++ b/gklearn/experiments/ged/stability/edit_costs.repeats.N.bipartite.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 20 11:48:02 2020 + +@author: ljia +""" +# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. + +import os +import multiprocessing +import pickle +import logging +from gklearn.ged.util import compute_geds +import time +import sys +from group_results import group_trials + + +def generate_graphs(): + from gklearn.utils.graph_synthesizer import GraphSynthesizer + gsyzer = GraphSynthesizer() + graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=20, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) + return graphs + + +def xp_compute_ged_matrix(graphs, N, repeats, ratio, trial): + + save_file_suffix = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) + + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None + + """**2. Set parameters.**""" + + # Parameters for GED computation. + ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic. + # 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv) + 'lsape_model': 'ECBP', # + # ??when bigger than 1, then the method is considered mIPFP. + # the actual number of computed solutions might be smaller than the specified value + 'max_num_solutions': 1, + 'edit_cost': 'CONSTANT', # use CONSTANT cost. + 'greedy_method': 'BASIC', # + # the distance between non-symbolic node/edge labels is computed by euclidean distance. + 'attr_distance': 'euclidean', + 'optimal': True, # if TRUE, the option --greedy-method has no effect + # parallel threads. Do not work if mpg_options['parallel'] = False. + 'threads': multiprocessing.cpu_count(), + 'centrality_method': 'NONE', + 'centrality_weight': 0.7, + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' + } + + edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] +# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] +# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) + + options = ged_options.copy() + options['edit_cost_constants'] = edit_cost_constants + options['node_labels'] = [] + options['edge_labels'] = [] + options['node_attrs'] = [] + options['edge_attrs'] = [] + parallel = True # if num_solutions == 1 else False + + """**5. Compute GED matrix.**""" + ged_mat = 'error' + runtime = 0 + try: + time0 = time.time() + ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, repeats=repeats, parallel=parallel, verbose=True) + runtime = time.time() - time0 + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception(save_file_suffix) + print(repr(exp)) + + """**6. Get results.**""" + + with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(ged_mat, f) + with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: + pickle.dump(runtime, f) + + return ged_mat, runtime + + +def save_trials_as_group(graphs, N, repeats, ratio): + # Return if the group file exists. + name_middle = '.' + str(N) + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if os.path.isfile(name_group): + return + + ged_mats = [] + runtimes = [] + for trial in range(1, 101): + print() + print('Trial:', trial) + ged_mat, runtime = xp_compute_ged_matrix(graphs, N, repeats, ratio, trial) + ged_mats.append(ged_mat) + runtimes.append(runtime) + + # Group trials and Remove single files. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + + +def results_for_a_ratio(ratio): + + for N in N_list: + print() + print('# of graphs:', N) + for repeats in [1, 20, 40, 60, 80, 100]: + print() + print('Repeats:', repeats) + save_trials_as_group(graphs[:N], N, repeats, ratio) + + +if __name__ == '__main__': + if len(sys.argv) > 1: + N_list = [int(i) for i in sys.argv[1:]] + else: + N_list = [10, 50, 100] + + # Generate graphs. + graphs = generate_graphs() + + save_dir = 'outputs/edit_costs.repeats.N.bipartite/' + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) + + for ratio in [10, 1, 0.1]: + print() + print('Ratio:', ratio) + results_for_a_ratio(ratio) diff --git a/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py index 5b4576b..bdb7a30 100644 --- a/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py +++ b/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.IPFP.py @@ -12,18 +12,19 @@ import multiprocessing import pickle import logging from gklearn.ged.util import compute_geds -import numpy as np import time from utils import get_dataset import sys +from group_results import group_trials def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): - + save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) - """**1. Get dataset.**""" - dataset = get_dataset(ds_name) + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None """**2. Set parameters.**""" @@ -78,6 +79,12 @@ def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): def save_trials_as_group(dataset, ds_name, repeats, ratio): + # Return if the group file exists. + name_middle = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if os.path.isfile(name_group): + return + ged_mats = [] runtimes = [] for trial in range(1, 101): @@ -87,25 +94,36 @@ def save_trials_as_group(dataset, ds_name, repeats, ratio): ged_mats.append(ged_mat) runtimes.append(runtime) - save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) - with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: - np.save(f, np.array(ged_mats)) - with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: - pickle.dump(runtime, f) - - + # Group trials and Remove single files. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + + def results_for_a_dataset(ds_name): """**1. Get dataset.**""" dataset = get_dataset(ds_name) - for repeats in [1, 20, 40, 60, 80, 100]: + for repeats in repeats_list: print() print('Repeats:', repeats) - for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: + for ratio in ratio_list: print() print('Ratio:', ratio) save_trials_as_group(dataset, ds_name, repeats, ratio) + + +def get_param_lists(ds_name): + if ds_name == 'AIDS_symb': + repeats_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + else: + repeats_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + return repeats_list, ratio_list + if __name__ == '__main__': if len(sys.argv) > 1: @@ -114,12 +132,11 @@ if __name__ == '__main__': ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] save_dir = 'outputs/edit_costs.repeats.ratios.IPFP/' - if not os.path.exists(save_dir): - os.makedirs(save_dir) - if not os.path.exists(save_dir + 'groups/'): - os.makedirs(save_dir + 'groups/') + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) for ds_name in ds_name_list: print() print('Dataset:', ds_name) + repeats_list, ratio_list = get_param_lists(ds_name) results_for_a_dataset(ds_name) diff --git a/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py b/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py index f6ecd99..b6863e2 100644 --- a/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py +++ b/gklearn/experiments/ged/stability/edit_costs.repeats.ratios.bipartite.py @@ -12,18 +12,19 @@ import multiprocessing import pickle import logging from gklearn.ged.util import compute_geds -import numpy as np import time from utils import get_dataset import sys +from group_results import group_trials def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): - + save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) - - """**1. Get dataset.**""" - dataset = get_dataset(ds_name) + + # Return if the file exists. + if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): + return None, None """**2. Set parameters.**""" @@ -83,6 +84,12 @@ def xp_compute_ged_matrix(dataset, ds_name, repeats, ratio, trial): def save_trials_as_group(dataset, ds_name, repeats, ratio): + # Return if the group file exists. + name_middle = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) + '.' + name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' + if os.path.isfile(name_group): + return + ged_mats = [] runtimes = [] for trial in range(1, 101): @@ -92,25 +99,36 @@ def save_trials_as_group(dataset, ds_name, repeats, ratio): ged_mats.append(ged_mat) runtimes.append(runtime) - save_file_suffix = '.' + ds_name + '.repeats_' + str(repeats) + '.ratio_' + "{:.2f}".format(ratio) - with open(save_dir + 'groups/ged_mats' + save_file_suffix + '.npy', 'wb') as f: - np.save(f, np.array(ged_mats)) - with open(save_dir + 'groups/runtimes' + save_file_suffix + '.pkl', 'wb') as f: - pickle.dump(runtime, f) - - + # Group trials and Remove single files. + name_prefix = 'ged_matrix' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + name_prefix = 'runtime' + name_middle + group_trials(save_dir, name_prefix, True, True, False) + + def results_for_a_dataset(ds_name): """**1. Get dataset.**""" dataset = get_dataset(ds_name) - for repeats in [1, 20, 40, 60, 80, 100]: + for repeats in repeats_list: print() print('Repeats:', repeats) - for ratio in [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]: + for ratio in ratio_list: print() print('Ratio:', ratio) save_trials_as_group(dataset, ds_name, repeats, ratio) + + +def get_param_lists(ds_name): + if ds_name == 'AIDS_symb': + repeats_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + else: + repeats_list = [1, 20, 40, 60, 80, 100] + ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] + return repeats_list, ratio_list + if __name__ == '__main__': if len(sys.argv) > 1: @@ -119,12 +137,11 @@ if __name__ == '__main__': ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] save_dir = 'outputs/edit_costs.repeats.ratios.bipartite/' - if not os.path.exists(save_dir): - os.makedirs(save_dir) - if not os.path.exists(save_dir + 'groups/'): - os.makedirs(save_dir + 'groups/') + os.makedirs(save_dir, exist_ok=True) + os.makedirs(save_dir + 'groups/', exist_ok=True) for ds_name in ds_name_list: print() print('Dataset:', ds_name) + repeats_list, ratio_list = get_param_lists(ds_name) results_for_a_dataset(ds_name) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/group_results.py b/gklearn/experiments/ged/stability/group_results.py index 48ea68d..e1f999e 100644 --- a/gklearn/experiments/ged/stability/group_results.py +++ b/gklearn/experiments/ged/stability/group_results.py @@ -16,6 +16,7 @@ from tqdm import tqdm import sys +# This function is used by other scripts. Modify it carefully. def group_trials(dir_folder, name_prefix, override, clear, backup): # Get group name. @@ -47,8 +48,20 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' if os.path.isfile(file_name): with open(file_name, 'rb') as f: - data = pickle.load(f) + try: + data = pickle.load(f) + except EOFError: + print('EOF Error occurred.') + return data_group.append(data) + +# unpickler = pickle.Unpickler(f) +# data = unpickler.load() +# if not isinstance(data, np.array): +# return +# else: +# data_group.append(data) + else: # Not all trials are completed. return @@ -81,11 +94,9 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): # Create folders. - if not os.path.exists(dir_folder + 'groups/'): - os.makedirs(dir_folder + 'groups/') + os.makedirs(dir_folder + 'groups/', exist_ok=True) if backup: - if not os.path.exists(dir_folder + 'backups'): - os.makedirs(dir_folder + 'backups') + os.makedirs(dir_folder + 'backups', exist_ok=True) # Iterate all files. cur_file_prefix = '' @@ -105,4 +116,10 @@ if __name__ == '__main__': group_all_in_folder(dir_folder) dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' + group_all_in_folder(dir_folder) + + dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' + group_all_in_folder(dir_folder) + + dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' group_all_in_folder(dir_folder) \ No newline at end of file diff --git a/gklearn/experiments/ged/stability/run_job_edit_costs.N.py b/gklearn/experiments/ged/stability/run_job_edit_costs.N.py new file mode 100644 index 0000000..43da338 --- /dev/null +++ b/gklearn/experiments/ged/stability/run_job_edit_costs.N.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Nov 3 20:23:25 2020 + +@author: ljia +""" +import os +import re + + +def get_job_script(arg, params): + ged_method = params[0] + multi_method = params[1] + job_name_label = r"rep." if multi_method == 'repeats' else r"" + script = r""" +#!/bin/bash + +#SBATCH --exclusive +#SBATCH --job-name="st.""" + job_name_label + r"N" + arg + r"." + ged_method + r"""" +#SBATCH --partition=tlong +#SBATCH --mail-type=ALL +#SBATCH --mail-user=jajupmochi@gmail.com +#SBATCH --output="outputs/output_edit_costs.""" + multi_method + r".N." + ged_method + r"." + arg + r""".txt" +#SBATCH --error="errors/error_edit_costs.""" + multi_method + r".N." + ged_method + r"." + arg + r""".txt" +# +#SBATCH --ntasks=1 +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=1 +#SBATCH --time=300:00:00 +#SBATCH --mem-per-cpu=4000 + +srun hostname +srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability +srun python3 edit_costs.""" + multi_method + r".N." + ged_method + r".py " + arg + script = script.strip() + script = re.sub('\n\t+', '\n', script) + script = re.sub('\n +', '\n', script) + + return script + +if __name__ == '__main__': + + params_list = [('IPFP', 'nums_sols'), + ('IPFP', 'repeats'), + ('bipartite', 'max_num_sols'), + ('bipartite', 'repeats')] + N_list = [10, 50, 100] + for params in params_list[1:]: + for N in [N_list[i] for i in [0, 1, 2]]: + job_script = get_job_script(str(N), params) + command = 'sbatch < 1) - self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1)) - self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1)) - self.__labeled_edges = (ged_env.get_num_edge_labels() > 1) - self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1)) - self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1)) - self.__init_type = 'RANDOM' - self.__num_random_inits = 10 - self.__desired_num_random_inits = 10 - self.__use_real_randomness = True - self.__seed = 0 - self.__parallel = True - self.__update_order = True - self.__sort_graphs = True # sort graphs by size when computing GEDs. - self.__refine = True - self.__time_limit_in_sec = 0 - self.__epsilon = 0.0001 - self.__max_itrs = 100 - self.__max_itrs_without_update = 3 - self.__num_inits_increase_order = 10 - self.__init_type_increase_order = 'K-MEANS++' - self.__max_itrs_increase_order = 10 - self.__print_to_stdout = 2 - self.__median_id = np.inf # @todo: check - self.__node_maps_from_median = {} - self.__sum_of_distances = 0 - self.__best_init_sum_of_distances = np.inf - self.__converged_sum_of_distances = np.inf - self.__runtime = None - self.__runtime_initialized = None - self.__runtime_converged = None - self.__itrs = [] # @todo: check: {} ? - self.__num_decrease_order = 0 - self.__num_increase_order = 0 - self.__num_converged_descents = 0 - self.__state = AlgorithmState.TERMINATED - self.__label_names = {} + self._ged_env = ged_env + self._init_method = 'BRANCH_FAST' + self._init_options = '' + self._descent_method = 'BRANCH_FAST' + self._descent_options = '' + self._refine_method = 'IPFP' + self._refine_options = '' + self._constant_node_costs = constant_node_costs + self._labeled_nodes = (ged_env.get_num_node_labels() > 1) + self._node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1)) + self._node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1)) + self._labeled_edges = (ged_env.get_num_edge_labels() > 1) + self._edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1)) + self._edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1)) + self._init_type = 'RANDOM' + self._num_random_inits = 10 + self._desired_num_random_inits = 10 + self._use_real_randomness = True + self._seed = 0 + self._parallel = True + self._update_order = True + self._sort_graphs = True # sort graphs by size when computing GEDs. + self._refine = True + self._time_limit_in_sec = 0 + self._epsilon = 0.0001 + self._max_itrs = 100 + self._max_itrs_without_update = 3 + self._num_inits_increase_order = 10 + self._init_type_increase_order = 'K-MEANS++' + self._max_itrs_increase_order = 10 + self._print_to_stdout = 2 + self._median_id = np.inf # @todo: check + self._node_maps_from_median = {} + self._sum_of_distances = 0 + self._best_init_sum_of_distances = np.inf + self._converged_sum_of_distances = np.inf + self._runtime = None + self._runtime_initialized = None + self._runtime_converged = None + self._itrs = [] # @todo: check: {} ? + self._num_decrease_order = 0 + self._num_increase_order = 0 + self._num_converged_descents = 0 + self._state = AlgorithmState.TERMINATED + self._label_names = {} if ged_env is None: raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') @@ -91,142 +91,142 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no options : string String that specifies with which options to run the estimator. """ - self.__set_default_options() + self._set_default_options() options_map = misc.options_string_to_options_map(options) for opt_name, opt_val in options_map.items(): if opt_name == 'init-type': - self.__init_type = opt_val + self._init_type = opt_val if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN': raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"') elif opt_name == 'random-inits': try: - self.__num_random_inits = int(opt_val) - self.__desired_num_random_inits = self.__num_random_inits + self._num_random_inits = int(opt_val) + self._desired_num_random_inits = self._num_random_inits except: raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') - if self.__num_random_inits <= 0: + if self._num_random_inits <= 0: raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') elif opt_name == 'randomness': if opt_val == 'PSEUDO': - self.__use_real_randomness = False + self._use_real_randomness = False elif opt_val == 'REAL': - self.__use_real_randomness = True + self._use_real_randomness = True else: raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') elif opt_name == 'stdout': if opt_val == '0': - self.__print_to_stdout = 0 + self._print_to_stdout = 0 elif opt_val == '1': - self.__print_to_stdout = 1 + self._print_to_stdout = 1 elif opt_val == '2': - self.__print_to_stdout = 2 + self._print_to_stdout = 2 else: raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') elif opt_name == 'parallel': if opt_val == 'TRUE': - self.__parallel = True + self._parallel = True elif opt_val == 'FALSE': - self.__parallel = False + self._parallel = False else: raise Exception('Invalid argument "' + opt_val + '" for option parallel. Usage: options = "[--parallel TRUE|FALSE] [...]"') elif opt_name == 'update-order': if opt_val == 'TRUE': - self.__update_order = True + self._update_order = True elif opt_val == 'FALSE': - self.__update_order = False + self._update_order = False else: raise Exception('Invalid argument "' + opt_val + '" for option update-order. Usage: options = "[--update-order TRUE|FALSE] [...]"') elif opt_name == 'sort-graphs': if opt_val == 'TRUE': - self.__sort_graphs = True + self._sort_graphs = True elif opt_val == 'FALSE': - self.__sort_graphs = False + self._sort_graphs = False else: raise Exception('Invalid argument "' + opt_val + '" for option sort-graphs. Usage: options = "[--sort-graphs TRUE|FALSE] [...]"') elif opt_name == 'refine': if opt_val == 'TRUE': - self.__refine = True + self._refine = True elif opt_val == 'FALSE': - self.__refine = False + self._refine = False else: raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') elif opt_name == 'time-limit': try: - self.__time_limit_in_sec = float(opt_val) + self._time_limit_in_sec = float(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]') elif opt_name == 'max-itrs': try: - self.__max_itrs = int(opt_val) + self._max_itrs = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]') elif opt_name == 'max-itrs-without-update': try: - self.__max_itrs_without_update = int(opt_val) + self._max_itrs_without_update = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]') elif opt_name == 'seed': try: - self.__seed = int(opt_val) + self._seed = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]') elif opt_name == 'epsilon': try: - self.__epsilon = float(opt_val) + self._epsilon = float(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') - if self.__epsilon <= 0: + if self._epsilon <= 0: raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') elif opt_name == 'inits-increase-order': try: - self.__num_inits_increase_order = int(opt_val) + self._num_inits_increase_order = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') - if self.__num_inits_increase_order <= 0: + if self._num_inits_increase_order <= 0: raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') elif opt_name == 'init-type-increase-order': - self.__init_type_increase_order = opt_val + self._init_type_increase_order = opt_val if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') elif opt_name == 'max-itrs-increase-order': try: - self.__max_itrs_increase_order = int(opt_val) + self._max_itrs_increase_order = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]') @@ -253,8 +253,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no ----- Has no effect unless "--init-type MEDOID" is passed to set_options(). """ - self.__init_method = init_method; - self.__init_options = init_options; + self._init_method = init_method; + self._init_options = init_options; def set_descent_method(self, descent_method, descent_options=''): @@ -272,8 +272,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no ----- Has no effect unless "--init-type MEDOID" is passed to set_options(). """ - self.__descent_method = descent_method; - self.__descent_options = descent_options; + self._descent_method = descent_method; + self._descent_options = descent_options; def set_refine_method(self, refine_method, refine_options): @@ -291,8 +291,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no ----- Has no effect if "--refine FALSE" is passed to set_options(). """ - self.__refine_method = refine_method - self.__refine_options = refine_options + self._refine_method = refine_method + self._refine_options = refine_options def run(self, graph_ids, set_median_id, gen_median_id): @@ -315,7 +315,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no raise Exception('Empty vector of graph IDs, unable to compute median.') all_graphs_empty = True for graph_id in graph_ids: - if self.__ged_env.get_graph_num_nodes(graph_id) > 0: + if self._ged_env.get_graph_num_nodes(graph_id) > 0: all_graphs_empty = False break if all_graphs_empty: @@ -323,16 +323,16 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # Start timer and record start time. start = time.time() - timer = Timer(self.__time_limit_in_sec) - self.__median_id = gen_median_id - self.__state = AlgorithmState.TERMINATED + timer = Timer(self._time_limit_in_sec) + self._median_id = gen_median_id + self._state = AlgorithmState.TERMINATED # Get NetworkX graph representations of the input graphs. graphs = {} for graph_id in graph_ids: # @todo: get_nx_graph() function may need to be modified according to the coming code. - graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id, True, True, False) -# print(self.__ged_env.get_graph_internal_id(0)) + graphs[graph_id] = self._ged_env.get_nx_graph(graph_id, True, True, False) +# print(self._ged_env.get_graph_internal_id(0)) # print(graphs[0].graph) # print(graphs[0].nodes(data=True)) # print(graphs[0].edges(data=True)) @@ -340,27 +340,27 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # Construct initial medians. medians = [] - self.__construct_initial_medians(graph_ids, timer, medians) + self._construct_initial_medians(graph_ids, timer, medians) end_init = time.time() - self.__runtime_initialized = end_init - start -# print(medians[0].graph) -# print(medians[0].nodes(data=True)) -# print(medians[0].edges(data=True)) -# print(nx.adjacency_matrix(medians[0])) + self._runtime_initialized = end_init - start + print(medians[0].graph) + print(medians[0].nodes(data=True)) + print(medians[0].edges(data=True)) + print(nx.adjacency_matrix(medians[0])) # Reset information about iterations and number of times the median decreases and increases. - self.__itrs = [0] * len(medians) - self.__num_decrease_order = 0 - self.__num_increase_order = 0 - self.__num_converged_descents = 0 + self._itrs = [0] * len(medians) + self._num_decrease_order = 0 + self._num_increase_order = 0 + self._num_converged_descents = 0 # Initialize the best median. best_sum_of_distances = np.inf - self.__best_init_sum_of_distances = np.inf + self._best_init_sum_of_distances = np.inf node_maps_from_best_median = {} # Run block gradient descent from all initial medians. - self.__ged_env.set_method(self.__descent_method, self.__descent_options) + self._ged_env.set_method(self._descent_method, self._descent_options) for median_pos in range(0, len(medians)): # Terminate if the timer has expired and at least one SOD has been computed. @@ -368,7 +368,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no break # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n===========================================================') print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') print('-----------------------------------------------------------') @@ -377,27 +377,27 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no median = medians[median_pos] # Load initial median into the environment. - self.__ged_env.load_nx_graph(median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) + self._ged_env.load_nx_graph(median, gen_median_id) + self._ged_env.init(self._ged_env.get_init_type()) # Compute node maps and sum of distances for initial median. -# xxx = self.__node_maps_from_median - self.__compute_init_node_maps(graph_ids, gen_median_id) -# yyy = self.__node_maps_from_median + xxx = self._node_maps_from_median + self._compute_init_node_maps(graph_ids, gen_median_id) + yyy = self._node_maps_from_median - self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) - self.__ged_env.load_nx_graph(median, set_median_id) -# print(self.__best_init_sum_of_distances) + self._best_init_sum_of_distances = min(self._best_init_sum_of_distances, self._sum_of_distances) + self._ged_env.load_nx_graph(median, set_median_id) + print(self._best_init_sum_of_distances) # Run block gradient descent from initial median. converged = False itrs_without_update = 0 - while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update): + while not self._termination_criterion_met(converged, timer, self._itrs[median_pos], itrs_without_update): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n===========================================================') - print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('Iteration', str(self._itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') print('-----------------------------------------------------------') # Initialize flags that tell us what happened in the iteration. @@ -407,12 +407,12 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no increased_order = False # Update the median. - median_modified = self.__update_median(graphs, median) - if self.__update_order: - if not median_modified or self.__itrs[median_pos] == 0: - decreased_order = self.__decrease_order(graphs, median) - if not decreased_order or self.__itrs[median_pos] == 0: - increased_order = self.__increase_order(graphs, median) + median_modified = self._update_median(graphs, median) + if self._update_order: + if not median_modified or self._itrs[median_pos] == 0: + decreased_order = self._decrease_order(graphs, median) + if not decreased_order or self._itrs[median_pos] == 0: + increased_order = self._increase_order(graphs, median) # Update the number of iterations without update of the median. if median_modified or decreased_order or increased_order: @@ -421,51 +421,51 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no itrs_without_update += 1 # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Loading median to environment: ... ', end='') # Load the median into the environment. # @todo: should this function use the original node label? - self.__ged_env.load_nx_graph(median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) + self._ged_env.load_nx_graph(median, gen_median_id) + self._ged_env.init(self._ged_env.get_init_type()) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Updating induced costs: ... ', end='') # Compute induced costs of the old node maps w.r.t. the updated median. for graph_id in graph_ids: -# print(self.__node_maps_from_median[graph_id].induced_cost()) -# xxx = self.__node_maps_from_median[graph_id] - self.__ged_env.compute_induced_cost(gen_median_id, graph_id, self.__node_maps_from_median[graph_id]) +# print(self._node_maps_from_median[graph_id].induced_cost()) +# xxx = self._node_maps_from_median[graph_id] + self._ged_env.compute_induced_cost(gen_median_id, graph_id, self._node_maps_from_median[graph_id]) # print('---------------------------------------') -# print(self.__node_maps_from_median[graph_id].induced_cost()) +# print(self._node_maps_from_median[graph_id].induced_cost()) # @todo:!!!!!!!!!!!!!!!!!!!!!!!!!!!!This value is a slight different from the c++ program, which might be a bug! Use it very carefully! # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Update the node maps. - node_maps_modified = self.__update_node_maps() + node_maps_modified = self._update_node_maps() # Update the order of the median if no improvement can be found with the current order. # Update the sum of distances. - old_sum_of_distances = self.__sum_of_distances - self.__sum_of_distances = 0 - for graph_id, node_map in self.__node_maps_from_median.items(): - self.__sum_of_distances += node_map.induced_cost() -# print(self.__sum_of_distances) + old_sum_of_distances = self._sum_of_distances + self._sum_of_distances = 0 + for graph_id, node_map in self._node_maps_from_median.items(): + self._sum_of_distances += node_map.induced_cost() +# print(self._sum_of_distances) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Old local SOD: ', old_sum_of_distances) - print('New local SOD: ', self.__sum_of_distances) + print('New local SOD: ', self._sum_of_distances) print('Best converged SOD: ', best_sum_of_distances) print('Modified median: ', median_modified) print('Modified node maps: ', node_maps_modified) @@ -475,121 +475,121 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no converged = not (median_modified or node_maps_modified or decreased_order or increased_order) - self.__itrs[median_pos] += 1 + self._itrs[median_pos] += 1 # Update the best median. - if self.__sum_of_distances < best_sum_of_distances: - best_sum_of_distances = self.__sum_of_distances - node_maps_from_best_median = self.__node_maps_from_median.copy() # @todo: this is a shallow copy, not sure if it is enough. + if self._sum_of_distances < best_sum_of_distances: + best_sum_of_distances = self._sum_of_distances + node_maps_from_best_median = self._node_maps_from_median.copy() # @todo: this is a shallow copy, not sure if it is enough. best_median = median # Update the number of converged descents. if converged: - self.__num_converged_descents += 1 + self._num_converged_descents += 1 # Store the best encountered median. - self.__sum_of_distances = best_sum_of_distances - self.__node_maps_from_median = node_maps_from_best_median - self.__ged_env.load_nx_graph(best_median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) + self._sum_of_distances = best_sum_of_distances + self._node_maps_from_median = node_maps_from_best_median + self._ged_env.load_nx_graph(best_median, gen_median_id) + self._ged_env.init(self._ged_env.get_init_type()) end_descent = time.time() - self.__runtime_converged = end_descent - start + self._runtime_converged = end_descent - start # Refine the sum of distances and the node maps for the converged median. - self.__converged_sum_of_distances = self.__sum_of_distances - if self.__refine: - self.__improve_sum_of_distances(timer) + self._converged_sum_of_distances = self._sum_of_distances + if self._refine: + self._improve_sum_of_distances(timer) # Record end time, set runtime and reset the number of initial medians. end = time.time() - self.__runtime = end - start - self.__num_random_inits = self.__desired_num_random_inits + self._runtime = end - start + self._num_random_inits = self._desired_num_random_inits # Print global information. - if self.__print_to_stdout != 0: + if self._print_to_stdout != 0: print('\n===========================================================') print('Finished computation of generalized median graph.') print('-----------------------------------------------------------') - print('Best SOD after initialization: ', self.__best_init_sum_of_distances) - print('Converged SOD: ', self.__converged_sum_of_distances) - if self.__refine: - print('Refined SOD: ', self.__sum_of_distances) - print('Overall runtime: ', self.__runtime) - print('Runtime of initialization: ', self.__runtime_initialized) - print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized) - if self.__refine: - print('Runtime of refinement: ', self.__runtime - self.__runtime_converged) + print('Best SOD after initialization: ', self._best_init_sum_of_distances) + print('Converged SOD: ', self._converged_sum_of_distances) + if self._refine: + print('Refined SOD: ', self._sum_of_distances) + print('Overall runtime: ', self._runtime) + print('Runtime of initialization: ', self._runtime_initialized) + print('Runtime of block gradient descent: ', self._runtime_converged - self._runtime_initialized) + if self._refine: + print('Runtime of refinement: ', self._runtime - self._runtime_converged) print('Number of initial medians: ', len(medians)) total_itr = 0 num_started_descents = 0 - for itr in self.__itrs: + for itr in self._itrs: total_itr += itr if itr > 0: num_started_descents += 1 print('Size of graph collection: ', len(graph_ids)) print('Number of started descents: ', num_started_descents) - print('Number of converged descents: ', self.__num_converged_descents) + print('Number of converged descents: ', self._num_converged_descents) print('Overall number of iterations: ', total_itr) - print('Overall number of times the order decreased: ', self.__num_decrease_order) - print('Overall number of times the order increased: ', self.__num_increase_order) + print('Overall number of times the order decreased: ', self._num_decrease_order) + print('Overall number of times the order increased: ', self._num_increase_order) print('===========================================================\n') - def __improve_sum_of_distances(self, timer): # @todo: go through and test + def _improve_sum_of_distances(self, timer): # @todo: go through and test # Use method selected for refinement phase. - self.__ged_env.set_method(self.__refine_method, self.__refine_options) + self._ged_env.set_method(self._refine_method, self._refine_options) # Print information about current iteration. - if self.__print_to_stdout == 2: - progress = tqdm(desc='Improving node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + if self._print_to_stdout == 2: + progress = tqdm(desc='Improving node maps', total=len(self._node_maps_from_median), file=sys.stdout) print('\n===========================================================') print('Improving node maps and SOD for converged median.') print('-----------------------------------------------------------') progress.update(1) # Improving the node maps. - nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__gen_median_id) - for graph_id, node_map in self.__node_maps_from_median.items(): + nb_nodes_median = self._ged_env.get_graph_num_nodes(self._gen_median_id) + for graph_id, node_map in self._node_maps_from_median.items(): if time.expired(): - if self.__state == AlgorithmState.TERMINATED: - self.__state = AlgorithmState.CONVERGED + if self._state == AlgorithmState.TERMINATED: + self._state = AlgorithmState.CONVERGED break - nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) - if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: - self.__ged_env.run_method(self.__gen_median_id, graph_id) - if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost(): - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id) + nb_nodes_g = self._ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self._sort_graphs: + self._ged_env.run_method(self._gen_median_id, graph_id) + if self._ged_env.get_upper_bound(self._gen_median_id, graph_id) < node_map.induced_cost(): + self._node_maps_from_median[graph_id] = self._ged_env.get_node_map(self._gen_median_id, graph_id) else: - self.__ged_env.run_method(graph_id, self.__gen_median_id) - if self.__ged_env.get_upper_bound(graph_id, self.__gen_median_id) < node_map.induced_cost(): - node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__gen_median_id) + self._ged_env.run_method(graph_id, self._gen_median_id) + if self._ged_env.get_upper_bound(graph_id, self._gen_median_id) < node_map.induced_cost(): + node_map_tmp = self._ged_env.get_node_map(graph_id, self._gen_median_id) node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map - self.__node_maps_from_median[graph_id] = node_map_tmp + self._node_maps_from_median[graph_id] = node_map_tmp - self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + self._sum_of_distances += self._node_maps_from_median[graph_id].induced_cost() # Print information. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) - self.__sum_of_distances = 0.0 - for key, val in self.__node_maps_from_median.items(): - self.__sum_of_distances += val.induced_cost() + self._sum_of_distances = 0.0 + for key, val in self._node_maps_from_median.items(): + self._sum_of_distances += val.induced_cost() # Print information. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('===========================================================\n') - def __median_available(self): - return self.__median_id != np.inf + def _median_available(self): + return self._median_id != np.inf def get_state(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_state().') - return self.__state + return self._state def get_sum_of_distances(self, state=''): @@ -605,92 +605,92 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no float The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned. """ - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().') if state == 'initialized': - return self.__best_init_sum_of_distances + return self._best_init_sum_of_distances if state == 'converged': - return self.__converged_sum_of_distances - return self.__sum_of_distances + return self._converged_sum_of_distances + return self._sum_of_distances def get_runtime(self, state): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_runtime().') if state == AlgorithmState.INITIALIZED: - return self.__runtime_initialized + return self._runtime_initialized if state == AlgorithmState.CONVERGED: - return self.__runtime_converged - return self.__runtime + return self._runtime_converged + return self._runtime def get_num_itrs(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_itrs().') - return self.__itrs + return self._itrs def get_num_times_order_decreased(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_times_order_decreased().') - return self.__num_decrease_order + return self._num_decrease_order def get_num_times_order_increased(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_times_order_increased().') - return self.__num_increase_order + return self._num_increase_order def get_num_converged_descents(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_converged_descents().') - return self.__num_converged_descents + return self._num_converged_descents def get_ged_env(self): - return self.__ged_env - - - def __set_default_options(self): - self.__init_type = 'RANDOM' - self.__num_random_inits = 10 - self.__desired_num_random_inits = 10 - self.__use_real_randomness = True - self.__seed = 0 - self.__parallel = True - self.__update_order = True - self.__sort_graphs = True - self.__refine = True - self.__time_limit_in_sec = 0 - self.__epsilon = 0.0001 - self.__max_itrs = 100 - self.__max_itrs_without_update = 3 - self.__num_inits_increase_order = 10 - self.__init_type_increase_order = 'K-MEANS++' - self.__max_itrs_increase_order = 10 - self.__print_to_stdout = 2 - self.__label_names = {} + return self._ged_env + + + def _set_default_options(self): + self._init_type = 'RANDOM' + self._num_random_inits = 10 + self._desired_num_random_inits = 10 + self._use_real_randomness = True + self._seed = 0 + self._parallel = True + self._update_order = True + self._sort_graphs = True + self._refine = True + self._time_limit_in_sec = 0 + self._epsilon = 0.0001 + self._max_itrs = 100 + self._max_itrs_without_update = 3 + self._num_inits_increase_order = 10 + self._init_type_increase_order = 'K-MEANS++' + self._max_itrs_increase_order = 10 + self._print_to_stdout = 2 + self._label_names = {} - def __construct_initial_medians(self, graph_ids, timer, initial_medians): + def _construct_initial_medians(self, graph_ids, timer, initial_medians): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n===========================================================') print('Constructing initial median(s).') print('-----------------------------------------------------------') # Compute or sample the initial median(s). initial_medians.clear() - if self.__init_type == 'MEDOID': - self.__compute_medoid(graph_ids, timer, initial_medians) - elif self.__init_type == 'MAX': + if self._init_type == 'MEDOID': + self._compute_medoid(graph_ids, timer, initial_medians) + elif self._init_type == 'MAX': pass # @todo # compute_max_order_graph_(graph_ids, initial_medians) - elif self.__init_type == 'MIN': + elif self._init_type == 'MIN': pass # @todo # compute_min_order_graph_(graph_ids, initial_medians) - elif self.__init_type == 'MEAN': + elif self._init_type == 'MEAN': pass # @todo # compute_mean_order_graph_(graph_ids, initial_medians) else: @@ -698,17 +698,17 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # sample_initial_medians_(graph_ids, initial_medians) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('===========================================================') - def __compute_medoid(self, graph_ids, timer, initial_medians): + def _compute_medoid(self, graph_ids, timer, initial_medians): # Use method selected for initialization phase. - self.__ged_env.set_method(self.__init_method, self.__init_options) + self._ged_env.set_method(self._init_method, self._init_options) # Compute the medoid. - if self.__parallel: - # @todo: notice when parallel self.__ged_env is not modified. + if self._parallel: + # @todo: notice when parallel self._ged_env is not modified. sum_of_distances_list = [np.inf] * len(graph_ids) len_itr = len(graph_ids) itr = zip(graph_ids, range(0, len(graph_ids))) @@ -720,9 +720,9 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - do_fun = partial(_compute_medoid_parallel, graph_ids, self.__sort_graphs) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) - if self.__print_to_stdout == 2: + do_fun = partial(_compute_medoid_parallel, graph_ids, self._sort_graphs) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self._ged_env,)) + if self._print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), desc='Computing medoid', file=sys.stdout) else: @@ -735,50 +735,55 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no medoid_id = np.argmin(sum_of_distances_list) best_sum_of_distances = sum_of_distances_list[medoid_id] - initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo + initial_medians.append(self._ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo else: # Print information about current iteration. - if self.__print_to_stdout == 2: + self.ged_matrix_set_median_tmp = np.ones((len(graph_ids), len(graph_ids))) * np.inf + if self._print_to_stdout == 2: progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout) medoid_id = graph_ids[0] best_sum_of_distances = np.inf for g_id in graph_ids: if timer.expired(): - self.__state = AlgorithmState.CALLED + self._state = AlgorithmState.CALLED break - nb_nodes_g = self.__ged_env.get_graph_num_nodes(g_id) + nb_nodes_g = self._ged_env.get_graph_num_nodes(g_id) sum_of_distances = 0 - for h_id in graph_ids: - nb_nodes_h = self.__ged_env.get_graph_num_nodes(h_id) - if nb_nodes_g <= nb_nodes_h or not self.__sort_graphs: - self.__ged_env.run_method(g_id, h_id) - sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) + for h_id in graph_ids: # @todo: can this be faster? + nb_nodes_h = self._ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not self._sort_graphs: + self._ged_env.run_method(g_id, h_id) + sum_of_distances += self._ged_env.get_upper_bound(g_id, h_id) + self.ged_matrix_set_median_tmp[g_id, h_id] = self._ged_env.get_upper_bound(g_id, h_id) else: - self.__ged_env.run_method(h_id, g_id) - sum_of_distances += self.__ged_env.get_upper_bound(h_id, g_id) + # @todo: is this correct? + self._ged_env.run_method(h_id, g_id) + sum_of_distances += self._ged_env.get_upper_bound(h_id, g_id) + self.ged_matrix_set_median_tmp[g_id, h_id] = self._ged_env.get_upper_bound(h_id, g_id) + print(sum_of_distances) if sum_of_distances < best_sum_of_distances: best_sum_of_distances = sum_of_distances medoid_id = g_id # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) - initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo + initial_medians.append(self._ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n') - def __compute_init_node_maps(self, graph_ids, gen_median_id): + def _compute_init_node_maps(self, graph_ids, gen_median_id): # Compute node maps and sum of distances for initial median. - if self.__parallel: - # @todo: notice when parallel self.__ged_env is not modified. - self.__sum_of_distances = 0 - self.__node_maps_from_median.clear() + if self._parallel: + # @todo: notice when parallel self._ged_env is not modified. + self._sum_of_distances = 0 + self._node_maps_from_median.clear() sum_of_distances_list = [0] * len(graph_ids) len_itr = len(graph_ids) @@ -791,88 +796,88 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) - do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self.__sort_graphs, nb_nodes_median) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) - if self.__print_to_stdout == 2: + nb_nodes_median = self._ged_env.get_graph_num_nodes(gen_median_id) + do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self._sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self._ged_env,)) + if self._print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), desc='Computing initial node maps', file=sys.stdout) else: iterator = pool.imap_unordered(do_fun, itr, chunksize) for g_id, sod, node_maps in iterator: sum_of_distances_list[g_id] = sod - self.__node_maps_from_median[g_id] = node_maps + self._node_maps_from_median[g_id] = node_maps pool.close() pool.join() - self.__sum_of_distances = np.sum(sum_of_distances_list) -# xxx = self.__node_maps_from_median + self._sum_of_distances = np.sum(sum_of_distances_list) +# xxx = self._node_maps_from_median else: # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout) - self.__sum_of_distances = 0 - self.__node_maps_from_median.clear() - nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + self._sum_of_distances = 0 + self._node_maps_from_median.clear() + nb_nodes_median = self._ged_env.get_graph_num_nodes(gen_median_id) for graph_id in graph_ids: - nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) - if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: - self.__ged_env.run_method(gen_median_id, graph_id) - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) + nb_nodes_g = self._ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self._sort_graphs: + self._ged_env.run_method(gen_median_id, graph_id) + self._node_maps_from_median[graph_id] = self._ged_env.get_node_map(gen_median_id, graph_id) else: - self.__ged_env.run_method(graph_id, gen_median_id) - node_map_tmp = self.__ged_env.get_node_map(graph_id, gen_median_id) + self._ged_env.run_method(graph_id, gen_median_id) + node_map_tmp = self._ged_env.get_node_map(graph_id, gen_median_id) node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map - self.__node_maps_from_median[graph_id] = node_map_tmp - # print(self.__node_maps_from_median[graph_id]) - self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() - # print(self.__sum_of_distances) + self._node_maps_from_median[graph_id] = node_map_tmp + # print(self._node_maps_from_median[graph_id]) + self._sum_of_distances += self._node_maps_from_median[graph_id].induced_cost() + # print(self._sum_of_distances) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n') - def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): - if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): - if self.__state == AlgorithmState.TERMINATED: - self.__state = AlgorithmState.INITIALIZED + def _termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self._max_itrs if self._max_itrs >= 0 else False): + if self._state == AlgorithmState.TERMINATED: + self._state = AlgorithmState.INITIALIZED return True - return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + return converged or (itrs_without_update > self._max_itrs_without_update if self._max_itrs_without_update >= 0 else False) - def __update_median(self, graphs, median): + def _update_median(self, graphs, median): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Updating median: ', end='') # Store copy of the old median. old_median = median.copy() # @todo: this is just a shallow copy. # Update the node labels. - if self.__labeled_nodes: - self.__update_node_labels(graphs, median) + if self._labeled_nodes: + self._update_node_labels(graphs, median) # Update the edges and their labels. - self.__update_edges(graphs, median) + self._update_edges(graphs, median) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') - return not self.__are_graphs_equal(median, old_median) + return not self._are_graphs_equal(median, old_median) - def __update_node_labels(self, graphs, median): + def _update_node_labels(self, graphs, median): # print('----------------------------') # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('nodes ... ', end='') # Iterate through all nodes of the median. @@ -882,24 +887,24 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no node_labels = [] for graph_id, graph in graphs.items(): # print('graph_id: ', graph_id) -# print(self.__node_maps_from_median[graph_id]) -# print(self.__node_maps_from_median[graph_id].forward_map, self.__node_maps_from_median[graph_id].backward_map) - k = self.__node_maps_from_median[graph_id].image(i) +# print(self._node_maps_from_median[graph_id]) +# print(self._node_maps_from_median[graph_id].forward_map, self._node_maps_from_median[graph_id].backward_map) + k = self._node_maps_from_median[graph_id].image(i) # print('k: ', k) if k != np.inf: node_labels.append(graph.nodes[k]) # Compute the median label and update the median. if len(node_labels) > 0: -# median_label = self.__ged_env.get_median_node_label(node_labels) - median_label = self.__get_median_node_label(node_labels) - if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: +# median_label = self._ged_env.get_median_node_label(node_labels) + median_label = self._get_median_node_label(node_labels) + if self._ged_env.get_node_rel_cost(median.nodes[i], median_label) > self._epsilon: nx.set_node_attributes(median, {i: median_label}) - def __update_edges(self, graphs, median): + def _update_edges(self, graphs, median): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('edges ... ', end='') # # Clear the adjacency lists of the median and reset number of edges to 0. @@ -915,43 +920,43 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # Collect the labels of the edges to which (i,j) is mapped by the node maps. edge_labels = [] for graph_id, graph in graphs.items(): - k = self.__node_maps_from_median[graph_id].image(i) - l = self.__node_maps_from_median[graph_id].image(j) + k = self._node_maps_from_median[graph_id].image(i) + l = self._node_maps_from_median[graph_id].image(j) if k != np.inf and l != np.inf: if graph.has_edge(k, l): edge_labels.append(graph.edges[(k, l)]) # Compute the median edge label and the overall edge relabeling cost. rel_cost = 0 - median_label = self.__ged_env.get_edge_label(1) + median_label = self._ged_env.get_edge_label(1) if median.has_edge(i, j): median_label = median.edges[(i, j)] - if self.__labeled_edges and len(edge_labels) > 0: - new_median_label = self.__get_median_edge_label(edge_labels) - if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: + if self._labeled_edges and len(edge_labels) > 0: + new_median_label = self._get_median_edge_label(edge_labels) + if self._ged_env.get_edge_rel_cost(median_label, new_median_label) > self._epsilon: median_label = new_median_label for edge_label in edge_labels: - rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label) + rel_cost += self._ged_env.get_edge_rel_cost(median_label, edge_label) # Update the median. if median.has_edge(i, j): median.remove_edge(i, j) - if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs): + if rel_cost < (self._edge_ins_cost + self._edge_del_cost) * len(edge_labels) - self._edge_del_cost * len(graphs): median.add_edge(i, j, **median_label) # else: # if median.has_edge(i, j): # median.remove_edge(i, j) - def __update_node_maps(self): + def _update_node_maps(self): # Update the node maps. - if self.__parallel: - # @todo: notice when parallel self.__ged_env is not modified. + if self._parallel: + # @todo: notice when parallel self._ged_env is not modified. node_maps_were_modified = False -# xxx = self.__node_maps_from_median.copy() +# xxx = self._node_maps_from_median.copy() - len_itr = len(self.__node_maps_from_median) - itr = [item for item in self.__node_maps_from_median.items()] + len_itr = len(self._node_maps_from_median) + itr = [item for item in self._node_maps_from_median.items()] n_jobs = multiprocessing.cpu_count() if len_itr < 100 * n_jobs: chunksize = int(len_itr / n_jobs) + 1 @@ -960,66 +965,66 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) - do_fun = partial(_update_node_maps_parallel, self.__median_id, self.__epsilon, self.__sort_graphs, nb_nodes_median) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) - if self.__print_to_stdout == 2: + nb_nodes_median = self._ged_env.get_graph_num_nodes(self._median_id) + do_fun = partial(_update_node_maps_parallel, self._median_id, self._epsilon, self._sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self._ged_env,)) + if self._print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), desc='Updating node maps', file=sys.stdout) else: iterator = pool.imap_unordered(do_fun, itr, chunksize) for g_id, node_map, nm_modified in iterator: - self.__node_maps_from_median[g_id] = node_map + self._node_maps_from_median[g_id] = node_map if nm_modified: node_maps_were_modified = True pool.close() pool.join() -# yyy = self.__node_maps_from_median.copy() +# yyy = self._node_maps_from_median.copy() else: # Print information about current iteration. - if self.__print_to_stdout == 2: - progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + if self._print_to_stdout == 2: + progress = tqdm(desc='Updating node maps', total=len(self._node_maps_from_median), file=sys.stdout) node_maps_were_modified = False - nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) - for graph_id, node_map in self.__node_maps_from_median.items(): - nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + nb_nodes_median = self._ged_env.get_graph_num_nodes(self._median_id) + for graph_id, node_map in self._node_maps_from_median.items(): + nb_nodes_g = self._ged_env.get_graph_num_nodes(graph_id) - if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: - self.__ged_env.run_method(self.__median_id, graph_id) - if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon: - # xxx = self.__node_maps_from_median[graph_id] - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) + if nb_nodes_median <= nb_nodes_g or not self._sort_graphs: + self._ged_env.run_method(self._median_id, graph_id) + if self._ged_env.get_upper_bound(self._median_id, graph_id) < node_map.induced_cost() - self._epsilon: + # xxx = self._node_maps_from_median[graph_id] + self._node_maps_from_median[graph_id] = self._ged_env.get_node_map(self._median_id, graph_id) node_maps_were_modified = True else: - self.__ged_env.run_method(graph_id, self.__median_id) - if self.__ged_env.get_upper_bound(graph_id, self.__median_id) < node_map.induced_cost() - self.__epsilon: - node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__median_id) + self._ged_env.run_method(graph_id, self._median_id) + if self._ged_env.get_upper_bound(graph_id, self._median_id) < node_map.induced_cost() - self._epsilon: + node_map_tmp = self._ged_env.get_node_map(graph_id, self._median_id) node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map - self.__node_maps_from_median[graph_id] = node_map_tmp + self._node_maps_from_median[graph_id] = node_map_tmp node_maps_were_modified = True # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n') # Return true if the node maps were modified. return node_maps_were_modified - def __decrease_order(self, graphs, median): + def _decrease_order(self, graphs, median): # Print information about current iteration - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Trying to decrease order: ... ', end='') if nx.number_of_nodes(median) <= 1: - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('median graph has only 1 node, skip decrease.') return False @@ -1028,23 +1033,23 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no decreased_order = False # Decrease the order as long as the best deletion delta is negative. - while self.__compute_best_deletion_delta(graphs, median, id_deleted_node) < -self.__epsilon: + while self._compute_best_deletion_delta(graphs, median, id_deleted_node) < -self._epsilon: decreased_order = True - self.__delete_node_from_median(id_deleted_node[0], median) + self._delete_node_from_median(id_deleted_node[0], median) if nx.number_of_nodes(median) <= 1: - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('decrease stopped because median graph remains only 1 node. ', end='') break # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Return true iff the order was decreased. return decreased_order - def __compute_best_deletion_delta(self, graphs, median, id_deleted_node): + def _compute_best_deletion_delta(self, graphs, median, id_deleted_node): best_delta = 0.0 # Determine node that should be deleted (if any). @@ -1052,22 +1057,22 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # Compute cost delta. delta = 0.0 for graph_id, graph in graphs.items(): - k = self.__node_maps_from_median[graph_id].image(i) + k = self._node_maps_from_median[graph_id].image(i) if k == np.inf: - delta -= self.__node_del_cost + delta -= self._node_del_cost else: - delta += self.__node_ins_cost - self.__ged_env.get_node_rel_cost(median.nodes[i], graph.nodes[k]) + delta += self._node_ins_cost - self._ged_env.get_node_rel_cost(median.nodes[i], graph.nodes[k]) for j, j_label in median[i].items(): - l = self.__node_maps_from_median[graph_id].image(j) + l = self._node_maps_from_median[graph_id].image(j) if k == np.inf or l == np.inf: - delta -= self.__edge_del_cost + delta -= self._edge_del_cost elif not graph.has_edge(k, l): - delta -= self.__edge_del_cost + delta -= self._edge_del_cost else: - delta += self.__edge_ins_cost - self.__ged_env.get_edge_rel_cost(j_label, graph.edges[(k, l)]) + delta += self._edge_ins_cost - self._ged_env.get_edge_rel_cost(j_label, graph.edges[(k, l)]) # Update best deletion delta. - if delta < best_delta - self.__epsilon: + if delta < best_delta - self._epsilon: best_delta = delta id_deleted_node[0] = i # id_deleted_node[0] = 3 # @todo: @@ -1075,7 +1080,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no return best_delta - def __delete_node_from_median(self, id_deleted_node, median): + def _delete_node_from_median(self, id_deleted_node, median): # Update the median. mapping = {} for i in range(0, nx.number_of_nodes(median)): @@ -1086,8 +1091,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no nx.relabel_nodes(median, mapping, copy=False) # Update the node maps. -# xxx = self.__node_maps_from_median - for key, node_map in self.__node_maps_from_median.items(): +# xxx = self._node_maps_from_median + for key, node_map in self._node_maps_from_median.items(): new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) is_unassigned_target_node = [True] * node_map.num_target_nodes() for i in range(0, nx.number_of_nodes(median) + 1): @@ -1100,38 +1105,38 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no for k in range(0, node_map.num_target_nodes()): if is_unassigned_target_node[k]: new_node_map.add_assignment(np.inf, k) -# print(self.__node_maps_from_median[key].forward_map, self.__node_maps_from_median[key].backward_map) +# print(self._node_maps_from_median[key].forward_map, self._node_maps_from_median[key].backward_map) # print(new_node_map.forward_map, new_node_map.backward_map - self.__node_maps_from_median[key] = new_node_map + self._node_maps_from_median[key] = new_node_map # Increase overall number of decreases. - self.__num_decrease_order += 1 + self._num_decrease_order += 1 - def __increase_order(self, graphs, median): + def _increase_order(self, graphs, median): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Trying to increase order: ... ', end='') # Initialize the best configuration and the best label of the node that is to be inserted. best_config = {} - best_label = self.__ged_env.get_node_label(1) + best_label = self._ged_env.get_node_label(1) increased_order = False # Increase the order as long as the best insertion delta is negative. - while self.__compute_best_insertion_delta(graphs, best_config, best_label) < - self.__epsilon: + while self._compute_best_insertion_delta(graphs, best_config, best_label) < - self._epsilon: increased_order = True - self.__add_node_to_median(best_config, best_label, median) + self._add_node_to_median(best_config, best_label, median) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Return true iff the order was increased. return increased_order - def __compute_best_insertion_delta(self, graphs, best_config, best_label): + def _compute_best_insertion_delta(self, graphs, best_config, best_label): # Construct sets of inserted nodes. no_inserted_node = True inserted_nodes = {} @@ -1139,7 +1144,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no inserted_nodes[graph_id] = [] best_config[graph_id] = np.inf for k in range(nx.number_of_nodes(graph)): - if self.__node_maps_from_median[graph_id].pre_image(k) == np.inf: + if self._node_maps_from_median[graph_id].pre_image(k) == np.inf: no_inserted_node = False inserted_nodes[graph_id].append((k, tuple(item for item in graph.nodes[k].items()))) # @todo: can order of label names be garantteed? @@ -1149,34 +1154,34 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # Compute insertion configuration, label, and delta. best_delta = 0.0 # @todo - if len(self.__label_names['node_labels']) == 0 and len(self.__label_names['node_attrs']) == 0: # @todo - best_delta = self.__compute_insertion_delta_unlabeled(inserted_nodes, best_config, best_label) - elif len(self.__label_names['node_labels']) > 0: # self.__constant_node_costs: - best_delta = self.__compute_insertion_delta_constant(inserted_nodes, best_config, best_label) + if len(self._label_names['node_labels']) == 0 and len(self._label_names['node_attrs']) == 0: # @todo + best_delta = self._compute_insertion_delta_unlabeled(inserted_nodes, best_config, best_label) + elif len(self._label_names['node_labels']) > 0: # self._constant_node_costs: + best_delta = self._compute_insertion_delta_constant(inserted_nodes, best_config, best_label) else: - best_delta = self.__compute_insertion_delta_generic(inserted_nodes, best_config, best_label) + best_delta = self._compute_insertion_delta_generic(inserted_nodes, best_config, best_label) # Return the best delta. return best_delta - def __compute_insertion_delta_unlabeled(self, inserted_nodes, best_config, best_label): # @todo: go through and test. + def _compute_insertion_delta_unlabeled(self, inserted_nodes, best_config, best_label): # @todo: go through and test. # Construct the nest configuration and compute its insertion delta. best_delta = 0.0 best_config.clear() for graph_id, node_set in inserted_nodes.items(): if len(node_set) == 0: best_config[graph_id] = np.inf - best_delta += self.__node_del_cost + best_delta += self._node_del_cost else: best_config[graph_id] = node_set[0][0] - best_delta -= self.__node_ins_cost + best_delta -= self._node_ins_cost # Return the best insertion delta. return best_delta - def __compute_insertion_delta_constant(self, inserted_nodes, best_config, best_label): + def _compute_insertion_delta_constant(self, inserted_nodes, best_config, best_label): # Construct histogram and inverse label maps. hist = {} inverse_label_maps = {} @@ -1207,24 +1212,24 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # Construct the best configuration and compute its insertion delta. best_config.clear() best_delta = 0.0 - node_rel_cost = self.__ged_env.get_node_rel_cost(self.__ged_env.get_node_label(1), self.__ged_env.get_node_label(2)) - triangle_ineq_holds = (node_rel_cost <= self.__node_del_cost + self.__node_ins_cost) + node_rel_cost = self._ged_env.get_node_rel_cost(self._ged_env.get_node_label(1), self._ged_env.get_node_label(2)) + triangle_ineq_holds = (node_rel_cost <= self._node_del_cost + self._node_ins_cost) for graph_id, _ in inserted_nodes.items(): if best_label_tuple in inverse_label_maps[graph_id]: best_config[graph_id] = inverse_label_maps[graph_id][best_label_tuple] - best_delta -= self.__node_ins_cost + best_delta -= self._node_ins_cost elif triangle_ineq_holds and not len(inserted_nodes[graph_id]) == 0: best_config[graph_id] = inserted_nodes[graph_id][0][0] - best_delta += node_rel_cost - self.__node_ins_cost + best_delta += node_rel_cost - self._node_ins_cost else: best_config[graph_id] = np.inf - best_delta += self.__node_del_cost + best_delta += self._node_del_cost # Return the best insertion delta. return best_delta - def __compute_insertion_delta_generic(self, inserted_nodes, best_config, best_label): + def _compute_insertion_delta_generic(self, inserted_nodes, best_config, best_label): # Collect all node labels of inserted nodes. node_labels = [] for _, node_set in inserted_nodes.items(): @@ -1233,7 +1238,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # Compute node label medians that serve as initial solutions for block gradient descent. initial_node_labels = [] - self.__compute_initial_node_labels(node_labels, initial_node_labels) + self._compute_initial_node_labels(node_labels, initial_node_labels) # Determine best insertion configuration, label, and delta via parallel block gradient descent from all initial node labels. best_delta = 0.0 @@ -1241,15 +1246,15 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # Construct local configuration. config = {} for graph_id, _ in inserted_nodes.items(): - config[graph_id] = tuple((np.inf, tuple(item for item in self.__ged_env.get_node_label(1).items()))) + config[graph_id] = tuple((np.inf, tuple(item for item in self._ged_env.get_node_label(1).items()))) # Run block gradient descent. converged = False itr = 0 - while not self.__insertion_termination_criterion_met(converged, itr): - converged = not self.__update_config(node_label, inserted_nodes, config, node_labels) + while not self._insertion_termination_criterion_met(converged, itr): + converged = not self._update_config(node_label, inserted_nodes, config, node_labels) node_label_dict = dict(node_label) - converged = converged and (not self.__update_node_label([dict(item) for item in node_labels], node_label_dict)) # @todo: the dict is tupled again in the function, can be better. + converged = converged and (not self._update_node_label([dict(item) for item in node_labels], node_label_dict)) # @todo: the dict is tupled again in the function, can be better. node_label = tuple(item for item in node_label_dict.items()) # @todo: watch out: initial_node_labels[i] is not modified here. itr += 1 @@ -1258,12 +1263,12 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no delta = 0.0 for _, node in config.items(): if node[0] == np.inf: - delta += self.__node_del_cost + delta += self._node_del_cost else: - delta += self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost + delta += self._ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self._node_ins_cost # Update best delta and global configuration if improvement has been found. - if delta < best_delta - self.__epsilon: + if delta < best_delta - self._epsilon: best_delta = delta best_label.clear() for key, val in node_label: @@ -1276,16 +1281,16 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no return best_delta - def __compute_initial_node_labels(self, node_labels, median_labels): + def _compute_initial_node_labels(self, node_labels, median_labels): median_labels.clear() - if self.__use_real_randomness: # @todo: may not work if parallelized. + if self._use_real_randomness: # @todo: may not work if parallelized. rng = np.random.randint(0, high=2**32 - 1, size=1) urng = np.random.RandomState(seed=rng[0]) else: - urng = np.random.RandomState(seed=self.__seed) + urng = np.random.RandomState(seed=self._seed) # Generate the initial node label medians. - if self.__init_type_increase_order == 'K-MEANS++': + if self._init_type_increase_order == 'K-MEANS++': # Use k-means++ heuristic to generate the initial node label medians. already_selected = [False] * len(node_labels) selected_label_id = urng.randint(low=0, high=len(node_labels), size=1)[0] # c++ test: 23 @@ -1293,14 +1298,14 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no already_selected[selected_label_id] = True # xxx = [41, 0, 18, 9, 6, 14, 21, 25, 33] for c++ test # iii = 0 for c++ test - while len(median_labels) < self.__num_inits_increase_order: + while len(median_labels) < self._num_inits_increase_order: weights = [np.inf] * len(node_labels) for label_id in range(0, len(node_labels)): if already_selected[label_id]: weights[label_id] = 0 continue for label in median_labels: - weights[label_id] = min(weights[label_id], self.__ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) + weights[label_id] = min(weights[label_id], self._ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) # get non-zero weights. weights_p, idx_p = [], [] @@ -1315,26 +1320,26 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # iii += 1 for c++ test median_labels.append(node_labels[selected_label_id]) already_selected[selected_label_id] = True - else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self.__num_inits_increase_order. + else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self._num_inits_increase_order. break else: # Compute the initial node medians as the medians of randomly generated clusters of (roughly) equal size. # @todo: go through and test. shuffled_node_labels = [np.inf] * len(node_labels) #@todo: random? # @todo: std::shuffle(shuffled_node_labels.begin(), shuffled_node_labels.end(), urng);? - cluster_size = len(node_labels) / self.__num_inits_increase_order + cluster_size = len(node_labels) / self._num_inits_increase_order pos = 0.0 cluster = [] - while len(median_labels) < self.__num_inits_increase_order - 1: + while len(median_labels) < self._num_inits_increase_order - 1: while pos < (len(median_labels) + 1) * cluster_size: cluster.append(shuffled_node_labels[pos]) pos += 1 - median_labels.append(self.__get_median_node_label(cluster)) + median_labels.append(self._get_median_node_label(cluster)) cluster.clear() while pos < len(shuffled_node_labels): pos += 1 cluster.append(shuffled_node_labels[pos]) - median_labels.append(self.__get_median_node_label(cluster)) + median_labels.append(self._get_median_node_label(cluster)) cluster.clear() # Run Lloyd's Algorithm. @@ -1342,8 +1347,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no closest_median_ids = [np.inf] * len(node_labels) clusters = [[] for _ in range(len(median_labels))] itr = 1 - while not self.__insertion_termination_criterion_met(converged, itr): - converged = not self.__update_clusters(node_labels, median_labels, closest_median_ids) + while not self._insertion_termination_criterion_met(converged, itr): + converged = not self._update_clusters(node_labels, median_labels, closest_median_ids) if not converged: for cluster in clusters: cluster.clear() @@ -1351,33 +1356,33 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no clusters[closest_median_ids[label_id]].append(node_labels[label_id]) for cluster_id in range(0, len(clusters)): node_label = dict(median_labels[cluster_id]) - self.__update_node_label([dict(item) for item in clusters[cluster_id]], node_label) # @todo: the dict is tupled again in the function, can be better. + self._update_node_label([dict(item) for item in clusters[cluster_id]], node_label) # @todo: the dict is tupled again in the function, can be better. median_labels[cluster_id] = tuple(item for item in node_label.items()) itr += 1 - def __insertion_termination_criterion_met(self, converged, itr): - return converged or (itr >= self.__max_itrs_increase_order if self.__max_itrs_increase_order > 0 else False) + def _insertion_termination_criterion_met(self, converged, itr): + return converged or (itr >= self._max_itrs_increase_order if self._max_itrs_increase_order > 0 else False) - def __update_config(self, node_label, inserted_nodes, config, node_labels): + def _update_config(self, node_label, inserted_nodes, config, node_labels): # Determine the best configuration. config_modified = False for graph_id, node_set in inserted_nodes.items(): best_assignment = config[graph_id] best_cost = 0.0 if best_assignment[0] == np.inf: - best_cost = self.__node_del_cost + best_cost = self._node_del_cost else: - best_cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(best_assignment[1])) - self.__node_ins_cost + best_cost = self._ged_env.get_node_rel_cost(dict(node_label), dict(best_assignment[1])) - self._node_ins_cost for node in node_set: - cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost - if cost < best_cost - self.__epsilon: + cost = self._ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self._node_ins_cost + if cost < best_cost - self._epsilon: best_cost = cost best_assignment = node config_modified = True - if self.__node_del_cost < best_cost - self.__epsilon: - best_cost = self.__node_del_cost + if self._node_del_cost < best_cost - self._epsilon: + best_cost = self._node_del_cost best_assignment = tuple((np.inf, best_assignment[1])) config_modified = True config[graph_id] = best_assignment @@ -1392,11 +1397,11 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no return config_modified - def __update_node_label(self, node_labels, node_label): - if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling __update_config(). + def _update_node_label(self, node_labels, node_label): + if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling _update_config(). return False - new_node_label = self.__get_median_node_label(node_labels) - if self.__ged_env.get_node_rel_cost(new_node_label, node_label) > self.__epsilon: + new_node_label = self._get_median_node_label(node_labels) + if self._ged_env.get_node_rel_cost(new_node_label, node_label) > self._epsilon: node_label.clear() for key, val in new_node_label.items(): node_label[key] = val @@ -1404,15 +1409,15 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no return False - def __update_clusters(self, node_labels, median_labels, closest_median_ids): + def _update_clusters(self, node_labels, median_labels, closest_median_ids): # Determine the closest median for each node label. clusters_modified = False for label_id in range(0, len(node_labels)): closest_median_id = np.inf dist_to_closest_median = np.inf for median_id in range(0, len(median_labels)): - dist_to_median = self.__ged_env.get_node_rel_cost(dict(median_labels[median_id]), dict(node_labels[label_id])) - if dist_to_median < dist_to_closest_median - self.__epsilon: + dist_to_median = self._ged_env.get_node_rel_cost(dict(median_labels[median_id]), dict(node_labels[label_id])) + if dist_to_median < dist_to_closest_median - self._epsilon: dist_to_closest_median = dist_to_median closest_median_id = median_id if closest_median_id != closest_median_ids[label_id]: @@ -1423,26 +1428,26 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no return clusters_modified - def __add_node_to_median(self, best_config, best_label, median): + def _add_node_to_median(self, best_config, best_label, median): # Update the median. nb_nodes_median = nx.number_of_nodes(median) median.add_node(nb_nodes_median, **best_label) # Update the node maps. - for graph_id, node_map in self.__node_maps_from_median.items(): + for graph_id, node_map in self._node_maps_from_median.items(): node_map_as_rel = [] node_map.as_relation(node_map_as_rel) new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) for assignment in node_map_as_rel: new_node_map.add_assignment(assignment[0], assignment[1]) new_node_map.add_assignment(nx.number_of_nodes(median) - 1, best_config[graph_id]) - self.__node_maps_from_median[graph_id] = new_node_map + self._node_maps_from_median[graph_id] = new_node_map # Increase overall number of increases. - self.__num_increase_order += 1 + self._num_increase_order += 1 - def __are_graphs_equal(self, g1, g2): + def _are_graphs_equal(self, g1, g2): """ Check if the two graphs are equal. @@ -1487,29 +1492,29 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no def set_label_names(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): - self.__label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, + self._label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, 'node_attrs': node_attrs, 'edge_attrs': edge_attrs} - def __get_median_node_label(self, node_labels): - if len(self.__label_names['node_labels']) > 0: - return self.__get_median_label_symbolic(node_labels) - elif len(self.__label_names['node_attrs']) > 0: - return self.__get_median_label_nonsymbolic(node_labels) + def _get_median_node_label(self, node_labels): + if len(self._label_names['node_labels']) > 0: + return self._get_median_label_symbolic(node_labels) + elif len(self._label_names['node_attrs']) > 0: + return self._get_median_label_nonsymbolic(node_labels) else: raise Exception('Node label names are not given.') - def __get_median_edge_label(self, edge_labels): - if len(self.__label_names['edge_labels']) > 0: - return self.__get_median_label_symbolic(edge_labels) - elif len(self.__label_names['edge_attrs']) > 0: - return self.__get_median_label_nonsymbolic(edge_labels) + def _get_median_edge_label(self, edge_labels): + if len(self._label_names['edge_labels']) > 0: + return self._get_median_label_symbolic(edge_labels) + elif len(self._label_names['edge_attrs']) > 0: + return self._get_median_label_nonsymbolic(edge_labels) else: raise Exception('Edge label names are not given.') - def __get_median_label_symbolic(self, labels): + def _get_median_label_symbolic(self, labels): # Construct histogram. hist = {} for label in labels: @@ -1530,7 +1535,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no return median_label - def __get_median_label_nonsymbolic(self, labels): + def _get_median_label_nonsymbolic(self, labels): if len(labels) == 0: return {} # @todo else: @@ -1589,11 +1594,11 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no return median_label -# def __get_median_edge_label_symbolic(self, edge_labels): +# def _get_median_edge_label_symbolic(self, edge_labels): # pass -# def __get_median_edge_label_nonsymbolic(self, edge_labels): +# def _get_median_edge_label_nonsymbolic(self, edge_labels): # if len(edge_labels) == 0: # return {} # else: @@ -1657,7 +1662,7 @@ def _compute_medoid_parallel(graph_ids, sort, itr): i = itr[1] # @todo: timer not considered here. # if timer.expired(): -# self.__state = AlgorithmState.CALLED +# self._state = AlgorithmState.CALLED # break nb_nodes_g = G_ged_env.get_graph_num_nodes(g_id) sum_of_distances = 0 @@ -1678,13 +1683,13 @@ def _compute_init_node_maps_parallel(gen_median_id, sort, nb_nodes_median, itr): if nb_nodes_median <= nb_nodes_g or not sort: G_ged_env.run_method(gen_median_id, graph_id) node_map = G_ged_env.get_node_map(gen_median_id, graph_id) -# print(self.__node_maps_from_median[graph_id]) +# print(self._node_maps_from_median[graph_id]) else: G_ged_env.run_method(graph_id, gen_median_id) node_map = G_ged_env.get_node_map(graph_id, gen_median_id) node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map sum_of_distance = node_map.induced_cost() -# print(self.__sum_of_distances) +# print(self._sum_of_distances) return graph_id, sum_of_distance, node_map diff --git a/gklearn/ged/median/median_graph_estimator_cml.py b/gklearn/ged/median/median_graph_estimator_cml.py index 2d5b110..da74ad5 100644 --- a/gklearn/ged/median/median_graph_estimator_cml.py +++ b/gklearn/ged/median/median_graph_estimator_cml.py @@ -33,51 +33,51 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined constant_node_costs : Boolean Set to True if the node relabeling costs are constant. """ - self.__ged_env = ged_env - self.__init_method = 'BRANCH_FAST' - self.__init_options = '' - self.__descent_method = 'BRANCH_FAST' - self.__descent_options = '' - self.__refine_method = 'IPFP' - self.__refine_options = '' - self.__constant_node_costs = constant_node_costs - self.__labeled_nodes = (ged_env.get_num_node_labels() > 1) - self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1, to_dict=False)) - self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1, to_dict=False)) - self.__labeled_edges = (ged_env.get_num_edge_labels() > 1) - self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1, to_dict=False)) - self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1, to_dict=False)) - self.__init_type = 'RANDOM' - self.__num_random_inits = 10 - self.__desired_num_random_inits = 10 - self.__use_real_randomness = True - self.__seed = 0 - self.__parallel = True - self.__update_order = True - self.__sort_graphs = True # sort graphs by size when computing GEDs. - self.__refine = True - self.__time_limit_in_sec = 0 - self.__epsilon = 0.0001 - self.__max_itrs = 100 - self.__max_itrs_without_update = 3 - self.__num_inits_increase_order = 10 - self.__init_type_increase_order = 'K-MEANS++' - self.__max_itrs_increase_order = 10 - self.__print_to_stdout = 2 - self.__median_id = np.inf # @todo: check - self.__node_maps_from_median = {} - self.__sum_of_distances = 0 - self.__best_init_sum_of_distances = np.inf - self.__converged_sum_of_distances = np.inf - self.__runtime = None - self.__runtime_initialized = None - self.__runtime_converged = None - self.__itrs = [] # @todo: check: {} ? - self.__num_decrease_order = 0 - self.__num_increase_order = 0 - self.__num_converged_descents = 0 - self.__state = AlgorithmState.TERMINATED - self.__label_names = {} + self._ged_env = ged_env + self._init_method = 'BRANCH_FAST' + self._init_options = '' + self._descent_method = 'BRANCH_FAST' + self._descent_options = '' + self._refine_method = 'IPFP' + self._refine_options = '' + self._constant_node_costs = constant_node_costs + self._labeled_nodes = (ged_env.get_num_node_labels() > 1) + self._node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1, to_dict=False)) + self._node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1, to_dict=False)) + self._labeled_edges = (ged_env.get_num_edge_labels() > 1) + self._edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1, to_dict=False)) + self._edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1, to_dict=False)) + self._init_type = 'RANDOM' + self._num_random_inits = 10 + self._desired_num_random_inits = 10 + self._use_real_randomness = True + self._seed = 0 + self._parallel = True + self._update_order = True + self._sort_graphs = True # sort graphs by size when computing GEDs. + self._refine = True + self._time_limit_in_sec = 0 + self._epsilon = 0.0001 + self._max_itrs = 100 + self._max_itrs_without_update = 3 + self._num_inits_increase_order = 10 + self._init_type_increase_order = 'K-MEANS++' + self._max_itrs_increase_order = 10 + self._print_to_stdout = 2 + self._median_id = np.inf # @todo: check + self._node_maps_from_median = {} + self._sum_of_distances = 0 + self._best_init_sum_of_distances = np.inf + self._converged_sum_of_distances = np.inf + self._runtime = None + self._runtime_initialized = None + self._runtime_converged = None + self._itrs = [] # @todo: check: {} ? + self._num_decrease_order = 0 + self._num_increase_order = 0 + self._num_converged_descents = 0 + self._state = AlgorithmState.TERMINATED + self._label_names = {} if ged_env is None: raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') @@ -93,142 +93,142 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined options : string String that specifies with which options to run the estimator. """ - self.__set_default_options() + self._set_default_options() options_map = misc.options_string_to_options_map(options) for opt_name, opt_val in options_map.items(): if opt_name == 'init-type': - self.__init_type = opt_val + self._init_type = opt_val if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN': raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"') elif opt_name == 'random-inits': try: - self.__num_random_inits = int(opt_val) - self.__desired_num_random_inits = self.__num_random_inits + self._num_random_inits = int(opt_val) + self._desired_num_random_inits = self._num_random_inits except: raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') - if self.__num_random_inits <= 0: + if self._num_random_inits <= 0: raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') elif opt_name == 'randomness': if opt_val == 'PSEUDO': - self.__use_real_randomness = False + self._use_real_randomness = False elif opt_val == 'REAL': - self.__use_real_randomness = True + self._use_real_randomness = True else: raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') elif opt_name == 'stdout': if opt_val == '0': - self.__print_to_stdout = 0 + self._print_to_stdout = 0 elif opt_val == '1': - self.__print_to_stdout = 1 + self._print_to_stdout = 1 elif opt_val == '2': - self.__print_to_stdout = 2 + self._print_to_stdout = 2 else: raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') elif opt_name == 'parallel': if opt_val == 'TRUE': - self.__parallel = True + self._parallel = True elif opt_val == 'FALSE': - self.__parallel = False + self._parallel = False else: raise Exception('Invalid argument "' + opt_val + '" for option parallel. Usage: options = "[--parallel TRUE|FALSE] [...]"') elif opt_name == 'update-order': if opt_val == 'TRUE': - self.__update_order = True + self._update_order = True elif opt_val == 'FALSE': - self.__update_order = False + self._update_order = False else: raise Exception('Invalid argument "' + opt_val + '" for option update-order. Usage: options = "[--update-order TRUE|FALSE] [...]"') elif opt_name == 'sort-graphs': if opt_val == 'TRUE': - self.__sort_graphs = True + self._sort_graphs = True elif opt_val == 'FALSE': - self.__sort_graphs = False + self._sort_graphs = False else: raise Exception('Invalid argument "' + opt_val + '" for option sort-graphs. Usage: options = "[--sort-graphs TRUE|FALSE] [...]"') elif opt_name == 'refine': if opt_val == 'TRUE': - self.__refine = True + self._refine = True elif opt_val == 'FALSE': - self.__refine = False + self._refine = False else: raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') elif opt_name == 'time-limit': try: - self.__time_limit_in_sec = float(opt_val) + self._time_limit_in_sec = float(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]') elif opt_name == 'max-itrs': try: - self.__max_itrs = int(opt_val) + self._max_itrs = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]') elif opt_name == 'max-itrs-without-update': try: - self.__max_itrs_without_update = int(opt_val) + self._max_itrs_without_update = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]') elif opt_name == 'seed': try: - self.__seed = int(opt_val) + self._seed = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]') elif opt_name == 'epsilon': try: - self.__epsilon = float(opt_val) + self._epsilon = float(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') - if self.__epsilon <= 0: + if self._epsilon <= 0: raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') elif opt_name == 'inits-increase-order': try: - self.__num_inits_increase_order = int(opt_val) + self._num_inits_increase_order = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') - if self.__num_inits_increase_order <= 0: + if self._num_inits_increase_order <= 0: raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') elif opt_name == 'init-type-increase-order': - self.__init_type_increase_order = opt_val + self._init_type_increase_order = opt_val if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') elif opt_name == 'max-itrs-increase-order': try: - self.__max_itrs_increase_order = int(opt_val) + self._max_itrs_increase_order = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]') @@ -255,8 +255,8 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined ----- Has no effect unless "--init-type MEDOID" is passed to set_options(). """ - self.__init_method = init_method; - self.__init_options = init_options; + self._init_method = init_method; + self._init_options = init_options; def set_descent_method(self, descent_method, descent_options=''): @@ -274,8 +274,8 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined ----- Has no effect unless "--init-type MEDOID" is passed to set_options(). """ - self.__descent_method = descent_method; - self.__descent_options = descent_options; + self._descent_method = descent_method; + self._descent_options = descent_options; def set_refine_method(self, refine_method, refine_options): @@ -293,8 +293,8 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined ----- Has no effect if "--refine FALSE" is passed to set_options(). """ - self.__refine_method = refine_method - self.__refine_options = refine_options + self._refine_method = refine_method + self._refine_options = refine_options def run(self, graph_ids, set_median_id, gen_median_id): @@ -317,7 +317,7 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined raise Exception('Empty vector of graph IDs, unable to compute median.') all_graphs_empty = True for graph_id in graph_ids: - if self.__ged_env.get_graph_num_nodes(graph_id) > 0: + if self._ged_env.get_graph_num_nodes(graph_id) > 0: all_graphs_empty = False break if all_graphs_empty: @@ -325,16 +325,16 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # Start timer and record start time. start = time.time() - timer = Timer(self.__time_limit_in_sec) - self.__median_id = gen_median_id - self.__state = AlgorithmState.TERMINATED + timer = Timer(self._time_limit_in_sec) + self._median_id = gen_median_id + self._state = AlgorithmState.TERMINATED # Get NetworkX graph representations of the input graphs. graphs = {} for graph_id in graph_ids: # @todo: get_nx_graph() function may need to be modified according to the coming code. - graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id) -# print(self.__ged_env.get_graph_internal_id(0)) + graphs[graph_id] = self._ged_env.get_nx_graph(graph_id) +# print(self._ged_env.get_graph_internal_id(0)) # print(graphs[0].graph) # print(graphs[0].nodes(data=True)) # print(graphs[0].edges(data=True)) @@ -342,27 +342,27 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # Construct initial medians. medians = [] - self.__construct_initial_medians(graph_ids, timer, medians) + self._construct_initial_medians(graph_ids, timer, medians) end_init = time.time() - self.__runtime_initialized = end_init - start + self._runtime_initialized = end_init - start # print(medians[0].graph) # print(medians[0].nodes(data=True)) # print(medians[0].edges(data=True)) # print(nx.adjacency_matrix(medians[0])) # Reset information about iterations and number of times the median decreases and increases. - self.__itrs = [0] * len(medians) - self.__num_decrease_order = 0 - self.__num_increase_order = 0 - self.__num_converged_descents = 0 + self._itrs = [0] * len(medians) + self._num_decrease_order = 0 + self._num_increase_order = 0 + self._num_converged_descents = 0 # Initialize the best median. best_sum_of_distances = np.inf - self.__best_init_sum_of_distances = np.inf + self._best_init_sum_of_distances = np.inf node_maps_from_best_median = {} # Run block gradient descent from all initial medians. - self.__ged_env.set_method(self.__descent_method, self.__descent_options) + self._ged_env.set_method(self._descent_method, self._descent_options) for median_pos in range(0, len(medians)): # Terminate if the timer has expired and at least one SOD has been computed. @@ -370,7 +370,7 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined break # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n===========================================================') print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') print('-----------------------------------------------------------') @@ -379,27 +379,27 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined median = medians[median_pos] # Load initial median into the environment. - self.__ged_env.load_nx_graph(median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) + self._ged_env.load_nx_graph(median, gen_median_id) + self._ged_env.init(self._ged_env.get_init_type()) # Compute node maps and sum of distances for initial median. -# xxx = self.__node_maps_from_median - self.__compute_init_node_maps(graph_ids, gen_median_id) -# yyy = self.__node_maps_from_median +# xxx = self._node_maps_from_median + self._compute_init_node_maps(graph_ids, gen_median_id) +# yyy = self._node_maps_from_median - self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) - self.__ged_env.load_nx_graph(median, set_median_id) -# print(self.__best_init_sum_of_distances) + self._best_init_sum_of_distances = min(self._best_init_sum_of_distances, self._sum_of_distances) + self._ged_env.load_nx_graph(median, set_median_id) +# print(self._best_init_sum_of_distances) # Run block gradient descent from initial median. converged = False itrs_without_update = 0 - while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update): + while not self._termination_criterion_met(converged, timer, self._itrs[median_pos], itrs_without_update): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n===========================================================') - print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('Iteration', str(self._itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') print('-----------------------------------------------------------') # Initialize flags that tell us what happened in the iteration. @@ -409,13 +409,13 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined increased_order = False # Update the median. - median_modified = self.__update_median(graphs, median) - if self.__update_order: + median_modified = self._update_median(graphs, median) + if self._update_order: pass # @todo: -# if not median_modified or self.__itrs[median_pos] == 0: -# decreased_order = self.__decrease_order(graphs, median) -# if not decreased_order or self.__itrs[median_pos] == 0: -# increased_order = self.__increase_order(graphs, median) +# if not median_modified or self._itrs[median_pos] == 0: +# decreased_order = self._decrease_order(graphs, median) +# if not decreased_order or self._itrs[median_pos] == 0: +# increased_order = self._increase_order(graphs, median) # Update the number of iterations without update of the median. if median_modified or decreased_order or increased_order: @@ -424,51 +424,51 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined itrs_without_update += 1 # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Loading median to environment: ... ', end='') # Load the median into the environment. # @todo: should this function use the original node label? - self.__ged_env.load_nx_graph(median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) + self._ged_env.load_nx_graph(median, gen_median_id) + self._ged_env.init(self._ged_env.get_init_type()) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Updating induced costs: ... ', end='') # Compute induced costs of the old node maps w.r.t. the updated median. for graph_id in graph_ids: -# print(self.__node_maps_from_median[graph_id].induced_cost()) -# xxx = self.__node_maps_from_median[graph_id] - self.__ged_env.compute_induced_cost(gen_median_id, graph_id, self.__node_maps_from_median[graph_id]) +# print(self._node_maps_from_median[graph_id].induced_cost()) +# xxx = self._node_maps_from_median[graph_id] + self._ged_env.compute_induced_cost(gen_median_id, graph_id, self._node_maps_from_median[graph_id]) # print('---------------------------------------') -# print(self.__node_maps_from_median[graph_id].induced_cost()) +# print(self._node_maps_from_median[graph_id].induced_cost()) # @todo:!!!!!!!!!!!!!!!!!!!!!!!!!!!!This value is a slight different from the c++ program, which might be a bug! Use it very carefully! # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Update the node maps. - node_maps_modified = self.__update_node_maps() + node_maps_modified = self._update_node_maps() # Update the order of the median if no improvement can be found with the current order. # Update the sum of distances. - old_sum_of_distances = self.__sum_of_distances - self.__sum_of_distances = 0 - for graph_id, node_map in self.__node_maps_from_median.items(): - self.__sum_of_distances += node_map.induced_cost() -# print(self.__sum_of_distances) + old_sum_of_distances = self._sum_of_distances + self._sum_of_distances = 0 + for graph_id, node_map in self._node_maps_from_median.items(): + self._sum_of_distances += node_map.induced_cost() +# print(self._sum_of_distances) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Old local SOD: ', old_sum_of_distances) - print('New local SOD: ', self.__sum_of_distances) + print('New local SOD: ', self._sum_of_distances) print('Best converged SOD: ', best_sum_of_distances) print('Modified median: ', median_modified) print('Modified node maps: ', node_maps_modified) @@ -478,121 +478,121 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined converged = not (median_modified or node_maps_modified or decreased_order or increased_order) - self.__itrs[median_pos] += 1 + self._itrs[median_pos] += 1 # Update the best median. - if self.__sum_of_distances < best_sum_of_distances: - best_sum_of_distances = self.__sum_of_distances - node_maps_from_best_median = self.__node_maps_from_median.copy() # @todo: this is a shallow copy, not sure if it is enough. + if self._sum_of_distances < best_sum_of_distances: + best_sum_of_distances = self._sum_of_distances + node_maps_from_best_median = self._node_maps_from_median.copy() # @todo: this is a shallow copy, not sure if it is enough. best_median = median # Update the number of converged descents. if converged: - self.__num_converged_descents += 1 + self._num_converged_descents += 1 # Store the best encountered median. - self.__sum_of_distances = best_sum_of_distances - self.__node_maps_from_median = node_maps_from_best_median - self.__ged_env.load_nx_graph(best_median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) + self._sum_of_distances = best_sum_of_distances + self._node_maps_from_median = node_maps_from_best_median + self._ged_env.load_nx_graph(best_median, gen_median_id) + self._ged_env.init(self._ged_env.get_init_type()) end_descent = time.time() - self.__runtime_converged = end_descent - start + self._runtime_converged = end_descent - start # Refine the sum of distances and the node maps for the converged median. - self.__converged_sum_of_distances = self.__sum_of_distances - if self.__refine: - self.__improve_sum_of_distances(timer) + self._converged_sum_of_distances = self._sum_of_distances + if self._refine: + self._improve_sum_of_distances(timer) # Record end time, set runtime and reset the number of initial medians. end = time.time() - self.__runtime = end - start - self.__num_random_inits = self.__desired_num_random_inits + self._runtime = end - start + self._num_random_inits = self._desired_num_random_inits # Print global information. - if self.__print_to_stdout != 0: + if self._print_to_stdout != 0: print('\n===========================================================') print('Finished computation of generalized median graph.') print('-----------------------------------------------------------') - print('Best SOD after initialization: ', self.__best_init_sum_of_distances) - print('Converged SOD: ', self.__converged_sum_of_distances) - if self.__refine: - print('Refined SOD: ', self.__sum_of_distances) - print('Overall runtime: ', self.__runtime) - print('Runtime of initialization: ', self.__runtime_initialized) - print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized) - if self.__refine: - print('Runtime of refinement: ', self.__runtime - self.__runtime_converged) + print('Best SOD after initialization: ', self._best_init_sum_of_distances) + print('Converged SOD: ', self._converged_sum_of_distances) + if self._refine: + print('Refined SOD: ', self._sum_of_distances) + print('Overall runtime: ', self._runtime) + print('Runtime of initialization: ', self._runtime_initialized) + print('Runtime of block gradient descent: ', self._runtime_converged - self._runtime_initialized) + if self._refine: + print('Runtime of refinement: ', self._runtime - self._runtime_converged) print('Number of initial medians: ', len(medians)) total_itr = 0 num_started_descents = 0 - for itr in self.__itrs: + for itr in self._itrs: total_itr += itr if itr > 0: num_started_descents += 1 print('Size of graph collection: ', len(graph_ids)) print('Number of started descents: ', num_started_descents) - print('Number of converged descents: ', self.__num_converged_descents) + print('Number of converged descents: ', self._num_converged_descents) print('Overall number of iterations: ', total_itr) - print('Overall number of times the order decreased: ', self.__num_decrease_order) - print('Overall number of times the order increased: ', self.__num_increase_order) + print('Overall number of times the order decreased: ', self._num_decrease_order) + print('Overall number of times the order increased: ', self._num_increase_order) print('===========================================================\n') - def __improve_sum_of_distances(self, timer): # @todo: go through and test + def _improve_sum_of_distances(self, timer): # @todo: go through and test # Use method selected for refinement phase. - self.__ged_env.set_method(self.__refine_method, self.__refine_options) + self._ged_env.set_method(self._refine_method, self._refine_options) # Print information about current iteration. - if self.__print_to_stdout == 2: - progress = tqdm(desc='Improving node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + if self._print_to_stdout == 2: + progress = tqdm(desc='Improving node maps', total=len(self._node_maps_from_median), file=sys.stdout) print('\n===========================================================') print('Improving node maps and SOD for converged median.') print('-----------------------------------------------------------') progress.update(1) # Improving the node maps. - nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__gen_median_id) - for graph_id, node_map in self.__node_maps_from_median.items(): + nb_nodes_median = self._ged_env.get_graph_num_nodes(self._gen_median_id) + for graph_id, node_map in self._node_maps_from_median.items(): if time.expired(): - if self.__state == AlgorithmState.TERMINATED: - self.__state = AlgorithmState.CONVERGED + if self._state == AlgorithmState.TERMINATED: + self._state = AlgorithmState.CONVERGED break - nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) - if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: - self.__ged_env.run_method(self.__gen_median_id, graph_id) - if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost(): - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id) + nb_nodes_g = self._ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self._sort_graphs: + self._ged_env.run_method(self._gen_median_id, graph_id) + if self._ged_env.get_upper_bound(self._gen_median_id, graph_id) < node_map.induced_cost(): + self._node_maps_from_median[graph_id] = self._ged_env.get_node_map(self._gen_median_id, graph_id) else: - self.__ged_env.run_method(graph_id, self.__gen_median_id) - if self.__ged_env.get_upper_bound(graph_id, self.__gen_median_id) < node_map.induced_cost(): - node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__gen_median_id) + self._ged_env.run_method(graph_id, self._gen_median_id) + if self._ged_env.get_upper_bound(graph_id, self._gen_median_id) < node_map.induced_cost(): + node_map_tmp = self._ged_env.get_node_map(graph_id, self._gen_median_id) node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map - self.__node_maps_from_median[graph_id] = node_map_tmp + self._node_maps_from_median[graph_id] = node_map_tmp - self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + self._sum_of_distances += self._node_maps_from_median[graph_id].induced_cost() # Print information. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) - self.__sum_of_distances = 0.0 - for key, val in self.__node_maps_from_median.items(): - self.__sum_of_distances += val.induced_cost() + self._sum_of_distances = 0.0 + for key, val in self._node_maps_from_median.items(): + self._sum_of_distances += val.induced_cost() # Print information. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('===========================================================\n') - def __median_available(self): - return self.__median_id != np.inf + def _median_available(self): + return self._median_id != np.inf def get_state(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_state().') - return self.__state + return self._state def get_sum_of_distances(self, state=''): @@ -608,92 +608,92 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined float The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned. """ - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().') if state == 'initialized': - return self.__best_init_sum_of_distances + return self._best_init_sum_of_distances if state == 'converged': - return self.__converged_sum_of_distances - return self.__sum_of_distances + return self._converged_sum_of_distances + return self._sum_of_distances def get_runtime(self, state): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_runtime().') if state == AlgorithmState.INITIALIZED: - return self.__runtime_initialized + return self._runtime_initialized if state == AlgorithmState.CONVERGED: - return self.__runtime_converged - return self.__runtime + return self._runtime_converged + return self._runtime def get_num_itrs(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_itrs().') - return self.__itrs + return self._itrs def get_num_times_order_decreased(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_times_order_decreased().') - return self.__num_decrease_order + return self._num_decrease_order def get_num_times_order_increased(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_times_order_increased().') - return self.__num_increase_order + return self._num_increase_order def get_num_converged_descents(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_converged_descents().') - return self.__num_converged_descents + return self._num_converged_descents def get_ged_env(self): - return self.__ged_env - - - def __set_default_options(self): - self.__init_type = 'RANDOM' - self.__num_random_inits = 10 - self.__desired_num_random_inits = 10 - self.__use_real_randomness = True - self.__seed = 0 - self.__parallel = True - self.__update_order = True - self.__sort_graphs = True - self.__refine = True - self.__time_limit_in_sec = 0 - self.__epsilon = 0.0001 - self.__max_itrs = 100 - self.__max_itrs_without_update = 3 - self.__num_inits_increase_order = 10 - self.__init_type_increase_order = 'K-MEANS++' - self.__max_itrs_increase_order = 10 - self.__print_to_stdout = 2 - self.__label_names = {} + return self._ged_env + + + def _set_default_options(self): + self._init_type = 'RANDOM' + self._num_random_inits = 10 + self._desired_num_random_inits = 10 + self._use_real_randomness = True + self._seed = 0 + self._parallel = True + self._update_order = True + self._sort_graphs = True + self._refine = True + self._time_limit_in_sec = 0 + self._epsilon = 0.0001 + self._max_itrs = 100 + self._max_itrs_without_update = 3 + self._num_inits_increase_order = 10 + self._init_type_increase_order = 'K-MEANS++' + self._max_itrs_increase_order = 10 + self._print_to_stdout = 2 + self._label_names = {} - def __construct_initial_medians(self, graph_ids, timer, initial_medians): + def _construct_initial_medians(self, graph_ids, timer, initial_medians): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n===========================================================') print('Constructing initial median(s).') print('-----------------------------------------------------------') # Compute or sample the initial median(s). initial_medians.clear() - if self.__init_type == 'MEDOID': - self.__compute_medoid(graph_ids, timer, initial_medians) - elif self.__init_type == 'MAX': + if self._init_type == 'MEDOID': + self._compute_medoid(graph_ids, timer, initial_medians) + elif self._init_type == 'MAX': pass # @todo # compute_max_order_graph_(graph_ids, initial_medians) - elif self.__init_type == 'MIN': + elif self._init_type == 'MIN': pass # @todo # compute_min_order_graph_(graph_ids, initial_medians) - elif self.__init_type == 'MEAN': + elif self._init_type == 'MEAN': pass # @todo # compute_mean_order_graph_(graph_ids, initial_medians) else: @@ -701,17 +701,17 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # sample_initial_medians_(graph_ids, initial_medians) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('===========================================================') - def __compute_medoid(self, graph_ids, timer, initial_medians): + def _compute_medoid(self, graph_ids, timer, initial_medians): # Use method selected for initialization phase. - self.__ged_env.set_method(self.__init_method, self.__init_options) + self._ged_env.set_method(self._init_method, self._init_options) # Compute the medoid. - if self.__parallel: - # @todo: notice when parallel self.__ged_env is not modified. + if self._parallel: + # @todo: notice when parallel self._ged_env is not modified. sum_of_distances_list = [np.inf] * len(graph_ids) len_itr = len(graph_ids) itr = zip(graph_ids, range(0, len(graph_ids))) @@ -723,9 +723,9 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - do_fun = partial(_compute_medoid_parallel, graph_ids, self.__sort_graphs) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) - if self.__print_to_stdout == 2: + do_fun = partial(_compute_medoid_parallel, graph_ids, self._sort_graphs) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self._ged_env,)) + if self._print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), desc='Computing medoid', file=sys.stdout) else: @@ -738,50 +738,50 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined medoid_id = np.argmin(sum_of_distances_list) best_sum_of_distances = sum_of_distances_list[medoid_id] - initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + initial_medians.append(self._ged_env.get_nx_graph(medoid_id)) # @todo else: # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout) medoid_id = graph_ids[0] best_sum_of_distances = np.inf for g_id in graph_ids: if timer.expired(): - self.__state = AlgorithmState.CALLED + self._state = AlgorithmState.CALLED break - nb_nodes_g = self.__ged_env.get_graph_num_nodes(g_id) + nb_nodes_g = self._ged_env.get_graph_num_nodes(g_id) sum_of_distances = 0 for h_id in graph_ids: # @todo: this can be faster, only a half is needed. - nb_nodes_h = self.__ged_env.get_graph_num_nodes(h_id) - if nb_nodes_g <= nb_nodes_h or not self.__sort_graphs: - self.__ged_env.run_method(g_id, h_id) # @todo - sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) + nb_nodes_h = self._ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not self._sort_graphs: + self._ged_env.run_method(g_id, h_id) # @todo + sum_of_distances += self._ged_env.get_upper_bound(g_id, h_id) else: - self.__ged_env.run_method(h_id, g_id) - sum_of_distances += self.__ged_env.get_upper_bound(h_id, g_id) + self._ged_env.run_method(h_id, g_id) + sum_of_distances += self._ged_env.get_upper_bound(h_id, g_id) if sum_of_distances < best_sum_of_distances: best_sum_of_distances = sum_of_distances medoid_id = g_id # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) - initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + initial_medians.append(self._ged_env.get_nx_graph(medoid_id)) # @todo # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n') - def __compute_init_node_maps(self, graph_ids, gen_median_id): + def _compute_init_node_maps(self, graph_ids, gen_median_id): # Compute node maps and sum of distances for initial median. - if self.__parallel: - # @todo: notice when parallel self.__ged_env is not modified. - self.__sum_of_distances = 0 - self.__node_maps_from_median.clear() + if self._parallel: + # @todo: notice when parallel self._ged_env is not modified. + self._sum_of_distances = 0 + self._node_maps_from_median.clear() sum_of_distances_list = [0] * len(graph_ids) len_itr = len(graph_ids) @@ -794,92 +794,92 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) - do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self.__sort_graphs, nb_nodes_median) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) - if self.__print_to_stdout == 2: + nb_nodes_median = self._ged_env.get_graph_num_nodes(gen_median_id) + do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self._sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self._ged_env,)) + if self._print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), desc='Computing initial node maps', file=sys.stdout) else: iterator = pool.imap_unordered(do_fun, itr, chunksize) for g_id, sod, node_maps in iterator: sum_of_distances_list[g_id] = sod - self.__node_maps_from_median[g_id] = node_maps + self._node_maps_from_median[g_id] = node_maps pool.close() pool.join() - self.__sum_of_distances = np.sum(sum_of_distances_list) -# xxx = self.__node_maps_from_median + self._sum_of_distances = np.sum(sum_of_distances_list) +# xxx = self._node_maps_from_median else: # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout) - self.__sum_of_distances = 0 - self.__node_maps_from_median.clear() - nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + self._sum_of_distances = 0 + self._node_maps_from_median.clear() + nb_nodes_median = self._ged_env.get_graph_num_nodes(gen_median_id) for graph_id in graph_ids: - nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) - if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: - self.__ged_env.run_method(gen_median_id, graph_id) - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) + nb_nodes_g = self._ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self._sort_graphs: + self._ged_env.run_method(gen_median_id, graph_id) + self._node_maps_from_median[graph_id] = self._ged_env.get_node_map(gen_median_id, graph_id) else: - self.__ged_env.run_method(graph_id, gen_median_id) - node_map_tmp = self.__ged_env.get_node_map(graph_id, gen_median_id) + self._ged_env.run_method(graph_id, gen_median_id) + node_map_tmp = self._ged_env.get_node_map(graph_id, gen_median_id) node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map - self.__node_maps_from_median[graph_id] = node_map_tmp - # print(self.__node_maps_from_median[graph_id]) - self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() - # print(self.__sum_of_distances) + self._node_maps_from_median[graph_id] = node_map_tmp + # print(self._node_maps_from_median[graph_id]) + self._sum_of_distances += self._node_maps_from_median[graph_id].induced_cost() + # print(self._sum_of_distances) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n') - def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): - if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): - if self.__state == AlgorithmState.TERMINATED: - self.__state = AlgorithmState.INITIALIZED + def _termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self._max_itrs if self._max_itrs >= 0 else False): + if self._state == AlgorithmState.TERMINATED: + self._state = AlgorithmState.INITIALIZED return True - return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + return converged or (itrs_without_update > self._max_itrs_without_update if self._max_itrs_without_update >= 0 else False) - def __update_median(self, graphs, median): + def _update_median(self, graphs, median): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Updating median: ', end='') # Store copy of the old median. old_median = median.copy() # @todo: this is just a shallow copy. # Update the node labels. - if self.__labeled_nodes: - self.__update_node_labels(graphs, median) + if self._labeled_nodes: + self._update_node_labels(graphs, median) # Update the edges and their labels. - self.__update_edges(graphs, median) + self._update_edges(graphs, median) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') - return not self.__are_graphs_equal(median, old_median) + return not self._are_graphs_equal(median, old_median) - def __update_node_labels(self, graphs, median): + def _update_node_labels(self, graphs, median): # print('----------------------------') # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('nodes ... ', end='') # Collect all possible node labels. - all_labels = self.__ged_env.get_all_node_labels() + all_labels = self._ged_env.get_all_node_labels() # Iterate through all nodes of the median. for i in range(0, nx.number_of_nodes(median)): @@ -888,7 +888,7 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # Collect the labels of the substituted nodes. node_labels = [] for graph_id, graph in graphs.items(): - k = self.__node_maps_from_median[graph_id].image(i) + k = self._node_maps_from_median[graph_id].image(i) if k != np.inf: node_labels.append(tuple(graph.nodes[k].items())) # @todo: sort else: @@ -902,7 +902,7 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined for label1 in all_labels: fi = 0 for label2 in node_labels: - fi += self.__ged_env.get_node_cost(label1, label2) # @todo: check inside, this might be slow + fi += self._ged_env.get_node_cost(label1, label2) # @todo: check inside, this might be slow if fi < fi_min: # @todo: fi is too easy to be zero. use <= or consider multiple optimal labels. fi_min = fi median_label = label1 @@ -910,18 +910,18 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined median_label = {kv[0]: kv[1] for kv in median_label} nx.set_node_attributes(median, {i: median_label}) -# median_label = self.__get_median_node_label(node_labels) -# if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: +# median_label = self._get_median_node_label(node_labels) +# if self._ged_env.get_node_rel_cost(median.nodes[i], median_label) > self._epsilon: # nx.set_node_attributes(median, {i: median_label}) - def __update_edges(self, graphs, median): + def _update_edges(self, graphs, median): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('edges ... ', end='') # Collect all possible edge labels. - all_labels = self.__ged_env.get_all_edge_labels() + all_labels = self._ged_env.get_all_edge_labels() # @todo: what if edge is not labeled? # Iterate through all possible edges (i,j) of the median. @@ -931,27 +931,27 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # Collect the labels of the edges to which (i,j) is mapped by the node maps. edge_labels = [] for graph_id, graph in graphs.items(): - k = self.__node_maps_from_median[graph_id].image(i) - l = self.__node_maps_from_median[graph_id].image(j) + k = self._node_maps_from_median[graph_id].image(i) + l = self._node_maps_from_median[graph_id].image(j) if k != np.inf and l != np.inf and graph.has_edge(k, l): edge_labels.append(tuple(graph.edges[(k, l)].items())) # @todo: sort else: edge_labels.append(SpecialLabel.DUMMY) # Compute the median edge label and the overall edge relabeling cost. - if self.__labeled_edges and len(edge_labels) > 0: + if self._labeled_edges and len(edge_labels) > 0: fij1_min = np.inf median_label = tuple() # Compute f_ij^0. fij0 = 0 for label2 in edge_labels: - fij0 += self.__ged_env.get_edge_cost(SpecialLabel.DUMMY, label2) + fij0 += self._ged_env.get_edge_cost(SpecialLabel.DUMMY, label2) for label1 in all_labels: fij1 = 0 for label2 in edge_labels: - fij1 += self.__ged_env.get_edge_cost(label1, label2) + fij1 += self._ged_env.get_edge_cost(label1, label2) if fij1 < fij1_min: fij1_min = fij1 @@ -964,19 +964,19 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined median_label = {kv[0]: kv[1] for kv in median_label} median.add_edge(i, j, **median_label) -# if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: +# if self._ged_env.get_edge_rel_cost(median_label, new_median_label) > self._epsilon: # median_label = new_median_label - def __update_node_maps(self): + def _update_node_maps(self): # Update the node maps. - if self.__parallel: - # @todo: notice when parallel self.__ged_env is not modified. + if self._parallel: + # @todo: notice when parallel self._ged_env is not modified. node_maps_were_modified = False -# xxx = self.__node_maps_from_median.copy() +# xxx = self._node_maps_from_median.copy() - len_itr = len(self.__node_maps_from_median) - itr = [item for item in self.__node_maps_from_median.items()] + len_itr = len(self._node_maps_from_median) + itr = [item for item in self._node_maps_from_median.items()] n_jobs = multiprocessing.cpu_count() if len_itr < 100 * n_jobs: chunksize = int(len_itr / n_jobs) + 1 @@ -985,66 +985,66 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) - do_fun = partial(_update_node_maps_parallel, self.__median_id, self.__epsilon, self.__sort_graphs, nb_nodes_median) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) - if self.__print_to_stdout == 2: + nb_nodes_median = self._ged_env.get_graph_num_nodes(self._median_id) + do_fun = partial(_update_node_maps_parallel, self._median_id, self._epsilon, self._sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self._ged_env,)) + if self._print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), desc='Updating node maps', file=sys.stdout) else: iterator = pool.imap_unordered(do_fun, itr, chunksize) for g_id, node_map, nm_modified in iterator: - self.__node_maps_from_median[g_id] = node_map + self._node_maps_from_median[g_id] = node_map if nm_modified: node_maps_were_modified = True pool.close() pool.join() -# yyy = self.__node_maps_from_median.copy() +# yyy = self._node_maps_from_median.copy() else: # Print information about current iteration. - if self.__print_to_stdout == 2: - progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + if self._print_to_stdout == 2: + progress = tqdm(desc='Updating node maps', total=len(self._node_maps_from_median), file=sys.stdout) node_maps_were_modified = False - nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) - for graph_id, node_map in self.__node_maps_from_median.items(): - nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + nb_nodes_median = self._ged_env.get_graph_num_nodes(self._median_id) + for graph_id, node_map in self._node_maps_from_median.items(): + nb_nodes_g = self._ged_env.get_graph_num_nodes(graph_id) - if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: - self.__ged_env.run_method(self.__median_id, graph_id) - if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon: - # xxx = self.__node_maps_from_median[graph_id] - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) + if nb_nodes_median <= nb_nodes_g or not self._sort_graphs: + self._ged_env.run_method(self._median_id, graph_id) + if self._ged_env.get_upper_bound(self._median_id, graph_id) < node_map.induced_cost() - self._epsilon: + # xxx = self._node_maps_from_median[graph_id] + self._node_maps_from_median[graph_id] = self._ged_env.get_node_map(self._median_id, graph_id) node_maps_were_modified = True else: - self.__ged_env.run_method(graph_id, self.__median_id) - if self.__ged_env.get_upper_bound(graph_id, self.__median_id) < node_map.induced_cost() - self.__epsilon: - node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__median_id) + self._ged_env.run_method(graph_id, self._median_id) + if self._ged_env.get_upper_bound(graph_id, self._median_id) < node_map.induced_cost() - self._epsilon: + node_map_tmp = self._ged_env.get_node_map(graph_id, self._median_id) node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map - self.__node_maps_from_median[graph_id] = node_map_tmp + self._node_maps_from_median[graph_id] = node_map_tmp node_maps_were_modified = True # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n') # Return true if the node maps were modified. return node_maps_were_modified - def __decrease_order(self, graphs, median): + def _decrease_order(self, graphs, median): # Print information about current iteration - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Trying to decrease order: ... ', end='') if nx.number_of_nodes(median) <= 1: - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('median graph has only 1 node, skip decrease.') return False @@ -1053,23 +1053,23 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined decreased_order = False # Decrease the order as long as the best deletion delta is negative. - while self.__compute_best_deletion_delta(graphs, median, id_deleted_node) < -self.__epsilon: + while self._compute_best_deletion_delta(graphs, median, id_deleted_node) < -self._epsilon: decreased_order = True - self.__delete_node_from_median(id_deleted_node[0], median) + self._delete_node_from_median(id_deleted_node[0], median) if nx.number_of_nodes(median) <= 1: - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('decrease stopped because median graph remains only 1 node. ', end='') break # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Return true iff the order was decreased. return decreased_order - def __compute_best_deletion_delta(self, graphs, median, id_deleted_node): + def _compute_best_deletion_delta(self, graphs, median, id_deleted_node): best_delta = 0.0 # Determine node that should be deleted (if any). @@ -1077,22 +1077,22 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # Compute cost delta. delta = 0.0 for graph_id, graph in graphs.items(): - k = self.__node_maps_from_median[graph_id].image(i) + k = self._node_maps_from_median[graph_id].image(i) if k == np.inf: - delta -= self.__node_del_cost + delta -= self._node_del_cost else: - delta += self.__node_ins_cost - self.__ged_env.get_node_rel_cost(median.nodes[i], graph.nodes[k]) + delta += self._node_ins_cost - self._ged_env.get_node_rel_cost(median.nodes[i], graph.nodes[k]) for j, j_label in median[i].items(): - l = self.__node_maps_from_median[graph_id].image(j) + l = self._node_maps_from_median[graph_id].image(j) if k == np.inf or l == np.inf: - delta -= self.__edge_del_cost + delta -= self._edge_del_cost elif not graph.has_edge(k, l): - delta -= self.__edge_del_cost + delta -= self._edge_del_cost else: - delta += self.__edge_ins_cost - self.__ged_env.get_edge_rel_cost(j_label, graph.edges[(k, l)]) + delta += self._edge_ins_cost - self._ged_env.get_edge_rel_cost(j_label, graph.edges[(k, l)]) # Update best deletion delta. - if delta < best_delta - self.__epsilon: + if delta < best_delta - self._epsilon: best_delta = delta id_deleted_node[0] = i # id_deleted_node[0] = 3 # @todo: @@ -1100,7 +1100,7 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined return best_delta - def __delete_node_from_median(self, id_deleted_node, median): + def _delete_node_from_median(self, id_deleted_node, median): # Update the median. mapping = {} for i in range(0, nx.number_of_nodes(median)): @@ -1111,8 +1111,8 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined nx.relabel_nodes(median, mapping, copy=False) # Update the node maps. -# xxx = self.__node_maps_from_median - for key, node_map in self.__node_maps_from_median.items(): +# xxx = self._node_maps_from_median + for key, node_map in self._node_maps_from_median.items(): new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) is_unassigned_target_node = [True] * node_map.num_target_nodes() for i in range(0, nx.number_of_nodes(median) + 1): @@ -1125,38 +1125,38 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined for k in range(0, node_map.num_target_nodes()): if is_unassigned_target_node[k]: new_node_map.add_assignment(np.inf, k) -# print(self.__node_maps_from_median[key].forward_map, self.__node_maps_from_median[key].backward_map) +# print(self._node_maps_from_median[key].forward_map, self._node_maps_from_median[key].backward_map) # print(new_node_map.forward_map, new_node_map.backward_map - self.__node_maps_from_median[key] = new_node_map + self._node_maps_from_median[key] = new_node_map # Increase overall number of decreases. - self.__num_decrease_order += 1 + self._num_decrease_order += 1 - def __increase_order(self, graphs, median): + def _increase_order(self, graphs, median): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Trying to increase order: ... ', end='') # Initialize the best configuration and the best label of the node that is to be inserted. best_config = {} - best_label = self.__ged_env.get_node_label(1, to_dict=True) + best_label = self._ged_env.get_node_label(1, to_dict=True) increased_order = False # Increase the order as long as the best insertion delta is negative. - while self.__compute_best_insertion_delta(graphs, best_config, best_label) < - self.__epsilon: + while self._compute_best_insertion_delta(graphs, best_config, best_label) < - self._epsilon: increased_order = True - self.__add_node_to_median(best_config, best_label, median) + self._add_node_to_median(best_config, best_label, median) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Return true iff the order was increased. return increased_order - def __compute_best_insertion_delta(self, graphs, best_config, best_label): + def _compute_best_insertion_delta(self, graphs, best_config, best_label): # Construct sets of inserted nodes. no_inserted_node = True inserted_nodes = {} @@ -1164,7 +1164,7 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined inserted_nodes[graph_id] = [] best_config[graph_id] = np.inf for k in range(nx.number_of_nodes(graph)): - if self.__node_maps_from_median[graph_id].pre_image(k) == np.inf: + if self._node_maps_from_median[graph_id].pre_image(k) == np.inf: no_inserted_node = False inserted_nodes[graph_id].append((k, tuple(item for item in graph.nodes[k].items()))) # @todo: can order of label names be garantteed? @@ -1174,34 +1174,34 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # Compute insertion configuration, label, and delta. best_delta = 0.0 # @todo - if len(self.__label_names['node_labels']) == 0 and len(self.__label_names['node_attrs']) == 0: # @todo - best_delta = self.__compute_insertion_delta_unlabeled(inserted_nodes, best_config, best_label) - elif len(self.__label_names['node_labels']) > 0: # self.__constant_node_costs: - best_delta = self.__compute_insertion_delta_constant(inserted_nodes, best_config, best_label) + if len(self._label_names['node_labels']) == 0 and len(self._label_names['node_attrs']) == 0: # @todo + best_delta = self._compute_insertion_delta_unlabeled(inserted_nodes, best_config, best_label) + elif len(self._label_names['node_labels']) > 0: # self._constant_node_costs: + best_delta = self._compute_insertion_delta_constant(inserted_nodes, best_config, best_label) else: - best_delta = self.__compute_insertion_delta_generic(inserted_nodes, best_config, best_label) + best_delta = self._compute_insertion_delta_generic(inserted_nodes, best_config, best_label) # Return the best delta. return best_delta - def __compute_insertion_delta_unlabeled(self, inserted_nodes, best_config, best_label): # @todo: go through and test. + def _compute_insertion_delta_unlabeled(self, inserted_nodes, best_config, best_label): # @todo: go through and test. # Construct the nest configuration and compute its insertion delta. best_delta = 0.0 best_config.clear() for graph_id, node_set in inserted_nodes.items(): if len(node_set) == 0: best_config[graph_id] = np.inf - best_delta += self.__node_del_cost + best_delta += self._node_del_cost else: best_config[graph_id] = node_set[0][0] - best_delta -= self.__node_ins_cost + best_delta -= self._node_ins_cost # Return the best insertion delta. return best_delta - def __compute_insertion_delta_constant(self, inserted_nodes, best_config, best_label): + def _compute_insertion_delta_constant(self, inserted_nodes, best_config, best_label): # Construct histogram and inverse label maps. hist = {} inverse_label_maps = {} @@ -1232,24 +1232,24 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # Construct the best configuration and compute its insertion delta. best_config.clear() best_delta = 0.0 - node_rel_cost = self.__ged_env.get_node_rel_cost(self.__ged_env.get_node_label(1, to_dict=False), self.__ged_env.get_node_label(2, to_dict=False)) - triangle_ineq_holds = (node_rel_cost <= self.__node_del_cost + self.__node_ins_cost) + node_rel_cost = self._ged_env.get_node_rel_cost(self._ged_env.get_node_label(1, to_dict=False), self._ged_env.get_node_label(2, to_dict=False)) + triangle_ineq_holds = (node_rel_cost <= self._node_del_cost + self._node_ins_cost) for graph_id, _ in inserted_nodes.items(): if best_label_tuple in inverse_label_maps[graph_id]: best_config[graph_id] = inverse_label_maps[graph_id][best_label_tuple] - best_delta -= self.__node_ins_cost + best_delta -= self._node_ins_cost elif triangle_ineq_holds and not len(inserted_nodes[graph_id]) == 0: best_config[graph_id] = inserted_nodes[graph_id][0][0] - best_delta += node_rel_cost - self.__node_ins_cost + best_delta += node_rel_cost - self._node_ins_cost else: best_config[graph_id] = np.inf - best_delta += self.__node_del_cost + best_delta += self._node_del_cost # Return the best insertion delta. return best_delta - def __compute_insertion_delta_generic(self, inserted_nodes, best_config, best_label): + def _compute_insertion_delta_generic(self, inserted_nodes, best_config, best_label): # Collect all node labels of inserted nodes. node_labels = [] for _, node_set in inserted_nodes.items(): @@ -1258,7 +1258,7 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # Compute node label medians that serve as initial solutions for block gradient descent. initial_node_labels = [] - self.__compute_initial_node_labels(node_labels, initial_node_labels) + self._compute_initial_node_labels(node_labels, initial_node_labels) # Determine best insertion configuration, label, and delta via parallel block gradient descent from all initial node labels. best_delta = 0.0 @@ -1266,15 +1266,15 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # Construct local configuration. config = {} for graph_id, _ in inserted_nodes.items(): - config[graph_id] = tuple((np.inf, self.__ged_env.get_node_label(1, to_dict=False))) + config[graph_id] = tuple((np.inf, self._ged_env.get_node_label(1, to_dict=False))) # Run block gradient descent. converged = False itr = 0 - while not self.__insertion_termination_criterion_met(converged, itr): - converged = not self.__update_config(node_label, inserted_nodes, config, node_labels) + while not self._insertion_termination_criterion_met(converged, itr): + converged = not self._update_config(node_label, inserted_nodes, config, node_labels) node_label_dict = dict(node_label) - converged = converged and (not self.__update_node_label([dict(item) for item in node_labels], node_label_dict)) # @todo: the dict is tupled again in the function, can be better. + converged = converged and (not self._update_node_label([dict(item) for item in node_labels], node_label_dict)) # @todo: the dict is tupled again in the function, can be better. node_label = tuple(item for item in node_label_dict.items()) # @todo: watch out: initial_node_labels[i] is not modified here. itr += 1 @@ -1283,12 +1283,12 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined delta = 0.0 for _, node in config.items(): if node[0] == np.inf: - delta += self.__node_del_cost + delta += self._node_del_cost else: - delta += self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost + delta += self._ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self._node_ins_cost # Update best delta and global configuration if improvement has been found. - if delta < best_delta - self.__epsilon: + if delta < best_delta - self._epsilon: best_delta = delta best_label.clear() for key, val in node_label: @@ -1301,16 +1301,16 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined return best_delta - def __compute_initial_node_labels(self, node_labels, median_labels): + def _compute_initial_node_labels(self, node_labels, median_labels): median_labels.clear() - if self.__use_real_randomness: # @todo: may not work if parallelized. + if self._use_real_randomness: # @todo: may not work if parallelized. rng = np.random.randint(0, high=2**32 - 1, size=1) urng = np.random.RandomState(seed=rng[0]) else: - urng = np.random.RandomState(seed=self.__seed) + urng = np.random.RandomState(seed=self._seed) # Generate the initial node label medians. - if self.__init_type_increase_order == 'K-MEANS++': + if self._init_type_increase_order == 'K-MEANS++': # Use k-means++ heuristic to generate the initial node label medians. already_selected = [False] * len(node_labels) selected_label_id = urng.randint(low=0, high=len(node_labels), size=1)[0] # c++ test: 23 @@ -1318,14 +1318,14 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined already_selected[selected_label_id] = True # xxx = [41, 0, 18, 9, 6, 14, 21, 25, 33] for c++ test # iii = 0 for c++ test - while len(median_labels) < self.__num_inits_increase_order: + while len(median_labels) < self._num_inits_increase_order: weights = [np.inf] * len(node_labels) for label_id in range(0, len(node_labels)): if already_selected[label_id]: weights[label_id] = 0 continue for label in median_labels: - weights[label_id] = min(weights[label_id], self.__ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) + weights[label_id] = min(weights[label_id], self._ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) # get non-zero weights. weights_p, idx_p = [], [] @@ -1340,26 +1340,26 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # iii += 1 for c++ test median_labels.append(node_labels[selected_label_id]) already_selected[selected_label_id] = True - else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self.__num_inits_increase_order. + else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self._num_inits_increase_order. break else: # Compute the initial node medians as the medians of randomly generated clusters of (roughly) equal size. # @todo: go through and test. shuffled_node_labels = [np.inf] * len(node_labels) #@todo: random? # @todo: std::shuffle(shuffled_node_labels.begin(), shuffled_node_labels.end(), urng);? - cluster_size = len(node_labels) / self.__num_inits_increase_order + cluster_size = len(node_labels) / self._num_inits_increase_order pos = 0.0 cluster = [] - while len(median_labels) < self.__num_inits_increase_order - 1: + while len(median_labels) < self._num_inits_increase_order - 1: while pos < (len(median_labels) + 1) * cluster_size: cluster.append(shuffled_node_labels[pos]) pos += 1 - median_labels.append(self.__get_median_node_label(cluster)) + median_labels.append(self._get_median_node_label(cluster)) cluster.clear() while pos < len(shuffled_node_labels): pos += 1 cluster.append(shuffled_node_labels[pos]) - median_labels.append(self.__get_median_node_label(cluster)) + median_labels.append(self._get_median_node_label(cluster)) cluster.clear() # Run Lloyd's Algorithm. @@ -1367,8 +1367,8 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined closest_median_ids = [np.inf] * len(node_labels) clusters = [[] for _ in range(len(median_labels))] itr = 1 - while not self.__insertion_termination_criterion_met(converged, itr): - converged = not self.__update_clusters(node_labels, median_labels, closest_median_ids) + while not self._insertion_termination_criterion_met(converged, itr): + converged = not self._update_clusters(node_labels, median_labels, closest_median_ids) if not converged: for cluster in clusters: cluster.clear() @@ -1376,33 +1376,33 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined clusters[closest_median_ids[label_id]].append(node_labels[label_id]) for cluster_id in range(0, len(clusters)): node_label = dict(median_labels[cluster_id]) - self.__update_node_label([dict(item) for item in clusters[cluster_id]], node_label) # @todo: the dict is tupled again in the function, can be better. + self._update_node_label([dict(item) for item in clusters[cluster_id]], node_label) # @todo: the dict is tupled again in the function, can be better. median_labels[cluster_id] = tuple(item for item in node_label.items()) itr += 1 - def __insertion_termination_criterion_met(self, converged, itr): - return converged or (itr >= self.__max_itrs_increase_order if self.__max_itrs_increase_order > 0 else False) + def _insertion_termination_criterion_met(self, converged, itr): + return converged or (itr >= self._max_itrs_increase_order if self._max_itrs_increase_order > 0 else False) - def __update_config(self, node_label, inserted_nodes, config, node_labels): + def _update_config(self, node_label, inserted_nodes, config, node_labels): # Determine the best configuration. config_modified = False for graph_id, node_set in inserted_nodes.items(): best_assignment = config[graph_id] best_cost = 0.0 if best_assignment[0] == np.inf: - best_cost = self.__node_del_cost + best_cost = self._node_del_cost else: - best_cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(best_assignment[1])) - self.__node_ins_cost + best_cost = self._ged_env.get_node_rel_cost(dict(node_label), dict(best_assignment[1])) - self._node_ins_cost for node in node_set: - cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost - if cost < best_cost - self.__epsilon: + cost = self._ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self._node_ins_cost + if cost < best_cost - self._epsilon: best_cost = cost best_assignment = node config_modified = True - if self.__node_del_cost < best_cost - self.__epsilon: - best_cost = self.__node_del_cost + if self._node_del_cost < best_cost - self._epsilon: + best_cost = self._node_del_cost best_assignment = tuple((np.inf, best_assignment[1])) config_modified = True config[graph_id] = best_assignment @@ -1417,11 +1417,11 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined return config_modified - def __update_node_label(self, node_labels, node_label): - if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling __update_config(). + def _update_node_label(self, node_labels, node_label): + if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling _update_config(). return False - new_node_label = self.__get_median_node_label(node_labels) - if self.__ged_env.get_node_rel_cost(new_node_label, node_label) > self.__epsilon: + new_node_label = self._get_median_node_label(node_labels) + if self._ged_env.get_node_rel_cost(new_node_label, node_label) > self._epsilon: node_label.clear() for key, val in new_node_label.items(): node_label[key] = val @@ -1429,15 +1429,15 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined return False - def __update_clusters(self, node_labels, median_labels, closest_median_ids): + def _update_clusters(self, node_labels, median_labels, closest_median_ids): # Determine the closest median for each node label. clusters_modified = False for label_id in range(0, len(node_labels)): closest_median_id = np.inf dist_to_closest_median = np.inf for median_id in range(0, len(median_labels)): - dist_to_median = self.__ged_env.get_node_rel_cost(dict(median_labels[median_id]), dict(node_labels[label_id])) - if dist_to_median < dist_to_closest_median - self.__epsilon: + dist_to_median = self._ged_env.get_node_rel_cost(dict(median_labels[median_id]), dict(node_labels[label_id])) + if dist_to_median < dist_to_closest_median - self._epsilon: dist_to_closest_median = dist_to_median closest_median_id = median_id if closest_median_id != closest_median_ids[label_id]: @@ -1448,26 +1448,26 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined return clusters_modified - def __add_node_to_median(self, best_config, best_label, median): + def _add_node_to_median(self, best_config, best_label, median): # Update the median. nb_nodes_median = nx.number_of_nodes(median) median.add_node(nb_nodes_median, **best_label) # Update the node maps. - for graph_id, node_map in self.__node_maps_from_median.items(): + for graph_id, node_map in self._node_maps_from_median.items(): node_map_as_rel = [] node_map.as_relation(node_map_as_rel) new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) for assignment in node_map_as_rel: new_node_map.add_assignment(assignment[0], assignment[1]) new_node_map.add_assignment(nx.number_of_nodes(median) - 1, best_config[graph_id]) - self.__node_maps_from_median[graph_id] = new_node_map + self._node_maps_from_median[graph_id] = new_node_map # Increase overall number of increases. - self.__num_increase_order += 1 + self._num_increase_order += 1 - def __are_graphs_equal(self, g1, g2): + def _are_graphs_equal(self, g1, g2): """ Check if the two graphs are equal. @@ -1512,29 +1512,29 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined def set_label_names(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): - self.__label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, + self._label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, 'node_attrs': node_attrs, 'edge_attrs': edge_attrs} -# def __get_median_node_label(self, node_labels): -# if len(self.__label_names['node_labels']) > 0: -# return self.__get_median_label_symbolic(node_labels) -# elif len(self.__label_names['node_attrs']) > 0: -# return self.__get_median_label_nonsymbolic(node_labels) +# def _get_median_node_label(self, node_labels): +# if len(self._label_names['node_labels']) > 0: +# return self._get_median_label_symbolic(node_labels) +# elif len(self._label_names['node_attrs']) > 0: +# return self._get_median_label_nonsymbolic(node_labels) # else: # raise Exception('Node label names are not given.') # # -# def __get_median_edge_label(self, edge_labels): -# if len(self.__label_names['edge_labels']) > 0: -# return self.__get_median_label_symbolic(edge_labels) -# elif len(self.__label_names['edge_attrs']) > 0: -# return self.__get_median_label_nonsymbolic(edge_labels) +# def _get_median_edge_label(self, edge_labels): +# if len(self._label_names['edge_labels']) > 0: +# return self._get_median_label_symbolic(edge_labels) +# elif len(self._label_names['edge_attrs']) > 0: +# return self._get_median_label_nonsymbolic(edge_labels) # else: # raise Exception('Edge label names are not given.') # # -# def __get_median_label_symbolic(self, labels): +# def _get_median_label_symbolic(self, labels): # f_i = np.inf # # for label in labels: @@ -1560,7 +1560,7 @@ class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined # return median_label # # -# def __get_median_label_nonsymbolic(self, labels): +# def _get_median_label_nonsymbolic(self, labels): # if len(labels) == 0: # return {} # @todo # else: @@ -1624,7 +1624,7 @@ def _compute_medoid_parallel(graph_ids, sort, itr): i = itr[1] # @todo: timer not considered here. # if timer.expired(): -# self.__state = AlgorithmState.CALLED +# self._state = AlgorithmState.CALLED # break nb_nodes_g = G_ged_env.get_graph_num_nodes(g_id) sum_of_distances = 0 @@ -1645,13 +1645,13 @@ def _compute_init_node_maps_parallel(gen_median_id, sort, nb_nodes_median, itr): if nb_nodes_median <= nb_nodes_g or not sort: G_ged_env.run_method(gen_median_id, graph_id) node_map = G_ged_env.get_node_map(gen_median_id, graph_id) -# print(self.__node_maps_from_median[graph_id]) +# print(self._node_maps_from_median[graph_id]) else: G_ged_env.run_method(graph_id, gen_median_id) node_map = G_ged_env.get_node_map(graph_id, gen_median_id) node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map sum_of_distance = node_map.induced_cost() -# print(self.__sum_of_distances) +# print(self._sum_of_distances) return graph_id, sum_of_distance, node_map diff --git a/gklearn/ged/median/median_graph_estimator_py.py b/gklearn/ged/median/median_graph_estimator_py.py index 41dc3c9..6741c86 100644 --- a/gklearn/ged/median/median_graph_estimator_py.py +++ b/gklearn/ged/median/median_graph_estimator_py.py @@ -33,51 +33,51 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined constant_node_costs : Boolean Set to True if the node relabeling costs are constant. """ - self.__ged_env = ged_env - self.__init_method = 'BRANCH_FAST' - self.__init_options = '' - self.__descent_method = 'BRANCH_FAST' - self.__descent_options = '' - self.__refine_method = 'IPFP' - self.__refine_options = '' - self.__constant_node_costs = constant_node_costs - self.__labeled_nodes = (ged_env.get_num_node_labels() > 1) - self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1, to_dict=False)) - self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1, to_dict=False)) - self.__labeled_edges = (ged_env.get_num_edge_labels() > 1) - self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1, to_dict=False)) - self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1, to_dict=False)) - self.__init_type = 'RANDOM' - self.__num_random_inits = 10 - self.__desired_num_random_inits = 10 - self.__use_real_randomness = True - self.__seed = 0 - self.__parallel = True - self.__update_order = True - self.__sort_graphs = True # sort graphs by size when computing GEDs. - self.__refine = True - self.__time_limit_in_sec = 0 - self.__epsilon = 0.0001 - self.__max_itrs = 100 - self.__max_itrs_without_update = 3 - self.__num_inits_increase_order = 10 - self.__init_type_increase_order = 'K-MEANS++' - self.__max_itrs_increase_order = 10 - self.__print_to_stdout = 2 - self.__median_id = np.inf # @todo: check - self.__node_maps_from_median = {} - self.__sum_of_distances = 0 - self.__best_init_sum_of_distances = np.inf - self.__converged_sum_of_distances = np.inf - self.__runtime = None - self.__runtime_initialized = None - self.__runtime_converged = None - self.__itrs = [] # @todo: check: {} ? - self.__num_decrease_order = 0 - self.__num_increase_order = 0 - self.__num_converged_descents = 0 - self.__state = AlgorithmState.TERMINATED - self.__label_names = {} + self._ged_env = ged_env + self._init_method = 'BRANCH_FAST' + self._init_options = '' + self._descent_method = 'BRANCH_FAST' + self._descent_options = '' + self._refine_method = 'IPFP' + self._refine_options = '' + self._constant_node_costs = constant_node_costs + self._labeled_nodes = (ged_env.get_num_node_labels() > 1) + self._node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1, to_dict=False)) + self._node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1, to_dict=False)) + self._labeled_edges = (ged_env.get_num_edge_labels() > 1) + self._edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1, to_dict=False)) + self._edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1, to_dict=False)) + self._init_type = 'RANDOM' + self._num_random_inits = 10 + self._desired_num_random_inits = 10 + self._use_real_randomness = True + self._seed = 0 + self._parallel = True + self._update_order = True + self._sort_graphs = True # sort graphs by size when computing GEDs. + self._refine = True + self._time_limit_in_sec = 0 + self._epsilon = 0.0001 + self._max_itrs = 100 + self._max_itrs_without_update = 3 + self._num_inits_increase_order = 10 + self._init_type_increase_order = 'K-MEANS++' + self._max_itrs_increase_order = 10 + self._print_to_stdout = 2 + self._median_id = np.inf # @todo: check + self._node_maps_from_median = {} + self._sum_of_distances = 0 + self._best_init_sum_of_distances = np.inf + self._converged_sum_of_distances = np.inf + self._runtime = None + self._runtime_initialized = None + self._runtime_converged = None + self._itrs = [] # @todo: check: {} ? + self._num_decrease_order = 0 + self._num_increase_order = 0 + self._num_converged_descents = 0 + self._state = AlgorithmState.TERMINATED + self._label_names = {} if ged_env is None: raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') @@ -93,142 +93,142 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined options : string String that specifies with which options to run the estimator. """ - self.__set_default_options() + self._set_default_options() options_map = misc.options_string_to_options_map(options) for opt_name, opt_val in options_map.items(): if opt_name == 'init-type': - self.__init_type = opt_val + self._init_type = opt_val if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN': raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"') elif opt_name == 'random-inits': try: - self.__num_random_inits = int(opt_val) - self.__desired_num_random_inits = self.__num_random_inits + self._num_random_inits = int(opt_val) + self._desired_num_random_inits = self._num_random_inits except: raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') - if self.__num_random_inits <= 0: + if self._num_random_inits <= 0: raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') elif opt_name == 'randomness': if opt_val == 'PSEUDO': - self.__use_real_randomness = False + self._use_real_randomness = False elif opt_val == 'REAL': - self.__use_real_randomness = True + self._use_real_randomness = True else: raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') elif opt_name == 'stdout': if opt_val == '0': - self.__print_to_stdout = 0 + self._print_to_stdout = 0 elif opt_val == '1': - self.__print_to_stdout = 1 + self._print_to_stdout = 1 elif opt_val == '2': - self.__print_to_stdout = 2 + self._print_to_stdout = 2 else: raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') elif opt_name == 'parallel': if opt_val == 'TRUE': - self.__parallel = True + self._parallel = True elif opt_val == 'FALSE': - self.__parallel = False + self._parallel = False else: raise Exception('Invalid argument "' + opt_val + '" for option parallel. Usage: options = "[--parallel TRUE|FALSE] [...]"') elif opt_name == 'update-order': if opt_val == 'TRUE': - self.__update_order = True + self._update_order = True elif opt_val == 'FALSE': - self.__update_order = False + self._update_order = False else: raise Exception('Invalid argument "' + opt_val + '" for option update-order. Usage: options = "[--update-order TRUE|FALSE] [...]"') elif opt_name == 'sort-graphs': if opt_val == 'TRUE': - self.__sort_graphs = True + self._sort_graphs = True elif opt_val == 'FALSE': - self.__sort_graphs = False + self._sort_graphs = False else: raise Exception('Invalid argument "' + opt_val + '" for option sort-graphs. Usage: options = "[--sort-graphs TRUE|FALSE] [...]"') elif opt_name == 'refine': if opt_val == 'TRUE': - self.__refine = True + self._refine = True elif opt_val == 'FALSE': - self.__refine = False + self._refine = False else: raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') elif opt_name == 'time-limit': try: - self.__time_limit_in_sec = float(opt_val) + self._time_limit_in_sec = float(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]') elif opt_name == 'max-itrs': try: - self.__max_itrs = int(opt_val) + self._max_itrs = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]') elif opt_name == 'max-itrs-without-update': try: - self.__max_itrs_without_update = int(opt_val) + self._max_itrs_without_update = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]') elif opt_name == 'seed': try: - self.__seed = int(opt_val) + self._seed = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]') elif opt_name == 'epsilon': try: - self.__epsilon = float(opt_val) + self._epsilon = float(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') - if self.__epsilon <= 0: + if self._epsilon <= 0: raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') elif opt_name == 'inits-increase-order': try: - self.__num_inits_increase_order = int(opt_val) + self._num_inits_increase_order = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') - if self.__num_inits_increase_order <= 0: + if self._num_inits_increase_order <= 0: raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') elif opt_name == 'init-type-increase-order': - self.__init_type_increase_order = opt_val + self._init_type_increase_order = opt_val if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') elif opt_name == 'max-itrs-increase-order': try: - self.__max_itrs_increase_order = int(opt_val) + self._max_itrs_increase_order = int(opt_val) except: raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]') @@ -255,8 +255,8 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined ----- Has no effect unless "--init-type MEDOID" is passed to set_options(). """ - self.__init_method = init_method; - self.__init_options = init_options; + self._init_method = init_method; + self._init_options = init_options; def set_descent_method(self, descent_method, descent_options=''): @@ -274,8 +274,8 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined ----- Has no effect unless "--init-type MEDOID" is passed to set_options(). """ - self.__descent_method = descent_method; - self.__descent_options = descent_options; + self._descent_method = descent_method; + self._descent_options = descent_options; def set_refine_method(self, refine_method, refine_options): @@ -293,8 +293,8 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined ----- Has no effect if "--refine FALSE" is passed to set_options(). """ - self.__refine_method = refine_method - self.__refine_options = refine_options + self._refine_method = refine_method + self._refine_options = refine_options def run(self, graph_ids, set_median_id, gen_median_id): @@ -317,7 +317,7 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined raise Exception('Empty vector of graph IDs, unable to compute median.') all_graphs_empty = True for graph_id in graph_ids: - if self.__ged_env.get_graph_num_nodes(graph_id) > 0: + if self._ged_env.get_graph_num_nodes(graph_id) > 0: all_graphs_empty = False break if all_graphs_empty: @@ -325,16 +325,16 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # Start timer and record start time. start = time.time() - timer = Timer(self.__time_limit_in_sec) - self.__median_id = gen_median_id - self.__state = AlgorithmState.TERMINATED + timer = Timer(self._time_limit_in_sec) + self._median_id = gen_median_id + self._state = AlgorithmState.TERMINATED # Get NetworkX graph representations of the input graphs. graphs = {} for graph_id in graph_ids: # @todo: get_nx_graph() function may need to be modified according to the coming code. - graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id) -# print(self.__ged_env.get_graph_internal_id(0)) + graphs[graph_id] = self._ged_env.get_nx_graph(graph_id) +# print(self._ged_env.get_graph_internal_id(0)) # print(graphs[0].graph) # print(graphs[0].nodes(data=True)) # print(graphs[0].edges(data=True)) @@ -342,27 +342,27 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # Construct initial medians. medians = [] - self.__construct_initial_medians(graph_ids, timer, medians) + self._construct_initial_medians(graph_ids, timer, medians) end_init = time.time() - self.__runtime_initialized = end_init - start + self._runtime_initialized = end_init - start # print(medians[0].graph) # print(medians[0].nodes(data=True)) # print(medians[0].edges(data=True)) # print(nx.adjacency_matrix(medians[0])) # Reset information about iterations and number of times the median decreases and increases. - self.__itrs = [0] * len(medians) - self.__num_decrease_order = 0 - self.__num_increase_order = 0 - self.__num_converged_descents = 0 + self._itrs = [0] * len(medians) + self._num_decrease_order = 0 + self._num_increase_order = 0 + self._num_converged_descents = 0 # Initialize the best median. best_sum_of_distances = np.inf - self.__best_init_sum_of_distances = np.inf + self._best_init_sum_of_distances = np.inf node_maps_from_best_median = {} # Run block gradient descent from all initial medians. - self.__ged_env.set_method(self.__descent_method, self.__descent_options) + self._ged_env.set_method(self._descent_method, self._descent_options) for median_pos in range(0, len(medians)): # Terminate if the timer has expired and at least one SOD has been computed. @@ -370,7 +370,7 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined break # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n===========================================================') print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') print('-----------------------------------------------------------') @@ -379,27 +379,27 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined median = medians[median_pos] # Load initial median into the environment. - self.__ged_env.load_nx_graph(median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) + self._ged_env.load_nx_graph(median, gen_median_id) + self._ged_env.init(self._ged_env.get_init_type()) # Compute node maps and sum of distances for initial median. -# xxx = self.__node_maps_from_median - self.__compute_init_node_maps(graph_ids, gen_median_id) -# yyy = self.__node_maps_from_median +# xxx = self._node_maps_from_median + self._compute_init_node_maps(graph_ids, gen_median_id) +# yyy = self._node_maps_from_median - self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) - self.__ged_env.load_nx_graph(median, set_median_id) -# print(self.__best_init_sum_of_distances) + self._best_init_sum_of_distances = min(self._best_init_sum_of_distances, self._sum_of_distances) + self._ged_env.load_nx_graph(median, set_median_id) +# print(self._best_init_sum_of_distances) # Run block gradient descent from initial median. converged = False itrs_without_update = 0 - while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update): + while not self._termination_criterion_met(converged, timer, self._itrs[median_pos], itrs_without_update): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n===========================================================') - print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('Iteration', str(self._itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') print('-----------------------------------------------------------') # Initialize flags that tell us what happened in the iteration. @@ -409,12 +409,12 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined increased_order = False # Update the median. - median_modified = self.__update_median(graphs, median) - if self.__update_order: - if not median_modified or self.__itrs[median_pos] == 0: - decreased_order = self.__decrease_order(graphs, median) - if not decreased_order or self.__itrs[median_pos] == 0: - increased_order = self.__increase_order(graphs, median) + median_modified = self._update_median(graphs, median) + if self._update_order: + if not median_modified or self._itrs[median_pos] == 0: + decreased_order = self._decrease_order(graphs, median) + if not decreased_order or self._itrs[median_pos] == 0: + increased_order = self._increase_order(graphs, median) # Update the number of iterations without update of the median. if median_modified or decreased_order or increased_order: @@ -423,51 +423,51 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined itrs_without_update += 1 # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Loading median to environment: ... ', end='') # Load the median into the environment. # @todo: should this function use the original node label? - self.__ged_env.load_nx_graph(median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) + self._ged_env.load_nx_graph(median, gen_median_id) + self._ged_env.init(self._ged_env.get_init_type()) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Updating induced costs: ... ', end='') # Compute induced costs of the old node maps w.r.t. the updated median. for graph_id in graph_ids: -# print(self.__node_maps_from_median[graph_id].induced_cost()) -# xxx = self.__node_maps_from_median[graph_id] - self.__ged_env.compute_induced_cost(gen_median_id, graph_id, self.__node_maps_from_median[graph_id]) +# print(self._node_maps_from_median[graph_id].induced_cost()) +# xxx = self._node_maps_from_median[graph_id] + self._ged_env.compute_induced_cost(gen_median_id, graph_id, self._node_maps_from_median[graph_id]) # print('---------------------------------------') -# print(self.__node_maps_from_median[graph_id].induced_cost()) +# print(self._node_maps_from_median[graph_id].induced_cost()) # @todo:!!!!!!!!!!!!!!!!!!!!!!!!!!!!This value is a slight different from the c++ program, which might be a bug! Use it very carefully! # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Update the node maps. - node_maps_modified = self.__update_node_maps() + node_maps_modified = self._update_node_maps() # Update the order of the median if no improvement can be found with the current order. # Update the sum of distances. - old_sum_of_distances = self.__sum_of_distances - self.__sum_of_distances = 0 - for graph_id, node_map in self.__node_maps_from_median.items(): - self.__sum_of_distances += node_map.induced_cost() -# print(self.__sum_of_distances) + old_sum_of_distances = self._sum_of_distances + self._sum_of_distances = 0 + for graph_id, node_map in self._node_maps_from_median.items(): + self._sum_of_distances += node_map.induced_cost() +# print(self._sum_of_distances) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Old local SOD: ', old_sum_of_distances) - print('New local SOD: ', self.__sum_of_distances) + print('New local SOD: ', self._sum_of_distances) print('Best converged SOD: ', best_sum_of_distances) print('Modified median: ', median_modified) print('Modified node maps: ', node_maps_modified) @@ -477,121 +477,121 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined converged = not (median_modified or node_maps_modified or decreased_order or increased_order) - self.__itrs[median_pos] += 1 + self._itrs[median_pos] += 1 # Update the best median. - if self.__sum_of_distances < best_sum_of_distances: - best_sum_of_distances = self.__sum_of_distances - node_maps_from_best_median = self.__node_maps_from_median.copy() # @todo: this is a shallow copy, not sure if it is enough. + if self._sum_of_distances < best_sum_of_distances: + best_sum_of_distances = self._sum_of_distances + node_maps_from_best_median = self._node_maps_from_median.copy() # @todo: this is a shallow copy, not sure if it is enough. best_median = median # Update the number of converged descents. if converged: - self.__num_converged_descents += 1 + self._num_converged_descents += 1 # Store the best encountered median. - self.__sum_of_distances = best_sum_of_distances - self.__node_maps_from_median = node_maps_from_best_median - self.__ged_env.load_nx_graph(best_median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) + self._sum_of_distances = best_sum_of_distances + self._node_maps_from_median = node_maps_from_best_median + self._ged_env.load_nx_graph(best_median, gen_median_id) + self._ged_env.init(self._ged_env.get_init_type()) end_descent = time.time() - self.__runtime_converged = end_descent - start + self._runtime_converged = end_descent - start # Refine the sum of distances and the node maps for the converged median. - self.__converged_sum_of_distances = self.__sum_of_distances - if self.__refine: - self.__improve_sum_of_distances(timer) + self._converged_sum_of_distances = self._sum_of_distances + if self._refine: + self._improve_sum_of_distances(timer) # Record end time, set runtime and reset the number of initial medians. end = time.time() - self.__runtime = end - start - self.__num_random_inits = self.__desired_num_random_inits + self._runtime = end - start + self._num_random_inits = self._desired_num_random_inits # Print global information. - if self.__print_to_stdout != 0: + if self._print_to_stdout != 0: print('\n===========================================================') print('Finished computation of generalized median graph.') print('-----------------------------------------------------------') - print('Best SOD after initialization: ', self.__best_init_sum_of_distances) - print('Converged SOD: ', self.__converged_sum_of_distances) - if self.__refine: - print('Refined SOD: ', self.__sum_of_distances) - print('Overall runtime: ', self.__runtime) - print('Runtime of initialization: ', self.__runtime_initialized) - print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized) - if self.__refine: - print('Runtime of refinement: ', self.__runtime - self.__runtime_converged) + print('Best SOD after initialization: ', self._best_init_sum_of_distances) + print('Converged SOD: ', self._converged_sum_of_distances) + if self._refine: + print('Refined SOD: ', self._sum_of_distances) + print('Overall runtime: ', self._runtime) + print('Runtime of initialization: ', self._runtime_initialized) + print('Runtime of block gradient descent: ', self._runtime_converged - self._runtime_initialized) + if self._refine: + print('Runtime of refinement: ', self._runtime - self._runtime_converged) print('Number of initial medians: ', len(medians)) total_itr = 0 num_started_descents = 0 - for itr in self.__itrs: + for itr in self._itrs: total_itr += itr if itr > 0: num_started_descents += 1 print('Size of graph collection: ', len(graph_ids)) print('Number of started descents: ', num_started_descents) - print('Number of converged descents: ', self.__num_converged_descents) + print('Number of converged descents: ', self._num_converged_descents) print('Overall number of iterations: ', total_itr) - print('Overall number of times the order decreased: ', self.__num_decrease_order) - print('Overall number of times the order increased: ', self.__num_increase_order) + print('Overall number of times the order decreased: ', self._num_decrease_order) + print('Overall number of times the order increased: ', self._num_increase_order) print('===========================================================\n') - def __improve_sum_of_distances(self, timer): # @todo: go through and test + def _improve_sum_of_distances(self, timer): # @todo: go through and test # Use method selected for refinement phase. - self.__ged_env.set_method(self.__refine_method, self.__refine_options) + self._ged_env.set_method(self._refine_method, self._refine_options) # Print information about current iteration. - if self.__print_to_stdout == 2: - progress = tqdm(desc='Improving node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + if self._print_to_stdout == 2: + progress = tqdm(desc='Improving node maps', total=len(self._node_maps_from_median), file=sys.stdout) print('\n===========================================================') print('Improving node maps and SOD for converged median.') print('-----------------------------------------------------------') progress.update(1) # Improving the node maps. - nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__gen_median_id) - for graph_id, node_map in self.__node_maps_from_median.items(): + nb_nodes_median = self._ged_env.get_graph_num_nodes(self._gen_median_id) + for graph_id, node_map in self._node_maps_from_median.items(): if time.expired(): - if self.__state == AlgorithmState.TERMINATED: - self.__state = AlgorithmState.CONVERGED + if self._state == AlgorithmState.TERMINATED: + self._state = AlgorithmState.CONVERGED break - nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) - if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: - self.__ged_env.run_method(self.__gen_median_id, graph_id) - if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost(): - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id) + nb_nodes_g = self._ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self._sort_graphs: + self._ged_env.run_method(self._gen_median_id, graph_id) + if self._ged_env.get_upper_bound(self._gen_median_id, graph_id) < node_map.induced_cost(): + self._node_maps_from_median[graph_id] = self._ged_env.get_node_map(self._gen_median_id, graph_id) else: - self.__ged_env.run_method(graph_id, self.__gen_median_id) - if self.__ged_env.get_upper_bound(graph_id, self.__gen_median_id) < node_map.induced_cost(): - node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__gen_median_id) + self._ged_env.run_method(graph_id, self._gen_median_id) + if self._ged_env.get_upper_bound(graph_id, self._gen_median_id) < node_map.induced_cost(): + node_map_tmp = self._ged_env.get_node_map(graph_id, self._gen_median_id) node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map - self.__node_maps_from_median[graph_id] = node_map_tmp + self._node_maps_from_median[graph_id] = node_map_tmp - self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + self._sum_of_distances += self._node_maps_from_median[graph_id].induced_cost() # Print information. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) - self.__sum_of_distances = 0.0 - for key, val in self.__node_maps_from_median.items(): - self.__sum_of_distances += val.induced_cost() + self._sum_of_distances = 0.0 + for key, val in self._node_maps_from_median.items(): + self._sum_of_distances += val.induced_cost() # Print information. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('===========================================================\n') - def __median_available(self): - return self.__median_id != np.inf + def _median_available(self): + return self._median_id != np.inf def get_state(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_state().') - return self.__state + return self._state def get_sum_of_distances(self, state=''): @@ -607,92 +607,92 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined float The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned. """ - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().') if state == 'initialized': - return self.__best_init_sum_of_distances + return self._best_init_sum_of_distances if state == 'converged': - return self.__converged_sum_of_distances - return self.__sum_of_distances + return self._converged_sum_of_distances + return self._sum_of_distances def get_runtime(self, state): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_runtime().') if state == AlgorithmState.INITIALIZED: - return self.__runtime_initialized + return self._runtime_initialized if state == AlgorithmState.CONVERGED: - return self.__runtime_converged - return self.__runtime + return self._runtime_converged + return self._runtime def get_num_itrs(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_itrs().') - return self.__itrs + return self._itrs def get_num_times_order_decreased(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_times_order_decreased().') - return self.__num_decrease_order + return self._num_decrease_order def get_num_times_order_increased(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_times_order_increased().') - return self.__num_increase_order + return self._num_increase_order def get_num_converged_descents(self): - if not self.__median_available(): + if not self._median_available(): raise Exception('No median has been computed. Call run() before calling get_num_converged_descents().') - return self.__num_converged_descents + return self._num_converged_descents def get_ged_env(self): - return self.__ged_env - - - def __set_default_options(self): - self.__init_type = 'RANDOM' - self.__num_random_inits = 10 - self.__desired_num_random_inits = 10 - self.__use_real_randomness = True - self.__seed = 0 - self.__parallel = True - self.__update_order = True - self.__sort_graphs = True - self.__refine = True - self.__time_limit_in_sec = 0 - self.__epsilon = 0.0001 - self.__max_itrs = 100 - self.__max_itrs_without_update = 3 - self.__num_inits_increase_order = 10 - self.__init_type_increase_order = 'K-MEANS++' - self.__max_itrs_increase_order = 10 - self.__print_to_stdout = 2 - self.__label_names = {} + return self._ged_env + + + def _set_default_options(self): + self._init_type = 'RANDOM' + self._num_random_inits = 10 + self._desired_num_random_inits = 10 + self._use_real_randomness = True + self._seed = 0 + self._parallel = True + self._update_order = True + self._sort_graphs = True + self._refine = True + self._time_limit_in_sec = 0 + self._epsilon = 0.0001 + self._max_itrs = 100 + self._max_itrs_without_update = 3 + self._num_inits_increase_order = 10 + self._init_type_increase_order = 'K-MEANS++' + self._max_itrs_increase_order = 10 + self._print_to_stdout = 2 + self._label_names = {} - def __construct_initial_medians(self, graph_ids, timer, initial_medians): + def _construct_initial_medians(self, graph_ids, timer, initial_medians): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n===========================================================') print('Constructing initial median(s).') print('-----------------------------------------------------------') # Compute or sample the initial median(s). initial_medians.clear() - if self.__init_type == 'MEDOID': - self.__compute_medoid(graph_ids, timer, initial_medians) - elif self.__init_type == 'MAX': + if self._init_type == 'MEDOID': + self._compute_medoid(graph_ids, timer, initial_medians) + elif self._init_type == 'MAX': pass # @todo # compute_max_order_graph_(graph_ids, initial_medians) - elif self.__init_type == 'MIN': + elif self._init_type == 'MIN': pass # @todo # compute_min_order_graph_(graph_ids, initial_medians) - elif self.__init_type == 'MEAN': + elif self._init_type == 'MEAN': pass # @todo # compute_mean_order_graph_(graph_ids, initial_medians) else: @@ -700,17 +700,17 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # sample_initial_medians_(graph_ids, initial_medians) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('===========================================================') - def __compute_medoid(self, graph_ids, timer, initial_medians): + def _compute_medoid(self, graph_ids, timer, initial_medians): # Use method selected for initialization phase. - self.__ged_env.set_method(self.__init_method, self.__init_options) + self._ged_env.set_method(self._init_method, self._init_options) # Compute the medoid. - if self.__parallel: - # @todo: notice when parallel self.__ged_env is not modified. + if self._parallel: + # @todo: notice when parallel self._ged_env is not modified. sum_of_distances_list = [np.inf] * len(graph_ids) len_itr = len(graph_ids) itr = zip(graph_ids, range(0, len(graph_ids))) @@ -722,9 +722,9 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - do_fun = partial(_compute_medoid_parallel, graph_ids, self.__sort_graphs) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) - if self.__print_to_stdout == 2: + do_fun = partial(_compute_medoid_parallel, graph_ids, self._sort_graphs) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self._ged_env,)) + if self._print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), desc='Computing medoid', file=sys.stdout) else: @@ -737,50 +737,50 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined medoid_id = np.argmin(sum_of_distances_list) best_sum_of_distances = sum_of_distances_list[medoid_id] - initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + initial_medians.append(self._ged_env.get_nx_graph(medoid_id)) # @todo else: # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout) medoid_id = graph_ids[0] best_sum_of_distances = np.inf for g_id in graph_ids: if timer.expired(): - self.__state = AlgorithmState.CALLED + self._state = AlgorithmState.CALLED break - nb_nodes_g = self.__ged_env.get_graph_num_nodes(g_id) + nb_nodes_g = self._ged_env.get_graph_num_nodes(g_id) sum_of_distances = 0 for h_id in graph_ids: # @todo: this can be faster, only a half is needed. - nb_nodes_h = self.__ged_env.get_graph_num_nodes(h_id) - if nb_nodes_g <= nb_nodes_h or not self.__sort_graphs: - self.__ged_env.run_method(g_id, h_id) # @todo - sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) + nb_nodes_h = self._ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not self._sort_graphs: + self._ged_env.run_method(g_id, h_id) # @todo + sum_of_distances += self._ged_env.get_upper_bound(g_id, h_id) else: - self.__ged_env.run_method(h_id, g_id) - sum_of_distances += self.__ged_env.get_upper_bound(h_id, g_id) + self._ged_env.run_method(h_id, g_id) + sum_of_distances += self._ged_env.get_upper_bound(h_id, g_id) if sum_of_distances < best_sum_of_distances: best_sum_of_distances = sum_of_distances medoid_id = g_id # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) - initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + initial_medians.append(self._ged_env.get_nx_graph(medoid_id)) # @todo # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n') - def __compute_init_node_maps(self, graph_ids, gen_median_id): + def _compute_init_node_maps(self, graph_ids, gen_median_id): # Compute node maps and sum of distances for initial median. - if self.__parallel: - # @todo: notice when parallel self.__ged_env is not modified. - self.__sum_of_distances = 0 - self.__node_maps_from_median.clear() + if self._parallel: + # @todo: notice when parallel self._ged_env is not modified. + self._sum_of_distances = 0 + self._node_maps_from_median.clear() sum_of_distances_list = [0] * len(graph_ids) len_itr = len(graph_ids) @@ -793,88 +793,88 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) - do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self.__sort_graphs, nb_nodes_median) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) - if self.__print_to_stdout == 2: + nb_nodes_median = self._ged_env.get_graph_num_nodes(gen_median_id) + do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self._sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self._ged_env,)) + if self._print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), desc='Computing initial node maps', file=sys.stdout) else: iterator = pool.imap_unordered(do_fun, itr, chunksize) for g_id, sod, node_maps in iterator: sum_of_distances_list[g_id] = sod - self.__node_maps_from_median[g_id] = node_maps + self._node_maps_from_median[g_id] = node_maps pool.close() pool.join() - self.__sum_of_distances = np.sum(sum_of_distances_list) -# xxx = self.__node_maps_from_median + self._sum_of_distances = np.sum(sum_of_distances_list) +# xxx = self._node_maps_from_median else: # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout) - self.__sum_of_distances = 0 - self.__node_maps_from_median.clear() - nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + self._sum_of_distances = 0 + self._node_maps_from_median.clear() + nb_nodes_median = self._ged_env.get_graph_num_nodes(gen_median_id) for graph_id in graph_ids: - nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) - if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: - self.__ged_env.run_method(gen_median_id, graph_id) - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) + nb_nodes_g = self._ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self._sort_graphs: + self._ged_env.run_method(gen_median_id, graph_id) + self._node_maps_from_median[graph_id] = self._ged_env.get_node_map(gen_median_id, graph_id) else: - self.__ged_env.run_method(graph_id, gen_median_id) - node_map_tmp = self.__ged_env.get_node_map(graph_id, gen_median_id) + self._ged_env.run_method(graph_id, gen_median_id) + node_map_tmp = self._ged_env.get_node_map(graph_id, gen_median_id) node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map - self.__node_maps_from_median[graph_id] = node_map_tmp - # print(self.__node_maps_from_median[graph_id]) - self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() - # print(self.__sum_of_distances) + self._node_maps_from_median[graph_id] = node_map_tmp + # print(self._node_maps_from_median[graph_id]) + self._sum_of_distances += self._node_maps_from_median[graph_id].induced_cost() + # print(self._sum_of_distances) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n') - def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): - if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): - if self.__state == AlgorithmState.TERMINATED: - self.__state = AlgorithmState.INITIALIZED + def _termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self._max_itrs if self._max_itrs >= 0 else False): + if self._state == AlgorithmState.TERMINATED: + self._state = AlgorithmState.INITIALIZED return True - return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + return converged or (itrs_without_update > self._max_itrs_without_update if self._max_itrs_without_update >= 0 else False) - def __update_median(self, graphs, median): + def _update_median(self, graphs, median): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Updating median: ', end='') # Store copy of the old median. old_median = median.copy() # @todo: this is just a shallow copy. # Update the node labels. - if self.__labeled_nodes: - self.__update_node_labels(graphs, median) + if self._labeled_nodes: + self._update_node_labels(graphs, median) # Update the edges and their labels. - self.__update_edges(graphs, median) + self._update_edges(graphs, median) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') - return not self.__are_graphs_equal(median, old_median) + return not self._are_graphs_equal(median, old_median) - def __update_node_labels(self, graphs, median): + def _update_node_labels(self, graphs, median): # print('----------------------------') # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('nodes ... ', end='') # Iterate through all nodes of the median. @@ -884,24 +884,24 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined node_labels = [] for graph_id, graph in graphs.items(): # print('graph_id: ', graph_id) -# print(self.__node_maps_from_median[graph_id]) -# print(self.__node_maps_from_median[graph_id].forward_map, self.__node_maps_from_median[graph_id].backward_map) - k = self.__node_maps_from_median[graph_id].image(i) +# print(self._node_maps_from_median[graph_id]) +# print(self._node_maps_from_median[graph_id].forward_map, self._node_maps_from_median[graph_id].backward_map) + k = self._node_maps_from_median[graph_id].image(i) # print('k: ', k) if k != np.inf: node_labels.append(graph.nodes[k]) # Compute the median label and update the median. if len(node_labels) > 0: -# median_label = self.__ged_env.get_median_node_label(node_labels) - median_label = self.__get_median_node_label(node_labels) - if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: +# median_label = self._ged_env.get_median_node_label(node_labels) + median_label = self._get_median_node_label(node_labels) + if self._ged_env.get_node_rel_cost(median.nodes[i], median_label) > self._epsilon: nx.set_node_attributes(median, {i: median_label}) - def __update_edges(self, graphs, median): + def _update_edges(self, graphs, median): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('edges ... ', end='') # # Clear the adjacency lists of the median and reset number of edges to 0. @@ -917,43 +917,43 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # Collect the labels of the edges to which (i,j) is mapped by the node maps. edge_labels = [] for graph_id, graph in graphs.items(): - k = self.__node_maps_from_median[graph_id].image(i) - l = self.__node_maps_from_median[graph_id].image(j) + k = self._node_maps_from_median[graph_id].image(i) + l = self._node_maps_from_median[graph_id].image(j) if k != np.inf and l != np.inf: if graph.has_edge(k, l): edge_labels.append(graph.edges[(k, l)]) # Compute the median edge label and the overall edge relabeling cost. rel_cost = 0 - median_label = self.__ged_env.get_edge_label(1, to_dict=True) + median_label = self._ged_env.get_edge_label(1, to_dict=True) if median.has_edge(i, j): median_label = median.edges[(i, j)] - if self.__labeled_edges and len(edge_labels) > 0: - new_median_label = self.__get_median_edge_label(edge_labels) - if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: + if self._labeled_edges and len(edge_labels) > 0: + new_median_label = self._get_median_edge_label(edge_labels) + if self._ged_env.get_edge_rel_cost(median_label, new_median_label) > self._epsilon: median_label = new_median_label for edge_label in edge_labels: - rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label) + rel_cost += self._ged_env.get_edge_rel_cost(median_label, edge_label) # Update the median. if median.has_edge(i, j): median.remove_edge(i, j) - if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs): + if rel_cost < (self._edge_ins_cost + self._edge_del_cost) * len(edge_labels) - self._edge_del_cost * len(graphs): median.add_edge(i, j, **median_label) # else: # if median.has_edge(i, j): # median.remove_edge(i, j) - def __update_node_maps(self): + def _update_node_maps(self): # Update the node maps. - if self.__parallel: - # @todo: notice when parallel self.__ged_env is not modified. + if self._parallel: + # @todo: notice when parallel self._ged_env is not modified. node_maps_were_modified = False -# xxx = self.__node_maps_from_median.copy() +# xxx = self._node_maps_from_median.copy() - len_itr = len(self.__node_maps_from_median) - itr = [item for item in self.__node_maps_from_median.items()] + len_itr = len(self._node_maps_from_median) + itr = [item for item in self._node_maps_from_median.items()] n_jobs = multiprocessing.cpu_count() if len_itr < 100 * n_jobs: chunksize = int(len_itr / n_jobs) + 1 @@ -962,66 +962,66 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) - do_fun = partial(_update_node_maps_parallel, self.__median_id, self.__epsilon, self.__sort_graphs, nb_nodes_median) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) - if self.__print_to_stdout == 2: + nb_nodes_median = self._ged_env.get_graph_num_nodes(self._median_id) + do_fun = partial(_update_node_maps_parallel, self._median_id, self._epsilon, self._sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self._ged_env,)) + if self._print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), desc='Updating node maps', file=sys.stdout) else: iterator = pool.imap_unordered(do_fun, itr, chunksize) for g_id, node_map, nm_modified in iterator: - self.__node_maps_from_median[g_id] = node_map + self._node_maps_from_median[g_id] = node_map if nm_modified: node_maps_were_modified = True pool.close() pool.join() -# yyy = self.__node_maps_from_median.copy() +# yyy = self._node_maps_from_median.copy() else: # Print information about current iteration. - if self.__print_to_stdout == 2: - progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + if self._print_to_stdout == 2: + progress = tqdm(desc='Updating node maps', total=len(self._node_maps_from_median), file=sys.stdout) node_maps_were_modified = False - nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) - for graph_id, node_map in self.__node_maps_from_median.items(): - nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + nb_nodes_median = self._ged_env.get_graph_num_nodes(self._median_id) + for graph_id, node_map in self._node_maps_from_median.items(): + nb_nodes_g = self._ged_env.get_graph_num_nodes(graph_id) - if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: - self.__ged_env.run_method(self.__median_id, graph_id) - if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon: - # xxx = self.__node_maps_from_median[graph_id] - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) + if nb_nodes_median <= nb_nodes_g or not self._sort_graphs: + self._ged_env.run_method(self._median_id, graph_id) + if self._ged_env.get_upper_bound(self._median_id, graph_id) < node_map.induced_cost() - self._epsilon: + # xxx = self._node_maps_from_median[graph_id] + self._node_maps_from_median[graph_id] = self._ged_env.get_node_map(self._median_id, graph_id) node_maps_were_modified = True else: - self.__ged_env.run_method(graph_id, self.__median_id) - if self.__ged_env.get_upper_bound(graph_id, self.__median_id) < node_map.induced_cost() - self.__epsilon: - node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__median_id) + self._ged_env.run_method(graph_id, self._median_id) + if self._ged_env.get_upper_bound(graph_id, self._median_id) < node_map.induced_cost() - self._epsilon: + node_map_tmp = self._ged_env.get_node_map(graph_id, self._median_id) node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map - self.__node_maps_from_median[graph_id] = node_map_tmp + self._node_maps_from_median[graph_id] = node_map_tmp node_maps_were_modified = True # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: progress.update(1) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('\n') # Return true if the node maps were modified. return node_maps_were_modified - def __decrease_order(self, graphs, median): + def _decrease_order(self, graphs, median): # Print information about current iteration - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Trying to decrease order: ... ', end='') if nx.number_of_nodes(median) <= 1: - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('median graph has only 1 node, skip decrease.') return False @@ -1030,23 +1030,23 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined decreased_order = False # Decrease the order as long as the best deletion delta is negative. - while self.__compute_best_deletion_delta(graphs, median, id_deleted_node) < -self.__epsilon: + while self._compute_best_deletion_delta(graphs, median, id_deleted_node) < -self._epsilon: decreased_order = True - self.__delete_node_from_median(id_deleted_node[0], median) + self._delete_node_from_median(id_deleted_node[0], median) if nx.number_of_nodes(median) <= 1: - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('decrease stopped because median graph remains only 1 node. ', end='') break # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Return true iff the order was decreased. return decreased_order - def __compute_best_deletion_delta(self, graphs, median, id_deleted_node): + def _compute_best_deletion_delta(self, graphs, median, id_deleted_node): best_delta = 0.0 # Determine node that should be deleted (if any). @@ -1054,22 +1054,22 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # Compute cost delta. delta = 0.0 for graph_id, graph in graphs.items(): - k = self.__node_maps_from_median[graph_id].image(i) + k = self._node_maps_from_median[graph_id].image(i) if k == np.inf: - delta -= self.__node_del_cost + delta -= self._node_del_cost else: - delta += self.__node_ins_cost - self.__ged_env.get_node_rel_cost(median.nodes[i], graph.nodes[k]) + delta += self._node_ins_cost - self._ged_env.get_node_rel_cost(median.nodes[i], graph.nodes[k]) for j, j_label in median[i].items(): - l = self.__node_maps_from_median[graph_id].image(j) + l = self._node_maps_from_median[graph_id].image(j) if k == np.inf or l == np.inf: - delta -= self.__edge_del_cost + delta -= self._edge_del_cost elif not graph.has_edge(k, l): - delta -= self.__edge_del_cost + delta -= self._edge_del_cost else: - delta += self.__edge_ins_cost - self.__ged_env.get_edge_rel_cost(j_label, graph.edges[(k, l)]) + delta += self._edge_ins_cost - self._ged_env.get_edge_rel_cost(j_label, graph.edges[(k, l)]) # Update best deletion delta. - if delta < best_delta - self.__epsilon: + if delta < best_delta - self._epsilon: best_delta = delta id_deleted_node[0] = i # id_deleted_node[0] = 3 # @todo: @@ -1077,7 +1077,7 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined return best_delta - def __delete_node_from_median(self, id_deleted_node, median): + def _delete_node_from_median(self, id_deleted_node, median): # Update the median. mapping = {} for i in range(0, nx.number_of_nodes(median)): @@ -1088,8 +1088,8 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined nx.relabel_nodes(median, mapping, copy=False) # Update the node maps. -# xxx = self.__node_maps_from_median - for key, node_map in self.__node_maps_from_median.items(): +# xxx = self._node_maps_from_median + for key, node_map in self._node_maps_from_median.items(): new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) is_unassigned_target_node = [True] * node_map.num_target_nodes() for i in range(0, nx.number_of_nodes(median) + 1): @@ -1102,38 +1102,38 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined for k in range(0, node_map.num_target_nodes()): if is_unassigned_target_node[k]: new_node_map.add_assignment(np.inf, k) -# print(self.__node_maps_from_median[key].forward_map, self.__node_maps_from_median[key].backward_map) +# print(self._node_maps_from_median[key].forward_map, self._node_maps_from_median[key].backward_map) # print(new_node_map.forward_map, new_node_map.backward_map - self.__node_maps_from_median[key] = new_node_map + self._node_maps_from_median[key] = new_node_map # Increase overall number of decreases. - self.__num_decrease_order += 1 + self._num_decrease_order += 1 - def __increase_order(self, graphs, median): + def _increase_order(self, graphs, median): # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('Trying to increase order: ... ', end='') # Initialize the best configuration and the best label of the node that is to be inserted. best_config = {} - best_label = self.__ged_env.get_node_label(1, to_dict=True) + best_label = self._ged_env.get_node_label(1, to_dict=True) increased_order = False # Increase the order as long as the best insertion delta is negative. - while self.__compute_best_insertion_delta(graphs, best_config, best_label) < - self.__epsilon: + while self._compute_best_insertion_delta(graphs, best_config, best_label) < - self._epsilon: increased_order = True - self.__add_node_to_median(best_config, best_label, median) + self._add_node_to_median(best_config, best_label, median) # Print information about current iteration. - if self.__print_to_stdout == 2: + if self._print_to_stdout == 2: print('done.') # Return true iff the order was increased. return increased_order - def __compute_best_insertion_delta(self, graphs, best_config, best_label): + def _compute_best_insertion_delta(self, graphs, best_config, best_label): # Construct sets of inserted nodes. no_inserted_node = True inserted_nodes = {} @@ -1141,7 +1141,7 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined inserted_nodes[graph_id] = [] best_config[graph_id] = np.inf for k in range(nx.number_of_nodes(graph)): - if self.__node_maps_from_median[graph_id].pre_image(k) == np.inf: + if self._node_maps_from_median[graph_id].pre_image(k) == np.inf: no_inserted_node = False inserted_nodes[graph_id].append((k, tuple(item for item in graph.nodes[k].items()))) # @todo: can order of label names be garantteed? @@ -1151,34 +1151,34 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # Compute insertion configuration, label, and delta. best_delta = 0.0 # @todo - if len(self.__label_names['node_labels']) == 0 and len(self.__label_names['node_attrs']) == 0: # @todo - best_delta = self.__compute_insertion_delta_unlabeled(inserted_nodes, best_config, best_label) - elif len(self.__label_names['node_labels']) > 0: # self.__constant_node_costs: - best_delta = self.__compute_insertion_delta_constant(inserted_nodes, best_config, best_label) + if len(self._label_names['node_labels']) == 0 and len(self._label_names['node_attrs']) == 0: # @todo + best_delta = self._compute_insertion_delta_unlabeled(inserted_nodes, best_config, best_label) + elif len(self._label_names['node_labels']) > 0: # self._constant_node_costs: + best_delta = self._compute_insertion_delta_constant(inserted_nodes, best_config, best_label) else: - best_delta = self.__compute_insertion_delta_generic(inserted_nodes, best_config, best_label) + best_delta = self._compute_insertion_delta_generic(inserted_nodes, best_config, best_label) # Return the best delta. return best_delta - def __compute_insertion_delta_unlabeled(self, inserted_nodes, best_config, best_label): # @todo: go through and test. + def _compute_insertion_delta_unlabeled(self, inserted_nodes, best_config, best_label): # @todo: go through and test. # Construct the nest configuration and compute its insertion delta. best_delta = 0.0 best_config.clear() for graph_id, node_set in inserted_nodes.items(): if len(node_set) == 0: best_config[graph_id] = np.inf - best_delta += self.__node_del_cost + best_delta += self._node_del_cost else: best_config[graph_id] = node_set[0][0] - best_delta -= self.__node_ins_cost + best_delta -= self._node_ins_cost # Return the best insertion delta. return best_delta - def __compute_insertion_delta_constant(self, inserted_nodes, best_config, best_label): + def _compute_insertion_delta_constant(self, inserted_nodes, best_config, best_label): # Construct histogram and inverse label maps. hist = {} inverse_label_maps = {} @@ -1209,24 +1209,24 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # Construct the best configuration and compute its insertion delta. best_config.clear() best_delta = 0.0 - node_rel_cost = self.__ged_env.get_node_rel_cost(self.__ged_env.get_node_label(1, to_dict=False), self.__ged_env.get_node_label(2, to_dict=False)) - triangle_ineq_holds = (node_rel_cost <= self.__node_del_cost + self.__node_ins_cost) + node_rel_cost = self._ged_env.get_node_rel_cost(self._ged_env.get_node_label(1, to_dict=False), self._ged_env.get_node_label(2, to_dict=False)) + triangle_ineq_holds = (node_rel_cost <= self._node_del_cost + self._node_ins_cost) for graph_id, _ in inserted_nodes.items(): if best_label_tuple in inverse_label_maps[graph_id]: best_config[graph_id] = inverse_label_maps[graph_id][best_label_tuple] - best_delta -= self.__node_ins_cost + best_delta -= self._node_ins_cost elif triangle_ineq_holds and not len(inserted_nodes[graph_id]) == 0: best_config[graph_id] = inserted_nodes[graph_id][0][0] - best_delta += node_rel_cost - self.__node_ins_cost + best_delta += node_rel_cost - self._node_ins_cost else: best_config[graph_id] = np.inf - best_delta += self.__node_del_cost + best_delta += self._node_del_cost # Return the best insertion delta. return best_delta - def __compute_insertion_delta_generic(self, inserted_nodes, best_config, best_label): + def _compute_insertion_delta_generic(self, inserted_nodes, best_config, best_label): # Collect all node labels of inserted nodes. node_labels = [] for _, node_set in inserted_nodes.items(): @@ -1235,7 +1235,7 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # Compute node label medians that serve as initial solutions for block gradient descent. initial_node_labels = [] - self.__compute_initial_node_labels(node_labels, initial_node_labels) + self._compute_initial_node_labels(node_labels, initial_node_labels) # Determine best insertion configuration, label, and delta via parallel block gradient descent from all initial node labels. best_delta = 0.0 @@ -1243,15 +1243,15 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # Construct local configuration. config = {} for graph_id, _ in inserted_nodes.items(): - config[graph_id] = tuple((np.inf, self.__ged_env.get_node_label(1, to_dict=False))) + config[graph_id] = tuple((np.inf, self._ged_env.get_node_label(1, to_dict=False))) # Run block gradient descent. converged = False itr = 0 - while not self.__insertion_termination_criterion_met(converged, itr): - converged = not self.__update_config(node_label, inserted_nodes, config, node_labels) + while not self._insertion_termination_criterion_met(converged, itr): + converged = not self._update_config(node_label, inserted_nodes, config, node_labels) node_label_dict = dict(node_label) - converged = converged and (not self.__update_node_label([dict(item) for item in node_labels], node_label_dict)) # @todo: the dict is tupled again in the function, can be better. + converged = converged and (not self._update_node_label([dict(item) for item in node_labels], node_label_dict)) # @todo: the dict is tupled again in the function, can be better. node_label = tuple(item for item in node_label_dict.items()) # @todo: watch out: initial_node_labels[i] is not modified here. itr += 1 @@ -1260,12 +1260,12 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined delta = 0.0 for _, node in config.items(): if node[0] == np.inf: - delta += self.__node_del_cost + delta += self._node_del_cost else: - delta += self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost + delta += self._ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self._node_ins_cost # Update best delta and global configuration if improvement has been found. - if delta < best_delta - self.__epsilon: + if delta < best_delta - self._epsilon: best_delta = delta best_label.clear() for key, val in node_label: @@ -1278,16 +1278,16 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined return best_delta - def __compute_initial_node_labels(self, node_labels, median_labels): + def _compute_initial_node_labels(self, node_labels, median_labels): median_labels.clear() - if self.__use_real_randomness: # @todo: may not work if parallelized. + if self._use_real_randomness: # @todo: may not work if parallelized. rng = np.random.randint(0, high=2**32 - 1, size=1) urng = np.random.RandomState(seed=rng[0]) else: - urng = np.random.RandomState(seed=self.__seed) + urng = np.random.RandomState(seed=self._seed) # Generate the initial node label medians. - if self.__init_type_increase_order == 'K-MEANS++': + if self._init_type_increase_order == 'K-MEANS++': # Use k-means++ heuristic to generate the initial node label medians. already_selected = [False] * len(node_labels) selected_label_id = urng.randint(low=0, high=len(node_labels), size=1)[0] # c++ test: 23 @@ -1295,14 +1295,14 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined already_selected[selected_label_id] = True # xxx = [41, 0, 18, 9, 6, 14, 21, 25, 33] for c++ test # iii = 0 for c++ test - while len(median_labels) < self.__num_inits_increase_order: + while len(median_labels) < self._num_inits_increase_order: weights = [np.inf] * len(node_labels) for label_id in range(0, len(node_labels)): if already_selected[label_id]: weights[label_id] = 0 continue for label in median_labels: - weights[label_id] = min(weights[label_id], self.__ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) + weights[label_id] = min(weights[label_id], self._ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) # get non-zero weights. weights_p, idx_p = [], [] @@ -1317,26 +1317,26 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined # iii += 1 for c++ test median_labels.append(node_labels[selected_label_id]) already_selected[selected_label_id] = True - else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self.__num_inits_increase_order. + else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self._num_inits_increase_order. break else: # Compute the initial node medians as the medians of randomly generated clusters of (roughly) equal size. # @todo: go through and test. shuffled_node_labels = [np.inf] * len(node_labels) #@todo: random? # @todo: std::shuffle(shuffled_node_labels.begin(), shuffled_node_labels.end(), urng);? - cluster_size = len(node_labels) / self.__num_inits_increase_order + cluster_size = len(node_labels) / self._num_inits_increase_order pos = 0.0 cluster = [] - while len(median_labels) < self.__num_inits_increase_order - 1: + while len(median_labels) < self._num_inits_increase_order - 1: while pos < (len(median_labels) + 1) * cluster_size: cluster.append(shuffled_node_labels[pos]) pos += 1 - median_labels.append(self.__get_median_node_label(cluster)) + median_labels.append(self._get_median_node_label(cluster)) cluster.clear() while pos < len(shuffled_node_labels): pos += 1 cluster.append(shuffled_node_labels[pos]) - median_labels.append(self.__get_median_node_label(cluster)) + median_labels.append(self._get_median_node_label(cluster)) cluster.clear() # Run Lloyd's Algorithm. @@ -1344,8 +1344,8 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined closest_median_ids = [np.inf] * len(node_labels) clusters = [[] for _ in range(len(median_labels))] itr = 1 - while not self.__insertion_termination_criterion_met(converged, itr): - converged = not self.__update_clusters(node_labels, median_labels, closest_median_ids) + while not self._insertion_termination_criterion_met(converged, itr): + converged = not self._update_clusters(node_labels, median_labels, closest_median_ids) if not converged: for cluster in clusters: cluster.clear() @@ -1353,33 +1353,33 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined clusters[closest_median_ids[label_id]].append(node_labels[label_id]) for cluster_id in range(0, len(clusters)): node_label = dict(median_labels[cluster_id]) - self.__update_node_label([dict(item) for item in clusters[cluster_id]], node_label) # @todo: the dict is tupled again in the function, can be better. + self._update_node_label([dict(item) for item in clusters[cluster_id]], node_label) # @todo: the dict is tupled again in the function, can be better. median_labels[cluster_id] = tuple(item for item in node_label.items()) itr += 1 - def __insertion_termination_criterion_met(self, converged, itr): - return converged or (itr >= self.__max_itrs_increase_order if self.__max_itrs_increase_order > 0 else False) + def _insertion_termination_criterion_met(self, converged, itr): + return converged or (itr >= self._max_itrs_increase_order if self._max_itrs_increase_order > 0 else False) - def __update_config(self, node_label, inserted_nodes, config, node_labels): + def _update_config(self, node_label, inserted_nodes, config, node_labels): # Determine the best configuration. config_modified = False for graph_id, node_set in inserted_nodes.items(): best_assignment = config[graph_id] best_cost = 0.0 if best_assignment[0] == np.inf: - best_cost = self.__node_del_cost + best_cost = self._node_del_cost else: - best_cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(best_assignment[1])) - self.__node_ins_cost + best_cost = self._ged_env.get_node_rel_cost(dict(node_label), dict(best_assignment[1])) - self._node_ins_cost for node in node_set: - cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost - if cost < best_cost - self.__epsilon: + cost = self._ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self._node_ins_cost + if cost < best_cost - self._epsilon: best_cost = cost best_assignment = node config_modified = True - if self.__node_del_cost < best_cost - self.__epsilon: - best_cost = self.__node_del_cost + if self._node_del_cost < best_cost - self._epsilon: + best_cost = self._node_del_cost best_assignment = tuple((np.inf, best_assignment[1])) config_modified = True config[graph_id] = best_assignment @@ -1394,11 +1394,11 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined return config_modified - def __update_node_label(self, node_labels, node_label): - if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling __update_config(). + def _update_node_label(self, node_labels, node_label): + if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling _update_config(). return False - new_node_label = self.__get_median_node_label(node_labels) - if self.__ged_env.get_node_rel_cost(new_node_label, node_label) > self.__epsilon: + new_node_label = self._get_median_node_label(node_labels) + if self._ged_env.get_node_rel_cost(new_node_label, node_label) > self._epsilon: node_label.clear() for key, val in new_node_label.items(): node_label[key] = val @@ -1406,15 +1406,15 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined return False - def __update_clusters(self, node_labels, median_labels, closest_median_ids): + def _update_clusters(self, node_labels, median_labels, closest_median_ids): # Determine the closest median for each node label. clusters_modified = False for label_id in range(0, len(node_labels)): closest_median_id = np.inf dist_to_closest_median = np.inf for median_id in range(0, len(median_labels)): - dist_to_median = self.__ged_env.get_node_rel_cost(dict(median_labels[median_id]), dict(node_labels[label_id])) - if dist_to_median < dist_to_closest_median - self.__epsilon: + dist_to_median = self._ged_env.get_node_rel_cost(dict(median_labels[median_id]), dict(node_labels[label_id])) + if dist_to_median < dist_to_closest_median - self._epsilon: dist_to_closest_median = dist_to_median closest_median_id = median_id if closest_median_id != closest_median_ids[label_id]: @@ -1425,26 +1425,26 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined return clusters_modified - def __add_node_to_median(self, best_config, best_label, median): + def _add_node_to_median(self, best_config, best_label, median): # Update the median. nb_nodes_median = nx.number_of_nodes(median) median.add_node(nb_nodes_median, **best_label) # Update the node maps. - for graph_id, node_map in self.__node_maps_from_median.items(): + for graph_id, node_map in self._node_maps_from_median.items(): node_map_as_rel = [] node_map.as_relation(node_map_as_rel) new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) for assignment in node_map_as_rel: new_node_map.add_assignment(assignment[0], assignment[1]) new_node_map.add_assignment(nx.number_of_nodes(median) - 1, best_config[graph_id]) - self.__node_maps_from_median[graph_id] = new_node_map + self._node_maps_from_median[graph_id] = new_node_map # Increase overall number of increases. - self.__num_increase_order += 1 + self._num_increase_order += 1 - def __are_graphs_equal(self, g1, g2): + def _are_graphs_equal(self, g1, g2): """ Check if the two graphs are equal. @@ -1489,29 +1489,29 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined def set_label_names(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): - self.__label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, + self._label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, 'node_attrs': node_attrs, 'edge_attrs': edge_attrs} - def __get_median_node_label(self, node_labels): - if len(self.__label_names['node_labels']) > 0: - return self.__get_median_label_symbolic(node_labels) - elif len(self.__label_names['node_attrs']) > 0: - return self.__get_median_label_nonsymbolic(node_labels) + def _get_median_node_label(self, node_labels): + if len(self._label_names['node_labels']) > 0: + return self._get_median_label_symbolic(node_labels) + elif len(self._label_names['node_attrs']) > 0: + return self._get_median_label_nonsymbolic(node_labels) else: raise Exception('Node label names are not given.') - def __get_median_edge_label(self, edge_labels): - if len(self.__label_names['edge_labels']) > 0: - return self.__get_median_label_symbolic(edge_labels) - elif len(self.__label_names['edge_attrs']) > 0: - return self.__get_median_label_nonsymbolic(edge_labels) + def _get_median_edge_label(self, edge_labels): + if len(self._label_names['edge_labels']) > 0: + return self._get_median_label_symbolic(edge_labels) + elif len(self._label_names['edge_attrs']) > 0: + return self._get_median_label_nonsymbolic(edge_labels) else: raise Exception('Edge label names are not given.') - def __get_median_label_symbolic(self, labels): + def _get_median_label_symbolic(self, labels): # Construct histogram. hist = {} for label in labels: @@ -1532,7 +1532,7 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined return median_label - def __get_median_label_nonsymbolic(self, labels): + def _get_median_label_nonsymbolic(self, labels): if len(labels) == 0: return {} # @todo else: @@ -1591,11 +1591,11 @@ class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined return median_label -# def __get_median_edge_label_symbolic(self, edge_labels): +# def _get_median_edge_label_symbolic(self, edge_labels): # pass -# def __get_median_edge_label_nonsymbolic(self, edge_labels): +# def _get_median_edge_label_nonsymbolic(self, edge_labels): # if len(edge_labels) == 0: # return {} # else: @@ -1659,7 +1659,7 @@ def _compute_medoid_parallel(graph_ids, sort, itr): i = itr[1] # @todo: timer not considered here. # if timer.expired(): -# self.__state = AlgorithmState.CALLED +# self._state = AlgorithmState.CALLED # break nb_nodes_g = G_ged_env.get_graph_num_nodes(g_id) sum_of_distances = 0 @@ -1680,13 +1680,13 @@ def _compute_init_node_maps_parallel(gen_median_id, sort, nb_nodes_median, itr): if nb_nodes_median <= nb_nodes_g or not sort: G_ged_env.run_method(gen_median_id, graph_id) node_map = G_ged_env.get_node_map(gen_median_id, graph_id) -# print(self.__node_maps_from_median[graph_id]) +# print(self._node_maps_from_median[graph_id]) else: G_ged_env.run_method(graph_id, gen_median_id) node_map = G_ged_env.get_node_map(graph_id, gen_median_id) node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map sum_of_distance = node_map.induced_cost() -# print(self.__sum_of_distances) +# print(self._sum_of_distances) return graph_id, sum_of_distance, node_map diff --git a/gklearn/ged/median/test_median_graph_estimator.py b/gklearn/ged/median/test_median_graph_estimator.py index 60bce83..a0ebbbb 100644 --- a/gklearn/ged/median/test_median_graph_estimator.py +++ b/gklearn/ged/median/test_median_graph_estimator.py @@ -154,6 +154,6 @@ def test_median_graph_estimator_symb(): return set_median, gen_median -if __name__ == '__main__': +if _name_ == '_main_': # set_median, gen_median = test_median_graph_estimator() set_median, gen_median = test_median_graph_estimator_symb() \ No newline at end of file diff --git a/gklearn/kernels/__init__.py b/gklearn/kernels/__init__.py index 5740c77..6ffef06 100644 --- a/gklearn/kernels/__init__.py +++ b/gklearn/kernels/__init__.py @@ -7,6 +7,8 @@ __version__ = "0.1" __author__ = "Linlin Jia" __date__ = "November 2018" +from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels + from gklearn.kernels.graph_kernel import GraphKernel from gklearn.kernels.common_walk import CommonWalk from gklearn.kernels.marginalized import Marginalized diff --git a/gklearn/kernels/metadata.py b/gklearn/kernels/metadata.py new file mode 100644 index 0000000..d00d5d7 --- /dev/null +++ b/gklearn/kernels/metadata.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Nov 6 10:11:08 2020 + +@author: ljia +""" + +# The metadata of all graph kernels. +GRAPH_KERNELS = { + ### based on walks. + 'common walk': '', + 'marginalized': '', + 'sylvester equation': '', + 'fixed_point': '', + 'conjugate gradient': '', + 'spectral decomposition': '', + ### based on paths. + 'shortest path': '', + 'structural shortest path': '', + 'path up to length h': '', + ### based on non-linear patterns. + 'weisfeiler-lehman subtree': '', + 'treelet': '', + } + + +def list_of_graph_kernels(): + """List names of all graph kernels. + + Returns + ------- + list + The list of all graph kernels. + """ + return [i for i in GRAPH_KERNELS] \ No newline at end of file diff --git a/gklearn/preimage/generate_random_preimages_by_class.py b/gklearn/preimage/generate_random_preimages_by_class.py index 66f6c57..8c604ba 100644 --- a/gklearn/preimage/generate_random_preimages_by_class.py +++ b/gklearn/preimage/generate_random_preimages_by_class.py @@ -126,8 +126,7 @@ def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, sav # save median graphs. if save_preimages: - if not os.path.exists(dir_save + 'preimages/'): - os.makedirs(dir_save + 'preimages/') + os.makedirs(dir_save + 'preimages/', exist_ok=True) print('Saving preimages to files...') fn_best_dataset = dir_save + 'preimages/g_best_dataset.' + 'nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) saveGXL(rpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', @@ -167,8 +166,7 @@ def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, sav def _init_output_file_preimage(ds_name, gkernel, dir_output): - if not os.path.exists(dir_output): - os.makedirs(dir_output) + os.makedirs(dir_output, exist_ok=True) fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' f_detail = open(dir_output + fn_output_detail, 'a') csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'num graphs', diff --git a/gklearn/preimage/remove_best_graph.py b/gklearn/preimage/remove_best_graph.py index 48b2b25..7495c18 100644 --- a/gklearn/preimage/remove_best_graph.py +++ b/gklearn/preimage/remove_best_graph.py @@ -218,8 +218,7 @@ def remove_best_graph(ds_name, mpg_options, kernel_options, ged_options, mge_opt # save median graphs. if save_medians: - if not os.path.exists(dir_save + 'medians/'): - os.makedirs(dir_save + 'medians/') + os.makedirs(dir_save + 'medians/', exist_ok=True) print('Saving median graphs to files...') fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', @@ -375,8 +374,7 @@ def _compute_gram_matrix_unnorm(dataset, kernel_options): def _init_output_file(ds_name, gkernel, fit_method, dir_output): - if not os.path.exists(dir_output): - os.makedirs(dir_output) + os.makedirs(dir_output, exist_ok=True) fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' f_detail = open(dir_output + fn_output_detail, 'a') csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', diff --git a/gklearn/preimage/utils.py b/gklearn/preimage/utils.py index 0cdfddb..2d43437 100644 --- a/gklearn/preimage/utils.py +++ b/gklearn/preimage/utils.py @@ -230,8 +230,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged # save median graphs. if save_medians: - if not os.path.exists(dir_save + 'medians/'): - os.makedirs(dir_save + 'medians/') + os.makedirs(dir_save + 'medians/', exist_ok=True) print('Saving median graphs to files...') fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', @@ -308,8 +307,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged def _init_output_file_preimage(ds_name, gkernel, fit_method, dir_output): - if not os.path.exists(dir_output): - os.makedirs(dir_output) + os.makedirs(dir_output, exist_ok=True) # fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' f_detail = open(dir_output + fn_output_detail, 'a') diff --git a/gklearn/tests/test_graph_kernels.py b/gklearn/tests/test_graph_kernels.py index f1c480a..061b17c 100644 --- a/gklearn/tests/test_graph_kernels.py +++ b/gklearn/tests/test_graph_kernels.py @@ -52,6 +52,14 @@ def chooseDataset(ds_name): return dataset +def test_list_graph_kernels(): + """ + """ + from gklearn.kernels import GRAPH_KERNELS, list_of_graph_kernels + assert list_of_graph_kernels() == [i for i in GRAPH_KERNELS] + + + @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) @pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) @pytest.mark.parametrize('parallel', ['imap_unordered', None]) @@ -433,10 +441,11 @@ def test_WLSubtree(ds_name, parallel): if __name__ == "__main__": + test_list_graph_kernels() # test_spkernel('Alkane', 'imap_unordered') # test_StructuralSP('Fingerprint_edge', 'imap_unordered') - test_WLSubtree('Acyclic', 'imap_unordered') +# test_WLSubtree('Acyclic', 'imap_unordered') # test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'fp', None, None) -# test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') \ No newline at end of file +# test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 0343c0b..8e225d6 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -13,6 +13,10 @@ import os class Dataset(object): + import warnings + warnings.simplefilter('always', DeprecationWarning) + warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) + def __init__(self, filename=None, filename_targets=None, **kwargs): if filename is None: @@ -803,6 +807,10 @@ class Dataset(object): def split_dataset_by_target(dataset): + import warnings + warnings.simplefilter('always', DeprecationWarning) + warnings.warn('This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) + from gklearn.preimage.utils import get_same_item_indices graphs = dataset.graphs diff --git a/gklearn/utils/graph_files.py b/gklearn/utils/graph_files.py index ea2f516..57d0052 100644 --- a/gklearn/utils/graph_files.py +++ b/gklearn/utils/graph_files.py @@ -1,5 +1,9 @@ """ Utilities function to manage graph files """ +import warnings +warnings.simplefilter('always', DeprecationWarning) +warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning) + from os.path import dirname, splitext @@ -45,6 +49,10 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): for details. Note here filename is the name of either .txt file in the dataset directory. """ + import warnings + warnings.simplefilter('always', DeprecationWarning) + warnings.warn('The function "gklearn.utils.load_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataLoader" instead.', DeprecationWarning) + extension = splitext(filename)[1][1:] if extension == "ds": data, y, label_names = load_from_ds(filename, filename_targets) @@ -66,17 +74,19 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', **kwargs): """Save list of graphs. """ + import warnings + warnings.simplefilter('always', DeprecationWarning) + warnings.warn('The function "gklearn.utils.save_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataSaver" instead.', DeprecationWarning) + import os dirname_ds = os.path.dirname(filename) if dirname_ds != '': dirname_ds += '/' - if not os.path.exists(dirname_ds) : - os.makedirs(dirname_ds) + os.makedirs(dirname_ds, exist_ok=True) if 'graph_dir' in kwargs: graph_dir = kwargs['graph_dir'] + '/' - if not os.path.exists(graph_dir): - os.makedirs(graph_dir) + os.makedirs(graph_dir, exist_ok=True) del kwargs['graph_dir'] else: graph_dir = dirname_ds diff --git a/gklearn/utils/graph_synthesizer.py b/gklearn/utils/graph_synthesizer.py index 2c5f650..7e83225 100644 --- a/gklearn/utils/graph_synthesizer.py +++ b/gklearn/utils/graph_synthesizer.py @@ -13,6 +13,11 @@ import random class GraphSynthesizer(object): + import warnings + warnings.simplefilter('always', DeprecationWarning) + warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.graph_synthesizer.GraphSynthesizer" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.2.2.', DeprecationWarning) + + def __init__(self): pass diff --git a/gklearn/utils/graphfiles.py b/gklearn/utils/graphfiles.py index 862cda1..17498d6 100644 --- a/gklearn/utils/graphfiles.py +++ b/gklearn/utils/graphfiles.py @@ -671,13 +671,11 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None dirname_ds = os.path.dirname(filename) if dirname_ds != '': dirname_ds += '/' - if not os.path.exists(dirname_ds) : - os.makedirs(dirname_ds) + os.makedirs(dirname_ds, exist_ok=True) if xparams is not None and 'graph_dir' in xparams: graph_dir = xparams['graph_dir'] + '/' - if not os.path.exists(graph_dir): - os.makedirs(graph_dir) + os.makedirs(graph_dir, exist_ok=True) else: graph_dir = dirname_ds diff --git a/gklearn/utils/model_selection_precomputed.py b/gklearn/utils/model_selection_precomputed.py index 517d30a..d4fc900 100644 --- a/gklearn/utils/model_selection_precomputed.py +++ b/gklearn/utils/model_selection_precomputed.py @@ -91,8 +91,7 @@ def model_selection_for_precomputed_kernel(datafile, tqdm.monitor_interval = 0 output_dir += estimator.__name__ - if not os.path.exists(output_dir): - os.makedirs(output_dir) + os.makedirs(output_dir, exist_ok=True) # a string to save all the results. str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' @@ -604,8 +603,7 @@ def model_selection_for_precomputed_kernel(datafile, str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) # open file to save all results for this dataset. - if not os.path.exists(output_dir): - os.makedirs(output_dir) + os.makedirs(output_dir, exist_ok=True) # print out results as table. str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, diff --git a/gklearn/utils/utils.py b/gklearn/utils/utils.py index 66c92a8..1a991bb 100644 --- a/gklearn/utils/utils.py +++ b/gklearn/utils/utils.py @@ -458,8 +458,7 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d print() print('4. saving results...') if save_results: - if not os.path.exists(dir_save): - os.makedirs(dir_save) + os.makedirs(dir_save, exist_ok=True) np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=run_time_list) print('\ncomplete.')