@@ -139,7 +139,20 @@ Fork the library and open a pull request! Make your own contribute to the commun | |||
## Citation | |||
Still waiting... | |||
If you have used `graphkit-learn` in your publication, please cite the the following paper: | |||
``` | |||
@article{JIA2021, | |||
title = "graphkit-learn: A Python Library for Graph Kernels Based on Linear Patterns", | |||
journal = "Pattern Recognition Letters", | |||
year = "2021", | |||
issn = "0167-8655", | |||
doi = "https://doi.org/10.1016/j.patrec.2021.01.003", | |||
url = "http://www.sciencedirect.com/science/article/pii/S0167865521000131", | |||
author = "Linlin Jia and Benoit Gaüzère and Paul Honeine", | |||
keywords = "Graph Kernels, Linear Patterns, Python Implementation", | |||
abstract = "This paper presents graphkit-learn, the first Python library for efficient computation of graph kernels based on linear patterns, able to address various types of graphs. Graph kernels based on linear patterns are thoroughly implemented, each with specific computing methods, as well as two well-known graph kernels based on non-linear patterns for comparative analysis. Since computational complexity is an Achilles’ heel of graph kernels, we provide several strategies to address this critical issue, including parallelization, the trie data structure, and the FCSP method that we extend to other kernels and edge comparison. All proposed strategies save orders of magnitudes of computing time and memory usage. Moreover, all the graph kernels can be simply computed with a single Python statement, thus are appealing to researchers and practitioners. For the convenience of use, an advanced model selection procedure is provided for both regression and classification problems. Experiments on synthesized datasets and 11 real-world benchmark datasets show the relevance of the proposed library." | |||
} | |||
``` | |||
## Acknowledgments | |||
@@ -14,7 +14,7 @@ from gklearn.dataset import DATASET_META, DataFetcher, DataLoader | |||
class Dataset(object): | |||
def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs): | |||
def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', remove_null_graphs=True, clean_labels=True, reload=False, verbose=False, **kwargs): | |||
self._substructures = None | |||
self._node_label_dim = None | |||
self._edge_label_dim = None | |||
@@ -82,6 +82,8 @@ class Dataset(object): | |||
else: | |||
raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.') | |||
if remove_null_graphs: | |||
self.trim_dataset(edge_required=False) | |||
def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs): | |||
@@ -537,7 +539,7 @@ class Dataset(object): | |||
def trim_dataset(self, edge_required=False): | |||
if edge_required: | |||
if edge_required: # @todo: there is a possibility that some node labels will be removed. | |||
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)] | |||
else: | |||
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0] | |||
@@ -332,7 +332,8 @@ class DataLoader(): | |||
content_targets = ga.read().splitlines() # targets (regression) | |||
targets = [int(i) for i in content_targets] | |||
else: | |||
raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') | |||
exp_msg = 'Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.' | |||
raise Exception(exp_msg) | |||
if class_label_map is not None: | |||
targets = [class_label_map[t] for t in targets] | |||
@@ -0,0 +1,11 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Dec 15 18:22:34 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
EXP_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/' | |||
DATASET_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/datasets/' |
@@ -4,7 +4,7 @@ | |||
Created on Wed Oct 20 11:48:02 2020 | |||
@author: ljia | |||
""" | |||
""" | |||
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. | |||
import os | |||
@@ -13,15 +13,15 @@ import pickle | |||
import logging | |||
from gklearn.ged.util import compute_geds | |||
import time | |||
from utils import get_dataset | |||
from utils import get_dataset, set_edit_cost_consts | |||
import sys | |||
from group_results import group_trials | |||
from group_results import group_trials, check_group_existence, update_group_marker | |||
def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) | |||
# Return if the file exists. | |||
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): | |||
return None, None | |||
@@ -41,8 +41,11 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
'threads': multiprocessing.cpu_count(), | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
edit_cost_constants = set_edit_cost_consts(ratio, | |||
node_labeled=len(dataset.node_labels), | |||
edge_labeled=len(dataset.edge_labels), | |||
mode='uniform') | |||
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] | |||
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) | |||
@@ -53,7 +56,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
options['node_attrs'] = dataset.node_attrs | |||
options['edge_attrs'] = dataset.edge_attrs | |||
parallel = True # if num_solutions == 1 else False | |||
"""**5. Compute GED matrix.**""" | |||
ged_mat = 'error' | |||
runtime = 0 | |||
@@ -67,9 +70,9 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
"""**6. Get results.**""" | |||
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: | |||
pickle.dump(ged_mat, f) | |||
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: | |||
@@ -77,66 +80,76 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): | |||
return ged_mat, runtime | |||
def save_trials_as_group(dataset, ds_name, num_solutions, ratio): | |||
# Return if the group file exists. | |||
name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' | |||
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' | |||
if os.path.isfile(name_group): | |||
if check_group_existence(name_group): | |||
return | |||
ged_mats = [] | |||
runtimes = [] | |||
for trial in range(1, 101): | |||
num_trials = 100 | |||
for trial in range(1, num_trials + 1): | |||
print() | |||
print('Trial:', trial) | |||
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) | |||
ged_mats.append(ged_mat) | |||
runtimes.append(runtime) | |||
# Group trials and Remove single files. | |||
# @todo: if the program stops between the following lines, then there may be errors. | |||
name_prefix = 'ged_matrix' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | |||
name_prefix = 'runtime' + name_middle | |||
group_trials(save_dir, name_prefix, True, True, False) | |||
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials) | |||
update_group_marker(name_group) | |||
def results_for_a_dataset(ds_name): | |||
"""**1. Get dataset.**""" | |||
dataset = get_dataset(ds_name) | |||
for num_solutions in num_solutions_list: | |||
for ratio in ratio_list: | |||
print() | |||
print('# of solutions:', num_solutions) | |||
for ratio in ratio_list: | |||
print('Ratio:', ratio) | |||
for num_solutions in num_solutions_list: | |||
print() | |||
print('Ratio:', ratio) | |||
print('# of solutions:', num_solutions) | |||
save_trials_as_group(dataset, ds_name, num_solutions, ratio) | |||
def get_param_lists(ds_name): | |||
def get_param_lists(ds_name, test=False): | |||
if test: | |||
num_solutions_list = [1, 10, 20, 30, 40, 50] | |||
ratio_list = [10] | |||
return num_solutions_list, ratio_list | |||
if ds_name == 'AIDS_symb': | |||
num_solutions_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
else: | |||
num_solutions_list = [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] | |||
num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100] | |||
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1] | |||
return num_solutions_list, ratio_list | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
ds_name_list = sys.argv[1:] | |||
else: | |||
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/' | |||
ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | |||
# ds_name_list = ['Acyclic'] # 'Alkane_unlabeled'] | |||
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs(save_dir + 'groups/', exist_ok=True) | |||
for ds_name in ds_name_list: | |||
print() | |||
print('Dataset:', ds_name) | |||
num_solutions_list, ratio_list = get_param_lists(ds_name) | |||
num_solutions_list, ratio_list = get_param_lists(ds_name, test=False) | |||
results_for_a_dataset(ds_name) |
@@ -5,7 +5,7 @@ Created on Thu Oct 29 17:26:43 2020 | |||
@author: ljia | |||
This script groups results together into a single file for the sake of faster | |||
This script groups results together into a single file for the sake of faster | |||
searching and loading. | |||
""" | |||
import os | |||
@@ -16,9 +16,55 @@ from tqdm import tqdm | |||
import sys | |||
def check_group_existence(file_name): | |||
path, name = os.path.split(file_name) | |||
marker_fn = os.path.join(path, 'group_names_finished.pkl') | |||
if os.path.isfile(marker_fn): | |||
with open(marker_fn, 'rb') as f: | |||
fns = pickle.load(f) | |||
if name in fns: | |||
return True | |||
if os.path.isfile(file_name): | |||
return True | |||
return False | |||
def update_group_marker(file_name): | |||
path, name = os.path.split(file_name) | |||
marker_fn = os.path.join(path, 'group_names_finished.pkl') | |||
if os.path.isfile(marker_fn): | |||
with open(marker_fn, 'rb') as f: | |||
fns = pickle.load(f) | |||
if name in fns: | |||
return | |||
else: | |||
fns.add(name) | |||
else: | |||
fns = set({name}) | |||
with open(marker_fn, 'wb') as f: | |||
pickle.dump(fns, f) | |||
def create_group_marker_file(dir_folder, overwrite=True): | |||
if not overwrite: | |||
return | |||
fns = set() | |||
for file in sorted(os.listdir(dir_folder)): | |||
if os.path.isfile(os.path.join(dir_folder, file)): | |||
if file.endswith('.npy'): | |||
fns.add(file) | |||
marker_fn = os.path.join(dir_folder, 'group_names_finished.pkl') | |||
with open(marker_fn, 'wb') as f: | |||
pickle.dump(fns, f) | |||
# This function is used by other scripts. Modify it carefully. | |||
def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
def group_trials(dir_folder, name_prefix, overwrite, clear, backup, num_trials=100): | |||
# Get group name. | |||
label_name = name_prefix.split('.')[0] | |||
if label_name == 'ged_matrix': | |||
@@ -33,10 +79,10 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
else: | |||
name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl' | |||
if not override and os.path.isfile(name_group): | |||
if not overwrite and os.path.isfile(name_group): | |||
# Check if all trial files exist. | |||
trials_complete = True | |||
for trial in range(1, 101): | |||
for trial in range(1, num_trials + 1): | |||
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
if not os.path.isfile(file_name): | |||
trials_complete = False | |||
@@ -44,7 +90,7 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
else: | |||
# Get data. | |||
data_group = [] | |||
for trial in range(1, 101): | |||
for trial in range(1, num_trials + 1): | |||
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
@@ -64,7 +110,7 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
else: # Not all trials are completed. | |||
return | |||
# Write groups. | |||
if label_name == 'ged_matrix': | |||
data_group = np.array(data_group) | |||
@@ -73,31 +119,31 @@ def group_trials(dir_folder, name_prefix, override, clear, backup): | |||
else: | |||
with open(name_group, 'wb') as f: | |||
pickle.dump(data_group, f) | |||
trials_complete = True | |||
if trials_complete: | |||
# Backup. | |||
if backup: | |||
for trial in range(1, 101): | |||
for trial in range(1, num_trials + 1): | |||
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
copyfile(src, dst) | |||
# Clear. | |||
if clear: | |||
for trial in range(1, 101): | |||
for trial in range(1, num_trials + 1): | |||
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' | |||
os.remove(src) | |||
def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): | |||
def group_all_in_folder(dir_folder, overwrite=False, clear=True, backup=True): | |||
# Create folders. | |||
os.makedirs(dir_folder + 'groups/', exist_ok=True) | |||
if backup: | |||
os.makedirs(dir_folder + 'backups', exist_ok=True) | |||
# Iterate all files. | |||
cur_file_prefix = '' | |||
for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout): | |||
@@ -106,20 +152,23 @@ def group_all_in_folder(dir_folder, override=False, clear=True, backup=True): | |||
# print(name) | |||
# print(name_prefix) | |||
if name_prefix != cur_file_prefix: | |||
group_trials(dir_folder, name_prefix, override, clear, backup) | |||
group_trials(dir_folder, name_prefix, overwrite, clear, backup) | |||
cur_file_prefix = name_prefix | |||
if __name__ == '__main__': | |||
dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' | |||
group_all_in_folder(dir_folder) | |||
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' | |||
group_all_in_folder(dir_folder) | |||
dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' | |||
group_all_in_folder(dir_folder) | |||
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' | |||
group_all_in_folder(dir_folder) | |||
# dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/' | |||
# group_all_in_folder(dir_folder) | |||
# dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/' | |||
# group_all_in_folder(dir_folder) | |||
# dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/' | |||
# group_all_in_folder(dir_folder) | |||
# dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/' | |||
# group_all_in_folder(dir_folder) | |||
dir_folder = 'outputs/CRIANN/edit_costs.real_data.num_sols.ratios.IPFP/groups/' | |||
create_group_marker_file(dir_folder) |
@@ -15,30 +15,30 @@ def get_job_script(arg): | |||
#SBATCH --exclusive | |||
#SBATCH --job-name="st.""" + arg + r""".IPFP" | |||
#SBATCH --partition=tlong | |||
#SBATCH --partition=court | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||
#SBATCH --error="errors/error_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||
#SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||
#SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.IPFP.""" + arg + """.txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=300:00:00 | |||
#SBATCH --time=48:00:00 | |||
#SBATCH --mem-per-cpu=4000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability | |||
srun python3 edit_costs.nums_sols.ratios.IPFP.py """ + arg | |||
srun python3 edit_costs.real_data.nums_sols.ratios.IPFP.py """ + arg | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
return script | |||
if __name__ == '__main__': | |||
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] | |||
for ds_name in [ds_list[i] for i in [0, 3]]: | |||
ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG'] | |||
for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]: | |||
job_script = get_job_script(ds_name) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) |
@@ -5,26 +5,251 @@ Created on Thu Oct 29 19:17:36 2020 | |||
@author: ljia | |||
""" | |||
from gklearn.utils import Dataset | |||
import os | |||
import pickle | |||
import numpy as np | |||
from tqdm import tqdm | |||
import sys | |||
from gklearn.dataset import Dataset | |||
from gklearn.experiments import DATASET_ROOT | |||
def get_dataset(ds_name): | |||
# The node/edge labels that will not be used in the computation. | |||
if ds_name == 'MAO': | |||
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
elif ds_name == 'Monoterpenoides': | |||
irrelevant_labels = {'edge_labels': ['valence']} | |||
elif ds_name == 'MUTAG': | |||
irrelevant_labels = {'edge_labels': ['label_0']} | |||
elif ds_name == 'AIDS_symb': | |||
# if ds_name == 'MAO': | |||
# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
# if ds_name == 'Monoterpenoides': | |||
# irrelevant_labels = {'edge_labels': ['valence']} | |||
# elif ds_name == 'MUTAG': | |||
# irrelevant_labels = {'edge_labels': ['label_0']} | |||
if ds_name == 'AIDS_symb': | |||
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} | |||
ds_name = 'AIDS' | |||
else: | |||
irrelevant_labels = {} | |||
# Initialize a Dataset. | |||
dataset = Dataset() | |||
# Load predefined dataset. | |||
dataset.load_predefined_dataset(ds_name) | |||
dataset = Dataset(ds_name, root=DATASET_ROOT) | |||
# Remove irrelevant labels. | |||
dataset.remove_labels(**irrelevant_labels) | |||
print('dataset size:', len(dataset.graphs)) | |||
return dataset | |||
return dataset | |||
def set_edit_cost_consts(ratio, node_labeled=True, edge_labeled=True, mode='uniform'): | |||
if mode == 'uniform': | |||
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1] | |||
if not node_labeled: | |||
edit_cost_constants[2] = 0 | |||
if not edge_labeled: | |||
edit_cost_constants[5] = 0 | |||
return edit_cost_constants | |||
def nested_keys_exists(element, *keys): | |||
''' | |||
Check if *keys (nested) exists in `element` (dict). | |||
''' | |||
if not isinstance(element, dict): | |||
raise AttributeError('keys_exists() expects dict as first argument.') | |||
if len(keys) == 0: | |||
raise AttributeError('keys_exists() expects at least two arguments, one given.') | |||
_element = element | |||
for key in keys: | |||
try: | |||
_element = _element[key] | |||
except KeyError: | |||
return False | |||
return True | |||
# Check average relative error along elements in two ged matrices. | |||
def matrices_ave_relative_error(m1, m2): | |||
error = 0 | |||
base = 0 | |||
for i in range(m1.shape[0]): | |||
for j in range(m1.shape[1]): | |||
error += np.abs(m1[i, j] - m2[i, j]) | |||
base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2 | |||
return error / base | |||
def compute_relative_error(ged_mats): | |||
if len(ged_mats) != 0: | |||
# get the smallest "correct" GED matrix. | |||
ged_mat_s = np.ones(ged_mats[0].shape) * np.inf | |||
for i in range(ged_mats[0].shape[0]): | |||
for j in range(ged_mats[0].shape[1]): | |||
ged_mat_s[i, j] = np.min([mat[i, j] for mat in ged_mats]) | |||
# compute average error. | |||
errors = [] | |||
for i, mat in enumerate(ged_mats): | |||
err = matrices_ave_relative_error(mat, ged_mat_s) | |||
# if not per_correct: | |||
# print('matrix # ', str(i)) | |||
# pass | |||
errors.append(err) | |||
else: | |||
errors = [0] | |||
return np.mean(errors) | |||
def parse_group_file_name(fn): | |||
splits_all = fn.split('.') | |||
key1 = splits_all[1] | |||
pos2 = splits_all[2].rfind('_') | |||
# key2 = splits_all[2][:pos2] | |||
val2 = splits_all[2][pos2+1:] | |||
pos3 = splits_all[3].rfind('_') | |||
# key3 = splits_all[3][:pos3] | |||
val3 = splits_all[3][pos3+1:] + '.' + splits_all[4] | |||
return key1, val2, val3 | |||
def get_all_errors(save_dir, errors): | |||
# Loop for each GED matrix file. | |||
for file in tqdm(sorted(os.listdir(save_dir)), desc='Getting errors', file=sys.stdout): | |||
if os.path.isfile(os.path.join(save_dir, file)) and file.startswith('ged_mats.'): | |||
keys = parse_group_file_name(file) | |||
# Check if the results is in the errors. | |||
if not keys[0] in errors: | |||
errors[keys[0]] = {} | |||
if not keys[1] in errors[keys[0]]: | |||
errors[keys[0]][keys[1]] = {} | |||
# Compute the error if not exist. | |||
if not keys[2] in errors[keys[0]][keys[1]]: | |||
ged_mats = np.load(os.path.join(save_dir, file)) | |||
errors[keys[0]][keys[1]][keys[2]] = compute_relative_error(ged_mats) | |||
return errors | |||
def get_relative_errors(save_dir, overwrite=False): | |||
""" # Read relative errors from previous computed and saved file. Create the | |||
file, compute the errors, or add and save the new computed errors to the | |||
file if necessary. | |||
Parameters | |||
---------- | |||
save_dir : TYPE | |||
DESCRIPTION. | |||
overwrite : TYPE, optional | |||
DESCRIPTION. The default is False. | |||
Returns | |||
------- | |||
None. | |||
""" | |||
if not overwrite: | |||
fn_err = save_dir + '/relative_errors.pkl' | |||
# If error file exists. | |||
if os.path.isfile(fn_err): | |||
with open(fn_err, 'rb') as f: | |||
errors = pickle.load(f) | |||
errors = get_all_errors(save_dir, errors) | |||
else: | |||
errors = get_all_errors(save_dir, {}) | |||
else: | |||
errors = get_all_errors(save_dir, {}) | |||
with open(fn_err, 'wb') as f: | |||
pickle.dump(errors, f) | |||
return errors | |||
def interpolate_result(Z, method='linear'): | |||
values = Z.copy() | |||
for i in range(Z.shape[0]): | |||
for j in range(Z.shape[1]): | |||
if np.isnan(Z[i, j]): | |||
# Get the nearest non-nan values. | |||
x_neg = np.nan | |||
for idx, val in enumerate(Z[i, :][j::-1]): | |||
if not np.isnan(val): | |||
x_neg = val | |||
x_neg_off = idx | |||
break | |||
x_pos = np.nan | |||
for idx, val in enumerate(Z[i, :][j:]): | |||
if not np.isnan(val): | |||
x_pos = val | |||
x_pos_off = idx | |||
break | |||
# Interpolate. | |||
if not np.isnan(x_neg) and not np.isnan(x_pos): | |||
val_int = (x_pos_off / (x_neg_off + x_pos_off)) * (x_neg - x_pos) + x_pos | |||
values[i, j] = val_int | |||
break | |||
y_neg = np.nan | |||
for idx, val in enumerate(Z[:, j][i::-1]): | |||
if not np.isnan(val): | |||
y_neg = val | |||
y_neg_off = idx | |||
break | |||
y_pos = np.nan | |||
for idx, val in enumerate(Z[:, j][i:]): | |||
if not np.isnan(val): | |||
y_pos = val | |||
y_pos_off = idx | |||
break | |||
# Interpolate. | |||
if not np.isnan(y_neg) and not np.isnan(y_pos): | |||
val_int = (y_pos_off / (y_neg_off + y_neg_off)) * (y_neg - y_pos) + y_pos | |||
values[i, j] = val_int | |||
break | |||
return values | |||
def set_axis_style(ax): | |||
ax.set_axisbelow(True) | |||
ax.spines['top'].set_visible(False) | |||
ax.spines['bottom'].set_visible(False) | |||
ax.spines['right'].set_visible(False) | |||
ax.spines['left'].set_visible(False) | |||
ax.xaxis.set_ticks_position('none') | |||
ax.yaxis.set_ticks_position('none') | |||
ax.tick_params(labelsize=8, color='w', pad=1, grid_color='w') | |||
ax.tick_params(axis='x', pad=-2) | |||
ax.tick_params(axis='y', labelrotation=-40, pad=-2) | |||
# ax.zaxis._axinfo['juggled'] = (1, 2, 0) | |||
ax.set_xlabel(ax.get_xlabel(), fontsize=10, labelpad=-3) | |||
ax.set_ylabel(ax.get_ylabel(), fontsize=10, labelpad=-2, rotation=50) | |||
ax.set_zlabel(ax.get_zlabel(), fontsize=10, labelpad=-2) | |||
ax.set_title(ax.get_title(), pad=30, fontsize=15) | |||
return | |||
if __name__ == '__main__': | |||
root_dir = 'outputs/CRIANN/' | |||
# for dir_ in sorted(os.listdir(root_dir)): | |||
# if os.path.isdir(root_dir): | |||
# full_dir = os.path.join(root_dir, dir_) | |||
# print('---', full_dir,':') | |||
# save_dir = os.path.join(full_dir, 'groups/') | |||
# if os.path.exists(save_dir): | |||
# try: | |||
# get_relative_errors(save_dir) | |||
# except Exception as exp: | |||
# print('An exception occured when running this experiment:') | |||
# print(repr(exp)) |
@@ -4,8 +4,15 @@ | |||
``` | |||
python3 -m pip install graphkit-learn | |||
python3 run_xp.py | |||
``` | |||
Plot results in figure and LaTex tables: | |||
``` | |||
python3 ged_fit_distance_results_plot.py | |||
``` | |||
# Run xp (deprecated). | |||
``` | |||
export PYTHONPATH="/path/to/gedlibpy:/path/to/py-graph" | |||
python optim_costs.py dataset output_file | |||
``` |
@@ -10,6 +10,9 @@ import numpy as np | |||
import scipy.stats | |||
import matplotlib.pyplot as plt | |||
import matplotlib.gridspec as gridspec | |||
# import matplotlib as mpl | |||
# mpl.rcParams['text.usetex'] = True | |||
# mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}'] #for \text command | |||
def rounder(x, decimals): | |||
@@ -54,7 +57,7 @@ def df_to_latex_table(df, replace_header=True, end_mid_line=7): | |||
i_end = ltx.find('\\\\\n\\midrule\n') | |||
replace = r"""\begin{tabular}{lll@{~~}c@{~~}c@{~~}c@{~~}c} | |||
\toprule | |||
\multirow{2}[2]{*}{\textbf{Dataset}} & \multirow{2}[2]{*}{\textbf{Distance}} & \multirow{2}[2]{*}{\textbf{Method}} & \multicolumn{2}{c}{\textbf{BIPARTITE}} & \multicolumn{2}{c}{\textbf{IPFP}} \\ | |||
\multirow{2}[2]{*}{\textbf{Dataset}} & \multirow{2}[2]{*}{\textbf{Distance}} & \multirow{2}[2]{*}{\textbf{Method}} & \multicolumn{2}{c}{\textbf{bipartite}} & \multicolumn{2}{c}{\textbf{IPFP}} \\ | |||
\cmidrule(lr){4-5}\cmidrule(lr){6-7} | |||
& & & \textbf{Train errors} & \textbf{Test errors} & \textbf{Train errors} & \textbf{Test errors} \\ | |||
\midrule | |||
@@ -95,6 +98,9 @@ def beautify_df(df): | |||
for idx, index in enumerate(min_indices): | |||
df.loc[(ds, gk, index), min_labels[idx]] = '\\textbf{' + df.loc[(ds, gk, index), min_labels[idx]] + '}' | |||
# Rename indices. | |||
df.index.set_levels([r'Euclidean', r'Manhattan'], level=1, inplace=True) | |||
return df | |||
@@ -118,6 +124,11 @@ def params_to_latex_table(results): | |||
df.loc[idx_r, idx_c] = '-' | |||
# df = beautify_df(df) | |||
# Rename indices. | |||
# df.index.set_levels([r'\texttt{bipartite}', r'\texttt{IPFP}'], level=1, inplace=True) | |||
df.index.set_levels([r'bipartite', r'IPFP'], level=1, inplace=True) | |||
df.index.set_levels([r'Euclidean', r'Manhattan'], level=2, inplace=True) | |||
ltx = df_to_latex_table(df, replace_header=False, end_mid_line=9) | |||
return ltx | |||
@@ -208,14 +219,11 @@ def print_table_results(results_by_xp): | |||
tab.append(["Method", "App","Test"]) | |||
#setups = ["random","expert","fitted"] | |||
for i,setup in enumerate(results_by_xp.keys()): | |||
current_line = [setup] | |||
p = results_by_xp[setup] | |||
current_line.append(f"{p['mean'][0]:.2f} +- {p['interval'][0]:.2f}") | |||
current_line.append(f"{p['mean'][1]:.2f} +- {p['interval'][1]:.2f}") | |||
tab.append(current_line) | |||
print(tabulate(tab, headers="firstrow")) | |||
@@ -342,6 +350,13 @@ def set_figure(nb_rows): | |||
return fig | |||
def get_title(edit_cost, distance): | |||
ed = 'bipartite' if edit_cost == 'BIPARTITE' else 'IPFP' | |||
# ed = r'\texttt{' + ed + r'}' | |||
dis = distance[0].upper() + distance[1:] | |||
return ed + ', ' + dis | |||
if __name__ == '__main__': | |||
from sklearn.model_selection import ParameterGrid | |||
import pickle | |||
@@ -370,7 +385,8 @@ if __name__ == '__main__': | |||
for col, contents in enumerate(row_grid_list): | |||
ax = fig.add_subplot(gs[row, col]) | |||
y_label = (ds_name[:-10] if ds_name.endswith('_unlabeled') else ds_name) if col == 0 else '' | |||
title = contents['edit_cost'] + ', ' + contents['distance'] if row == 0 else '' | |||
title = get_title(contents['edit_cost'], contents['distance']) if row == 0 else '' | |||
p, c = plot_a_task(ax, ds_name, contents['edit_cost'], contents['distance'], title, y_label) | |||
results[(ds_name, contents['distance'], contents['edit_cost'])] = p | |||
params[(ds_name, contents['distance'], contents['edit_cost'])] = c | |||
@@ -10,6 +10,7 @@ This script compares the results with and without FCSP. | |||
from gklearn.dataset import Dataset | |||
from gklearn.utils import get_graph_kernel_by_name | |||
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||
from gklearn.experiments import DATASET_ROOT | |||
import functools | |||
import os | |||
import pickle | |||
@@ -17,50 +18,77 @@ import sys | |||
import logging | |||
def run_all(fcsp): | |||
save_dir = 'outputs/' + ('fscp' if fcsp == True else 'naive') + '/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
# def run_all(fcsp): | |||
# from sklearn.model_selection import ParameterGrid | |||
# Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
# 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||
# 'Letter-high', 'Letter-med', 'Letter-low', | |||
# 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
# 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||
# 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||
# 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||
# 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||
# 'Mutagenicity', 'REDDIT-BINARY'] | |||
# Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
# task_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||
# for task in list(task_grid): | |||
from sklearn.model_selection import ParameterGrid | |||
# save_file_suffix = '.' + task['kernel'] + '.' + task['dataset'] | |||
# file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||
# if not os.path.isfile(file_name): | |||
# print() | |||
# print((task['kernel'], task['dataset'])) | |||
Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
'PAH_unlabeled', 'PAH', 'MUTAG', 'Letter-high', 'Letter-med', 'Letter-low', | |||
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||
'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||
'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||
'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||
'Mutagenicity', 'REDDIT-BINARY'] | |||
# try: | |||
# gram_matrix, run_time = compute(task['kernel'], task['dataset'], fcsp) | |||
Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
# except Exception as exp: | |||
# print('An exception occured when running this experiment:') | |||
# LOG_FILENAME = save_dir + 'error.txt' | |||
# logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
# logging.exception('\n--------------' + save_file_suffix + '------------------') | |||
# print(repr(exp)) | |||
# else: | |||
# save_file_suffix = '.' + task['kernel'] + task['dataset'] | |||
work_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||
# with open(file_name, 'wb') as f: | |||
# pickle.dump(run_time, f) | |||
for work in list(work_grid): | |||
save_file_suffix = '.' + work['kernel'] + '.' + work['dataset'] | |||
file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||
if not os.path.isfile(file_name): | |||
print() | |||
print((work['kernel'], work['dataset'])) | |||
try: | |||
gram_matrix, run_time = run_work(work['kernel'], work['dataset'], fcsp) | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
def run_task(kernel_name, ds_name, fcsp): | |||
save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||
file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||
save_file_suffix = '.' + work['kernel'] + work['dataset'] | |||
if not os.path.isfile(file_name): | |||
print() | |||
print((kernel_name, ds_name, str(fcsp))) | |||
try: | |||
gram_matrix, run_time = compute(kernel_name, ds_name, fcsp) | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = os.path.join(save_dir, 'error' + save_file_suffix + '.txt') | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('\n--------------' + save_file_suffix + '------------------') | |||
print(repr(exp)) | |||
else: | |||
with open(file_name, 'wb') as f: | |||
pickle.dump(run_time, f) | |||
def run_work(kernel_name, ds_name, fcsp): | |||
dataset = Dataset(ds_name, verbose=True) | |||
def compute(kernel_name, ds_name, fcsp): | |||
dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||
if kernel_name == 'ShortestPath': | |||
dataset.trim_dataset(edge_required=True) | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
@@ -87,8 +115,15 @@ def run_work(kernel_name, ds_name, fcsp): | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
fcsp = True if sys.argv[1] == 'True' else False | |||
kernel_name = sys.argv[1] | |||
ds_name = sys.argv[2] | |||
fcsp = True if sys.argv[3] == 'True' else False | |||
else: | |||
kernel_name = 'ShortestPath' | |||
ds_name = 'Acyclic' | |||
fcsp = True | |||
run_all(fcsp) | |||
save_dir = 'outputs/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_task(kernel_name, ds_name, fcsp) |
@@ -0,0 +1,98 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Wed Dec 2 17:41:54 2020 | |||
@author: ljia | |||
This script compares the results with and without FCSP. | |||
""" | |||
from gklearn.dataset import Dataset | |||
from shortest_path import SPSpace | |||
from structural_sp import SSPSpace | |||
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||
from gklearn.experiments import DATASET_ROOT | |||
import functools | |||
import os | |||
import pickle | |||
import sys | |||
import logging | |||
def run_task(kernel_name, ds_name, fcsp): | |||
save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||
file_name = os.path.join(save_dir, 'space' + save_file_suffix + '.pkl') | |||
# Return if the task is already completed. | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
data = pickle.load(f) | |||
if data['completed']: | |||
return | |||
print() | |||
print((kernel_name, ds_name, str(fcsp))) | |||
try: | |||
gram_matrix, run_time = compute(kernel_name, ds_name, fcsp, file_name) | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = os.path.join(save_dir, 'error.space' + save_file_suffix + '.txt') | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('\n--------------' + save_file_suffix + '------------------') | |||
print(repr(exp)) | |||
# else: | |||
# with open(file_name, 'wb') as f: | |||
# pickle.dump(run_time, f) | |||
def compute(kernel_name, ds_name, fcsp, file_name): | |||
dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||
if kernel_name == 'ShortestPath': | |||
dataset.trim_dataset(edge_required=True) | |||
# dataset.cut_graphs(range(0, 10)) | |||
kernel_class = SPSpace | |||
else: | |||
# dataset.cut_graphs(range(0, 10)) | |||
kernel_class = SSPSpace | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
edge_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
graph_kernel = kernel_class(name=kernel_name, | |||
node_labels=dataset.node_labels, | |||
edge_labels=dataset.edge_labels, | |||
node_attrs=dataset.node_attrs, | |||
edge_attrs=dataset.edge_attrs, | |||
ds_infos=dataset.get_dataset_infos(keys=['directed']), | |||
fcsp=fcsp, | |||
compute_method='naive', | |||
node_kernels=node_kernels, | |||
edge_kernels=edge_kernels, | |||
file_name=file_name | |||
) | |||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, | |||
parallel=None, | |||
normalize=False, | |||
verbose=2 | |||
) | |||
return gram_matrix, run_time | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
kernel_name = sys.argv[1] | |||
ds_name = sys.argv[2] | |||
fcsp = True if sys.argv[3] == 'True' else False | |||
else: | |||
kernel_name = 'StructuralSP' | |||
ds_name = 'Fingerprint' | |||
fcsp = True | |||
save_dir = 'outputs/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_task(kernel_name, ds_name, fcsp) |
@@ -10,27 +10,86 @@ import os | |||
import re | |||
def get_job_script(param): | |||
OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'), | |||
('StructuralSP', 'ENZYMES', 'True'), | |||
('StructuralSP', 'ENZYMES', 'False'), | |||
('StructuralSP', 'AIDS', 'False'), | |||
('ShortestPath', 'NCI1', 'False'), | |||
('StructuralSP', 'NCI1', 'True'), | |||
('StructuralSP', 'NCI1', 'False'), | |||
('ShortestPath', 'NCI109', 'False'), | |||
('StructuralSP', 'NCI109', 'True'), | |||
('StructuralSP', 'NCI109', 'False'), | |||
('ShortestPath', 'DD', 'True'), | |||
('ShortestPath', 'DD', 'False'), | |||
('StructuralSP', 'BZR', 'False'), | |||
('ShortestPath', 'COX2', 'False'), | |||
('StructuralSP', 'COX2', 'False'), | |||
('ShortestPath', 'DHFR', 'False'), | |||
('StructuralSP', 'DHFR', 'False'), | |||
('StructuralSP', 'OHSU', 'True'), | |||
('StructuralSP', 'OHSU', 'False'), | |||
('StructuralSP', 'SYNTHETIC', 'False'), | |||
('StructuralSP', 'SYNTHETIC', 'True'), | |||
('StructuralSP', 'SYNTHETIC', 'False'), | |||
('ShortestPath', 'SYNTHETICnew', 'False'), | |||
('StructuralSP', 'SYNTHETICnew', 'True'), | |||
('StructuralSP', 'SYNTHETICnew', 'False'), | |||
('ShortestPath', 'Synthie', 'False'), | |||
('StructuralSP', 'Synthie', 'True'), | |||
('StructuralSP', 'Synthie', 'False'), | |||
('ShortestPath', 'COIL-DEL', 'False'), | |||
('StructuralSP', 'COIL-DEL', 'True'), | |||
('StructuralSP', 'COIL-DEL', 'False'), | |||
('ShortestPath', 'PROTEINS', 'False'), | |||
('ShortestPath', 'PROTEINS_full', 'False'), | |||
('StructuralSP', 'Mutagenicity', 'True'), | |||
('StructuralSP', 'Mutagenicity', 'False'), | |||
('StructuralSP', 'REDDIT-BINARY', 'True'), | |||
('StructuralSP', 'REDDIT-BINARY', 'False'), | |||
}) | |||
OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'), | |||
('StructuralSP', 'DD', 'False'), | |||
('StructuralSP', 'PROTEINS', 'True'), | |||
('StructuralSP', 'PROTEINS', 'False'), | |||
('StructuralSP', 'PROTEINS_full', 'True'), | |||
('StructuralSP', 'PROTEINS_full', 'False'), | |||
('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'True'), | |||
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'False'), | |||
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'True'), | |||
}) | |||
MISS_LABEL_LIST = set({('StructuralSP', 'GREC', 'True'), | |||
('StructuralSP', 'GREC', 'False'), | |||
('StructuralSP', 'Web', 'True'), | |||
('StructuralSP', 'Web', 'False'), | |||
}) | |||
def get_job_script(kernel, dataset, fcsp): | |||
script = r""" | |||
#!/bin/bash | |||
#SBATCH --exclusive | |||
#SBATCH --job-name="fcsp.""" + param + r"""" | |||
#SBATCH --partition=long | |||
##SBATCH --exclusive | |||
#SBATCH --job-name="fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||
#SBATCH --partition=tlong | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_fcsp.""" + param + r""".txt" | |||
#SBATCH --error="errors/error_fcsp.""" + param + r""".txt" | |||
#SBATCH --output="outputs/output_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
#SBATCH --error="errors/error_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=100:00:00 | |||
#SBATCH --mem-per-cpu=4000 | |||
#SBATCH --time=300:00:00 | |||
##SBATCH --mem-per-cpu=4000 | |||
#SBATCH --mem=40000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | |||
srun python3 compare_fcsp.py """ + param | |||
srun python3 compare_fcsp.py """ + kernel + r" " + dataset + r" " + fcsp | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
@@ -38,15 +97,83 @@ srun python3 compare_fcsp.py """ + param | |||
return script | |||
def check_task_status(save_dir, *params): | |||
str_task_id = '.' + '.'.join(params) | |||
# Check if the task is in out of memeory or out of space lists or missing labels. | |||
if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||
return True | |||
# Check if the task is running or in queue of slurm. | |||
command = 'squeue --user $USER --name "fcsp' + str_task_id + '" --format "%.2t" --noheader' | |||
stream = os.popen(command) | |||
output = stream.readlines() | |||
if len(output) > 0: | |||
return True | |||
# Check if there are more than 10 tlong tasks running. | |||
command = 'squeue --user $USER --partition tlong --noheader' | |||
stream = os.popen(command) | |||
output = stream.readlines() | |||
if len(output) >= 10: | |||
return True | |||
# Check if the results are already computed. | |||
file_name = os.path.join(save_dir, 'run_time' + str_task_id + '.pkl') | |||
if os.path.isfile(file_name): | |||
return True | |||
return False | |||
if __name__ == '__main__': | |||
save_dir = 'outputs/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs('outputs/', exist_ok=True) | |||
os.makedirs('errors/', exist_ok=True) | |||
param_list = ['True', 'False'] | |||
for param in param_list[:]: | |||
job_script = get_job_script(param) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
from sklearn.model_selection import ParameterGrid | |||
Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||
'Letter-high', 'Letter-med', 'Letter-low', | |||
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
# new: not so large. | |||
'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||
'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||
'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||
'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||
# new: large. | |||
'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||
'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||
'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||
'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||
'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||
'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||
'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||
'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||
'COLLAB', 'COIL-DEL', | |||
'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||
'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||
'REDDIT-MULTI-12K'] | |||
Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
fcsp_list = ['True', 'False'] | |||
task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||
'dataset': Dataset_List[:], | |||
'fcsp': fcsp_list[:]}) | |||
from tqdm import tqdm | |||
for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||
if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||
job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
# os.popen(command) | |||
# output = stream.readlines() | |||
# output = stream.readlines() |
@@ -0,0 +1,268 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Mon Dec 14 11:49:43 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
import re | |||
import pickle | |||
OUT_TIME_LIST = [] | |||
OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
('ShortestPath', 'REDDIT-BINARY', 'False'), | |||
('StructuralSP', 'ENZYMES', 'False'), | |||
('ShortestPath', 'DD', 'True'), | |||
('ShortestPath', 'DD', 'False'), | |||
('StructuralSP', 'DD', 'True'), | |||
('StructuralSP', 'DD', 'False'), | |||
('StructuralSP', 'COIL-DEL', 'True'), | |||
('ShortestPath', 'COLORS-3', 'True'), | |||
('ShortestPath', 'COLORS-3', 'False'), | |||
('StructuralSP', 'COLORS-3', 'True'), | |||
('StructuralSP', 'COLORS-3', 'False'), | |||
('StructuralSP', 'PROTEINS', 'True'), | |||
('StructuralSP', 'PROTEINS', 'False'), | |||
('StructuralSP', 'PROTEINS_full', 'True'), | |||
('StructuralSP', 'PROTEINS_full', 'False'), | |||
('StructuralSP', 'MSRC_21', 'False'), | |||
('ShortestPath', 'MCF-7', 'True'), | |||
('ShortestPath', 'MCF-7', 'False'), | |||
('StructuralSP', 'MCF-7', 'True'), | |||
('StructuralSP', 'MCF-7', 'False'), | |||
('ShortestPath', 'MCF-7H', 'True'), | |||
('ShortestPath', 'MCF-7H', 'False'), | |||
('StructuralSP', 'MCF-7H', 'True'), | |||
('StructuralSP', 'MCF-7H', 'False'), | |||
('ShortestPath', 'MOLT-4', 'True'), | |||
('ShortestPath', 'MOLT-4', 'False'), | |||
('StructuralSP', 'MOLT-4', 'True'), | |||
('StructuralSP', 'MOLT-4', 'False'), | |||
('ShortestPath', 'MOLT-4H', 'True'), | |||
('ShortestPath', 'MOLT-4H', 'False'), | |||
('StructuralSP', 'MOLT-4H', 'True'), | |||
('StructuralSP', 'MOLT-4H', 'False'), | |||
('ShortestPath', 'P388', 'True'), | |||
('ShortestPath', 'P388', 'False'), | |||
('StructuralSP', 'P388', 'True'), | |||
('StructuralSP', 'P388', 'False'), | |||
('ShortestPath', 'P388H', 'True'), | |||
('ShortestPath', 'P388H', 'False'), | |||
('StructuralSP', 'P388H', 'True'), | |||
('StructuralSP', 'P388H', 'False'), | |||
('ShortestPath', 'NCI-H23', 'True'), | |||
('ShortestPath', 'NCI-H23', 'False'), | |||
('StructuralSP', 'NCI-H23', 'True'), | |||
('StructuralSP', 'NCI-H23', 'False'), | |||
('ShortestPath', 'NCI-H23H', 'True'), | |||
('ShortestPath', 'NCI-H23H', 'False'), | |||
('StructuralSP', 'NCI-H23H', 'True'), | |||
('StructuralSP', 'NCI-H23H', 'False'), | |||
('ShortestPath', 'OVCAR-8', 'True'), | |||
('ShortestPath', 'OVCAR-8', 'False'), | |||
('StructuralSP', 'OVCAR-8', 'True'), | |||
('StructuralSP', 'OVCAR-8', 'False'), | |||
('ShortestPath', 'OVCAR-8H', 'True'), | |||
('ShortestPath', 'OVCAR-8H', 'False'), | |||
('StructuralSP', 'OVCAR-8H', 'True'), | |||
('StructuralSP', 'OVCAR-8H', 'False'), | |||
('ShortestPath', 'SN12C', 'True'), | |||
('ShortestPath', 'SN12C', 'False'), | |||
('StructuralSP', 'SN12C', 'True'), | |||
('StructuralSP', 'SN12C', 'False'), | |||
('ShortestPath', 'SN12CH', 'True'), | |||
('ShortestPath', 'SN12CH', 'False'), | |||
('ShortestPath', 'SF-295', 'True'), | |||
('ShortestPath', 'SF-295', 'False'), | |||
('StructuralSP', 'SF-295', 'True'), | |||
('StructuralSP', 'SF-295', 'False'), | |||
('ShortestPath', 'SF-295H', 'True'), | |||
('ShortestPath', 'SF-295H', 'False'), | |||
('StructuralSP', 'SF-295H', 'True'), | |||
('StructuralSP', 'SF-295H', 'False'), | |||
('ShortestPath', 'SW-620', 'True'), | |||
('ShortestPath', 'SW-620', 'False'), | |||
('StructuralSP', 'SW-620', 'True'), | |||
('StructuralSP', 'SW-620', 'False'), | |||
('ShortestPath', 'SW-620H', 'True'), | |||
('ShortestPath', 'SW-620H', 'False'), | |||
('StructuralSP', 'SW-620H', 'True'), | |||
('StructuralSP', 'SW-620H', 'False'), | |||
('ShortestPath', 'TRIANGLES', 'True'), | |||
('ShortestPath', 'TRIANGLES', 'False'), | |||
('StructuralSP', 'TRIANGLES', 'True'), | |||
('StructuralSP', 'TRIANGLES', 'False'), | |||
('ShortestPath', 'Yeast', 'True'), | |||
('ShortestPath', 'Yeast', 'False'), | |||
('StructuralSP', 'Yeast', 'True'), | |||
('StructuralSP', 'Yeast', 'False'), | |||
('ShortestPath', 'YeastH', 'True'), | |||
('ShortestPath', 'YeastH', 'False'), | |||
('StructuralSP', 'YeastH', 'True'), | |||
('StructuralSP', 'YeastH', 'False'), | |||
('ShortestPath', 'FRANKENSTEIN', 'True'), | |||
('ShortestPath', 'FRANKENSTEIN', 'False'), | |||
('StructuralSP', 'FRANKENSTEIN', 'True'), | |||
('StructuralSP', 'FRANKENSTEIN', 'False'), | |||
('StructuralSP', 'SN12CH', 'True'), | |||
('StructuralSP', 'SN12CH', 'False'), | |||
('ShortestPath', 'UACC257', 'True'), | |||
('ShortestPath', 'UACC257', 'False'), | |||
('StructuralSP', 'UACC257', 'True'), | |||
('StructuralSP', 'UACC257', 'False'), | |||
('ShortestPath', 'UACC257H', 'True'), | |||
('ShortestPath', 'UACC257H', 'False'), | |||
('StructuralSP', 'UACC257H', 'True'), | |||
('StructuralSP', 'UACC257H', 'False'), | |||
('ShortestPath', 'PC-3', 'True'), | |||
('ShortestPath', 'PC-3', 'False'), | |||
('StructuralSP', 'PC-3', 'True'), | |||
('StructuralSP', 'PC-3', 'False'), | |||
('ShortestPath', 'PC-3H', 'True'), | |||
('ShortestPath', 'PC-3H', 'False'), | |||
('StructuralSP', 'PC-3H', 'True'), | |||
('StructuralSP', 'PC-3H', 'False'), | |||
('ShortestPath', 'DBLP_v1', 'True'), | |||
('ShortestPath', 'DBLP_v1', 'False'), | |||
('StructuralSP', 'DBLP_v1', 'True'), | |||
('ShortestPath', 'COLLAB', 'True'), | |||
('ShortestPath', 'COLLAB', 'False'), | |||
('StructuralSP', 'COLLAB', 'True'), | |||
('StructuralSP', 'COLLAB', 'False'), | |||
('ShortestPath', 'REDDIT-BINARY', 'False'), | |||
('StructuralSP', 'REDDIT-BINARY', 'True'), | |||
('StructuralSP', 'REDDIT-BINARY', 'False'), | |||
('ShortestPath', 'REDDIT-MULTI-5K', 'True'), | |||
('ShortestPath', 'REDDIT-MULTI-5K', 'False'), | |||
('StructuralSP', 'REDDIT-MULTI-5K', 'True'), | |||
('StructuralSP', 'REDDIT-MULTI-5K', 'False'), | |||
('ShortestPath', 'REDDIT-MULTI-12K', 'True'), | |||
('ShortestPath', 'REDDIT-MULTI-12K', 'False'), | |||
('StructuralSP', 'REDDIT-MULTI-12K', 'True'), | |||
('StructuralSP', 'REDDIT-MULTI-12K', 'False'), | |||
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'True'), | |||
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'False'), | |||
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'True'), | |||
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'False'), | |||
}) | |||
MISS_LABEL_LIST = set({('StructuralSP', 'GREC', 'True'), | |||
('StructuralSP', 'GREC', 'False'), | |||
('StructuralSP', 'Web', 'True'), | |||
('StructuralSP', 'Web', 'False'), | |||
}) | |||
def get_job_script(kernel, dataset, fcsp): | |||
# if (kernel, dataset, fcsp) in OUT_MEM_LIST: | |||
# mem = '2560000' | |||
# else: | |||
mem = '4000' | |||
script = r""" | |||
#!/bin/bash | |||
##SBATCH --exclusive | |||
#SBATCH --job-name="fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||
#SBATCH --partition=""" + (r"court" if kernel == 'ShortestPath' else r"court") + r""" | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
#SBATCH --error="errors/error_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=""" + (r"48" if kernel == 'ShortestPath' else r"48") + r""":00:00 | |||
##SBATCH --mem-per-cpu=""" + mem + r""" | |||
#SBATCH --mem=4000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | |||
srun python3 compare_fcsp_space.py """ + kernel + r" " + dataset + r" " + fcsp | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
return script | |||
def check_task_status(save_dir, *params): | |||
str_task_id = '.' + '.'.join(params) | |||
# Check if the task is in out of memeory or out of space lists or missing labels. | |||
if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||
return True | |||
# Check if the task is running or in queue of slurm. | |||
command = 'squeue --user $USER --name "fcsp.space' + str_task_id + '" --format "%.2t" --noheader' | |||
stream = os.popen(command) | |||
output = stream.readlines() | |||
if len(output) > 0: | |||
return True | |||
# Check if the task is already computed. | |||
file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
data = pickle.load(f) | |||
if data['completed']: | |||
return True | |||
return False | |||
if __name__ == '__main__': | |||
save_dir = 'outputs/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs('outputs/', exist_ok=True) | |||
os.makedirs('errors/', exist_ok=True) | |||
from sklearn.model_selection import ParameterGrid | |||
Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||
'Letter-high', 'Letter-med', 'Letter-low', | |||
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
# new: not so large. | |||
'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||
'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||
'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||
'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||
# new: large. | |||
'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||
'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||
'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||
'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||
'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||
'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||
'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||
'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||
'COLLAB', 'COIL-DEL', | |||
'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||
'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||
'REDDIT-MULTI-12K'] | |||
Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
fcsp_list = ['True', 'False'] | |||
task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||
'dataset': Dataset_List[:], | |||
'fcsp': fcsp_list[:]}) | |||
from tqdm import tqdm | |||
for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||
if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||
job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
# os.popen(command) | |||
# output = stream.readlines() |
@@ -0,0 +1,253 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Apr 7 15:24:58 2020 | |||
@author: ljia | |||
@references: | |||
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||
Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||
""" | |||
import sys | |||
from itertools import product | |||
# from functools import partial | |||
from gklearn.utils import get_iters | |||
import numpy as np | |||
from gklearn.utils.utils import getSPGraph | |||
from gklearn.kernels import ShortestPath | |||
import os | |||
import pickle | |||
from pympler import asizeof | |||
import time | |||
import networkx as nx | |||
def load_results(file_name, fcsp): | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
return pickle.load(f) | |||
else: | |||
results = {'nb_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||
if fcsp: | |||
results['vk_dict_mem'] = [] | |||
return results | |||
def save_results(file_name, results): | |||
with open(file_name, 'wb') as f: | |||
pickle.dump(results, f) | |||
def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||
# asizeof.asized(obj, detail=1).format() | |||
# return asizeof.asizeof(obj) | |||
key, val = next(iter(obj.items())) | |||
# key = dict.iterkeys().next() | |||
# key_mem = asizeof.asizeof(key) | |||
dict_flat = sys.getsizeof(obj) | |||
key_mem = 64 | |||
if isinstance(val, float): | |||
val_mem = 24 | |||
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||
else: # value is True or False | |||
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||
# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||
return mem | |||
def compute_stats(file_name, results): | |||
del results['i'] | |||
del results['j'] | |||
results['nb_comparison'] = np.mean(results['nb_comparison']) | |||
results['completed'] = True | |||
if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||
results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||
save_results(file_name, results) | |||
class SPSpace(ShortestPath): | |||
def __init__(self, **kwargs): | |||
super().__init__(**kwargs) | |||
self._file_name = kwargs.get('file_name') | |||
# @profile | |||
def _compute_gm_series(self): | |||
self._all_graphs_have_edges(self._graphs) | |||
# get shortest path graph of each graph. | |||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | |||
results = load_results(self._file_name, self._fcsp) | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', | |||
length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) | |||
time0 = time.time() | |||
for i, j in iterator: | |||
if i > results['i'] or (i == results['i'] and j > results['j']): | |||
data = self._sp_do_space(self._graphs[i], self._graphs[j]) | |||
if self._fcsp: | |||
results['nb_comparison'].append(data[0]) | |||
if data[1] != {}: | |||
results['vk_dict_mem'].append(estimate_vk_memory(data[1], | |||
nx.number_of_nodes(self._graphs[i]), | |||
nx.number_of_nodes(self._graphs[j]))) | |||
else: | |||
results['nb_comparison'].append(data) | |||
results['i'] = i | |||
results['j'] = j | |||
time1 = time.time() | |||
if time1 - time0 > 600: | |||
save_results(self._file_name, results) | |||
time0 = time1 | |||
compute_stats(self._file_name, results) | |||
return gram_matrix | |||
def _sp_do_space(self, g1, g2): | |||
if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||
return self._sp_do_fcsp(g1, g2) | |||
else: | |||
return self._sp_do_naive(g1, g2) | |||
def _sp_do_fcsp(self, g1, g2): | |||
nb_comparison = 0 | |||
# compute shortest path matrices first, method borrowed from FCSP. | |||
vk_dict = {} # shortest path matrices dict | |||
if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do(). | |||
# node symb and non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
kn = self._node_kernels['mix'] | |||
for n1, n2 in product( | |||
g1.nodes(data=True), g2.nodes(data=True)): | |||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
nb_comparison += 1 | |||
# node symb labeled | |||
else: | |||
kn = self._node_kernels['symb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||
nb_comparison += 1 | |||
else: | |||
# node non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
kn = self._node_kernels['nsymb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||
nb_comparison += 1 | |||
# node unlabeled | |||
else: | |||
for e1, e2 in product( | |||
g1.edges(data=True), g2.edges(data=True)): | |||
pass | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kernel += 1 | |||
# nb_comparison += 1 | |||
return nb_comparison, vk_dict | |||
# # compute graph kernels | |||
# if self._ds_infos['directed']: | |||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])] | |||
# kn1 = nk11 * nk22 | |||
# kernel += kn1 | |||
# else: | |||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||
# nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||
# e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])] | |||
# kn1 = nk11 * nk22 | |||
# kn2 = nk12 * nk21 | |||
# kernel += kn1 + kn2 | |||
def _sp_do_naive(self, g1, g2): | |||
nb_comparison = 0 | |||
# Define the function to compute kernels between vertices in each condition. | |||
if len(self._node_labels) > 0: | |||
# node symb and non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['mix'] | |||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
# node symb labeled | |||
else: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['symb'] | |||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
return kn(n1_labels, n2_labels) | |||
else: | |||
# node non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['nsymb'] | |||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
return kn(n1_attrs, n2_attrs) | |||
# node unlabeled | |||
else: | |||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kernel += 1 | |||
return 0 | |||
# compute graph kernels | |||
if self._ds_infos['directed']: | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
# nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1]) | |||
# kn1 = nk11 * nk22 | |||
# kernel += kn1 | |||
nb_comparison += 2 | |||
else: | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
# nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk( | |||
# e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1]) | |||
# kn1 = nk11 * nk22 | |||
# kn2 = nk12 * nk21 | |||
# kernel += kn1 + kn2 | |||
nb_comparison += 4 | |||
return nb_comparison |
@@ -0,0 +1,439 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Mon Mar 30 11:59:57 2020 | |||
@author: ljia | |||
@references: | |||
[1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For | |||
Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). | |||
""" | |||
import sys | |||
from itertools import product | |||
from gklearn.utils import get_iters | |||
import numpy as np | |||
import time | |||
import os, errno | |||
import pickle | |||
from pympler import asizeof | |||
import networkx as nx | |||
from gklearn.utils.utils import get_shortest_paths | |||
from gklearn.kernels import StructuralSP | |||
def load_splist(file_name): | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
return pickle.load(f) | |||
else: | |||
results_path = {'splist': [], 'i': -1, 'completed': False} | |||
return results_path | |||
def load_results(file_name, fcsp): | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
return pickle.load(f) | |||
else: | |||
results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||
if fcsp: | |||
results['vk_dict_mem'] = [] | |||
results['ek_dict_mem'] = [] | |||
return results | |||
def save_results(file_name, results): | |||
with open(file_name, 'wb') as f: | |||
pickle.dump(results, f) | |||
def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||
# asizeof.asized(obj, detail=1).format() | |||
# return asizeof.asizeof(obj) | |||
key, val = next(iter(obj.items())) | |||
# key = dict.iterkeys().next() | |||
# key_mem = asizeof.asizeof(key) | |||
dict_flat = sys.getsizeof(obj) | |||
key_mem = 64 | |||
if isinstance(val, float): | |||
val_mem = 24 | |||
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||
else: # value is True or False | |||
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||
# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||
return mem | |||
def estimate_ek_memory(obj, nb_nodes1, nb_nodes2): | |||
# asizeof.asized(obj, detail=1).format() | |||
# return asizeof.asizeof(obj) | |||
key, val = next(iter(obj.items())) | |||
# key = dict.iterkeys().next() | |||
# key_mem = asizeof.asizeof(key) | |||
dict_flat = sys.getsizeof(obj) | |||
key_mem = 192 | |||
if isinstance(val, float): | |||
val_mem = 24 | |||
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||
else: # value is True or False | |||
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||
# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||
return mem | |||
def compute_stats(file_name, results, splist): | |||
del results['i'] | |||
del results['j'] | |||
results['nb_v_comparison'] = np.mean(results['nb_v_comparison']) | |||
# if len(results['nb_e_comparison']) > 0: | |||
results['nb_e_comparison'] = np.mean(results['nb_e_comparison']) | |||
results['completed'] = True | |||
if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||
results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||
if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0: | |||
results['ek_dict_mem'] = np.mean(results['ek_dict_mem']) | |||
results['nb_sp_ave'] = np.mean([len(ps) for ps in splist]) | |||
results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist]) | |||
results['sp_mem_all'] = asizeof.asizeof(splist) | |||
save_results(file_name, results) | |||
class SSPSpace(StructuralSP): | |||
def __init__(self, **kwargs): | |||
super().__init__(**kwargs) | |||
self._file_name = kwargs.get('file_name') | |||
# @profile | |||
def _compute_gm_series(self): | |||
# get shortest paths of each graph in the graphs. | |||
fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl' | |||
results_path = load_splist(fn_paths) | |||
if not results_path['completed']: | |||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for g in iterator: | |||
splist.append(self._get_sps_as_trie(g)) | |||
else: | |||
time0 = time.time() | |||
for i, g in enumerate(iterator): | |||
if i > results_path['i']: | |||
results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) | |||
results_path['i'] = i | |||
time1 = time.time() | |||
if time1 - time0 > 600: | |||
save_results(fn_paths, results_path) | |||
time0 = time1 | |||
del results_path['i'] | |||
results_path['completed'] = True | |||
save_results(fn_paths, results_path) | |||
######### | |||
splist = results_path['splist'] | |||
results = load_results(self._file_name, self._fcsp) | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
length=len_itr, verbose=(self._verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for i, j in iterator: | |||
kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||
gram_matrix[i][j] = kernel | |||
gram_matrix[j][i] = kernel | |||
else: | |||
time0 = time.time() | |||
for i, j in iterator: | |||
if i > results['i'] or (i == results['i'] and j > results['j']): | |||
data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||
results['nb_v_comparison'].append(data[0]) | |||
results['nb_e_comparison'].append(data[1]) | |||
if self._fcsp: | |||
if data[2] != {}: | |||
results['vk_dict_mem'].append(estimate_vk_memory(data[2], | |||
nx.number_of_nodes(self._graphs[i]), | |||
nx.number_of_nodes(self._graphs[j]))) | |||
if data[3] != {}: | |||
results['ek_dict_mem'].append(estimate_ek_memory(data[3], | |||
nx.number_of_nodes(self._graphs[i]), | |||
nx.number_of_nodes(self._graphs[j]))) | |||
results['i'] = i | |||
results['j'] = j | |||
time1 = time.time() | |||
if time1 - time0 > 600: | |||
save_results(self._file_name, results) | |||
time0 = time1 | |||
compute_stats(self._file_name, results, splist) | |||
# @todo: may not remove the path file if the program stops exactly here. | |||
try: | |||
os.remove(fn_paths) | |||
except OSError as e: | |||
if e.errno != errno.ENOENT: | |||
raise | |||
return gram_matrix | |||
def _ssp_do_naive_space(self, g1, g2, spl1, spl2): | |||
if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||
return self._sp_do_naive_fcsp(g1, g2, spl1, spl2) | |||
else: | |||
return self._sp_do_naive_naive(g1, g2, spl1, spl2) | |||
def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2): | |||
# First, compute shortest path matrices, method borrowed from FCSP. | |||
vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2) | |||
# Then, compute kernels between all pairs of edges, which is an idea of | |||
# extension of FCSP. It suits sparse graphs, which is the most case we | |||
# went though. For dense graphs, this would be slow. | |||
ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2) | |||
return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict | |||
def _sp_do_naive_naive(self, g1, g2, spl1, spl2): | |||
nb_v_comparison = 0 | |||
nb_e_comparison = 0 | |||
# Define the function to compute kernels between vertices in each condition. | |||
if len(self._node_labels) > 0: | |||
# node symb and non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['mix'] | |||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
# node symb labeled | |||
else: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['symb'] | |||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
return kn(n1_labels, n2_labels) | |||
else: | |||
# node non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['nsymb'] | |||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
return kn(n1_attrs, n2_attrs) | |||
# # node unlabeled | |||
# else: | |||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kernel += 1 | |||
# return kernel | |||
# Define the function to compute kernels between edges in each condition. | |||
if len(self._edge_labels) > 0: | |||
# edge symb and non-synb labeled | |||
if len(self._edge_attrs) > 0: | |||
def compute_ek(e1, e2): | |||
ke = self._edge_kernels['mix'] | |||
e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||
e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||
return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||
# edge symb labeled | |||
else: | |||
def compute_ek(e1, e2): | |||
ke = self._edge_kernels['symb'] | |||
e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||
e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||
return ke(e1_labels, e2_labels) | |||
else: | |||
# edge non-synb labeled | |||
if len(self._edge_attrs) > 0: | |||
def compute_ek(e1, e2): | |||
ke = self._edge_kernels['nsymb'] | |||
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||
return ke(e1_attrs, e2_attrs) | |||
# compute graph kernels | |||
if len(self._node_labels) > 0 or len(self._node_attrs) > 0: | |||
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||
for p1, p2 in product(spl1, spl2): | |||
if len(p1) == len(p2): | |||
# nb_v_comparison = len(p1) | |||
# nb_e_comparison = len(p1) - 1 | |||
kpath = compute_vk(p1[0], p2[0]) | |||
nb_v_comparison += 1 | |||
if kpath: | |||
for idx in range(1, len(p1)): | |||
kpath *= compute_vk(p1[idx], p2[idx]) * \ | |||
compute_ek((p1[idx-1], p1[idx]), | |||
(p2[idx-1], p2[idx])) | |||
nb_v_comparison += 1 | |||
nb_e_comparison += 1 | |||
if not kpath: | |||
break | |||
# kernel += kpath # add up kernels of all paths | |||
else: | |||
for p1, p2 in product(spl1, spl2): | |||
if len(p1) == len(p2): | |||
kpath = compute_vk(p1[0], p2[0]) | |||
nb_v_comparison += 1 | |||
if kpath: | |||
for idx in range(1, len(p1)): | |||
kpath *= compute_vk(p1[idx], p2[idx]) | |||
nb_v_comparison += 1 | |||
if not kpath: | |||
break | |||
# kernel += kpath # add up kernels of all paths | |||
else: | |||
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||
for p1, p2 in product(spl1, spl2): | |||
if len(p1) == len(p2): | |||
if len(p1) == 0: | |||
pass | |||
else: | |||
kpath = 1 | |||
for idx in range(0, len(p1) - 1): | |||
kpath *= compute_ek((p1[idx], p1[idx+1]), | |||
(p2[idx], p2[idx+1])) | |||
nb_e_comparison += 1 | |||
if not kpath: | |||
break | |||
else: | |||
pass | |||
# for p1, p2 in product(spl1, spl2): | |||
# if len(p1) == len(p2): | |||
# kernel += 1 | |||
# try: | |||
# kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average | |||
# except ZeroDivisionError: | |||
# print(spl1, spl2) | |||
# print(g1.nodes(data=True)) | |||
# print(g1.edges(data=True)) | |||
# raise Exception | |||
return nb_v_comparison, nb_e_comparison | |||
def _get_all_node_kernels(self, g1, g2): | |||
nb_comparison = 0 | |||
vk_dict = {} # shortest path matrices dict | |||
if len(self._node_labels) > 0: | |||
# node symb and non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
kn = self._node_kernels['mix'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
nb_comparison += 1 | |||
# node symb labeled | |||
else: | |||
kn = self._node_kernels['symb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||
nb_comparison += 1 | |||
else: | |||
# node non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
kn = self._node_kernels['nsymb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||
nb_comparison += 1 | |||
# node unlabeled | |||
else: | |||
pass # @todo: add edge weights. | |||
# for e1 in g1.edges(data=True): | |||
# for e2 in g2.edges(data=True): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kernel += 1 | |||
# return kernel | |||
return vk_dict, nb_comparison | |||
def _get_all_edge_kernels(self, g1, g2): | |||
nb_comparison = 0 | |||
# compute kernels between all pairs of edges, which is an idea of | |||
# extension of FCSP. It suits sparse graphs, which is the most case we | |||
# went though. For dense graphs, this would be slow. | |||
ek_dict = {} # dict of edge kernels | |||
if len(self._edge_labels) > 0: | |||
# edge symb and non-synb labeled | |||
if len(self._edge_attrs) > 0: | |||
ke = self._edge_kernels['mix'] | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
e1_labels = [e1[2][el] for el in self._edge_labels] | |||
e2_labels = [e2[2][el] for el in self._edge_labels] | |||
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||
ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||
nb_comparison += 1 | |||
# edge symb labeled | |||
else: | |||
ke = self._edge_kernels['symb'] | |||
for e1 in g1.edges(data=True): | |||
for e2 in g2.edges(data=True): | |||
e1_labels = [e1[2][el] for el in self._edge_labels] | |||
e2_labels = [e2[2][el] for el in self._edge_labels] | |||
ek_temp = ke(e1_labels, e2_labels) | |||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||
nb_comparison += 1 | |||
else: | |||
# edge non-synb labeled | |||
if len(self._edge_attrs) > 0: | |||
ke = self._edge_kernels['nsymb'] | |||
for e1 in g1.edges(data=True): | |||
for e2 in g2.edges(data=True): | |||
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||
ek_temp = ke(e1_attrs, e2_attrs) | |||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||
nb_comparison += 1 | |||
# edge unlabeled | |||
else: | |||
pass | |||
return ek_dict, nb_comparison |
@@ -1,3 +1,3 @@ | |||
from gklearn.ged.util.lsape_solver import LSAPESolver | |||
from gklearn.ged.util.util import compute_geds, ged_options_to_string | |||
from gklearn.ged.util.util import pairwise_ged, compute_geds, get_nb_edit_operations, ged_options_to_string | |||
from gklearn.ged.util.util import compute_geds_cml, label_costs_to_matrix |
@@ -11,9 +11,10 @@ import multiprocessing | |||
from multiprocessing import Pool | |||
from functools import partial | |||
import sys | |||
from tqdm import tqdm | |||
# from tqdm import tqdm | |||
import networkx as nx | |||
from gklearn.ged.env import GEDEnv | |||
from gklearn.utils import get_iters | |||
def compute_ged(g1, g2, options): | |||
@@ -23,7 +24,7 @@ def compute_ged(g1, g2, options): | |||
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) | |||
ged_env.add_nx_graph(g1, '') | |||
ged_env.add_nx_graph(g2, '') | |||
listID = ged_env.get_all_graph_ids() | |||
listID = ged_env.get_all_graph_ids() | |||
ged_env.init(init_type=options['init_option']) | |||
ged_env.set_method(options['method'], ged_options_to_string(options)) | |||
ged_env.init_method() | |||
@@ -33,9 +34,46 @@ def compute_ged(g1, g2, options): | |||
ged_env.run_method(g, h) | |||
pi_forward = ged_env.get_forward_map(g, h) | |||
pi_backward = ged_env.get_backward_map(g, h) | |||
upper = ged_env.get_upper_bound(g, h) | |||
upper = ged_env.get_upper_bound(g, h) | |||
dis = upper | |||
# make the map label correct (label remove map as np.inf) | |||
nodes1 = [n for n in g1.nodes()] | |||
nodes2 = [n for n in g2.nodes()] | |||
nb1 = nx.number_of_nodes(g1) | |||
nb2 = nx.number_of_nodes(g2) | |||
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||
# print(pi_forward) | |||
return dis, pi_forward, pi_backward | |||
def pairwise_ged(g1, g2, options={}, sort=True, repeats=1, parallel=False, verbose=True): | |||
from gklearn.gedlib import librariesImport, gedlibpy | |||
ged_env = gedlibpy.GEDEnv() | |||
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) | |||
ged_env.add_nx_graph(g1, '') | |||
ged_env.add_nx_graph(g2, '') | |||
listID = ged_env.get_all_graph_ids() | |||
ged_env.init(init_option=(options['init_option'] if 'init_option' in options else 'EAGER_WITHOUT_SHUFFLED_COPIES')) | |||
ged_env.set_method(options['method'], ged_options_to_string(options)) | |||
ged_env.init_method() | |||
g = listID[0] | |||
h = listID[1] | |||
dis_min = np.inf | |||
for i in range(0, repeats): | |||
ged_env.run_method(g, h) | |||
upper = ged_env.get_upper_bound(g, h) | |||
dis = upper | |||
if dis < dis_min: | |||
dis_min = dis | |||
pi_forward = ged_env.get_forward_map(g, h) | |||
pi_backward = ged_env.get_backward_map(g, h) | |||
# lower = ged_env.get_lower_bound(g, h) | |||
# make the map label correct (label remove map as np.inf) | |||
nodes1 = [n for n in g1.nodes()] | |||
nodes2 = [n for n in g2.nodes()] | |||
@@ -56,7 +94,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True | |||
for g in graphs: | |||
ged_env.add_nx_graph(g, '') | |||
listID = ged_env.get_all_graph_ids() | |||
node_labels = ged_env.get_all_node_labels() | |||
edge_labels = ged_env.get_all_edge_labels() | |||
node_label_costs = label_costs_to_matrix(options['node_label_costs'], len(node_labels)) if 'node_label_costs' in options else None | |||
@@ -73,7 +111,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True | |||
if node_label_costs is None and edge_label_costs is None: | |||
neo_options = {'edit_cost': options['edit_cost'], | |||
'is_cml': False, | |||
'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], | |||
'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], | |||
'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} | |||
else: | |||
neo_options = {'edit_cost': options['edit_cost'], | |||
@@ -98,11 +136,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True | |||
G_listID = listID_toshare | |||
do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort) | |||
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) | |||
if verbose: | |||
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | |||
desc='computing GEDs', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(do_partial, itr, chunksize) | |||
iterator = get_iters(pool.imap_unordered(do_partial, itr, chunksize), desc='computing GEDs', file=sys.stdout, length=len_itr, verbose=verbose) | |||
# iterator = pool.imap_unordered(do_partial, itr, chunksize) | |||
for i, j, dis, n_eo_tmp in iterator: | |||
idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2) | |||
@@ -114,14 +148,11 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True | |||
# print(i, j, idx_itr, dis) | |||
pool.close() | |||
pool.join() | |||
else: | |||
ged_vec = [] | |||
n_edit_operations = [] | |||
if verbose: | |||
iterator = tqdm(range(len(graphs)), desc='computing GEDs', file=sys.stdout) | |||
else: | |||
iterator = range(len(graphs)) | |||
iterator = get_iters(range(len(graphs)), desc='computing GEDs', file=sys.stdout, length=len(graphs), verbose=verbose) | |||
for i in iterator: | |||
# for i in range(len(graphs)): | |||
for j in range(i + 1, len(graphs)): | |||
@@ -138,7 +169,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True | |||
return ged_vec, ged_mat, n_edit_operations | |||
def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbose=True): | |||
def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, n_jobs=None, verbose=True): | |||
from gklearn.gedlib import librariesImport, gedlibpy | |||
# initialize ged env. | |||
@@ -146,7 +177,7 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo | |||
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) | |||
for g in graphs: | |||
ged_env.add_nx_graph(g, '') | |||
listID = ged_env.get_all_graph_ids() | |||
listID = ged_env.get_all_graph_ids() | |||
ged_env.init() | |||
if parallel: | |||
options['threads'] = 1 | |||
@@ -155,7 +186,7 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo | |||
# compute ged. | |||
neo_options = {'edit_cost': options['edit_cost'], | |||
'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], | |||
'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], | |||
'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} | |||
ged_mat = np.zeros((len(graphs), len(graphs))) | |||
if parallel: | |||
@@ -163,7 +194,8 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo | |||
ged_vec = [0 for i in range(len_itr)] | |||
n_edit_operations = [0 for i in range(len_itr)] | |||
itr = combinations(range(0, len(graphs)), 2) | |||
n_jobs = multiprocessing.cpu_count() | |||
if n_jobs is None: | |||
n_jobs = multiprocessing.cpu_count() | |||
if len_itr < 100 * n_jobs: | |||
chunksize = int(len_itr / n_jobs) + 1 | |||
else: | |||
@@ -175,11 +207,7 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo | |||
G_listID = listID_toshare | |||
do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort, repeats) | |||
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) | |||
if verbose: | |||
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), | |||
desc='computing GEDs', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(do_partial, itr, chunksize) | |||
iterator = get_iters(pool.imap_unordered(do_partial, itr, chunksize), desc='computing GEDs', file=sys.stdout, length=len_itr, verbose=verbose) | |||
# iterator = pool.imap_unordered(do_partial, itr, chunksize) | |||
for i, j, dis, n_eo_tmp in iterator: | |||
idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2) | |||
@@ -191,14 +219,11 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo | |||
# print(i, j, idx_itr, dis) | |||
pool.close() | |||
pool.join() | |||
else: | |||
ged_vec = [] | |||
n_edit_operations = [] | |||
if verbose: | |||
iterator = tqdm(range(len(graphs)), desc='computing GEDs', file=sys.stdout) | |||
else: | |||
iterator = range(len(graphs)) | |||
iterator = get_iters(range(len(graphs)), desc='computing GEDs', file=sys.stdout, length=len(graphs), verbose=verbose) | |||
for i in iterator: | |||
# for i in range(len(graphs)): | |||
for j in range(i + 1, len(graphs)): | |||
@@ -232,14 +257,14 @@ def _compute_ged_parallel(env, gid1, gid2, g1, g2, options, sort, repeats): | |||
def _compute_ged(env, gid1, gid2, g1, g2, repeats): | |||
dis_min = np.inf | |||
dis_min = np.inf # @todo: maybe compare distance and then do others (faster). | |||
for i in range(0, repeats): | |||
env.run_method(gid1, gid2) | |||
pi_forward = env.get_forward_map(gid1, gid2) | |||
pi_backward = env.get_backward_map(gid1, gid2) | |||
upper = env.get_upper_bound(gid1, gid2) | |||
upper = env.get_upper_bound(gid1, gid2) | |||
dis = upper | |||
# make the map label correct (label remove map as np.inf) | |||
nodes1 = [n for n in g1.nodes()] | |||
nodes2 = [n for n in g2.nodes()] | |||
@@ -247,7 +272,7 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats): | |||
nb2 = nx.number_of_nodes(g2) | |||
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] | |||
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] | |||
if dis < dis_min: | |||
dis_min = dis | |||
pi_forward_min = pi_forward | |||
@@ -268,7 +293,7 @@ def label_costs_to_matrix(costs, nb_labels): | |||
Returns | |||
------- | |||
cost_matrix : numpy.array. | |||
cost_matrix : numpy.array. | |||
The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. | |||
""" | |||
# Initialize label cost matrix. | |||
@@ -282,13 +307,13 @@ def label_costs_to_matrix(costs, nb_labels): | |||
for row in range(1, nb_labels + 1): | |||
cost_matrix[row, 0] = costs[i] | |||
i += 1 | |||
# Costs of substitutions. | |||
# Costs of substitutions. | |||
for row in range(1, nb_labels + 1): | |||
for col in range(row + 1, nb_labels + 1): | |||
cost_matrix[row, col] = costs[i] | |||
cost_matrix[col, row] = costs[i] | |||
i += 1 | |||
return cost_matrix | |||
@@ -299,7 +324,7 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is | |||
edge_labels = kwargs.get('edge_labels', []) | |||
return get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, | |||
node_labels=node_labels, edge_labels=edge_labels) | |||
else: | |||
else: | |||
raise Exception('Edit cost "', edit_cost, '" is not supported.') | |||
else: | |||
if edit_cost == 'LETTER' or edit_cost == 'LETTER2': | |||
@@ -307,21 +332,21 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is | |||
elif edit_cost == 'NON_SYMBOLIC': | |||
node_attrs = kwargs.get('node_attrs', []) | |||
edge_attrs = kwargs.get('edge_attrs', []) | |||
return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, | |||
return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, | |||
node_attrs=node_attrs, edge_attrs=edge_attrs) | |||
elif edit_cost == 'CONSTANT': | |||
node_labels = kwargs.get('node_labels', []) | |||
edge_labels = kwargs.get('edge_labels', []) | |||
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, | |||
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, | |||
node_labels=node_labels, edge_labels=edge_labels) | |||
else: | |||
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) | |||
def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, | |||
else: | |||
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) | |||
def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, | |||
node_labels=[], edge_labels=[]): | |||
"""Compute times that edit operations are used in an edit path for symbolic-labeled graphs, where the costs are different for each pair of nodes. | |||
Returns | |||
------- | |||
list | |||
@@ -330,7 +355,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, | |||
# Initialize. | |||
nb_ops_node = np.zeros((1 + len(node_labels), 1 + len(node_labels))) | |||
nb_ops_edge = np.zeros((1 + len(edge_labels), 1 + len(edge_labels))) | |||
# For nodes. | |||
nodes1 = [n for n in g1.nodes()] | |||
for i, map_i in enumerate(forward_map): | |||
@@ -350,7 +375,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, | |||
label = tuple(g2.nodes[nodes2[i]].items()) | |||
idx_label = node_labels.index(label) # @todo: faster | |||
nb_ops_node[0, idx_label + 1] += 1 | |||
# For edges. | |||
edges1 = [e for e in g1.edges()] | |||
edges2_marked = [] | |||
@@ -371,7 +396,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, | |||
label2 = tuple(g2.edges[(nf2, nt2)].items()) | |||
if label1 != label2: | |||
idx_label2 = edge_labels.index(label2) # @todo: faster | |||
nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1 | |||
nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1 | |||
# Switch nf2 and nt2, for directed graphs. | |||
elif (nt2, nf2) in g2.edges(): | |||
edges2_marked.append((nt2, nf2)) | |||
@@ -389,7 +414,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, | |||
label = tuple(g2.edges[(nt, nf)].items()) | |||
idx_label = edge_labels.index(label) # @todo: faster | |||
nb_ops_edge[0, idx_label + 1] += 1 | |||
# Reform the numbers of edit oeprations into a vector. | |||
nb_eo_vector = [] | |||
# node insertion. | |||
@@ -412,9 +437,9 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, | |||
for i in range(1, len(nb_ops_edge)): | |||
for j in range(i + 1, len(nb_ops_edge)): | |||
nb_eo_vector.append(nb_ops_edge[i, j]) | |||
return nb_eo_vector | |||
def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, | |||
node_labels=[], edge_labels=[]): | |||
@@ -426,7 +451,7 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, | |||
n_ei = 0 | |||
n_er = 0 | |||
n_es = 0 | |||
nodes1 = [n for n in g1.nodes()] | |||
for i, map_i in enumerate(forward_map): | |||
if map_i == np.inf: | |||
@@ -441,9 +466,9 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, | |||
for map_i in backward_map: | |||
if map_i == np.inf: | |||
n_vi += 1 | |||
# idx_nodes1 = range(0, len(node1)) | |||
edges1 = [e for e in g1.edges()] | |||
nb_edges2_cnted = 0 | |||
for n1, n2 in edges1: | |||
@@ -475,7 +500,7 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, | |||
else: | |||
n_er += 1 | |||
n_ei = nx.number_of_edges(g2) - nb_edges2_cnted | |||
return n_vi, n_vr, n_vs, n_ei, n_er, n_es | |||
@@ -488,7 +513,7 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): | |||
sod_vs = 0 | |||
n_ei = 0 | |||
n_er = 0 | |||
nodes1 = [n for n in g1.nodes()] | |||
for i, map_i in enumerate(forward_map): | |||
if map_i == np.inf: | |||
@@ -501,9 +526,9 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): | |||
for map_i in backward_map: | |||
if map_i == np.inf: | |||
n_vi += 1 | |||
# idx_nodes1 = range(0, len(node1)) | |||
edges1 = [e for e in g1.edges()] | |||
nb_edges2_cnted = 0 | |||
for n1, n2 in edges1: | |||
@@ -520,7 +545,7 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): | |||
else: | |||
n_er += 1 | |||
n_ei = nx.number_of_edges(g2) - nb_edges2_cnted | |||
return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er | |||
@@ -536,7 +561,7 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, | |||
n_er = 0 | |||
n_es = 0 | |||
sod_es = 0 | |||
nodes1 = [n for n in g1.nodes()] | |||
for i, map_i in enumerate(forward_map): | |||
if map_i == np.inf: | |||
@@ -551,9 +576,9 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, | |||
for map_i in backward_map: | |||
if map_i == np.inf: | |||
n_vi += 1 | |||
# idx_nodes1 = range(0, len(node1)) | |||
edges1 = [e for e in g1.edges()] | |||
for n1, n2 in edges1: | |||
idx1 = nodes1.index(n1) | |||
@@ -582,7 +607,7 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, | |||
else: | |||
n_er += 1 | |||
n_ei = nx.number_of_edges(g2) - n_es | |||
return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es | |||
@@ -615,7 +640,7 @@ def ged_options_to_string(options): | |||
opt_str += '--log ' + str(val) + ' ' | |||
elif key == 'randomness': | |||
opt_str += '--randomness ' + str(val) + ' ' | |||
# if not isinstance(val, list): | |||
# opt_str += '--' + key.replace('_', '-') + ' ' | |||
# if val == False: | |||
@@ -37,7 +37,7 @@ class GraphKernel(object): | |||
elif len(graphs[0]) == 0: | |||
raise Exception('The graph list given is empty. No computation was performed.') | |||
else: | |||
self._graphs = [g.copy() for g in graphs[0]] | |||
self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. | |||
self._gram_matrix = self._compute_gram_matrix() | |||
self._gram_matrix_unnorm = np.copy(self._gram_matrix) | |||
if self._normalize: | |||
@@ -14,7 +14,7 @@ import sys | |||
from itertools import product | |||
# from functools import partial | |||
from multiprocessing import Pool | |||
from tqdm import tqdm | |||
from gklearn.utils import get_iters | |||
# import networkx as nx | |||
import numpy as np | |||
from gklearn.utils.parallel import parallel_gm, parallel_me | |||
@@ -41,10 +41,7 @@ class StructuralSP(GraphKernel): | |||
def _compute_gm_series(self): | |||
# get shortest paths of each graph in the graphs. | |||
splist = [] | |||
if self._verbose >= 2: | |||
iterator = tqdm(self._graphs, desc='getting sp graphs', file=sys.stdout) | |||
else: | |||
iterator = self._graphs | |||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for g in iterator: | |||
splist.append(self._get_sps_as_trie(g)) | |||
@@ -57,10 +54,9 @@ class StructuralSP(GraphKernel): | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
if self._verbose >= 2: | |||
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) | |||
else: | |||
iterator = itr | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
length=len_itr, verbose=(self._verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for i, j in iterator: | |||
kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||
@@ -91,11 +87,9 @@ class StructuralSP(GraphKernel): | |||
get_sps_fun = self._wrapper_get_sps_trie | |||
else: | |||
get_sps_fun = self._wrapper_get_sps_naive | |||
if self.verbose >= 2: | |||
iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize), | |||
desc='getting shortest paths', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(get_sps_fun, itr, chunksize) | |||
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | |||
desc='getting shortest paths', file=sys.stdout, | |||
length=len(self._graphs), verbose=(self._verbose >= 2)) | |||
for i, sp in iterator: | |||
splist[i] = sp | |||
pool.close() | |||
@@ -122,10 +116,8 @@ class StructuralSP(GraphKernel): | |||
# get shortest paths of g1 and each graph in g_list. | |||
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) | |||
splist = [] | |||
if self._verbose >= 2: | |||
iterator = tqdm(g_list, desc='getting sp graphs', file=sys.stdout) | |||
else: | |||
iterator = g_list | |||
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, | |||
verbose=(self._verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for g in iterator: | |||
splist.append(self._get_sps_as_trie(g)) | |||
@@ -135,10 +127,8 @@ class StructuralSP(GraphKernel): | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
if self._verbose >= 2: | |||
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) | |||
else: | |||
iterator = range(len(g_list)) | |||
iterator = get_iters(range(len(g_list)), desc='Computing kernels', | |||
file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for i in iterator: | |||
kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i]) | |||
@@ -166,11 +156,9 @@ class StructuralSP(GraphKernel): | |||
get_sps_fun = self._wrapper_get_sps_trie | |||
else: | |||
get_sps_fun = self._wrapper_get_sps_naive | |||
if self.verbose >= 2: | |||
iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize), | |||
desc='getting shortest paths', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(get_sps_fun, itr, chunksize) | |||
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), | |||
desc='getting shortest paths', file=sys.stdout, | |||
length=len(g_list), verbose=(self._verbose >= 2)) | |||
for i, sp in iterator: | |||
splist[i] = sp | |||
pool.close() | |||
@@ -12,13 +12,13 @@ import os | |||
class Dataset(object): | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) | |||
def __init__(self, filename=None, filename_targets=None, **kwargs): | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) | |||
if filename is None: | |||
self._graphs = None | |||
self._targets = None | |||
@@ -28,7 +28,7 @@ class Dataset(object): | |||
self._edge_attrs = None | |||
else: | |||
self.load_dataset(filename, filename_targets=filename_targets, **kwargs) | |||
self._substructures = None | |||
self._node_label_dim = None | |||
self._edge_label_dim = None | |||
@@ -53,8 +53,8 @@ class Dataset(object): | |||
self._node_attr_dim = None | |||
self._edge_attr_dim = None | |||
self._class_number = None | |||
def load_dataset(self, filename, filename_targets=None, **kwargs): | |||
self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs) | |||
self._node_labels = label_names['node_labels'] | |||
@@ -62,15 +62,15 @@ class Dataset(object): | |||
self._edge_labels = label_names['edge_labels'] | |||
self._edge_attrs = label_names['edge_attrs'] | |||
self.clean_labels() | |||
def load_graphs(self, graphs, targets=None): | |||
# this has to be followed by set_labels(). | |||
self._graphs = graphs | |||
self._targets = targets | |||
# self.set_labels_attrs() # @todo | |||
def load_predefined_dataset(self, ds_name): | |||
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | |||
if ds_name == 'Acyclic': | |||
@@ -130,7 +130,7 @@ class Dataset(object): | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'NCI109': | |||
ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
elif ds_name == 'PAH': | |||
ds_file = current_path + '../../datasets/PAH/dataset.ds' | |||
self._graphs, self._targets, label_names = load_dataset(ds_file) | |||
@@ -143,13 +143,13 @@ class Dataset(object): | |||
pass | |||
else: | |||
raise Exception('The dataset name "', ds_name, '" is not pre-defined.') | |||
self._node_labels = label_names['node_labels'] | |||
self._node_attrs = label_names['node_attrs'] | |||
self._edge_labels = label_names['edge_labels'] | |||
self._edge_attrs = label_names['edge_attrs'] | |||
self.clean_labels() | |||
def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): | |||
self._node_labels = node_labels | |||
@@ -157,7 +157,7 @@ class Dataset(object): | |||
self._edge_labels = edge_labels | |||
self._edge_attrs = edge_attrs | |||
def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): | |||
# @todo: remove labels which have only one possible values. | |||
if node_labels is None: | |||
@@ -183,86 +183,86 @@ class Dataset(object): | |||
# if 'attributes' in e[2]: | |||
# return len(e[2]['attributes']) | |||
# return 0 | |||
def get_dataset_infos(self, keys=None, params=None): | |||
"""Computes and returns the structure and property information of the graph dataset. | |||
Parameters | |||
---------- | |||
keys : list, optional | |||
A list of strings which indicate which informations will be returned. The | |||
possible choices includes: | |||
'substructures': sub-structures graphs contains, including 'linear', 'non | |||
'substructures': sub-structures graphs contains, including 'linear', 'non | |||
linear' and 'cyclic'. | |||
'node_label_dim': whether vertices have symbolic labels. | |||
'edge_label_dim': whether egdes have symbolic labels. | |||
'directed': whether graphs in dataset are directed. | |||
'dataset_size': number of graphs in dataset. | |||
'total_node_num': total number of vertices of all graphs in dataset. | |||
'ave_node_num': average number of vertices of graphs in dataset. | |||
'min_node_num': minimum number of vertices of graphs in dataset. | |||
'max_node_num': maximum number of vertices of graphs in dataset. | |||
'total_edge_num': total number of edges of all graphs in dataset. | |||
'ave_edge_num': average number of edges of graphs in dataset. | |||
'min_edge_num': minimum number of edges of graphs in dataset. | |||
'max_edge_num': maximum number of edges of graphs in dataset. | |||
'ave_node_degree': average vertex degree of graphs in dataset. | |||
'min_node_degree': minimum vertex degree of graphs in dataset. | |||
'max_node_degree': maximum vertex degree of graphs in dataset. | |||
'ave_fill_factor': average fill factor (number_of_edges / | |||
'ave_fill_factor': average fill factor (number_of_edges / | |||
(number_of_nodes ** 2)) of graphs in dataset. | |||
'min_fill_factor': minimum fill factor of graphs in dataset. | |||
'max_fill_factor': maximum fill factor of graphs in dataset. | |||
'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset. | |||
'edge_label_nums': list number of symbolic edge labels of graphs in dataset. | |||
'node_attr_dim': number of dimensions of non-symbolic vertex labels. | |||
'node_attr_dim': number of dimensions of non-symbolic vertex labels. | |||
Extracted from the 'attributes' attribute of graph nodes. | |||
'edge_attr_dim': number of dimensions of non-symbolic edge labels. | |||
'edge_attr_dim': number of dimensions of non-symbolic edge labels. | |||
Extracted from the 'attributes' attribute of graph edges. | |||
'class_number': number of classes. Only available for classification problems. | |||
'all_degree_entropy': the entropy of degree distribution of each graph. | |||
'ave_degree_entropy': the average entropy of degree distribution of all graphs. | |||
All informations above will be returned if `keys` is not given. | |||
params: dict of dict, optional | |||
A dictinary which contains extra parameters for each possible | |||
A dictinary which contains extra parameters for each possible | |||
element in ``keys``. | |||
Return | |||
------ | |||
dict | |||
Information of the graph dataset keyed by `keys`. | |||
""" | |||
infos = {} | |||
if keys == None: | |||
keys = [ | |||
'substructures', | |||
@@ -292,13 +292,13 @@ class Dataset(object): | |||
'all_degree_entropy', | |||
'ave_degree_entropy' | |||
] | |||
# dataset size | |||
if 'dataset_size' in keys: | |||
if self._dataset_size is None: | |||
self._dataset_size = self._get_dataset_size() | |||
infos['dataset_size'] = self._dataset_size | |||
# graph node number | |||
if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']): | |||
all_node_nums = self._get_all_node_nums() | |||
@@ -307,22 +307,22 @@ class Dataset(object): | |||
if self._total_node_num is None: | |||
self._total_node_num = self._get_total_node_num(all_node_nums) | |||
infos['total_node_num'] = self._total_node_num | |||
if 'ave_node_num' in keys: | |||
if self._ave_node_num is None: | |||
self._ave_node_num = self._get_ave_node_num(all_node_nums) | |||
infos['ave_node_num'] = self._ave_node_num | |||
if 'min_node_num' in keys: | |||
if self._min_node_num is None: | |||
self._min_node_num = self._get_min_node_num(all_node_nums) | |||
infos['min_node_num'] = self._min_node_num | |||
if 'max_node_num' in keys: | |||
if self._max_node_num is None: | |||
self._max_node_num = self._get_max_node_num(all_node_nums) | |||
infos['max_node_num'] = self._max_node_num | |||
# graph edge number | |||
if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']): | |||
all_edge_nums = self._get_all_edge_nums() | |||
@@ -331,12 +331,12 @@ class Dataset(object): | |||
if self._total_edge_num is None: | |||
self._total_edge_num = self._get_total_edge_num(all_edge_nums) | |||
infos['total_edge_num'] = self._total_edge_num | |||
if 'ave_edge_num' in keys: | |||
if self._ave_edge_num is None: | |||
self._ave_edge_num = self._get_ave_edge_num(all_edge_nums) | |||
infos['ave_edge_num'] = self._ave_edge_num | |||
if 'max_edge_num' in keys: | |||
if self._max_edge_num is None: | |||
self._max_edge_num = self._get_max_edge_num(all_edge_nums) | |||
@@ -346,120 +346,120 @@ class Dataset(object): | |||
if self._min_edge_num is None: | |||
self._min_edge_num = self._get_min_edge_num(all_edge_nums) | |||
infos['min_edge_num'] = self._min_edge_num | |||
# label number | |||
if 'node_label_dim' in keys: | |||
if self._node_label_dim is None: | |||
self._node_label_dim = self._get_node_label_dim() | |||
infos['node_label_dim'] = self._node_label_dim | |||
infos['node_label_dim'] = self._node_label_dim | |||
if 'node_label_nums' in keys: | |||
if self._node_label_nums is None: | |||
self._node_label_nums = {} | |||
for node_label in self._node_labels: | |||
self._node_label_nums[node_label] = self._get_node_label_num(node_label) | |||
infos['node_label_nums'] = self._node_label_nums | |||
if 'edge_label_dim' in keys: | |||
if self._edge_label_dim is None: | |||
self._edge_label_dim = self._get_edge_label_dim() | |||
infos['edge_label_dim'] = self._edge_label_dim | |||
infos['edge_label_dim'] = self._edge_label_dim | |||
if 'edge_label_nums' in keys: | |||
if self._edge_label_nums is None: | |||
self._edge_label_nums = {} | |||
for edge_label in self._edge_labels: | |||
self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label) | |||
infos['edge_label_nums'] = self._edge_label_nums | |||
if 'directed' in keys or 'substructures' in keys: | |||
if self._directed is None: | |||
self._directed = self._is_directed() | |||
infos['directed'] = self._directed | |||
# node degree | |||
if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']): | |||
all_node_degrees = self._get_all_node_degrees() | |||
if 'ave_node_degree' in keys: | |||
if self._ave_node_degree is None: | |||
self._ave_node_degree = self._get_ave_node_degree(all_node_degrees) | |||
infos['ave_node_degree'] = self._ave_node_degree | |||
if 'max_node_degree' in keys: | |||
if self._max_node_degree is None: | |||
self._max_node_degree = self._get_max_node_degree(all_node_degrees) | |||
infos['max_node_degree'] = self._max_node_degree | |||
if 'min_node_degree' in keys: | |||
if self._min_node_degree is None: | |||
self._min_node_degree = self._get_min_node_degree(all_node_degrees) | |||
infos['min_node_degree'] = self._min_node_degree | |||
# fill factor | |||
if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']): | |||
all_fill_factors = self._get_all_fill_factors() | |||
if 'ave_fill_factor' in keys: | |||
if self._ave_fill_factor is None: | |||
self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors) | |||
infos['ave_fill_factor'] = self._ave_fill_factor | |||
if 'max_fill_factor' in keys: | |||
if self._max_fill_factor is None: | |||
self._max_fill_factor = self._get_max_fill_factor(all_fill_factors) | |||
infos['max_fill_factor'] = self._max_fill_factor | |||
if 'min_fill_factor' in keys: | |||
if self._min_fill_factor is None: | |||
self._min_fill_factor = self._get_min_fill_factor(all_fill_factors) | |||
infos['min_fill_factor'] = self._min_fill_factor | |||
if 'substructures' in keys: | |||
if self._substructures is None: | |||
self._substructures = self._get_substructures() | |||
infos['substructures'] = self._substructures | |||
if 'class_number' in keys: | |||
if self._class_number is None: | |||
self._class_number = self._get_class_number() | |||
infos['class_number'] = self._class_number | |||
if 'node_attr_dim' in keys: | |||
if self._node_attr_dim is None: | |||
self._node_attr_dim = self._get_node_attr_dim() | |||
infos['node_attr_dim'] = self._node_attr_dim | |||
if 'edge_attr_dim' in keys: | |||
if self._edge_attr_dim is None: | |||
self._edge_attr_dim = self._get_edge_attr_dim() | |||
infos['edge_attr_dim'] = self._edge_attr_dim | |||
# entropy of degree distribution. | |||
if 'all_degree_entropy' in keys: | |||
if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): | |||
base = params['all_degree_entropy']['base'] | |||
else: | |||
base = None | |||
infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base) | |||
if 'ave_degree_entropy' in keys: | |||
if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): | |||
base = params['ave_degree_entropy']['base'] | |||
else: | |||
base = None | |||
infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) | |||
return infos | |||
def print_graph_infos(self, infos): | |||
from collections import OrderedDict | |||
keys = list(infos.keys()) | |||
print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) | |||
def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): | |||
node_labels = [item for item in node_labels if item in self._node_labels] | |||
edge_labels = [item for item in edge_labels if item in self._edge_labels] | |||
@@ -485,8 +485,8 @@ class Dataset(object): | |||
self._node_attrs = [na for na in self._node_attrs if na not in node_attrs] | |||
if len(edge_attrs) > 0: | |||
self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs] | |||
def clean_labels(self): | |||
labels = [] | |||
for name in self._node_labels: | |||
@@ -543,8 +543,8 @@ class Dataset(object): | |||
for ed in G.edges(): | |||
del G.edges[ed][name] | |||
self._edge_attrs = labels | |||
def cut_graphs(self, range_): | |||
self._graphs = [self._graphs[i] for i in range_] | |||
if self._targets is not None: | |||
@@ -561,8 +561,8 @@ class Dataset(object): | |||
self._graphs = [p[1] for p in trimed_pairs] | |||
self._targets = [self._targets[i] for i in idx] | |||
self.clean_labels() | |||
def copy(self): | |||
dataset = Dataset() | |||
graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None | |||
@@ -575,8 +575,8 @@ class Dataset(object): | |||
dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) | |||
# @todo: clean_labels and add other class members? | |||
return dataset | |||
def get_all_node_labels(self): | |||
node_labels = [] | |||
for g in self._graphs: | |||
@@ -585,8 +585,8 @@ class Dataset(object): | |||
if nl not in node_labels: | |||
node_labels.append(nl) | |||
return node_labels | |||
def get_all_edge_labels(self): | |||
edge_labels = [] | |||
for g in self._graphs: | |||
@@ -595,94 +595,94 @@ class Dataset(object): | |||
if el not in edge_labels: | |||
edge_labels.append(el) | |||
return edge_labels | |||
def _get_dataset_size(self): | |||
return len(self._graphs) | |||
def _get_all_node_nums(self): | |||
return [nx.number_of_nodes(G) for G in self._graphs] | |||
def _get_total_node_nums(self, all_node_nums): | |||
return np.sum(all_node_nums) | |||
def _get_ave_node_num(self, all_node_nums): | |||
return np.mean(all_node_nums) | |||
def _get_min_node_num(self, all_node_nums): | |||
return np.amin(all_node_nums) | |||
def _get_max_node_num(self, all_node_nums): | |||
return np.amax(all_node_nums) | |||
def _get_all_edge_nums(self): | |||
return [nx.number_of_edges(G) for G in self._graphs] | |||
def _get_total_edge_nums(self, all_edge_nums): | |||
return np.sum(all_edge_nums) | |||
def _get_ave_edge_num(self, all_edge_nums): | |||
return np.mean(all_edge_nums) | |||
def _get_min_edge_num(self, all_edge_nums): | |||
return np.amin(all_edge_nums) | |||
def _get_max_edge_num(self, all_edge_nums): | |||
return np.amax(all_edge_nums) | |||
def _get_node_label_dim(self): | |||
return len(self._node_labels) | |||
def _get_node_label_num(self, node_label): | |||
nl = set() | |||
for G in self._graphs: | |||
nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||
return len(nl) | |||
def _get_edge_label_dim(self): | |||
return len(self._edge_labels) | |||
def _get_edge_label_num(self, edge_label): | |||
el = set() | |||
for G in self._graphs: | |||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
return len(el) | |||
def _is_directed(self): | |||
return nx.is_directed(self._graphs[0]) | |||
def _get_all_node_degrees(self): | |||
return [np.mean(list(dict(G.degree()).values())) for G in self._graphs] | |||
def _get_ave_node_degree(self, all_node_degrees): | |||
return np.mean(all_node_degrees) | |||
def _get_max_node_degree(self, all_node_degrees): | |||
return np.amax(all_node_degrees) | |||
def _get_min_node_degree(self, all_node_degrees): | |||
return np.amin(all_node_degrees) | |||
def _get_all_fill_factors(self): | |||
"""Get fill factor, the number of non-zero entries in the adjacency matrix. | |||
@@ -692,20 +692,20 @@ class Dataset(object): | |||
List of fill factors for all graphs. | |||
""" | |||
return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs] | |||
def _get_ave_fill_factor(self, all_fill_factors): | |||
return np.mean(all_fill_factors) | |||
def _get_max_fill_factor(self, all_fill_factors): | |||
return np.amax(all_fill_factors) | |||
def _get_min_fill_factor(self, all_fill_factors): | |||
return np.amin(all_fill_factors) | |||
def _get_substructures(self): | |||
subs = set() | |||
for G in self._graphs: | |||
@@ -737,22 +737,22 @@ class Dataset(object): | |||
# if any(len(i) > 2 for i in cyc): | |||
# subs.add('cyclic') | |||
# break | |||
return subs | |||
def _get_class_num(self): | |||
return len(set(self._targets)) | |||
def _get_node_attr_dim(self): | |||
return len(self._node_attrs) | |||
def _get_edge_attr_dim(self): | |||
return len(self._edge_attrs) | |||
def _compute_all_degree_entropy(self, base=None): | |||
"""Compute the entropy of degree distribution of each graph. | |||
@@ -767,15 +767,15 @@ class Dataset(object): | |||
The calculated entropy. | |||
""" | |||
from gklearn.utils.stats import entropy | |||
degree_entropy = [] | |||
for g in self._graphs: | |||
degrees = list(dict(g.degree()).values()) | |||
en = entropy(degrees, base=base) | |||
degree_entropy.append(en) | |||
return degree_entropy | |||
@property | |||
def graphs(self): | |||
return self._graphs | |||
@@ -784,8 +784,8 @@ class Dataset(object): | |||
@property | |||
def targets(self): | |||
return self._targets | |||
@property | |||
def node_labels(self): | |||
return self._node_labels | |||
@@ -794,25 +794,25 @@ class Dataset(object): | |||
@property | |||
def edge_labels(self): | |||
return self._edge_labels | |||
@property | |||
def node_attrs(self): | |||
return self._node_attrs | |||
@property | |||
def edge_attrs(self): | |||
return self._edge_attrs | |||
def split_dataset_by_target(dataset): | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) | |||
from gklearn.preimage.utils import get_same_item_indices | |||
graphs = dataset.graphs | |||
targets = dataset.targets | |||
datasets = [] | |||
@@ -1,8 +1,8 @@ | |||
""" Utilities function to manage graph files | |||
""" | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning) | |||
# import warnings | |||
# warnings.simplefilter('always', DeprecationWarning) | |||
# warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning) | |||
from os.path import dirname, splitext | |||
@@ -26,17 +26,17 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs): | |||
y : List | |||
Targets corresponding to graphs. | |||
Notes | |||
----- | |||
This function supports following graph dataset formats: | |||
'ds': load data from .ds file. See comments of function loadFromDS for a example. | |||
'cxl': load data from Graph eXchange Language file (.cxl file). See | |||
'cxl': load data from Graph eXchange Language file (.cxl file). See | |||
`here <http://www.gupro.de/GXL/Introduction/background.html>`__ for detail. | |||
'sdf': load data from structured data file (.sdf file). See | |||
'sdf': load data from structured data file (.sdf file). See | |||
`here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ | |||
for details. | |||
@@ -77,20 +77,20 @@ def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', **kwargs): | |||
import warnings | |||
warnings.simplefilter('always', DeprecationWarning) | |||
warnings.warn('The function "gklearn.utils.save_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataSaver" instead.', DeprecationWarning) | |||
import os | |||
dirname_ds = os.path.dirname(filename) | |||
if dirname_ds != '': | |||
dirname_ds += '/' | |||
os.makedirs(dirname_ds, exist_ok=True) | |||
if 'graph_dir' in kwargs: | |||
graph_dir = kwargs['graph_dir'] + '/' | |||
os.makedirs(graph_dir, exist_ok=True) | |||
del kwargs['graph_dir'] | |||
else: | |||
graph_dir = dirname_ds | |||
graph_dir = dirname_ds | |||
if group == 'xml' and gformat == 'gxl': | |||
with open(filename + '.xml', 'w') as fgroup: | |||
fgroup.write("<?xml version=\"1.0\"?>") | |||
@@ -122,7 +122,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he | |||
1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo | |||
2 3 1 1 | |||
Check `CTFile Formats file <https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS>`__ | |||
for detailed format discription. | |||
""" | |||
@@ -144,7 +144,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he | |||
if count_line_tags[i] != '': # if not obsoleted | |||
g.graph[count_line_tags[i]] = tmp[i].strip() | |||
i += 1 | |||
# read the atom block. | |||
atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag'] | |||
for i in range(0, nb_atoms): | |||
@@ -156,7 +156,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he | |||
if atom_tags[j] != '': | |||
g.nodes[i][atom_tags[j]] = tmp[j].strip() | |||
j += 1 | |||
# read the bond block. | |||
bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status'] | |||
for i in range(0, nb_bonds): | |||
@@ -169,7 +169,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he | |||
if bond_tags[j] != '': | |||
g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip() | |||
j += 1 | |||
# get label names. | |||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1] | |||
@@ -188,7 +188,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he | |||
else: | |||
label_names['edge_attrs'].append(key) | |||
break | |||
return g, label_names | |||
@@ -215,19 +215,19 @@ def load_gxl(filename): # @todo: directed graphs. | |||
for attr in edge.iter('attr'): | |||
labels[attr.attrib['name']] = attr[0].text | |||
g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) | |||
# get label names. | |||
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
for node in root.iter('node'): | |||
for attr in node.iter('attr'): | |||
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||
label_names['node_labels'].append(attr.attrib['name']) | |||
else: | |||
label_names['node_attrs'].append(attr.attrib['name']) | |||
break | |||
for edge in root.iter('edge'): | |||
for attr in edge.iter('attr'): | |||
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow. | |||
label_names['edge_labels'].append(attr.attrib['name']) | |||
else: | |||
label_names['edge_attrs'].append(attr.attrib['name']) | |||
@@ -249,20 +249,20 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[], | |||
gxl_file.write("<graph id=\"" + name + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||
for v, attrs in graph.nodes(data=True): | |||
gxl_file.write("<node id=\"_" + str(v) + "\">") | |||
for l_name in node_labels: | |||
gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||
for l_name in node_labels: | |||
gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||
str(attrs[l_name]) + "</int></attr>") | |||
for a_name in node_attrs: | |||
gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||
for a_name in node_attrs: | |||
gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||
str(attrs[a_name]) + "</float></attr>") | |||
gxl_file.write("</node>\n") | |||
for v1, v2, attrs in graph.edges(data=True): | |||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") | |||
for l_name in edge_labels: | |||
gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||
for l_name in edge_labels: | |||
gxl_file.write("<attr name=\"" + l_name + "\"><int>" + | |||
str(attrs[l_name]) + "</int></attr>") | |||
for a_name in edge_attrs: | |||
gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||
for a_name in edge_attrs: | |||
gxl_file.write("<attr name=\"" + a_name + "\"><float>" + | |||
str(attrs[a_name]) + "</float></attr>") | |||
gxl_file.write("</edge>\n") | |||
gxl_file.write("</graph>\n") | |||
@@ -276,7 +276,7 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[], | |||
attr['edgeids'] = 'true' | |||
attr['edgemode'] = 'undirected' | |||
graph_node = ET.SubElement(root_node, 'graph', attrib=attr) | |||
for v in graph: | |||
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) | |||
for attr in graph.nodes[v].keys(): | |||
@@ -285,7 +285,7 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[], | |||
cur_value = ET.SubElement(cur_attr, | |||
graph.nodes[v][attr].__class__.__name__) | |||
cur_value.text = graph.nodes[v][attr] | |||
for v1 in graph: | |||
for v2 in graph[v1]: | |||
if (v1 < v2): # Non oriented graphs | |||
@@ -302,7 +302,7 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[], | |||
cur_value = ET.SubElement( | |||
cur_attr, graph[v1][v2][attr].__class__.__name__) | |||
cur_value.text = str(graph[v1][v2][attr]) | |||
tree = ET.ElementTree(root_node) | |||
tree.write(filename) | |||
elif method == 'gedlib': | |||
@@ -458,11 +458,11 @@ def load_mat(filename, order): # @todo: need to be updated (auto order) or depre | |||
g.add_edge(col, row) | |||
data.append(g) | |||
# print(g.edges(data=True)) | |||
label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} | |||
if order[1] == 0: | |||
label_names['edge_labels'].append('label_1') | |||
return data, y, label_names | |||
@@ -477,12 +477,12 @@ def load_tud(filename): | |||
import networkx as nx | |||
from os import listdir | |||
from os.path import dirname, basename | |||
def get_infos_from_readme(frm): # @todo: add README (cuniform), maybe node/edge label maps. | |||
"""Get information from DS_label_readme.txt file. | |||
""" | |||
def get_label_names_from_line(line): | |||
"""Get names of labels/attributes from a line. | |||
""" | |||
@@ -490,8 +490,8 @@ def load_tud(filename): | |||
names = str_names.split(',') | |||
names = [attr.strip() for attr in names] | |||
return names | |||
def get_class_label_map(label_map_strings): | |||
label_map = {} | |||
for string in label_map_strings: | |||
@@ -500,7 +500,7 @@ def load_tud(filename): | |||
return label_map | |||
label_names = {'node_labels': [], 'node_attrs': [], | |||
label_names = {'node_labels': [], 'node_attrs': [], | |||
'edge_labels': [], 'edge_attrs': []} | |||
class_label_map = None | |||
class_label_map_strings = [] | |||
@@ -528,16 +528,16 @@ def load_tud(filename): | |||
line = content_rm[i].strip() | |||
class_label_map = get_class_label_map(class_label_map_strings) | |||
i += 1 | |||
return label_names, class_label_map | |||
# get dataset name. | |||
dirname_dataset = dirname(filename) | |||
filename = basename(filename) | |||
fn_split = filename.split('_A') | |||
ds_name = fn_split[0].strip() | |||
# load data file names | |||
for name in listdir(dirname_dataset): | |||
if ds_name + '_A' in name: | |||
@@ -561,20 +561,20 @@ def load_tud(filename): | |||
# this is supposed to be the node attrs, make sure to put this as the last 'elif' | |||
elif ds_name + '_attributes' in name: | |||
fna = dirname_dataset + '/' + name | |||
# get labels and attributes names. | |||
if 'frm' in locals(): | |||
label_names, class_label_map = get_infos_from_readme(frm) | |||
else: | |||
label_names = {'node_labels': [], 'node_attrs': [], | |||
label_names = {'node_labels': [], 'node_attrs': [], | |||
'edge_labels': [], 'edge_attrs': []} | |||
class_label_map = None | |||
with open(fgi) as gi: | |||
content_gi = gi.read().splitlines() # graph indicator | |||
with open(fam) as am: | |||
content_am = am.read().splitlines() # adjacency matrix | |||
# load targets. | |||
if 'fgl' in locals(): | |||
with open(fgl) as gl: | |||
@@ -609,7 +609,7 @@ def load_tud(filename): | |||
else: | |||
for i, line in enumerate(content_gi): | |||
data[int(line) - 1].add_node(i) | |||
# add edges | |||
for line in content_am: | |||
tmp = line.split(',') | |||
@@ -670,7 +670,7 @@ def load_tud(filename): | |||
data[g].edges[n[0], n[1]][a_name] = attrs[i] | |||
return data, targets, label_names | |||
def load_from_ds(filename, filename_targets): | |||
"""Load data from .ds file. | |||
@@ -681,9 +681,9 @@ def load_from_ds(filename, filename_targets): | |||
'.gxl': see dunction load_gxl for detail. | |||
Note these graph formats are checked automatically by the extensions of | |||
Note these graph formats are checked automatically by the extensions of | |||
graph files. | |||
""" | |||
""" | |||
dirname_dataset = dirname(filename) | |||
data = [] | |||
y = [] | |||
@@ -695,7 +695,7 @@ def load_from_ds(filename, filename_targets): | |||
load_file_fun = load_ct | |||
elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet. | |||
load_file_fun = load_gxl | |||
if filename_targets is None or filename_targets == '': | |||
for i in range(0, len(content)): | |||
tmp = content[i].split(' ') | |||
@@ -711,7 +711,7 @@ def load_from_ds(filename, filename_targets): | |||
g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1)) | |||
data.append(g) | |||
_append_label_names(label_names, l_names) | |||
with open(filename_targets) as fnt: | |||
content_y = fnt.read().splitlines() | |||
# assume entries in filename and filename_targets have the same order. | |||
@@ -719,13 +719,13 @@ def load_from_ds(filename, filename_targets): | |||
tmp = item.split(' ') | |||
# assume the 3rd entry in a line is y (for Alkane dataset) | |||
y.append(float(tmp[2])) | |||
return data, y, label_names | |||
# def load_from_cxl(filename): | |||
# import xml.etree.ElementTree as ET | |||
# | |||
# | |||
# dirname_dataset = dirname(filename) | |||
# tree = ET.parse(filename) | |||
# root = tree.getroot() | |||
@@ -736,11 +736,11 @@ def load_from_ds(filename, filename_targets): | |||
# mol_class = graph.attrib['class'] | |||
# data.append(load_gxl(dirname_dataset + '/' + mol_filename)) | |||
# y.append(mol_class) | |||
def load_from_xml(filename, dir_dataset=None): | |||
import xml.etree.ElementTree as ET | |||
if dir_dataset is not None: | |||
dir_dataset = dir_dataset | |||
else: | |||
@@ -757,16 +757,16 @@ def load_from_xml(filename, dir_dataset=None): | |||
data.append(g) | |||
_append_label_names(label_names, l_names) | |||
y.append(mol_class) | |||
return data, y, label_names | |||
def _append_label_names(label_names, new_names): | |||
for key, val in label_names.items(): | |||
label_names[key] += [name for name in new_names[key] if name not in val] | |||
if __name__ == '__main__': | |||
if __name__ == '__main__': | |||
# ### Load dataset from .ds file. | |||
# # .ct files. | |||
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', | |||
@@ -782,7 +782,7 @@ if __name__ == '__main__': | |||
# print(Gn[1].nodes(data=True)) | |||
# print(Gn[1].edges(data=True)) | |||
# print(targets[1]) | |||
# # .gxl file. | |||
# ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb | |||
# Gn, y, label_names = load_dataset(ds_file) | |||
@@ -803,7 +803,7 @@ if __name__ == '__main__': | |||
# ### Convert graph from one format to another. | |||
# # .gxl file. | |||
# import networkx as nx | |||
# ds = {'name': 'monoterpenoides', | |||
# ds = {'name': 'monoterpenoides', | |||
# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb | |||
# Gn, y = loadDataset(ds['dataset']) | |||
# y = [int(i) for i in y] | |||
@@ -826,13 +826,13 @@ if __name__ == '__main__': | |||
# filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' | |||
# xparams = {'method': 'gedlib'} | |||
# saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) | |||
# save dataset. | |||
# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', | |||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||
# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | |||
# saveDataset(Gn, y, group='xml', filename='temp/temp') | |||
# test - new way to add labels and attributes. | |||
# dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' | |||
# filename = '../../datasets/Fingerprint/Fingerprint_A.txt' | |||
@@ -5,345 +5,345 @@ This file is for old version of graphkit-learn. | |||
def get_dataset_attributes(Gn, | |||
target=None, | |||
attr_names=[], | |||
node_label=None, | |||
edge_label=None): | |||
"""Returns the structure and property information of the graph dataset Gn. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs whose information will be returned. | |||
target : list | |||
The list of classification targets corresponding to Gn. Only works for | |||
classification problems. | |||
attr_names : list | |||
List of strings which indicate which informations will be returned. The | |||
possible choices includes: | |||
'substructures': sub-structures Gn contains, including 'linear', 'non | |||
target=None, | |||
attr_names=[], | |||
node_label=None, | |||
edge_label=None): | |||
"""Returns the structure and property information of the graph dataset Gn. | |||
Parameters | |||
---------- | |||
Gn : List of NetworkX graph | |||
List of graphs whose information will be returned. | |||
target : list | |||
The list of classification targets corresponding to Gn. Only works for | |||
classification problems. | |||
attr_names : list | |||
List of strings which indicate which informations will be returned. The | |||
possible choices includes: | |||
'substructures': sub-structures Gn contains, including 'linear', 'non | |||
linear' and 'cyclic'. | |||
'node_labeled': whether vertices have symbolic labels. | |||
'node_labeled': whether vertices have symbolic labels. | |||
'edge_labeled': whether egdes have symbolic labels. | |||
'edge_labeled': whether egdes have symbolic labels. | |||
'is_directed': whether graphs in Gn are directed. | |||
'is_directed': whether graphs in Gn are directed. | |||
'dataset_size': number of graphs in Gn. | |||
'dataset_size': number of graphs in Gn. | |||
'ave_node_num': average number of vertices of graphs in Gn. | |||
'ave_node_num': average number of vertices of graphs in Gn. | |||
'min_node_num': minimum number of vertices of graphs in Gn. | |||
'min_node_num': minimum number of vertices of graphs in Gn. | |||
'max_node_num': maximum number of vertices of graphs in Gn. | |||
'max_node_num': maximum number of vertices of graphs in Gn. | |||
'ave_edge_num': average number of edges of graphs in Gn. | |||
'ave_edge_num': average number of edges of graphs in Gn. | |||
'min_edge_num': minimum number of edges of graphs in Gn. | |||
'min_edge_num': minimum number of edges of graphs in Gn. | |||
'max_edge_num': maximum number of edges of graphs in Gn. | |||
'max_edge_num': maximum number of edges of graphs in Gn. | |||
'ave_node_degree': average vertex degree of graphs in Gn. | |||
'ave_node_degree': average vertex degree of graphs in Gn. | |||
'min_node_degree': minimum vertex degree of graphs in Gn. | |||
'min_node_degree': minimum vertex degree of graphs in Gn. | |||
'max_node_degree': maximum vertex degree of graphs in Gn. | |||
'max_node_degree': maximum vertex degree of graphs in Gn. | |||
'ave_fill_factor': average fill factor (number_of_edges / | |||
'ave_fill_factor': average fill factor (number_of_edges / | |||
(number_of_nodes ** 2)) of graphs in Gn. | |||
'min_fill_factor': minimum fill factor of graphs in Gn. | |||
'min_fill_factor': minimum fill factor of graphs in Gn. | |||
'max_fill_factor': maximum fill factor of graphs in Gn. | |||
'max_fill_factor': maximum fill factor of graphs in Gn. | |||
'node_label_num': number of symbolic vertex labels. | |||
'node_label_num': number of symbolic vertex labels. | |||
'edge_label_num': number of symbolic edge labels. | |||
'edge_label_num': number of symbolic edge labels. | |||
'node_attr_dim': number of dimensions of non-symbolic vertex labels. | |||
'node_attr_dim': number of dimensions of non-symbolic vertex labels. | |||
Extracted from the 'attributes' attribute of graph nodes. | |||
'edge_attr_dim': number of dimensions of non-symbolic edge labels. | |||
'edge_attr_dim': number of dimensions of non-symbolic edge labels. | |||
Extracted from the 'attributes' attribute of graph edges. | |||
'class_number': number of classes. Only available for classification problems. | |||
'class_number': number of classes. Only available for classification problems. | |||
node_label : string | |||
Node attribute used as label. The default node label is atom. Mandatory | |||
when 'node_labeled' or 'node_label_num' is required. | |||
node_label : string | |||
Node attribute used as label. The default node label is atom. Mandatory | |||
when 'node_labeled' or 'node_label_num' is required. | |||
edge_label : string | |||
Edge attribute used as label. The default edge label is bond_type. | |||
Mandatory when 'edge_labeled' or 'edge_label_num' is required. | |||
Return | |||
------ | |||
attrs : dict | |||
Value for each property. | |||
""" | |||
import networkx as nx | |||
import numpy as np | |||
attrs = {} | |||
def get_dataset_size(Gn): | |||
return len(Gn) | |||
def get_all_node_num(Gn): | |||
return [nx.number_of_nodes(G) for G in Gn] | |||
def get_ave_node_num(all_node_num): | |||
return np.mean(all_node_num) | |||
def get_min_node_num(all_node_num): | |||
return np.amin(all_node_num) | |||
def get_max_node_num(all_node_num): | |||
return np.amax(all_node_num) | |||
def get_all_edge_num(Gn): | |||
return [nx.number_of_edges(G) for G in Gn] | |||
def get_ave_edge_num(all_edge_num): | |||
return np.mean(all_edge_num) | |||
def get_min_edge_num(all_edge_num): | |||
return np.amin(all_edge_num) | |||
def get_max_edge_num(all_edge_num): | |||
return np.amax(all_edge_num) | |||
def is_node_labeled(Gn): | |||
return False if node_label is None else True | |||
def get_node_label_num(Gn): | |||
nl = set() | |||
for G in Gn: | |||
nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||
return len(nl) | |||
def is_edge_labeled(Gn): | |||
return False if edge_label is None else True | |||
def get_edge_label_num(Gn): | |||
el = set() | |||
for G in Gn: | |||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
return len(el) | |||
def is_directed(Gn): | |||
return nx.is_directed(Gn[0]) | |||
def get_ave_node_degree(Gn): | |||
return np.mean([np.mean(list(dict(G.degree()).values())) for G in Gn]) | |||
def get_max_node_degree(Gn): | |||
return np.amax([np.mean(list(dict(G.degree()).values())) for G in Gn]) | |||
def get_min_node_degree(Gn): | |||
return np.amin([np.mean(list(dict(G.degree()).values())) for G in Gn]) | |||
# get fill factor, the number of non-zero entries in the adjacency matrix. | |||
def get_ave_fill_factor(Gn): | |||
return np.mean([nx.number_of_edges(G) / (nx.number_of_nodes(G) | |||
* nx.number_of_nodes(G)) for G in Gn]) | |||
def get_max_fill_factor(Gn): | |||
return np.amax([nx.number_of_edges(G) / (nx.number_of_nodes(G) | |||
* nx.number_of_nodes(G)) for G in Gn]) | |||
def get_min_fill_factor(Gn): | |||
return np.amin([nx.number_of_edges(G) / (nx.number_of_nodes(G) | |||
* nx.number_of_nodes(G)) for G in Gn]) | |||
def get_substructures(Gn): | |||
subs = set() | |||
for G in Gn: | |||
degrees = list(dict(G.degree()).values()) | |||
if any(i == 2 for i in degrees): | |||
subs.add('linear') | |||
if np.amax(degrees) >= 3: | |||
subs.add('non linear') | |||
if 'linear' in subs and 'non linear' in subs: | |||
break | |||
if is_directed(Gn): | |||
for G in Gn: | |||
if len(list(nx.find_cycle(G))) > 0: | |||
subs.add('cyclic') | |||
break | |||
# else: | |||
# # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way. | |||
# upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10 | |||
# for G in Gn: | |||
# if (nx.number_of_edges(G) < upper): | |||
# cyc = list(nx.simple_cycles(G.to_directed())) | |||
# if any(len(i) > 2 for i in cyc): | |||
# subs.add('cyclic') | |||
# break | |||
# if 'cyclic' not in subs: | |||
# for G in Gn: | |||
# cyc = list(nx.simple_cycles(G.to_directed())) | |||
# if any(len(i) > 2 for i in cyc): | |||
# subs.add('cyclic') | |||
# break | |||
return subs | |||
def get_class_num(target): | |||
return len(set(target)) | |||
def get_node_attr_dim(Gn): | |||
for G in Gn: | |||
for n in G.nodes(data=True): | |||
if 'attributes' in n[1]: | |||
return len(n[1]['attributes']) | |||
return 0 | |||
def get_edge_attr_dim(Gn): | |||
for G in Gn: | |||
if nx.number_of_edges(G) > 0: | |||
for e in G.edges(data=True): | |||
if 'attributes' in e[2]: | |||
return len(e[2]['attributes']) | |||
return 0 | |||
if attr_names == []: | |||
attr_names = [ | |||
'substructures', | |||
'node_labeled', | |||
'edge_labeled', | |||
'is_directed', | |||
'dataset_size', | |||
'ave_node_num', | |||
'min_node_num', | |||
'max_node_num', | |||
'ave_edge_num', | |||
'min_edge_num', | |||
'max_edge_num', | |||
'ave_node_degree', | |||
'min_node_degree', | |||
'max_node_degree', | |||
'ave_fill_factor', | |||
'min_fill_factor', | |||
'max_fill_factor', | |||
'node_label_num', | |||
'edge_label_num', | |||
'node_attr_dim', | |||
'edge_attr_dim', | |||
'class_number', | |||
] | |||
# dataset size | |||
if 'dataset_size' in attr_names: | |||
attrs.update({'dataset_size': get_dataset_size(Gn)}) | |||
# graph node number | |||
if any(i in attr_names | |||
for i in ['ave_node_num', 'min_node_num', 'max_node_num']): | |||
all_node_num = get_all_node_num(Gn) | |||
if 'ave_node_num' in attr_names: | |||
attrs.update({'ave_node_num': get_ave_node_num(all_node_num)}) | |||
if 'min_node_num' in attr_names: | |||
attrs.update({'min_node_num': get_min_node_num(all_node_num)}) | |||
if 'max_node_num' in attr_names: | |||
attrs.update({'max_node_num': get_max_node_num(all_node_num)}) | |||
# graph edge number | |||
if any(i in attr_names for i in | |||
['ave_edge_num', 'min_edge_num', 'max_edge_num']): | |||
all_edge_num = get_all_edge_num(Gn) | |||
edge_label : string | |||
Edge attribute used as label. The default edge label is bond_type. | |||
Mandatory when 'edge_labeled' or 'edge_label_num' is required. | |||
if 'ave_edge_num' in attr_names: | |||
Return | |||
------ | |||
attrs : dict | |||
Value for each property. | |||
""" | |||
import networkx as nx | |||
import numpy as np | |||
attrs = {} | |||
def get_dataset_size(Gn): | |||
return len(Gn) | |||
def get_all_node_num(Gn): | |||
return [nx.number_of_nodes(G) for G in Gn] | |||
def get_ave_node_num(all_node_num): | |||
return np.mean(all_node_num) | |||
def get_min_node_num(all_node_num): | |||
return np.amin(all_node_num) | |||
def get_max_node_num(all_node_num): | |||
return np.amax(all_node_num) | |||
def get_all_edge_num(Gn): | |||
return [nx.number_of_edges(G) for G in Gn] | |||
def get_ave_edge_num(all_edge_num): | |||
return np.mean(all_edge_num) | |||
def get_min_edge_num(all_edge_num): | |||
return np.amin(all_edge_num) | |||
def get_max_edge_num(all_edge_num): | |||
return np.amax(all_edge_num) | |||
def is_node_labeled(Gn): | |||
return False if node_label is None else True | |||
def get_node_label_num(Gn): | |||
nl = set() | |||
for G in Gn: | |||
nl = nl | set(nx.get_node_attributes(G, node_label).values()) | |||
return len(nl) | |||
def is_edge_labeled(Gn): | |||
return False if edge_label is None else True | |||
def get_edge_label_num(Gn): | |||
el = set() | |||
for G in Gn: | |||
el = el | set(nx.get_edge_attributes(G, edge_label).values()) | |||
return len(el) | |||
def is_directed(Gn): | |||
return nx.is_directed(Gn[0]) | |||
def get_ave_node_degree(Gn): | |||
return np.mean([np.mean(list(dict(G.degree()).values())) for G in Gn]) | |||
def get_max_node_degree(Gn): | |||
return np.amax([np.mean(list(dict(G.degree()).values())) for G in Gn]) | |||
def get_min_node_degree(Gn): | |||
return np.amin([np.mean(list(dict(G.degree()).values())) for G in Gn]) | |||
# get fill factor, the number of non-zero entries in the adjacency matrix. | |||
def get_ave_fill_factor(Gn): | |||
return np.mean([nx.number_of_edges(G) / (nx.number_of_nodes(G) | |||
* nx.number_of_nodes(G)) for G in Gn]) | |||
def get_max_fill_factor(Gn): | |||
return np.amax([nx.number_of_edges(G) / (nx.number_of_nodes(G) | |||
* nx.number_of_nodes(G)) for G in Gn]) | |||
def get_min_fill_factor(Gn): | |||
return np.amin([nx.number_of_edges(G) / (nx.number_of_nodes(G) | |||
* nx.number_of_nodes(G)) for G in Gn]) | |||
def get_substructures(Gn): | |||
subs = set() | |||
for G in Gn: | |||
degrees = list(dict(G.degree()).values()) | |||
if any(i == 2 for i in degrees): | |||
subs.add('linear') | |||
if np.amax(degrees) >= 3: | |||
subs.add('non linear') | |||
if 'linear' in subs and 'non linear' in subs: | |||
break | |||
if is_directed(Gn): | |||
for G in Gn: | |||
if len(list(nx.find_cycle(G))) > 0: | |||
subs.add('cyclic') | |||
break | |||
# else: | |||
# # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way. | |||
# upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10 | |||
# for G in Gn: | |||
# if (nx.number_of_edges(G) < upper): | |||
# cyc = list(nx.simple_cycles(G.to_directed())) | |||
# if any(len(i) > 2 for i in cyc): | |||
# subs.add('cyclic') | |||
# break | |||
# if 'cyclic' not in subs: | |||
# for G in Gn: | |||
# cyc = list(nx.simple_cycles(G.to_directed())) | |||
# if any(len(i) > 2 for i in cyc): | |||
# subs.add('cyclic') | |||
# break | |||
return subs | |||
def get_class_num(target): | |||
return len(set(target)) | |||
def get_node_attr_dim(Gn): | |||
for G in Gn: | |||
for n in G.nodes(data=True): | |||
if 'attributes' in n[1]: | |||
return len(n[1]['attributes']) | |||
return 0 | |||
def get_edge_attr_dim(Gn): | |||
for G in Gn: | |||
if nx.number_of_edges(G) > 0: | |||
for e in G.edges(data=True): | |||
if 'attributes' in e[2]: | |||
return len(e[2]['attributes']) | |||
return 0 | |||
if attr_names == []: | |||
attr_names = [ | |||
'substructures', | |||
'node_labeled', | |||
'edge_labeled', | |||
'is_directed', | |||
'dataset_size', | |||
'ave_node_num', | |||
'min_node_num', | |||
'max_node_num', | |||
'ave_edge_num', | |||
'min_edge_num', | |||
'max_edge_num', | |||
'ave_node_degree', | |||
'min_node_degree', | |||
'max_node_degree', | |||
'ave_fill_factor', | |||
'min_fill_factor', | |||
'max_fill_factor', | |||
'node_label_num', | |||
'edge_label_num', | |||
'node_attr_dim', | |||
'edge_attr_dim', | |||
'class_number', | |||
] | |||
# dataset size | |||
if 'dataset_size' in attr_names: | |||
attrs.update({'ave_edge_num': get_ave_edge_num(all_edge_num)}) | |||
attrs.update({'dataset_size': get_dataset_size(Gn)}) | |||
if 'max_edge_num' in attr_names: | |||
# graph node number | |||
if any(i in attr_names | |||
for i in ['ave_node_num', 'min_node_num', 'max_node_num']): | |||
attrs.update({'max_edge_num': get_max_edge_num(all_edge_num)}) | |||
all_node_num = get_all_node_num(Gn) | |||
if 'ave_node_num' in attr_names: | |||
if 'min_edge_num' in attr_names: | |||
attrs.update({'ave_node_num': get_ave_node_num(all_node_num)}) | |||
if 'min_node_num' in attr_names: | |||
attrs.update({'min_edge_num': get_min_edge_num(all_edge_num)}) | |||
attrs.update({'min_node_num': get_min_node_num(all_node_num)}) | |||
if 'max_node_num' in attr_names: | |||
# label number | |||
if any(i in attr_names for i in ['node_labeled', 'node_label_num']): | |||
is_nl = is_node_labeled(Gn) | |||
node_label_num = get_node_label_num(Gn) | |||
attrs.update({'max_node_num': get_max_node_num(all_node_num)}) | |||
# graph edge number | |||
if any(i in attr_names for i in | |||
['ave_edge_num', 'min_edge_num', 'max_edge_num']): | |||
if 'node_labeled' in attr_names: | |||
# graphs are considered node unlabeled if all nodes have the same label. | |||
attrs.update({'node_labeled': is_nl if node_label_num > 1 else False}) | |||
all_edge_num = get_all_edge_num(Gn) | |||
if 'node_label_num' in attr_names: | |||
attrs.update({'node_label_num': node_label_num}) | |||
if 'ave_edge_num' in attr_names: | |||
if any(i in attr_names for i in ['edge_labeled', 'edge_label_num']): | |||
is_el = is_edge_labeled(Gn) | |||
edge_label_num = get_edge_label_num(Gn) | |||
attrs.update({'ave_edge_num': get_ave_edge_num(all_edge_num)}) | |||
if 'edge_labeled' in attr_names: | |||
# graphs are considered edge unlabeled if all edges have the same label. | |||
attrs.update({'edge_labeled': is_el if edge_label_num > 1 else False}) | |||
if 'max_edge_num' in attr_names: | |||
if 'edge_label_num' in attr_names: | |||
attrs.update({'edge_label_num': edge_label_num}) | |||
attrs.update({'max_edge_num': get_max_edge_num(all_edge_num)}) | |||
if 'is_directed' in attr_names: | |||
attrs.update({'is_directed': is_directed(Gn)}) | |||
if 'min_edge_num' in attr_names: | |||
if 'ave_node_degree' in attr_names: | |||
attrs.update({'ave_node_degree': get_ave_node_degree(Gn)}) | |||
attrs.update({'min_edge_num': get_min_edge_num(all_edge_num)}) | |||
if 'max_node_degree' in attr_names: | |||
attrs.update({'max_node_degree': get_max_node_degree(Gn)}) | |||
# label number | |||
if any(i in attr_names for i in ['node_labeled', 'node_label_num']): | |||
is_nl = is_node_labeled(Gn) | |||
node_label_num = get_node_label_num(Gn) | |||
if 'min_node_degree' in attr_names: | |||
attrs.update({'min_node_degree': get_min_node_degree(Gn)}) | |||
if 'ave_fill_factor' in attr_names: | |||
attrs.update({'ave_fill_factor': get_ave_fill_factor(Gn)}) | |||
if 'node_labeled' in attr_names: | |||
# graphs are considered node unlabeled if all nodes have the same label. | |||
attrs.update({'node_labeled': is_nl if node_label_num > 1 else False}) | |||
if 'max_fill_factor' in attr_names: | |||
attrs.update({'max_fill_factor': get_max_fill_factor(Gn)}) | |||
if 'node_label_num' in attr_names: | |||
attrs.update({'node_label_num': node_label_num}) | |||
if 'min_fill_factor' in attr_names: | |||
attrs.update({'min_fill_factor': get_min_fill_factor(Gn)}) | |||
if any(i in attr_names for i in ['edge_labeled', 'edge_label_num']): | |||
is_el = is_edge_labeled(Gn) | |||
edge_label_num = get_edge_label_num(Gn) | |||
if 'substructures' in attr_names: | |||
attrs.update({'substructures': get_substructures(Gn)}) | |||
if 'edge_labeled' in attr_names: | |||
# graphs are considered edge unlabeled if all edges have the same label. | |||
attrs.update({'edge_labeled': is_el if edge_label_num > 1 else False}) | |||
if 'class_number' in attr_names: | |||
attrs.update({'class_number': get_class_num(target)}) | |||
if 'edge_label_num' in attr_names: | |||
attrs.update({'edge_label_num': edge_label_num}) | |||
if 'node_attr_dim' in attr_names: | |||
attrs['node_attr_dim'] = get_node_attr_dim(Gn) | |||
if 'is_directed' in attr_names: | |||
attrs.update({'is_directed': is_directed(Gn)}) | |||
if 'edge_attr_dim' in attr_names: | |||
attrs['edge_attr_dim'] = get_edge_attr_dim(Gn) | |||
if 'ave_node_degree' in attr_names: | |||
attrs.update({'ave_node_degree': get_ave_node_degree(Gn)}) | |||
from collections import OrderedDict | |||
return OrderedDict( | |||
sorted(attrs.items(), key=lambda i: attr_names.index(i[0]))) | |||
if 'max_node_degree' in attr_names: | |||
attrs.update({'max_node_degree': get_max_node_degree(Gn)}) | |||
if 'min_node_degree' in attr_names: | |||
attrs.update({'min_node_degree': get_min_node_degree(Gn)}) | |||
if 'ave_fill_factor' in attr_names: | |||
attrs.update({'ave_fill_factor': get_ave_fill_factor(Gn)}) | |||
if 'max_fill_factor' in attr_names: | |||
attrs.update({'max_fill_factor': get_max_fill_factor(Gn)}) | |||
if 'min_fill_factor' in attr_names: | |||
attrs.update({'min_fill_factor': get_min_fill_factor(Gn)}) | |||
if 'substructures' in attr_names: | |||
attrs.update({'substructures': get_substructures(Gn)}) | |||
if 'class_number' in attr_names: | |||
attrs.update({'class_number': get_class_num(target)}) | |||
if 'node_attr_dim' in attr_names: | |||
attrs['node_attr_dim'] = get_node_attr_dim(Gn) | |||
if 'edge_attr_dim' in attr_names: | |||
attrs['edge_attr_dim'] = get_edge_attr_dim(Gn) | |||
from collections import OrderedDict | |||
return OrderedDict( | |||
sorted(attrs.items(), key=lambda i: attr_names.index(i[0]))) | |||
def load_predefined_dataset(ds_name): | |||
import os | |||
from gklearn.utils.graphfiles import loadDataset | |||
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' | |||
if ds_name == 'Acyclic': | |||
ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' | |||
@@ -415,5 +415,5 @@ def load_predefined_dataset(ds_name): | |||
pass | |||
else: | |||
raise Exception('The dataset name "', ds_name, '" is not pre-defined.') | |||
return graphs, targets |
@@ -18,8 +18,8 @@ def deltakernel(x, y): | |||
References | |||
---------- | |||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||
labeled graphs. In Proceedings of the 20th International Conference on | |||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||
labeled graphs. In Proceedings of the 20th International Conference on | |||
Machine Learning, Washington, DC, United States, 2003. | |||
""" | |||
return x == y #(1 if condition else 0) | |||
@@ -68,7 +68,7 @@ def polynomialkernel(x, y, d=1, c=0): | |||
x, y : array | |||
d : integer, default 1 | |||
c : float, default 0 | |||
Returns | |||
@@ -89,7 +89,7 @@ def linearkernel(x, y): | |||
x, y : array | |||
d : integer, default 1 | |||
c : float, default 0 | |||
Returns | |||