Browse Source

Merge pull request #44 from jajupmochi/v0.2.x

V0.2.x
master
linlin GitHub 4 years ago
parent
commit
830f96acbe
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 6784 additions and 755 deletions
  1. +14
    -1
      README.md
  2. +4
    -2
      gklearn/dataset/dataset.py
  3. +2
    -1
      gklearn/dataset/file_managers.py
  4. +11
    -0
      gklearn/experiments/__init__.py
  5. +46
    -33
      gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py
  6. +77
    -28
      gklearn/experiments/ged/stability/group_results.py
  7. +8
    -8
      gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py
  8. +237
    -12
      gklearn/experiments/ged/stability/utils.py
  9. +7
    -0
      gklearn/experiments/thesis/ged/fit_distances/README.md
  10. +4459
    -0
      gklearn/experiments/thesis/ged/fit_distances/ged_fit_distance_results.eps
  11. +21
    -5
      gklearn/experiments/thesis/ged/fit_distances/ged_fit_distance_results_plot.py
  12. +68
    -33
      gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp.py
  13. +98
    -0
      gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp_space.py
  14. +143
    -16
      gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py
  15. +268
    -0
      gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py
  16. +253
    -0
      gklearn/experiments/thesis/graph_kernels/fcsp/shortest_path.py
  17. +439
    -0
      gklearn/experiments/thesis/graph_kernels/fcsp/structural_sp.py
  18. +1
    -1
      gklearn/ged/util/__init__.py
  19. +90
    -65
      gklearn/ged/util/util.py
  20. +1
    -1
      gklearn/kernels/graph_kernel.py
  21. +15
    -27
      gklearn/kernels/structural_sp.py
  22. +170
    -170
      gklearn/utils/dataset.py
  23. +63
    -63
      gklearn/utils/graph_files.py
  24. +285
    -285
      gklearn/utils/graphdataset.py
  25. +4
    -4
      gklearn/utils/kernels.py

+ 14
- 1
README.md View File

@@ -139,7 +139,20 @@ Fork the library and open a pull request! Make your own contribute to the commun


## Citation ## Citation


Still waiting...
If you have used `graphkit-learn` in your publication, please cite the the following paper:
```
@article{JIA2021,
title = "graphkit-learn: A Python Library for Graph Kernels Based on Linear Patterns",
journal = "Pattern Recognition Letters",
year = "2021",
issn = "0167-8655",
doi = "https://doi.org/10.1016/j.patrec.2021.01.003",
url = "http://www.sciencedirect.com/science/article/pii/S0167865521000131",
author = "Linlin Jia and Benoit Gaüzère and Paul Honeine",
keywords = "Graph Kernels, Linear Patterns, Python Implementation",
abstract = "This paper presents graphkit-learn, the first Python library for efficient computation of graph kernels based on linear patterns, able to address various types of graphs. Graph kernels based on linear patterns are thoroughly implemented, each with specific computing methods, as well as two well-known graph kernels based on non-linear patterns for comparative analysis. Since computational complexity is an Achilles’ heel of graph kernels, we provide several strategies to address this critical issue, including parallelization, the trie data structure, and the FCSP method that we extend to other kernels and edge comparison. All proposed strategies save orders of magnitudes of computing time and memory usage. Moreover, all the graph kernels can be simply computed with a single Python statement, thus are appealing to researchers and practitioners. For the convenience of use, an advanced model selection procedure is provided for both regression and classification problems. Experiments on synthesized datasets and 11 real-world benchmark datasets show the relevance of the proposed library."
}
```


## Acknowledgments ## Acknowledgments




+ 4
- 2
gklearn/dataset/dataset.py View File

@@ -14,7 +14,7 @@ from gklearn.dataset import DATASET_META, DataFetcher, DataLoader
class Dataset(object): class Dataset(object):




def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs):
def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', remove_null_graphs=True, clean_labels=True, reload=False, verbose=False, **kwargs):
self._substructures = None self._substructures = None
self._node_label_dim = None self._node_label_dim = None
self._edge_label_dim = None self._edge_label_dim = None
@@ -82,6 +82,8 @@ class Dataset(object):
else: else:
raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.') raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.')


if remove_null_graphs:
self.trim_dataset(edge_required=False)




def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs): def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs):
@@ -537,7 +539,7 @@ class Dataset(object):




def trim_dataset(self, edge_required=False): def trim_dataset(self, edge_required=False):
if edge_required:
if edge_required: # @todo: there is a possibility that some node labels will be removed.
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)] trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)]
else: else:
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0] trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0]


+ 2
- 1
gklearn/dataset/file_managers.py View File

@@ -332,7 +332,8 @@ class DataLoader():
content_targets = ga.read().splitlines() # targets (regression) content_targets = ga.read().splitlines() # targets (regression)
targets = [int(i) for i in content_targets] targets = [int(i) for i in content_targets]
else: else:
raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.')
exp_msg = 'Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.'
raise Exception(exp_msg)
if class_label_map is not None: if class_label_map is not None:
targets = [class_label_map[t] for t in targets] targets = [class_label_map[t] for t in targets]




+ 11
- 0
gklearn/experiments/__init__.py View File

@@ -0,0 +1,11 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 15 18:22:34 2020

@author: ljia
"""

import os
EXP_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/'
DATASET_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/datasets/'

gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py → gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py View File

@@ -4,7 +4,7 @@
Created on Wed Oct 20 11:48:02 2020 Created on Wed Oct 20 11:48:02 2020


@author: ljia @author: ljia
"""
"""
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1].


import os import os
@@ -13,15 +13,15 @@ import pickle
import logging import logging
from gklearn.ged.util import compute_geds from gklearn.ged.util import compute_geds
import time import time
from utils import get_dataset
from utils import get_dataset, set_edit_cost_consts
import sys import sys
from group_results import group_trials
from group_results import group_trials, check_group_existence, update_group_marker




def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial): def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):


save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial) save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)
# Return if the file exists. # Return if the file exists.
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'): if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'):
return None, None return None, None
@@ -41,8 +41,11 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
'threads': multiprocessing.cpu_count(), 'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
} }
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]

edit_cost_constants = set_edit_cost_consts(ratio,
node_labeled=len(dataset.node_labels),
edge_labeled=len(dataset.edge_labels),
mode='uniform')
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants] # edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb")) # pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))


@@ -53,7 +56,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
options['node_attrs'] = dataset.node_attrs options['node_attrs'] = dataset.node_attrs
options['edge_attrs'] = dataset.edge_attrs options['edge_attrs'] = dataset.edge_attrs
parallel = True # if num_solutions == 1 else False parallel = True # if num_solutions == 1 else False
"""**5. Compute GED matrix.**""" """**5. Compute GED matrix.**"""
ged_mat = 'error' ged_mat = 'error'
runtime = 0 runtime = 0
@@ -67,9 +70,9 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix) logging.exception(save_file_suffix)
print(repr(exp)) print(repr(exp))
"""**6. Get results.**""" """**6. Get results.**"""
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f: with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f) pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f: with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
@@ -77,66 +80,76 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):


return ged_mat, runtime return ged_mat, runtime


def save_trials_as_group(dataset, ds_name, num_solutions, ratio): def save_trials_as_group(dataset, ds_name, num_solutions, ratio):
# Return if the group file exists. # Return if the group file exists.
name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.' name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.'
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy' name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy'
if os.path.isfile(name_group):
if check_group_existence(name_group):
return return
ged_mats = [] ged_mats = []
runtimes = [] runtimes = []
for trial in range(1, 101):
num_trials = 100
for trial in range(1, num_trials + 1):
print() print()
print('Trial:', trial) print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial) ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial)
ged_mats.append(ged_mat) ged_mats.append(ged_mat)
runtimes.append(runtime) runtimes.append(runtime)
# Group trials and Remove single files. # Group trials and Remove single files.
# @todo: if the program stops between the following lines, then there may be errors.
name_prefix = 'ged_matrix' + name_middle name_prefix = 'ged_matrix' + name_middle
group_trials(save_dir, name_prefix, True, True, False)
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
name_prefix = 'runtime' + name_middle name_prefix = 'runtime' + name_middle
group_trials(save_dir, name_prefix, True, True, False)
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
update_group_marker(name_group)




def results_for_a_dataset(ds_name): def results_for_a_dataset(ds_name):
"""**1. Get dataset.**""" """**1. Get dataset.**"""
dataset = get_dataset(ds_name) dataset = get_dataset(ds_name)
for num_solutions in num_solutions_list:
for ratio in ratio_list:
print() print()
print('# of solutions:', num_solutions)
for ratio in ratio_list:
print('Ratio:', ratio)
for num_solutions in num_solutions_list:
print() print()
print('Ratio:', ratio)
print('# of solutions:', num_solutions)
save_trials_as_group(dataset, ds_name, num_solutions, ratio) save_trials_as_group(dataset, ds_name, num_solutions, ratio)
def get_param_lists(ds_name):


def get_param_lists(ds_name, test=False):
if test:
num_solutions_list = [1, 10, 20, 30, 40, 50]
ratio_list = [10]
return num_solutions_list, ratio_list

if ds_name == 'AIDS_symb': if ds_name == 'AIDS_symb':
num_solutions_list = [1, 20, 40, 60, 80, 100] num_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9] ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
else: else:
num_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1]
return num_solutions_list, ratio_list return num_solutions_list, ratio_list


if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: if len(sys.argv) > 1:
ds_name_list = sys.argv[1:] ds_name_list = sys.argv[1:]
else: else:
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/'
ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG']
# ds_name_list = ['Acyclic'] # 'Alkane_unlabeled']
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']

save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/'
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir + 'groups/', exist_ok=True) os.makedirs(save_dir + 'groups/', exist_ok=True)
for ds_name in ds_name_list: for ds_name in ds_name_list:
print() print()
print('Dataset:', ds_name) print('Dataset:', ds_name)
num_solutions_list, ratio_list = get_param_lists(ds_name)
num_solutions_list, ratio_list = get_param_lists(ds_name, test=False)
results_for_a_dataset(ds_name) results_for_a_dataset(ds_name)

+ 77
- 28
gklearn/experiments/ged/stability/group_results.py View File

@@ -5,7 +5,7 @@ Created on Thu Oct 29 17:26:43 2020


@author: ljia @author: ljia


This script groups results together into a single file for the sake of faster
This script groups results together into a single file for the sake of faster
searching and loading. searching and loading.
""" """
import os import os
@@ -16,9 +16,55 @@ from tqdm import tqdm
import sys import sys




def check_group_existence(file_name):
path, name = os.path.split(file_name)
marker_fn = os.path.join(path, 'group_names_finished.pkl')
if os.path.isfile(marker_fn):
with open(marker_fn, 'rb') as f:
fns = pickle.load(f)
if name in fns:
return True

if os.path.isfile(file_name):
return True

return False


def update_group_marker(file_name):
path, name = os.path.split(file_name)
marker_fn = os.path.join(path, 'group_names_finished.pkl')
if os.path.isfile(marker_fn):
with open(marker_fn, 'rb') as f:
fns = pickle.load(f)
if name in fns:
return
else:
fns.add(name)
else:
fns = set({name})
with open(marker_fn, 'wb') as f:
pickle.dump(fns, f)


def create_group_marker_file(dir_folder, overwrite=True):
if not overwrite:
return

fns = set()
for file in sorted(os.listdir(dir_folder)):
if os.path.isfile(os.path.join(dir_folder, file)):
if file.endswith('.npy'):
fns.add(file)

marker_fn = os.path.join(dir_folder, 'group_names_finished.pkl')
with open(marker_fn, 'wb') as f:
pickle.dump(fns, f)


# This function is used by other scripts. Modify it carefully. # This function is used by other scripts. Modify it carefully.
def group_trials(dir_folder, name_prefix, override, clear, backup):
def group_trials(dir_folder, name_prefix, overwrite, clear, backup, num_trials=100):
# Get group name. # Get group name.
label_name = name_prefix.split('.')[0] label_name = name_prefix.split('.')[0]
if label_name == 'ged_matrix': if label_name == 'ged_matrix':
@@ -33,10 +79,10 @@ def group_trials(dir_folder, name_prefix, override, clear, backup):
else: else:
name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl' name_group = dir_folder + 'groups/' + group_label + name_suffix + 'pkl'


if not override and os.path.isfile(name_group):
if not overwrite and os.path.isfile(name_group):
# Check if all trial files exist. # Check if all trial files exist.
trials_complete = True trials_complete = True
for trial in range(1, 101):
for trial in range(1, num_trials + 1):
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl'
if not os.path.isfile(file_name): if not os.path.isfile(file_name):
trials_complete = False trials_complete = False
@@ -44,7 +90,7 @@ def group_trials(dir_folder, name_prefix, override, clear, backup):
else: else:
# Get data. # Get data.
data_group = [] data_group = []
for trial in range(1, 101):
for trial in range(1, num_trials + 1):
file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' file_name = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl'
if os.path.isfile(file_name): if os.path.isfile(file_name):
with open(file_name, 'rb') as f: with open(file_name, 'rb') as f:
@@ -64,7 +110,7 @@ def group_trials(dir_folder, name_prefix, override, clear, backup):


else: # Not all trials are completed. else: # Not all trials are completed.
return return
# Write groups. # Write groups.
if label_name == 'ged_matrix': if label_name == 'ged_matrix':
data_group = np.array(data_group) data_group = np.array(data_group)
@@ -73,31 +119,31 @@ def group_trials(dir_folder, name_prefix, override, clear, backup):
else: else:
with open(name_group, 'wb') as f: with open(name_group, 'wb') as f:
pickle.dump(data_group, f) pickle.dump(data_group, f)
trials_complete = True trials_complete = True


if trials_complete: if trials_complete:
# Backup. # Backup.
if backup: if backup:
for trial in range(1, 101):
for trial in range(1, num_trials + 1):
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl'
dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl' dst = dir_folder + 'backups/' + name_prefix + 'trial_' + str(trial) + '.pkl'
copyfile(src, dst) copyfile(src, dst)
# Clear. # Clear.
if clear: if clear:
for trial in range(1, 101):
for trial in range(1, num_trials + 1):
src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl' src = dir_folder + name_prefix + 'trial_' + str(trial) + '.pkl'
os.remove(src) os.remove(src)




def group_all_in_folder(dir_folder, override=False, clear=True, backup=True):
def group_all_in_folder(dir_folder, overwrite=False, clear=True, backup=True):
# Create folders. # Create folders.
os.makedirs(dir_folder + 'groups/', exist_ok=True) os.makedirs(dir_folder + 'groups/', exist_ok=True)
if backup: if backup:
os.makedirs(dir_folder + 'backups', exist_ok=True) os.makedirs(dir_folder + 'backups', exist_ok=True)
# Iterate all files. # Iterate all files.
cur_file_prefix = '' cur_file_prefix = ''
for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout): for file in tqdm(sorted(os.listdir(dir_folder)), desc='Grouping', file=sys.stdout):
@@ -106,20 +152,23 @@ def group_all_in_folder(dir_folder, override=False, clear=True, backup=True):
# print(name) # print(name)
# print(name_prefix) # print(name_prefix)
if name_prefix != cur_file_prefix: if name_prefix != cur_file_prefix:
group_trials(dir_folder, name_prefix, override, clear, backup)
group_trials(dir_folder, name_prefix, overwrite, clear, backup)
cur_file_prefix = name_prefix cur_file_prefix = name_prefix


if __name__ == '__main__': if __name__ == '__main__':
dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/'
group_all_in_folder(dir_folder)
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/'
group_all_in_folder(dir_folder)
dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/'
group_all_in_folder(dir_folder)
dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/'
group_all_in_folder(dir_folder)
# dir_folder = 'outputs/CRIANN/edit_costs.num_sols.ratios.IPFP/'
# group_all_in_folder(dir_folder)

# dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.IPFP/'
# group_all_in_folder(dir_folder)

# dir_folder = 'outputs/CRIANN/edit_costs.max_num_sols.ratios.bipartite/'
# group_all_in_folder(dir_folder)

# dir_folder = 'outputs/CRIANN/edit_costs.repeats.ratios.bipartite/'
# group_all_in_folder(dir_folder)

dir_folder = 'outputs/CRIANN/edit_costs.real_data.num_sols.ratios.IPFP/groups/'
create_group_marker_file(dir_folder)

gklearn/experiments/ged/stability/run_job_edit_costs.nums_sols.ratios.IPFP.py → gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py View File

@@ -15,30 +15,30 @@ def get_job_script(arg):


#SBATCH --exclusive #SBATCH --exclusive
#SBATCH --job-name="st.""" + arg + r""".IPFP" #SBATCH --job-name="st.""" + arg + r""".IPFP"
#SBATCH --partition=tlong
#SBATCH --partition=court
#SBATCH --mail-type=ALL #SBATCH --mail-type=ALL
#SBATCH --mail-user=jajupmochi@gmail.com #SBATCH --mail-user=jajupmochi@gmail.com
#SBATCH --output="outputs/output_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt"
#SBATCH --error="errors/error_edit_costs.nums_sols.ratios.IPFP.""" + arg + """.txt"
#SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.IPFP.""" + arg + """.txt"
#SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.IPFP.""" + arg + """.txt"
# #
#SBATCH --ntasks=1 #SBATCH --ntasks=1
#SBATCH --nodes=1 #SBATCH --nodes=1
#SBATCH --cpus-per-task=1 #SBATCH --cpus-per-task=1
#SBATCH --time=300:00:00
#SBATCH --time=48:00:00
#SBATCH --mem-per-cpu=4000 #SBATCH --mem-per-cpu=4000


srun hostname srun hostname
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability
srun python3 edit_costs.nums_sols.ratios.IPFP.py """ + arg
srun python3 edit_costs.real_data.nums_sols.ratios.IPFP.py """ + arg
script = script.strip() script = script.strip()
script = re.sub('\n\t+', '\n', script) script = re.sub('\n\t+', '\n', script)
script = re.sub('\n +', '\n', script) script = re.sub('\n +', '\n', script)
return script return script


if __name__ == '__main__': if __name__ == '__main__':
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
for ds_name in [ds_list[i] for i in [0, 3]]:
ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG']
for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]:
job_script = get_job_script(ds_name) job_script = get_job_script(ds_name)
command = 'sbatch <<EOF\n' + job_script + '\nEOF' command = 'sbatch <<EOF\n' + job_script + '\nEOF'
# print(command) # print(command)

+ 237
- 12
gklearn/experiments/ged/stability/utils.py View File

@@ -5,26 +5,251 @@ Created on Thu Oct 29 19:17:36 2020


@author: ljia @author: ljia
""" """
from gklearn.utils import Dataset
import os
import pickle
import numpy as np
from tqdm import tqdm
import sys
from gklearn.dataset import Dataset
from gklearn.experiments import DATASET_ROOT




def get_dataset(ds_name): def get_dataset(ds_name):
# The node/edge labels that will not be used in the computation. # The node/edge labels that will not be used in the computation.
if ds_name == 'MAO':
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}
elif ds_name == 'Monoterpenoides':
irrelevant_labels = {'edge_labels': ['valence']}
elif ds_name == 'MUTAG':
irrelevant_labels = {'edge_labels': ['label_0']}
elif ds_name == 'AIDS_symb':
# if ds_name == 'MAO':
# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}
# if ds_name == 'Monoterpenoides':
# irrelevant_labels = {'edge_labels': ['valence']}
# elif ds_name == 'MUTAG':
# irrelevant_labels = {'edge_labels': ['label_0']}
if ds_name == 'AIDS_symb':
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']} irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']}
ds_name = 'AIDS' ds_name = 'AIDS'
else:
irrelevant_labels = {}


# Initialize a Dataset.
dataset = Dataset()
# Load predefined dataset. # Load predefined dataset.
dataset.load_predefined_dataset(ds_name)
dataset = Dataset(ds_name, root=DATASET_ROOT)
# Remove irrelevant labels. # Remove irrelevant labels.
dataset.remove_labels(**irrelevant_labels) dataset.remove_labels(**irrelevant_labels)
print('dataset size:', len(dataset.graphs)) print('dataset size:', len(dataset.graphs))
return dataset
return dataset


def set_edit_cost_consts(ratio, node_labeled=True, edge_labeled=True, mode='uniform'):
if mode == 'uniform':
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]

if not node_labeled:
edit_cost_constants[2] = 0
if not edge_labeled:
edit_cost_constants[5] = 0

return edit_cost_constants


def nested_keys_exists(element, *keys):
'''
Check if *keys (nested) exists in `element` (dict).
'''
if not isinstance(element, dict):
raise AttributeError('keys_exists() expects dict as first argument.')
if len(keys) == 0:
raise AttributeError('keys_exists() expects at least two arguments, one given.')

_element = element
for key in keys:
try:
_element = _element[key]
except KeyError:
return False
return True



# Check average relative error along elements in two ged matrices.
def matrices_ave_relative_error(m1, m2):
error = 0
base = 0
for i in range(m1.shape[0]):
for j in range(m1.shape[1]):
error += np.abs(m1[i, j] - m2[i, j])
base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2

return error / base


def compute_relative_error(ged_mats):

if len(ged_mats) != 0:
# get the smallest "correct" GED matrix.
ged_mat_s = np.ones(ged_mats[0].shape) * np.inf
for i in range(ged_mats[0].shape[0]):
for j in range(ged_mats[0].shape[1]):
ged_mat_s[i, j] = np.min([mat[i, j] for mat in ged_mats])

# compute average error.
errors = []
for i, mat in enumerate(ged_mats):
err = matrices_ave_relative_error(mat, ged_mat_s)
# if not per_correct:
# print('matrix # ', str(i))
# pass
errors.append(err)
else:
errors = [0]

return np.mean(errors)


def parse_group_file_name(fn):
splits_all = fn.split('.')
key1 = splits_all[1]

pos2 = splits_all[2].rfind('_')
# key2 = splits_all[2][:pos2]
val2 = splits_all[2][pos2+1:]

pos3 = splits_all[3].rfind('_')
# key3 = splits_all[3][:pos3]
val3 = splits_all[3][pos3+1:] + '.' + splits_all[4]

return key1, val2, val3


def get_all_errors(save_dir, errors):

# Loop for each GED matrix file.
for file in tqdm(sorted(os.listdir(save_dir)), desc='Getting errors', file=sys.stdout):
if os.path.isfile(os.path.join(save_dir, file)) and file.startswith('ged_mats.'):
keys = parse_group_file_name(file)

# Check if the results is in the errors.
if not keys[0] in errors:
errors[keys[0]] = {}
if not keys[1] in errors[keys[0]]:
errors[keys[0]][keys[1]] = {}
# Compute the error if not exist.
if not keys[2] in errors[keys[0]][keys[1]]:
ged_mats = np.load(os.path.join(save_dir, file))
errors[keys[0]][keys[1]][keys[2]] = compute_relative_error(ged_mats)

return errors


def get_relative_errors(save_dir, overwrite=False):
""" # Read relative errors from previous computed and saved file. Create the
file, compute the errors, or add and save the new computed errors to the
file if necessary.

Parameters
----------
save_dir : TYPE
DESCRIPTION.
overwrite : TYPE, optional
DESCRIPTION. The default is False.

Returns
-------
None.
"""
if not overwrite:
fn_err = save_dir + '/relative_errors.pkl'

# If error file exists.
if os.path.isfile(fn_err):
with open(fn_err, 'rb') as f:
errors = pickle.load(f)
errors = get_all_errors(save_dir, errors)
else:
errors = get_all_errors(save_dir, {})

else:
errors = get_all_errors(save_dir, {})

with open(fn_err, 'wb') as f:
pickle.dump(errors, f)

return errors


def interpolate_result(Z, method='linear'):
values = Z.copy()
for i in range(Z.shape[0]):
for j in range(Z.shape[1]):
if np.isnan(Z[i, j]):

# Get the nearest non-nan values.
x_neg = np.nan
for idx, val in enumerate(Z[i, :][j::-1]):
if not np.isnan(val):
x_neg = val
x_neg_off = idx
break
x_pos = np.nan
for idx, val in enumerate(Z[i, :][j:]):
if not np.isnan(val):
x_pos = val
x_pos_off = idx
break

# Interpolate.
if not np.isnan(x_neg) and not np.isnan(x_pos):
val_int = (x_pos_off / (x_neg_off + x_pos_off)) * (x_neg - x_pos) + x_pos
values[i, j] = val_int
break

y_neg = np.nan
for idx, val in enumerate(Z[:, j][i::-1]):
if not np.isnan(val):
y_neg = val
y_neg_off = idx
break
y_pos = np.nan
for idx, val in enumerate(Z[:, j][i:]):
if not np.isnan(val):
y_pos = val
y_pos_off = idx
break

# Interpolate.
if not np.isnan(y_neg) and not np.isnan(y_pos):
val_int = (y_pos_off / (y_neg_off + y_neg_off)) * (y_neg - y_pos) + y_pos
values[i, j] = val_int
break

return values


def set_axis_style(ax):
ax.set_axisbelow(True)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
ax.tick_params(labelsize=8, color='w', pad=1, grid_color='w')
ax.tick_params(axis='x', pad=-2)
ax.tick_params(axis='y', labelrotation=-40, pad=-2)
# ax.zaxis._axinfo['juggled'] = (1, 2, 0)
ax.set_xlabel(ax.get_xlabel(), fontsize=10, labelpad=-3)
ax.set_ylabel(ax.get_ylabel(), fontsize=10, labelpad=-2, rotation=50)
ax.set_zlabel(ax.get_zlabel(), fontsize=10, labelpad=-2)
ax.set_title(ax.get_title(), pad=30, fontsize=15)
return


if __name__ == '__main__':
root_dir = 'outputs/CRIANN/'
# for dir_ in sorted(os.listdir(root_dir)):
# if os.path.isdir(root_dir):
# full_dir = os.path.join(root_dir, dir_)
# print('---', full_dir,':')
# save_dir = os.path.join(full_dir, 'groups/')
# if os.path.exists(save_dir):
# try:
# get_relative_errors(save_dir)
# except Exception as exp:
# print('An exception occured when running this experiment:')
# print(repr(exp))

+ 7
- 0
gklearn/experiments/thesis/ged/fit_distances/README.md View File

@@ -4,8 +4,15 @@
``` ```
python3 -m pip install graphkit-learn python3 -m pip install graphkit-learn
python3 run_xp.py python3 run_xp.py
```

Plot results in figure and LaTex tables:
```
python3 ged_fit_distance_results_plot.py
``` ```


# Run xp (deprecated). # Run xp (deprecated).
```
export PYTHONPATH="/path/to/gedlibpy:/path/to/py-graph" export PYTHONPATH="/path/to/gedlibpy:/path/to/py-graph"
python optim_costs.py dataset output_file python optim_costs.py dataset output_file
```

+ 4459
- 0
gklearn/experiments/thesis/ged/fit_distances/ged_fit_distance_results.eps
File diff suppressed because it is too large
View File


+ 21
- 5
gklearn/experiments/thesis/ged/fit_distances/ged_fit_distance_results_plot.py View File

@@ -10,6 +10,9 @@ import numpy as np
import scipy.stats import scipy.stats
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec import matplotlib.gridspec as gridspec
# import matplotlib as mpl
# mpl.rcParams['text.usetex'] = True
# mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}'] #for \text command




def rounder(x, decimals): def rounder(x, decimals):
@@ -54,7 +57,7 @@ def df_to_latex_table(df, replace_header=True, end_mid_line=7):
i_end = ltx.find('\\\\\n\\midrule\n') i_end = ltx.find('\\\\\n\\midrule\n')
replace = r"""\begin{tabular}{lll@{~~}c@{~~}c@{~~}c@{~~}c} replace = r"""\begin{tabular}{lll@{~~}c@{~~}c@{~~}c@{~~}c}
\toprule \toprule
\multirow{2}[2]{*}{\textbf{Dataset}} & \multirow{2}[2]{*}{\textbf{Distance}} & \multirow{2}[2]{*}{\textbf{Method}} & \multicolumn{2}{c}{\textbf{BIPARTITE}} & \multicolumn{2}{c}{\textbf{IPFP}} \\
\multirow{2}[2]{*}{\textbf{Dataset}} & \multirow{2}[2]{*}{\textbf{Distance}} & \multirow{2}[2]{*}{\textbf{Method}} & \multicolumn{2}{c}{\textbf{bipartite}} & \multicolumn{2}{c}{\textbf{IPFP}} \\
\cmidrule(lr){4-5}\cmidrule(lr){6-7} \cmidrule(lr){4-5}\cmidrule(lr){6-7}
& & & \textbf{Train errors} & \textbf{Test errors} & \textbf{Train errors} & \textbf{Test errors} \\ & & & \textbf{Train errors} & \textbf{Test errors} & \textbf{Train errors} & \textbf{Test errors} \\
\midrule \midrule
@@ -95,6 +98,9 @@ def beautify_df(df):
for idx, index in enumerate(min_indices): for idx, index in enumerate(min_indices):
df.loc[(ds, gk, index), min_labels[idx]] = '\\textbf{' + df.loc[(ds, gk, index), min_labels[idx]] + '}' df.loc[(ds, gk, index), min_labels[idx]] = '\\textbf{' + df.loc[(ds, gk, index), min_labels[idx]] + '}'


# Rename indices.
df.index.set_levels([r'Euclidean', r'Manhattan'], level=1, inplace=True)

return df return df




@@ -118,6 +124,11 @@ def params_to_latex_table(results):
df.loc[idx_r, idx_c] = '-' df.loc[idx_r, idx_c] = '-'


# df = beautify_df(df) # df = beautify_df(df)
# Rename indices.
# df.index.set_levels([r'\texttt{bipartite}', r'\texttt{IPFP}'], level=1, inplace=True)
df.index.set_levels([r'bipartite', r'IPFP'], level=1, inplace=True)
df.index.set_levels([r'Euclidean', r'Manhattan'], level=2, inplace=True)

ltx = df_to_latex_table(df, replace_header=False, end_mid_line=9) ltx = df_to_latex_table(df, replace_header=False, end_mid_line=9)
return ltx return ltx


@@ -208,14 +219,11 @@ def print_table_results(results_by_xp):
tab.append(["Method", "App","Test"]) tab.append(["Method", "App","Test"])
#setups = ["random","expert","fitted"] #setups = ["random","expert","fitted"]



for i,setup in enumerate(results_by_xp.keys()): for i,setup in enumerate(results_by_xp.keys()):
current_line = [setup] current_line = [setup]
p = results_by_xp[setup] p = results_by_xp[setup]
current_line.append(f"{p['mean'][0]:.2f} +- {p['interval'][0]:.2f}") current_line.append(f"{p['mean'][0]:.2f} +- {p['interval'][0]:.2f}")

current_line.append(f"{p['mean'][1]:.2f} +- {p['interval'][1]:.2f}") current_line.append(f"{p['mean'][1]:.2f} +- {p['interval'][1]:.2f}")

tab.append(current_line) tab.append(current_line)


print(tabulate(tab, headers="firstrow")) print(tabulate(tab, headers="firstrow"))
@@ -342,6 +350,13 @@ def set_figure(nb_rows):
return fig return fig




def get_title(edit_cost, distance):
ed = 'bipartite' if edit_cost == 'BIPARTITE' else 'IPFP'
# ed = r'\texttt{' + ed + r'}'
dis = distance[0].upper() + distance[1:]
return ed + ', ' + dis


if __name__ == '__main__': if __name__ == '__main__':
from sklearn.model_selection import ParameterGrid from sklearn.model_selection import ParameterGrid
import pickle import pickle
@@ -370,7 +385,8 @@ if __name__ == '__main__':
for col, contents in enumerate(row_grid_list): for col, contents in enumerate(row_grid_list):
ax = fig.add_subplot(gs[row, col]) ax = fig.add_subplot(gs[row, col])
y_label = (ds_name[:-10] if ds_name.endswith('_unlabeled') else ds_name) if col == 0 else '' y_label = (ds_name[:-10] if ds_name.endswith('_unlabeled') else ds_name) if col == 0 else ''
title = contents['edit_cost'] + ', ' + contents['distance'] if row == 0 else ''

title = get_title(contents['edit_cost'], contents['distance']) if row == 0 else ''
p, c = plot_a_task(ax, ds_name, contents['edit_cost'], contents['distance'], title, y_label) p, c = plot_a_task(ax, ds_name, contents['edit_cost'], contents['distance'], title, y_label)
results[(ds_name, contents['distance'], contents['edit_cost'])] = p results[(ds_name, contents['distance'], contents['edit_cost'])] = p
params[(ds_name, contents['distance'], contents['edit_cost'])] = c params[(ds_name, contents['distance'], contents['edit_cost'])] = c


+ 68
- 33
gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp.py View File

@@ -10,6 +10,7 @@ This script compares the results with and without FCSP.
from gklearn.dataset import Dataset from gklearn.dataset import Dataset
from gklearn.utils import get_graph_kernel_by_name from gklearn.utils import get_graph_kernel_by_name
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
from gklearn.experiments import DATASET_ROOT
import functools import functools
import os import os
import pickle import pickle
@@ -17,50 +18,77 @@ import sys
import logging import logging




def run_all(fcsp):
save_dir = 'outputs/' + ('fscp' if fcsp == True else 'naive') + '/'
os.makedirs(save_dir, exist_ok=True)
# def run_all(fcsp):

# from sklearn.model_selection import ParameterGrid

# Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO',
# 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens',
# 'Letter-high', 'Letter-med', 'Letter-low',
# 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD',
# 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR',
# 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew',
# 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY',
# 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full',
# 'Mutagenicity', 'REDDIT-BINARY']

# Kernel_List = ['ShortestPath', 'StructuralSP']

# task_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]})

# for task in list(task_grid):


from sklearn.model_selection import ParameterGrid
# save_file_suffix = '.' + task['kernel'] + '.' + task['dataset']
# file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl')
# if not os.path.isfile(file_name):
# print()
# print((task['kernel'], task['dataset']))


Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO',
'PAH_unlabeled', 'PAH', 'MUTAG', 'Letter-high', 'Letter-med', 'Letter-low',
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD',
'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR',
'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew',
'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY',
'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full',
'Mutagenicity', 'REDDIT-BINARY']
# try:
# gram_matrix, run_time = compute(task['kernel'], task['dataset'], fcsp)


Kernel_List = ['ShortestPath', 'StructuralSP']
# except Exception as exp:
# print('An exception occured when running this experiment:')
# LOG_FILENAME = save_dir + 'error.txt'
# logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
# logging.exception('\n--------------' + save_file_suffix + '------------------')
# print(repr(exp))
# else:
# save_file_suffix = '.' + task['kernel'] + task['dataset']


work_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]})
# with open(file_name, 'wb') as f:
# pickle.dump(run_time, f)


for work in list(work_grid):


save_file_suffix = '.' + work['kernel'] + '.' + work['dataset']
file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl')
if not os.path.isfile(file_name):
print()
print((work['kernel'], work['dataset']))


try:
gram_matrix, run_time = run_work(work['kernel'], work['dataset'], fcsp)
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))
def run_task(kernel_name, ds_name, fcsp):
save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp)
file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl')


save_file_suffix = '.' + work['kernel'] + work['dataset']
if not os.path.isfile(file_name):
print()
print((kernel_name, ds_name, str(fcsp)))


try:
gram_matrix, run_time = compute(kernel_name, ds_name, fcsp)

except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = os.path.join(save_dir, 'error' + save_file_suffix + '.txt')
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception('\n--------------' + save_file_suffix + '------------------')
print(repr(exp))

else:
with open(file_name, 'wb') as f: with open(file_name, 'wb') as f:
pickle.dump(run_time, f) pickle.dump(run_time, f)




def run_work(kernel_name, ds_name, fcsp):
dataset = Dataset(ds_name, verbose=True)
def compute(kernel_name, ds_name, fcsp):
dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True)
if kernel_name == 'ShortestPath':
dataset.trim_dataset(edge_required=True)



mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
@@ -87,8 +115,15 @@ def run_work(kernel_name, ds_name, fcsp):


if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: if len(sys.argv) > 1:
fcsp = True if sys.argv[1] == 'True' else False
kernel_name = sys.argv[1]
ds_name = sys.argv[2]
fcsp = True if sys.argv[3] == 'True' else False
else: else:
kernel_name = 'ShortestPath'
ds_name = 'Acyclic'
fcsp = True fcsp = True
run_all(fcsp)


save_dir = 'outputs/'
os.makedirs(save_dir, exist_ok=True)

run_task(kernel_name, ds_name, fcsp)

+ 98
- 0
gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp_space.py View File

@@ -0,0 +1,98 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 2 17:41:54 2020

@author: ljia

This script compares the results with and without FCSP.
"""
from gklearn.dataset import Dataset
from shortest_path import SPSpace
from structural_sp import SSPSpace
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
from gklearn.experiments import DATASET_ROOT
import functools
import os
import pickle
import sys
import logging


def run_task(kernel_name, ds_name, fcsp):
save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp)
file_name = os.path.join(save_dir, 'space' + save_file_suffix + '.pkl')

# Return if the task is already completed.
if os.path.isfile(file_name):
with open(file_name, 'rb') as f:
data = pickle.load(f)
if data['completed']:
return

print()
print((kernel_name, ds_name, str(fcsp)))

try:
gram_matrix, run_time = compute(kernel_name, ds_name, fcsp, file_name)

except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = os.path.join(save_dir, 'error.space' + save_file_suffix + '.txt')
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception('\n--------------' + save_file_suffix + '------------------')
print(repr(exp))

# else:
# with open(file_name, 'wb') as f:
# pickle.dump(run_time, f)


def compute(kernel_name, ds_name, fcsp, file_name):
dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True)
if kernel_name == 'ShortestPath':
dataset.trim_dataset(edge_required=True)
# dataset.cut_graphs(range(0, 10))
kernel_class = SPSpace
else:
# dataset.cut_graphs(range(0, 10))
kernel_class = SSPSpace

mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
edge_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}

graph_kernel = kernel_class(name=kernel_name,
node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs,
edge_attrs=dataset.edge_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
fcsp=fcsp,
compute_method='naive',
node_kernels=node_kernels,
edge_kernels=edge_kernels,
file_name=file_name
)
gram_matrix, run_time = graph_kernel.compute(dataset.graphs,
parallel=None,
normalize=False,
verbose=2
)
return gram_matrix, run_time


if __name__ == '__main__':
if len(sys.argv) > 1:
kernel_name = sys.argv[1]
ds_name = sys.argv[2]
fcsp = True if sys.argv[3] == 'True' else False
else:
kernel_name = 'StructuralSP'
ds_name = 'Fingerprint'
fcsp = True

save_dir = 'outputs/'
os.makedirs(save_dir, exist_ok=True)

run_task(kernel_name, ds_name, fcsp)

+ 143
- 16
gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py View File

@@ -10,27 +10,86 @@ import os
import re import re




def get_job_script(param):
OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'),
('StructuralSP', 'ENZYMES', 'True'),
('StructuralSP', 'ENZYMES', 'False'),
('StructuralSP', 'AIDS', 'False'),
('ShortestPath', 'NCI1', 'False'),
('StructuralSP', 'NCI1', 'True'),
('StructuralSP', 'NCI1', 'False'),
('ShortestPath', 'NCI109', 'False'),
('StructuralSP', 'NCI109', 'True'),
('StructuralSP', 'NCI109', 'False'),
('ShortestPath', 'DD', 'True'),
('ShortestPath', 'DD', 'False'),
('StructuralSP', 'BZR', 'False'),
('ShortestPath', 'COX2', 'False'),
('StructuralSP', 'COX2', 'False'),
('ShortestPath', 'DHFR', 'False'),
('StructuralSP', 'DHFR', 'False'),
('StructuralSP', 'OHSU', 'True'),
('StructuralSP', 'OHSU', 'False'),
('StructuralSP', 'SYNTHETIC', 'False'),
('StructuralSP', 'SYNTHETIC', 'True'),
('StructuralSP', 'SYNTHETIC', 'False'),
('ShortestPath', 'SYNTHETICnew', 'False'),
('StructuralSP', 'SYNTHETICnew', 'True'),
('StructuralSP', 'SYNTHETICnew', 'False'),
('ShortestPath', 'Synthie', 'False'),
('StructuralSP', 'Synthie', 'True'),
('StructuralSP', 'Synthie', 'False'),
('ShortestPath', 'COIL-DEL', 'False'),
('StructuralSP', 'COIL-DEL', 'True'),
('StructuralSP', 'COIL-DEL', 'False'),
('ShortestPath', 'PROTEINS', 'False'),
('ShortestPath', 'PROTEINS_full', 'False'),
('StructuralSP', 'Mutagenicity', 'True'),
('StructuralSP', 'Mutagenicity', 'False'),
('StructuralSP', 'REDDIT-BINARY', 'True'),
('StructuralSP', 'REDDIT-BINARY', 'False'),
})

OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'),
('StructuralSP', 'DD', 'False'),
('StructuralSP', 'PROTEINS', 'True'),
('StructuralSP', 'PROTEINS', 'False'),
('StructuralSP', 'PROTEINS_full', 'True'),
('StructuralSP', 'PROTEINS_full', 'False'),
('ShortestPath', 'REDDIT-BINARY', 'True'),
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'True'),
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'False'),
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'True'),
})

MISS_LABEL_LIST = set({('StructuralSP', 'GREC', 'True'),
('StructuralSP', 'GREC', 'False'),
('StructuralSP', 'Web', 'True'),
('StructuralSP', 'Web', 'False'),
})


def get_job_script(kernel, dataset, fcsp):
script = r""" script = r"""
#!/bin/bash #!/bin/bash


#SBATCH --exclusive
#SBATCH --job-name="fcsp.""" + param + r""""
#SBATCH --partition=long
##SBATCH --exclusive
#SBATCH --job-name="fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""""
#SBATCH --partition=tlong
#SBATCH --mail-type=ALL #SBATCH --mail-type=ALL
#SBATCH --mail-user=jajupmochi@gmail.com #SBATCH --mail-user=jajupmochi@gmail.com
#SBATCH --output="outputs/output_fcsp.""" + param + r""".txt"
#SBATCH --error="errors/error_fcsp.""" + param + r""".txt"
#SBATCH --output="outputs/output_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt"
#SBATCH --error="errors/error_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt"
# #
#SBATCH --ntasks=1 #SBATCH --ntasks=1
#SBATCH --nodes=1 #SBATCH --nodes=1
#SBATCH --cpus-per-task=1 #SBATCH --cpus-per-task=1
#SBATCH --time=100:00:00
#SBATCH --mem-per-cpu=4000
#SBATCH --time=300:00:00
##SBATCH --mem-per-cpu=4000
#SBATCH --mem=40000


srun hostname srun hostname
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp
srun python3 compare_fcsp.py """ + param
srun python3 compare_fcsp.py """ + kernel + r" " + dataset + r" " + fcsp
script = script.strip() script = script.strip()
script = re.sub('\n\t+', '\n', script) script = re.sub('\n\t+', '\n', script)
script = re.sub('\n +', '\n', script) script = re.sub('\n +', '\n', script)
@@ -38,15 +97,83 @@ srun python3 compare_fcsp.py """ + param
return script return script




def check_task_status(save_dir, *params):
str_task_id = '.' + '.'.join(params)

# Check if the task is in out of memeory or out of space lists or missing labels.
if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST:
return True

# Check if the task is running or in queue of slurm.
command = 'squeue --user $USER --name "fcsp' + str_task_id + '" --format "%.2t" --noheader'
stream = os.popen(command)
output = stream.readlines()
if len(output) > 0:
return True

# Check if there are more than 10 tlong tasks running.
command = 'squeue --user $USER --partition tlong --noheader'
stream = os.popen(command)
output = stream.readlines()
if len(output) >= 10:
return True


# Check if the results are already computed.
file_name = os.path.join(save_dir, 'run_time' + str_task_id + '.pkl')
if os.path.isfile(file_name):
return True

return False


if __name__ == '__main__': if __name__ == '__main__':
save_dir = 'outputs/'
os.makedirs(save_dir, exist_ok=True)
os.makedirs('outputs/', exist_ok=True) os.makedirs('outputs/', exist_ok=True)
os.makedirs('errors/', exist_ok=True) os.makedirs('errors/', exist_ok=True)


param_list = ['True', 'False']
for param in param_list[:]:
job_script = get_job_script(param)
command = 'sbatch <<EOF\n' + job_script + '\nEOF'
# print(command)
os.system(command)
from sklearn.model_selection import ParameterGrid

Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO',
'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens',
'Letter-high', 'Letter-med', 'Letter-low',
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD',
# new: not so large.
'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D',
'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY',
'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2',
'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC',
# new: large.
'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7',
'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H',
'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H',
'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H',
'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH',
'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K',
'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K',
'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C',
'COLLAB', 'COIL-DEL',
'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity',
'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K',
'REDDIT-MULTI-12K']

Kernel_List = ['ShortestPath', 'StructuralSP']

fcsp_list = ['True', 'False']

task_grid = ParameterGrid({'kernel': Kernel_List[:],
'dataset': Dataset_List[:],
'fcsp': fcsp_list[:]})

from tqdm import tqdm

for task in tqdm(list(task_grid), desc='submitting tasks/jobs'):

if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']):
job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp'])
command = 'sbatch <<EOF\n' + job_script + '\nEOF'
# print(command)
os.system(command)
# os.popen(command) # os.popen(command)
# output = stream.readlines()
# output = stream.readlines()

+ 268
- 0
gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py View File

@@ -0,0 +1,268 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 14 11:49:43 2020

@author: ljia
"""

import os
import re
import pickle


OUT_TIME_LIST = []


OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'),
('ShortestPath', 'REDDIT-BINARY', 'False'),
('StructuralSP', 'ENZYMES', 'False'),
('ShortestPath', 'DD', 'True'),
('ShortestPath', 'DD', 'False'),
('StructuralSP', 'DD', 'True'),
('StructuralSP', 'DD', 'False'),
('StructuralSP', 'COIL-DEL', 'True'),
('ShortestPath', 'COLORS-3', 'True'),
('ShortestPath', 'COLORS-3', 'False'),
('StructuralSP', 'COLORS-3', 'True'),
('StructuralSP', 'COLORS-3', 'False'),
('StructuralSP', 'PROTEINS', 'True'),
('StructuralSP', 'PROTEINS', 'False'),
('StructuralSP', 'PROTEINS_full', 'True'),
('StructuralSP', 'PROTEINS_full', 'False'),
('StructuralSP', 'MSRC_21', 'False'),
('ShortestPath', 'MCF-7', 'True'),
('ShortestPath', 'MCF-7', 'False'),
('StructuralSP', 'MCF-7', 'True'),
('StructuralSP', 'MCF-7', 'False'),
('ShortestPath', 'MCF-7H', 'True'),
('ShortestPath', 'MCF-7H', 'False'),
('StructuralSP', 'MCF-7H', 'True'),
('StructuralSP', 'MCF-7H', 'False'),
('ShortestPath', 'MOLT-4', 'True'),
('ShortestPath', 'MOLT-4', 'False'),
('StructuralSP', 'MOLT-4', 'True'),
('StructuralSP', 'MOLT-4', 'False'),
('ShortestPath', 'MOLT-4H', 'True'),
('ShortestPath', 'MOLT-4H', 'False'),
('StructuralSP', 'MOLT-4H', 'True'),
('StructuralSP', 'MOLT-4H', 'False'),
('ShortestPath', 'P388', 'True'),
('ShortestPath', 'P388', 'False'),
('StructuralSP', 'P388', 'True'),
('StructuralSP', 'P388', 'False'),
('ShortestPath', 'P388H', 'True'),
('ShortestPath', 'P388H', 'False'),
('StructuralSP', 'P388H', 'True'),
('StructuralSP', 'P388H', 'False'),
('ShortestPath', 'NCI-H23', 'True'),
('ShortestPath', 'NCI-H23', 'False'),
('StructuralSP', 'NCI-H23', 'True'),
('StructuralSP', 'NCI-H23', 'False'),
('ShortestPath', 'NCI-H23H', 'True'),
('ShortestPath', 'NCI-H23H', 'False'),
('StructuralSP', 'NCI-H23H', 'True'),
('StructuralSP', 'NCI-H23H', 'False'),
('ShortestPath', 'OVCAR-8', 'True'),
('ShortestPath', 'OVCAR-8', 'False'),
('StructuralSP', 'OVCAR-8', 'True'),
('StructuralSP', 'OVCAR-8', 'False'),
('ShortestPath', 'OVCAR-8H', 'True'),
('ShortestPath', 'OVCAR-8H', 'False'),
('StructuralSP', 'OVCAR-8H', 'True'),
('StructuralSP', 'OVCAR-8H', 'False'),
('ShortestPath', 'SN12C', 'True'),
('ShortestPath', 'SN12C', 'False'),
('StructuralSP', 'SN12C', 'True'),
('StructuralSP', 'SN12C', 'False'),
('ShortestPath', 'SN12CH', 'True'),
('ShortestPath', 'SN12CH', 'False'),
('ShortestPath', 'SF-295', 'True'),
('ShortestPath', 'SF-295', 'False'),
('StructuralSP', 'SF-295', 'True'),
('StructuralSP', 'SF-295', 'False'),
('ShortestPath', 'SF-295H', 'True'),
('ShortestPath', 'SF-295H', 'False'),
('StructuralSP', 'SF-295H', 'True'),
('StructuralSP', 'SF-295H', 'False'),
('ShortestPath', 'SW-620', 'True'),
('ShortestPath', 'SW-620', 'False'),
('StructuralSP', 'SW-620', 'True'),
('StructuralSP', 'SW-620', 'False'),
('ShortestPath', 'SW-620H', 'True'),
('ShortestPath', 'SW-620H', 'False'),
('StructuralSP', 'SW-620H', 'True'),
('StructuralSP', 'SW-620H', 'False'),
('ShortestPath', 'TRIANGLES', 'True'),
('ShortestPath', 'TRIANGLES', 'False'),
('StructuralSP', 'TRIANGLES', 'True'),
('StructuralSP', 'TRIANGLES', 'False'),
('ShortestPath', 'Yeast', 'True'),
('ShortestPath', 'Yeast', 'False'),
('StructuralSP', 'Yeast', 'True'),
('StructuralSP', 'Yeast', 'False'),
('ShortestPath', 'YeastH', 'True'),
('ShortestPath', 'YeastH', 'False'),
('StructuralSP', 'YeastH', 'True'),
('StructuralSP', 'YeastH', 'False'),
('ShortestPath', 'FRANKENSTEIN', 'True'),
('ShortestPath', 'FRANKENSTEIN', 'False'),
('StructuralSP', 'FRANKENSTEIN', 'True'),
('StructuralSP', 'FRANKENSTEIN', 'False'),
('StructuralSP', 'SN12CH', 'True'),
('StructuralSP', 'SN12CH', 'False'),
('ShortestPath', 'UACC257', 'True'),
('ShortestPath', 'UACC257', 'False'),
('StructuralSP', 'UACC257', 'True'),
('StructuralSP', 'UACC257', 'False'),
('ShortestPath', 'UACC257H', 'True'),
('ShortestPath', 'UACC257H', 'False'),
('StructuralSP', 'UACC257H', 'True'),
('StructuralSP', 'UACC257H', 'False'),
('ShortestPath', 'PC-3', 'True'),
('ShortestPath', 'PC-3', 'False'),
('StructuralSP', 'PC-3', 'True'),
('StructuralSP', 'PC-3', 'False'),
('ShortestPath', 'PC-3H', 'True'),
('ShortestPath', 'PC-3H', 'False'),
('StructuralSP', 'PC-3H', 'True'),
('StructuralSP', 'PC-3H', 'False'),
('ShortestPath', 'DBLP_v1', 'True'),
('ShortestPath', 'DBLP_v1', 'False'),
('StructuralSP', 'DBLP_v1', 'True'),
('ShortestPath', 'COLLAB', 'True'),
('ShortestPath', 'COLLAB', 'False'),
('StructuralSP', 'COLLAB', 'True'),
('StructuralSP', 'COLLAB', 'False'),
('ShortestPath', 'REDDIT-BINARY', 'False'),
('StructuralSP', 'REDDIT-BINARY', 'True'),
('StructuralSP', 'REDDIT-BINARY', 'False'),
('ShortestPath', 'REDDIT-MULTI-5K', 'True'),
('ShortestPath', 'REDDIT-MULTI-5K', 'False'),
('StructuralSP', 'REDDIT-MULTI-5K', 'True'),
('StructuralSP', 'REDDIT-MULTI-5K', 'False'),
('ShortestPath', 'REDDIT-MULTI-12K', 'True'),
('ShortestPath', 'REDDIT-MULTI-12K', 'False'),
('StructuralSP', 'REDDIT-MULTI-12K', 'True'),
('StructuralSP', 'REDDIT-MULTI-12K', 'False'),
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'True'),
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'False'),
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'True'),
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'False'),
})

MISS_LABEL_LIST = set({('StructuralSP', 'GREC', 'True'),
('StructuralSP', 'GREC', 'False'),
('StructuralSP', 'Web', 'True'),
('StructuralSP', 'Web', 'False'),
})


def get_job_script(kernel, dataset, fcsp):
# if (kernel, dataset, fcsp) in OUT_MEM_LIST:
# mem = '2560000'
# else:
mem = '4000'
script = r"""
#!/bin/bash

##SBATCH --exclusive
#SBATCH --job-name="fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""""
#SBATCH --partition=""" + (r"court" if kernel == 'ShortestPath' else r"court") + r"""
#SBATCH --mail-type=ALL
#SBATCH --mail-user=jajupmochi@gmail.com
#SBATCH --output="outputs/output_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt"
#SBATCH --error="errors/error_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt"
#
#SBATCH --ntasks=1
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --time=""" + (r"48" if kernel == 'ShortestPath' else r"48") + r""":00:00
##SBATCH --mem-per-cpu=""" + mem + r"""
#SBATCH --mem=4000

srun hostname
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp
srun python3 compare_fcsp_space.py """ + kernel + r" " + dataset + r" " + fcsp
script = script.strip()
script = re.sub('\n\t+', '\n', script)
script = re.sub('\n +', '\n', script)

return script


def check_task_status(save_dir, *params):
str_task_id = '.' + '.'.join(params)

# Check if the task is in out of memeory or out of space lists or missing labels.
if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST:
return True

# Check if the task is running or in queue of slurm.
command = 'squeue --user $USER --name "fcsp.space' + str_task_id + '" --format "%.2t" --noheader'
stream = os.popen(command)
output = stream.readlines()
if len(output) > 0:
return True

# Check if the task is already computed.
file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl')
if os.path.isfile(file_name):
with open(file_name, 'rb') as f:
data = pickle.load(f)
if data['completed']:
return True

return False


if __name__ == '__main__':
save_dir = 'outputs/'
os.makedirs(save_dir, exist_ok=True)
os.makedirs('outputs/', exist_ok=True)
os.makedirs('errors/', exist_ok=True)

from sklearn.model_selection import ParameterGrid

Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO',
'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens',
'Letter-high', 'Letter-med', 'Letter-low',
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD',
# new: not so large.
'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D',
'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY',
'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2',
'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC',
# new: large.
'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7',
'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H',
'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H',
'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H',
'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH',
'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K',
'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K',
'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C',
'COLLAB', 'COIL-DEL',
'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity',
'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K',
'REDDIT-MULTI-12K']

Kernel_List = ['ShortestPath', 'StructuralSP']

fcsp_list = ['True', 'False']

task_grid = ParameterGrid({'kernel': Kernel_List[:],
'dataset': Dataset_List[:],
'fcsp': fcsp_list[:]})

from tqdm import tqdm

for task in tqdm(list(task_grid), desc='submitting tasks/jobs'):

if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']):
job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp'])
command = 'sbatch <<EOF\n' + job_script + '\nEOF'
# print(command)
os.system(command)
# os.popen(command)
# output = stream.readlines()

+ 253
- 0
gklearn/experiments/thesis/graph_kernels/fcsp/shortest_path.py View File

@@ -0,0 +1,253 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 7 15:24:58 2020

@author: ljia

@references:

[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData
Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
"""

import sys
from itertools import product
# from functools import partial
from gklearn.utils import get_iters
import numpy as np
from gklearn.utils.utils import getSPGraph
from gklearn.kernels import ShortestPath
import os
import pickle
from pympler import asizeof
import time
import networkx as nx


def load_results(file_name, fcsp):
if os.path.isfile(file_name):
with open(file_name, 'rb') as f:
return pickle.load(f)
else:
results = {'nb_comparison': [], 'i': -1, 'j': -1, 'completed': False}
if fcsp:
results['vk_dict_mem'] = []
return results


def save_results(file_name, results):
with open(file_name, 'wb') as f:
pickle.dump(results, f)


def estimate_vk_memory(obj, nb_nodes1, nb_nodes2):
# asizeof.asized(obj, detail=1).format()
# return asizeof.asizeof(obj)
key, val = next(iter(obj.items()))
# key = dict.iterkeys().next()
# key_mem = asizeof.asizeof(key)
dict_flat = sys.getsizeof(obj)
key_mem = 64

if isinstance(val, float):
val_mem = 24
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2)
else: # value is True or False
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2)

# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n')
return mem


def compute_stats(file_name, results):
del results['i']
del results['j']
results['nb_comparison'] = np.mean(results['nb_comparison'])
results['completed'] = True
if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0:
results['vk_dict_mem'] = np.mean(results['vk_dict_mem'])
save_results(file_name, results)


class SPSpace(ShortestPath):

def __init__(self, **kwargs):
super().__init__(**kwargs)
self._file_name = kwargs.get('file_name')

# @profile
def _compute_gm_series(self):
self._all_graphs_have_edges(self._graphs)
# get shortest path graph of each graph.
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator]


results = load_results(self._file_name, self._fcsp)

# compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels',
length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2))

time0 = time.time()
for i, j in iterator:
if i > results['i'] or (i == results['i'] and j > results['j']):
data = self._sp_do_space(self._graphs[i], self._graphs[j])
if self._fcsp:
results['nb_comparison'].append(data[0])
if data[1] != {}:
results['vk_dict_mem'].append(estimate_vk_memory(data[1],
nx.number_of_nodes(self._graphs[i]),
nx.number_of_nodes(self._graphs[j])))
else:
results['nb_comparison'].append(data)
results['i'] = i
results['j'] = j

time1 = time.time()
if time1 - time0 > 600:
save_results(self._file_name, results)
time0 = time1

compute_stats(self._file_name, results)

return gram_matrix


def _sp_do_space(self, g1, g2):

if self._fcsp: # @todo: it may be put outside the _sp_do().
return self._sp_do_fcsp(g1, g2)
else:
return self._sp_do_naive(g1, g2)


def _sp_do_fcsp(self, g1, g2):

nb_comparison = 0

# compute shortest path matrices first, method borrowed from FCSP.
vk_dict = {} # shortest path matrices dict
if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do().
# node symb and non-synb labeled
if len(self._node_attrs) > 0:
kn = self._node_kernels['mix']
for n1, n2 in product(
g1.nodes(data=True), g2.nodes(data=True)):
n1_labels = [n1[1][nl] for nl in self._node_labels]
n2_labels = [n2[1][nl] for nl in self._node_labels]
n1_attrs = [n1[1][na] for na in self._node_attrs]
n2_attrs = [n2[1][na] for na in self._node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
nb_comparison += 1
# node symb labeled
else:
kn = self._node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in self._node_labels]
n2_labels = [n2[1][nl] for nl in self._node_labels]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels)
nb_comparison += 1
else:
# node non-synb labeled
if len(self._node_attrs) > 0:
kn = self._node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_attrs = [n1[1][na] for na in self._node_attrs]
n2_attrs = [n2[1][na] for na in self._node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs)
nb_comparison += 1
# node unlabeled
else:
for e1, e2 in product(
g1.edges(data=True), g2.edges(data=True)):
pass
# if e1[2]['cost'] == e2[2]['cost']:
# kernel += 1
# nb_comparison += 1

return nb_comparison, vk_dict

# # compute graph kernels
# if self._ds_infos['directed']:
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])]
# kn1 = nk11 * nk22
# kernel += kn1
# else:
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# # each edge walk is counted twice, starting from both its extreme nodes.
# nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
# e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])]
# kn1 = nk11 * nk22
# kn2 = nk12 * nk21
# kernel += kn1 + kn2


def _sp_do_naive(self, g1, g2):

nb_comparison = 0

# Define the function to compute kernels between vertices in each condition.
if len(self._node_labels) > 0:
# node symb and non-synb labeled
if len(self._node_attrs) > 0:
def compute_vk(n1, n2):
kn = self._node_kernels['mix']
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
# node symb labeled
else:
def compute_vk(n1, n2):
kn = self._node_kernels['symb']
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
return kn(n1_labels, n2_labels)
else:
# node non-synb labeled
if len(self._node_attrs) > 0:
def compute_vk(n1, n2):
kn = self._node_kernels['nsymb']
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
return kn(n1_attrs, n2_attrs)
# node unlabeled
else:
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kernel += 1
return 0

# compute graph kernels
if self._ds_infos['directed']:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
# nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1])
# kn1 = nk11 * nk22
# kernel += kn1
nb_comparison += 2
else:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
# each edge walk is counted twice, starting from both its extreme nodes.
# nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk(
# e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1])
# kn1 = nk11 * nk22
# kn2 = nk12 * nk21
# kernel += kn1 + kn2
nb_comparison += 4

return nb_comparison

+ 439
- 0
gklearn/experiments/thesis/graph_kernels/fcsp/structural_sp.py View File

@@ -0,0 +1,439 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 30 11:59:57 2020

@author: ljia

@references:

[1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
"""
import sys
from itertools import product
from gklearn.utils import get_iters
import numpy as np
import time
import os, errno
import pickle
from pympler import asizeof
import networkx as nx
from gklearn.utils.utils import get_shortest_paths
from gklearn.kernels import StructuralSP


def load_splist(file_name):
if os.path.isfile(file_name):
with open(file_name, 'rb') as f:
return pickle.load(f)
else:
results_path = {'splist': [], 'i': -1, 'completed': False}
return results_path


def load_results(file_name, fcsp):
if os.path.isfile(file_name):
with open(file_name, 'rb') as f:
return pickle.load(f)
else:
results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False}
if fcsp:
results['vk_dict_mem'] = []
results['ek_dict_mem'] = []
return results


def save_results(file_name, results):
with open(file_name, 'wb') as f:
pickle.dump(results, f)


def estimate_vk_memory(obj, nb_nodes1, nb_nodes2):
# asizeof.asized(obj, detail=1).format()
# return asizeof.asizeof(obj)
key, val = next(iter(obj.items()))
# key = dict.iterkeys().next()
# key_mem = asizeof.asizeof(key)
dict_flat = sys.getsizeof(obj)
key_mem = 64

if isinstance(val, float):
val_mem = 24
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2)
else: # value is True or False
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2)

# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n')
return mem


def estimate_ek_memory(obj, nb_nodes1, nb_nodes2):
# asizeof.asized(obj, detail=1).format()
# return asizeof.asizeof(obj)
key, val = next(iter(obj.items()))
# key = dict.iterkeys().next()
# key_mem = asizeof.asizeof(key)
dict_flat = sys.getsizeof(obj)
key_mem = 192

if isinstance(val, float):
val_mem = 24
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2)
else: # value is True or False
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2)

# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n')
return mem


def compute_stats(file_name, results, splist):
del results['i']
del results['j']
results['nb_v_comparison'] = np.mean(results['nb_v_comparison'])
# if len(results['nb_e_comparison']) > 0:
results['nb_e_comparison'] = np.mean(results['nb_e_comparison'])
results['completed'] = True
if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0:
results['vk_dict_mem'] = np.mean(results['vk_dict_mem'])
if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0:
results['ek_dict_mem'] = np.mean(results['ek_dict_mem'])
results['nb_sp_ave'] = np.mean([len(ps) for ps in splist])
results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist])
results['sp_mem_all'] = asizeof.asizeof(splist)
save_results(file_name, results)


class SSPSpace(StructuralSP):

def __init__(self, **kwargs):
super().__init__(**kwargs)
self._file_name = kwargs.get('file_name')

# @profile
def _compute_gm_series(self):
# get shortest paths of each graph in the graphs.
fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl'
results_path = load_splist(fn_paths)

if not results_path['completed']:

iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
if self._compute_method == 'trie':
for g in iterator:
splist.append(self._get_sps_as_trie(g))
else:
time0 = time.time()
for i, g in enumerate(iterator):
if i > results_path['i']:
results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed']))
results_path['i'] = i

time1 = time.time()
if time1 - time0 > 600:
save_results(fn_paths, results_path)
time0 = time1

del results_path['i']
results_path['completed'] = True
save_results(fn_paths, results_path)

#########
splist = results_path['splist']
results = load_results(self._file_name, self._fcsp)

# compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self._verbose >= 2))
if self._compute_method == 'trie':
for i, j in iterator:
kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j])
gram_matrix[i][j] = kernel
gram_matrix[j][i] = kernel
else:
time0 = time.time()
for i, j in iterator:
if i > results['i'] or (i == results['i'] and j > results['j']):
data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j])
results['nb_v_comparison'].append(data[0])
results['nb_e_comparison'].append(data[1])
if self._fcsp:
if data[2] != {}:
results['vk_dict_mem'].append(estimate_vk_memory(data[2],
nx.number_of_nodes(self._graphs[i]),
nx.number_of_nodes(self._graphs[j])))
if data[3] != {}:
results['ek_dict_mem'].append(estimate_ek_memory(data[3],
nx.number_of_nodes(self._graphs[i]),
nx.number_of_nodes(self._graphs[j])))
results['i'] = i
results['j'] = j

time1 = time.time()
if time1 - time0 > 600:
save_results(self._file_name, results)
time0 = time1

compute_stats(self._file_name, results, splist)
# @todo: may not remove the path file if the program stops exactly here.
try:
os.remove(fn_paths)
except OSError as e:
if e.errno != errno.ENOENT:
raise

return gram_matrix


def _ssp_do_naive_space(self, g1, g2, spl1, spl2):
if self._fcsp: # @todo: it may be put outside the _sp_do().
return self._sp_do_naive_fcsp(g1, g2, spl1, spl2)
else:
return self._sp_do_naive_naive(g1, g2, spl1, spl2)


def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2):

# First, compute shortest path matrices, method borrowed from FCSP.
vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2)
# Then, compute kernels between all pairs of edges, which is an idea of
# extension of FCSP. It suits sparse graphs, which is the most case we
# went though. For dense graphs, this would be slow.
ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2)

return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict


def _sp_do_naive_naive(self, g1, g2, spl1, spl2):

nb_v_comparison = 0
nb_e_comparison = 0

# Define the function to compute kernels between vertices in each condition.
if len(self._node_labels) > 0:
# node symb and non-synb labeled
if len(self._node_attrs) > 0:
def compute_vk(n1, n2):
kn = self._node_kernels['mix']
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
# node symb labeled
else:
def compute_vk(n1, n2):
kn = self._node_kernels['symb']
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
return kn(n1_labels, n2_labels)
else:
# node non-synb labeled
if len(self._node_attrs) > 0:
def compute_vk(n1, n2):
kn = self._node_kernels['nsymb']
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
return kn(n1_attrs, n2_attrs)
# # node unlabeled
# else:
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kernel += 1
# return kernel

# Define the function to compute kernels between edges in each condition.
if len(self._edge_labels) > 0:
# edge symb and non-synb labeled
if len(self._edge_attrs) > 0:
def compute_ek(e1, e2):
ke = self._edge_kernels['mix']
e1_labels = [g1.edges[e1][el] for el in self._edge_labels]
e2_labels = [g2.edges[e2][el] for el in self._edge_labels]
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs]
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs]
return ke(e1_labels, e2_labels, e1_attrs, e2_attrs)
# edge symb labeled
else:
def compute_ek(e1, e2):
ke = self._edge_kernels['symb']
e1_labels = [g1.edges[e1][el] for el in self._edge_labels]
e2_labels = [g2.edges[e2][el] for el in self._edge_labels]
return ke(e1_labels, e2_labels)
else:
# edge non-synb labeled
if len(self._edge_attrs) > 0:
def compute_ek(e1, e2):
ke = self._edge_kernels['nsymb']
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs]
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs]
return ke(e1_attrs, e2_attrs)


# compute graph kernels
if len(self._node_labels) > 0 or len(self._node_attrs) > 0:
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
# nb_v_comparison = len(p1)
# nb_e_comparison = len(p1) - 1
kpath = compute_vk(p1[0], p2[0])
nb_v_comparison += 1
if kpath:
for idx in range(1, len(p1)):
kpath *= compute_vk(p1[idx], p2[idx]) * \
compute_ek((p1[idx-1], p1[idx]),
(p2[idx-1], p2[idx]))
nb_v_comparison += 1
nb_e_comparison += 1
if not kpath:
break
# kernel += kpath # add up kernels of all paths
else:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
kpath = compute_vk(p1[0], p2[0])
nb_v_comparison += 1
if kpath:
for idx in range(1, len(p1)):
kpath *= compute_vk(p1[idx], p2[idx])
nb_v_comparison += 1
if not kpath:
break
# kernel += kpath # add up kernels of all paths
else:
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
if len(p1) == 0:
pass
else:
kpath = 1
for idx in range(0, len(p1) - 1):
kpath *= compute_ek((p1[idx], p1[idx+1]),
(p2[idx], p2[idx+1]))
nb_e_comparison += 1
if not kpath:
break
else:
pass
# for p1, p2 in product(spl1, spl2):
# if len(p1) == len(p2):
# kernel += 1
# try:
# kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average
# except ZeroDivisionError:
# print(spl1, spl2)
# print(g1.nodes(data=True))
# print(g1.edges(data=True))
# raise Exception

return nb_v_comparison, nb_e_comparison


def _get_all_node_kernels(self, g1, g2):
nb_comparison = 0

vk_dict = {} # shortest path matrices dict
if len(self._node_labels) > 0:
# node symb and non-synb labeled
if len(self._node_attrs) > 0:
kn = self._node_kernels['mix']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in self._node_labels]
n2_labels = [n2[1][nl] for nl in self._node_labels]
n1_attrs = [n1[1][na] for na in self._node_attrs]
n2_attrs = [n2[1][na] for na in self._node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
nb_comparison += 1
# node symb labeled
else:
kn = self._node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in self._node_labels]
n2_labels = [n2[1][nl] for nl in self._node_labels]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels)
nb_comparison += 1
else:
# node non-synb labeled
if len(self._node_attrs) > 0:
kn = self._node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_attrs = [n1[1][na] for na in self._node_attrs]
n2_attrs = [n2[1][na] for na in self._node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs)
nb_comparison += 1
# node unlabeled
else:
pass # @todo: add edge weights.
# for e1 in g1.edges(data=True):
# for e2 in g2.edges(data=True):
# if e1[2]['cost'] == e2[2]['cost']:
# kernel += 1
# return kernel

return vk_dict, nb_comparison


def _get_all_edge_kernels(self, g1, g2):
nb_comparison = 0

# compute kernels between all pairs of edges, which is an idea of
# extension of FCSP. It suits sparse graphs, which is the most case we
# went though. For dense graphs, this would be slow.
ek_dict = {} # dict of edge kernels
if len(self._edge_labels) > 0:
# edge symb and non-synb labeled
if len(self._edge_attrs) > 0:
ke = self._edge_kernels['mix']
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
e1_labels = [e1[2][el] for el in self._edge_labels]
e2_labels = [e2[2][el] for el in self._edge_labels]
e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs)
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
nb_comparison += 1
# edge symb labeled
else:
ke = self._edge_kernels['symb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
e1_labels = [e1[2][el] for el in self._edge_labels]
e2_labels = [e2[2][el] for el in self._edge_labels]
ek_temp = ke(e1_labels, e2_labels)
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
nb_comparison += 1
else:
# edge non-synb labeled
if len(self._edge_attrs) > 0:
ke = self._edge_kernels['nsymb']
for e1 in g1.edges(data=True):
for e2 in g2.edges(data=True):
e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
ek_temp = ke(e1_attrs, e2_attrs)
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
nb_comparison += 1
# edge unlabeled
else:
pass

return ek_dict, nb_comparison

+ 1
- 1
gklearn/ged/util/__init__.py View File

@@ -1,3 +1,3 @@
from gklearn.ged.util.lsape_solver import LSAPESolver from gklearn.ged.util.lsape_solver import LSAPESolver
from gklearn.ged.util.util import compute_geds, ged_options_to_string
from gklearn.ged.util.util import pairwise_ged, compute_geds, get_nb_edit_operations, ged_options_to_string
from gklearn.ged.util.util import compute_geds_cml, label_costs_to_matrix from gklearn.ged.util.util import compute_geds_cml, label_costs_to_matrix

+ 90
- 65
gklearn/ged/util/util.py View File

@@ -11,9 +11,10 @@ import multiprocessing
from multiprocessing import Pool from multiprocessing import Pool
from functools import partial from functools import partial
import sys import sys
from tqdm import tqdm
# from tqdm import tqdm
import networkx as nx import networkx as nx
from gklearn.ged.env import GEDEnv from gklearn.ged.env import GEDEnv
from gklearn.utils import get_iters




def compute_ged(g1, g2, options): def compute_ged(g1, g2, options):
@@ -23,7 +24,7 @@ def compute_ged(g1, g2, options):
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants'])
ged_env.add_nx_graph(g1, '') ged_env.add_nx_graph(g1, '')
ged_env.add_nx_graph(g2, '') ged_env.add_nx_graph(g2, '')
listID = ged_env.get_all_graph_ids()
listID = ged_env.get_all_graph_ids()
ged_env.init(init_type=options['init_option']) ged_env.init(init_type=options['init_option'])
ged_env.set_method(options['method'], ged_options_to_string(options)) ged_env.set_method(options['method'], ged_options_to_string(options))
ged_env.init_method() ged_env.init_method()
@@ -33,9 +34,46 @@ def compute_ged(g1, g2, options):
ged_env.run_method(g, h) ged_env.run_method(g, h)
pi_forward = ged_env.get_forward_map(g, h) pi_forward = ged_env.get_forward_map(g, h)
pi_backward = ged_env.get_backward_map(g, h) pi_backward = ged_env.get_backward_map(g, h)
upper = ged_env.get_upper_bound(g, h)
upper = ged_env.get_upper_bound(g, h)
dis = upper dis = upper

# make the map label correct (label remove map as np.inf)
nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()]
nb1 = nx.number_of_nodes(g1)
nb2 = nx.number_of_nodes(g2)
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
# print(pi_forward)

return dis, pi_forward, pi_backward


def pairwise_ged(g1, g2, options={}, sort=True, repeats=1, parallel=False, verbose=True):
from gklearn.gedlib import librariesImport, gedlibpy

ged_env = gedlibpy.GEDEnv()
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants'])
ged_env.add_nx_graph(g1, '')
ged_env.add_nx_graph(g2, '')
listID = ged_env.get_all_graph_ids()
ged_env.init(init_option=(options['init_option'] if 'init_option' in options else 'EAGER_WITHOUT_SHUFFLED_COPIES'))
ged_env.set_method(options['method'], ged_options_to_string(options))
ged_env.init_method()

g = listID[0]
h = listID[1]
dis_min = np.inf
for i in range(0, repeats):
ged_env.run_method(g, h)
upper = ged_env.get_upper_bound(g, h)
dis = upper
if dis < dis_min:
dis_min = dis
pi_forward = ged_env.get_forward_map(g, h)
pi_backward = ged_env.get_backward_map(g, h)
# lower = ged_env.get_lower_bound(g, h)

# make the map label correct (label remove map as np.inf) # make the map label correct (label remove map as np.inf)
nodes1 = [n for n in g1.nodes()] nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()] nodes2 = [n for n in g2.nodes()]
@@ -56,7 +94,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
for g in graphs: for g in graphs:
ged_env.add_nx_graph(g, '') ged_env.add_nx_graph(g, '')
listID = ged_env.get_all_graph_ids() listID = ged_env.get_all_graph_ids()
node_labels = ged_env.get_all_node_labels() node_labels = ged_env.get_all_node_labels()
edge_labels = ged_env.get_all_edge_labels() edge_labels = ged_env.get_all_edge_labels()
node_label_costs = label_costs_to_matrix(options['node_label_costs'], len(node_labels)) if 'node_label_costs' in options else None node_label_costs = label_costs_to_matrix(options['node_label_costs'], len(node_labels)) if 'node_label_costs' in options else None
@@ -73,7 +111,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
if node_label_costs is None and edge_label_costs is None: if node_label_costs is None and edge_label_costs is None:
neo_options = {'edit_cost': options['edit_cost'], neo_options = {'edit_cost': options['edit_cost'],
'is_cml': False, 'is_cml': False,
'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'],
'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'],
'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']}
else: else:
neo_options = {'edit_cost': options['edit_cost'], neo_options = {'edit_cost': options['edit_cost'],
@@ -98,11 +136,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
G_listID = listID_toshare G_listID = listID_toshare
do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort) do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID))
if verbose:
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout)
else:
iterator = pool.imap_unordered(do_partial, itr, chunksize)
iterator = get_iters(pool.imap_unordered(do_partial, itr, chunksize), desc='computing GEDs', file=sys.stdout, length=len_itr, verbose=verbose)
# iterator = pool.imap_unordered(do_partial, itr, chunksize) # iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, j, dis, n_eo_tmp in iterator: for i, j, dis, n_eo_tmp in iterator:
idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2) idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2)
@@ -114,14 +148,11 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
# print(i, j, idx_itr, dis) # print(i, j, idx_itr, dis)
pool.close() pool.close()
pool.join() pool.join()
else: else:
ged_vec = [] ged_vec = []
n_edit_operations = [] n_edit_operations = []
if verbose:
iterator = tqdm(range(len(graphs)), desc='computing GEDs', file=sys.stdout)
else:
iterator = range(len(graphs))
iterator = get_iters(range(len(graphs)), desc='computing GEDs', file=sys.stdout, length=len(graphs), verbose=verbose)
for i in iterator: for i in iterator:
# for i in range(len(graphs)): # for i in range(len(graphs)):
for j in range(i + 1, len(graphs)): for j in range(i + 1, len(graphs)):
@@ -138,7 +169,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
return ged_vec, ged_mat, n_edit_operations return ged_vec, ged_mat, n_edit_operations




def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbose=True):
def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, n_jobs=None, verbose=True):
from gklearn.gedlib import librariesImport, gedlibpy from gklearn.gedlib import librariesImport, gedlibpy


# initialize ged env. # initialize ged env.
@@ -146,7 +177,7 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants'])
for g in graphs: for g in graphs:
ged_env.add_nx_graph(g, '') ged_env.add_nx_graph(g, '')
listID = ged_env.get_all_graph_ids()
listID = ged_env.get_all_graph_ids()
ged_env.init() ged_env.init()
if parallel: if parallel:
options['threads'] = 1 options['threads'] = 1
@@ -155,7 +186,7 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo


# compute ged. # compute ged.
neo_options = {'edit_cost': options['edit_cost'], neo_options = {'edit_cost': options['edit_cost'],
'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'],
'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'],
'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']}
ged_mat = np.zeros((len(graphs), len(graphs))) ged_mat = np.zeros((len(graphs), len(graphs)))
if parallel: if parallel:
@@ -163,7 +194,8 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo
ged_vec = [0 for i in range(len_itr)] ged_vec = [0 for i in range(len_itr)]
n_edit_operations = [0 for i in range(len_itr)] n_edit_operations = [0 for i in range(len_itr)]
itr = combinations(range(0, len(graphs)), 2) itr = combinations(range(0, len(graphs)), 2)
n_jobs = multiprocessing.cpu_count()
if n_jobs is None:
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs: if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1 chunksize = int(len_itr / n_jobs) + 1
else: else:
@@ -175,11 +207,7 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo
G_listID = listID_toshare G_listID = listID_toshare
do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort, repeats) do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort, repeats)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID))
if verbose:
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout)
else:
iterator = pool.imap_unordered(do_partial, itr, chunksize)
iterator = get_iters(pool.imap_unordered(do_partial, itr, chunksize), desc='computing GEDs', file=sys.stdout, length=len_itr, verbose=verbose)
# iterator = pool.imap_unordered(do_partial, itr, chunksize) # iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, j, dis, n_eo_tmp in iterator: for i, j, dis, n_eo_tmp in iterator:
idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2) idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2)
@@ -191,14 +219,11 @@ def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, verbo
# print(i, j, idx_itr, dis) # print(i, j, idx_itr, dis)
pool.close() pool.close()
pool.join() pool.join()
else: else:
ged_vec = [] ged_vec = []
n_edit_operations = [] n_edit_operations = []
if verbose:
iterator = tqdm(range(len(graphs)), desc='computing GEDs', file=sys.stdout)
else:
iterator = range(len(graphs))
iterator = get_iters(range(len(graphs)), desc='computing GEDs', file=sys.stdout, length=len(graphs), verbose=verbose)
for i in iterator: for i in iterator:
# for i in range(len(graphs)): # for i in range(len(graphs)):
for j in range(i + 1, len(graphs)): for j in range(i + 1, len(graphs)):
@@ -232,14 +257,14 @@ def _compute_ged_parallel(env, gid1, gid2, g1, g2, options, sort, repeats):




def _compute_ged(env, gid1, gid2, g1, g2, repeats): def _compute_ged(env, gid1, gid2, g1, g2, repeats):
dis_min = np.inf
dis_min = np.inf # @todo: maybe compare distance and then do others (faster).
for i in range(0, repeats): for i in range(0, repeats):
env.run_method(gid1, gid2) env.run_method(gid1, gid2)
pi_forward = env.get_forward_map(gid1, gid2) pi_forward = env.get_forward_map(gid1, gid2)
pi_backward = env.get_backward_map(gid1, gid2) pi_backward = env.get_backward_map(gid1, gid2)
upper = env.get_upper_bound(gid1, gid2)
upper = env.get_upper_bound(gid1, gid2)
dis = upper dis = upper
# make the map label correct (label remove map as np.inf) # make the map label correct (label remove map as np.inf)
nodes1 = [n for n in g1.nodes()] nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()] nodes2 = [n for n in g2.nodes()]
@@ -247,7 +272,7 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats):
nb2 = nx.number_of_nodes(g2) nb2 = nx.number_of_nodes(g2)
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
if dis < dis_min: if dis < dis_min:
dis_min = dis dis_min = dis
pi_forward_min = pi_forward pi_forward_min = pi_forward
@@ -268,7 +293,7 @@ def label_costs_to_matrix(costs, nb_labels):


Returns Returns
------- -------
cost_matrix : numpy.array.
cost_matrix : numpy.array.
The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData.
""" """
# Initialize label cost matrix. # Initialize label cost matrix.
@@ -282,13 +307,13 @@ def label_costs_to_matrix(costs, nb_labels):
for row in range(1, nb_labels + 1): for row in range(1, nb_labels + 1):
cost_matrix[row, 0] = costs[i] cost_matrix[row, 0] = costs[i]
i += 1 i += 1
# Costs of substitutions.
# Costs of substitutions.
for row in range(1, nb_labels + 1): for row in range(1, nb_labels + 1):
for col in range(row + 1, nb_labels + 1): for col in range(row + 1, nb_labels + 1):
cost_matrix[row, col] = costs[i] cost_matrix[row, col] = costs[i]
cost_matrix[col, row] = costs[i] cost_matrix[col, row] = costs[i]
i += 1 i += 1
return cost_matrix return cost_matrix




@@ -299,7 +324,7 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is
edge_labels = kwargs.get('edge_labels', []) edge_labels = kwargs.get('edge_labels', [])
return get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, return get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
node_labels=node_labels, edge_labels=edge_labels) node_labels=node_labels, edge_labels=edge_labels)
else:
else:
raise Exception('Edit cost "', edit_cost, '" is not supported.') raise Exception('Edit cost "', edit_cost, '" is not supported.')
else: else:
if edit_cost == 'LETTER' or edit_cost == 'LETTER2': if edit_cost == 'LETTER' or edit_cost == 'LETTER2':
@@ -307,21 +332,21 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is
elif edit_cost == 'NON_SYMBOLIC': elif edit_cost == 'NON_SYMBOLIC':
node_attrs = kwargs.get('node_attrs', []) node_attrs = kwargs.get('node_attrs', [])
edge_attrs = kwargs.get('edge_attrs', []) edge_attrs = kwargs.get('edge_attrs', [])
return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map,
return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map,
node_attrs=node_attrs, edge_attrs=edge_attrs) node_attrs=node_attrs, edge_attrs=edge_attrs)
elif edit_cost == 'CONSTANT': elif edit_cost == 'CONSTANT':
node_labels = kwargs.get('node_labels', []) node_labels = kwargs.get('node_labels', [])
edge_labels = kwargs.get('edge_labels', []) edge_labels = kwargs.get('edge_labels', [])
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,
node_labels=node_labels, edge_labels=edge_labels) node_labels=node_labels, edge_labels=edge_labels)
else:
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map)
def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
else:
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map)
def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
node_labels=[], edge_labels=[]): node_labels=[], edge_labels=[]):
"""Compute times that edit operations are used in an edit path for symbolic-labeled graphs, where the costs are different for each pair of nodes. """Compute times that edit operations are used in an edit path for symbolic-labeled graphs, where the costs are different for each pair of nodes.
Returns Returns
------- -------
list list
@@ -330,7 +355,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
# Initialize. # Initialize.
nb_ops_node = np.zeros((1 + len(node_labels), 1 + len(node_labels))) nb_ops_node = np.zeros((1 + len(node_labels), 1 + len(node_labels)))
nb_ops_edge = np.zeros((1 + len(edge_labels), 1 + len(edge_labels))) nb_ops_edge = np.zeros((1 + len(edge_labels), 1 + len(edge_labels)))
# For nodes. # For nodes.
nodes1 = [n for n in g1.nodes()] nodes1 = [n for n in g1.nodes()]
for i, map_i in enumerate(forward_map): for i, map_i in enumerate(forward_map):
@@ -350,7 +375,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
label = tuple(g2.nodes[nodes2[i]].items()) label = tuple(g2.nodes[nodes2[i]].items())
idx_label = node_labels.index(label) # @todo: faster idx_label = node_labels.index(label) # @todo: faster
nb_ops_node[0, idx_label + 1] += 1 nb_ops_node[0, idx_label + 1] += 1
# For edges. # For edges.
edges1 = [e for e in g1.edges()] edges1 = [e for e in g1.edges()]
edges2_marked = [] edges2_marked = []
@@ -371,7 +396,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
label2 = tuple(g2.edges[(nf2, nt2)].items()) label2 = tuple(g2.edges[(nf2, nt2)].items())
if label1 != label2: if label1 != label2:
idx_label2 = edge_labels.index(label2) # @todo: faster idx_label2 = edge_labels.index(label2) # @todo: faster
nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1
nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1
# Switch nf2 and nt2, for directed graphs. # Switch nf2 and nt2, for directed graphs.
elif (nt2, nf2) in g2.edges(): elif (nt2, nf2) in g2.edges():
edges2_marked.append((nt2, nf2)) edges2_marked.append((nt2, nf2))
@@ -389,7 +414,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
label = tuple(g2.edges[(nt, nf)].items()) label = tuple(g2.edges[(nt, nf)].items())
idx_label = edge_labels.index(label) # @todo: faster idx_label = edge_labels.index(label) # @todo: faster
nb_ops_edge[0, idx_label + 1] += 1 nb_ops_edge[0, idx_label + 1] += 1
# Reform the numbers of edit oeprations into a vector. # Reform the numbers of edit oeprations into a vector.
nb_eo_vector = [] nb_eo_vector = []
# node insertion. # node insertion.
@@ -412,9 +437,9 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
for i in range(1, len(nb_ops_edge)): for i in range(1, len(nb_ops_edge)):
for j in range(i + 1, len(nb_ops_edge)): for j in range(i + 1, len(nb_ops_edge)):
nb_eo_vector.append(nb_ops_edge[i, j]) nb_eo_vector.append(nb_ops_edge[i, j])
return nb_eo_vector return nb_eo_vector


def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,
node_labels=[], edge_labels=[]): node_labels=[], edge_labels=[]):
@@ -426,7 +451,7 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,
n_ei = 0 n_ei = 0
n_er = 0 n_er = 0
n_es = 0 n_es = 0
nodes1 = [n for n in g1.nodes()] nodes1 = [n for n in g1.nodes()]
for i, map_i in enumerate(forward_map): for i, map_i in enumerate(forward_map):
if map_i == np.inf: if map_i == np.inf:
@@ -441,9 +466,9 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,
for map_i in backward_map: for map_i in backward_map:
if map_i == np.inf: if map_i == np.inf:
n_vi += 1 n_vi += 1
# idx_nodes1 = range(0, len(node1)) # idx_nodes1 = range(0, len(node1))
edges1 = [e for e in g1.edges()] edges1 = [e for e in g1.edges()]
nb_edges2_cnted = 0 nb_edges2_cnted = 0
for n1, n2 in edges1: for n1, n2 in edges1:
@@ -475,7 +500,7 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,
else: else:
n_er += 1 n_er += 1
n_ei = nx.number_of_edges(g2) - nb_edges2_cnted n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
return n_vi, n_vr, n_vs, n_ei, n_er, n_es return n_vi, n_vr, n_vs, n_ei, n_er, n_es




@@ -488,7 +513,7 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map):
sod_vs = 0 sod_vs = 0
n_ei = 0 n_ei = 0
n_er = 0 n_er = 0
nodes1 = [n for n in g1.nodes()] nodes1 = [n for n in g1.nodes()]
for i, map_i in enumerate(forward_map): for i, map_i in enumerate(forward_map):
if map_i == np.inf: if map_i == np.inf:
@@ -501,9 +526,9 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map):
for map_i in backward_map: for map_i in backward_map:
if map_i == np.inf: if map_i == np.inf:
n_vi += 1 n_vi += 1
# idx_nodes1 = range(0, len(node1)) # idx_nodes1 = range(0, len(node1))
edges1 = [e for e in g1.edges()] edges1 = [e for e in g1.edges()]
nb_edges2_cnted = 0 nb_edges2_cnted = 0
for n1, n2 in edges1: for n1, n2 in edges1:
@@ -520,7 +545,7 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map):
else: else:
n_er += 1 n_er += 1
n_ei = nx.number_of_edges(g2) - nb_edges2_cnted n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er




@@ -536,7 +561,7 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map,
n_er = 0 n_er = 0
n_es = 0 n_es = 0
sod_es = 0 sod_es = 0
nodes1 = [n for n in g1.nodes()] nodes1 = [n for n in g1.nodes()]
for i, map_i in enumerate(forward_map): for i, map_i in enumerate(forward_map):
if map_i == np.inf: if map_i == np.inf:
@@ -551,9 +576,9 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map,
for map_i in backward_map: for map_i in backward_map:
if map_i == np.inf: if map_i == np.inf:
n_vi += 1 n_vi += 1
# idx_nodes1 = range(0, len(node1)) # idx_nodes1 = range(0, len(node1))
edges1 = [e for e in g1.edges()] edges1 = [e for e in g1.edges()]
for n1, n2 in edges1: for n1, n2 in edges1:
idx1 = nodes1.index(n1) idx1 = nodes1.index(n1)
@@ -582,7 +607,7 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map,
else: else:
n_er += 1 n_er += 1
n_ei = nx.number_of_edges(g2) - n_es n_ei = nx.number_of_edges(g2) - n_es
return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es




@@ -615,7 +640,7 @@ def ged_options_to_string(options):
opt_str += '--log ' + str(val) + ' ' opt_str += '--log ' + str(val) + ' '
elif key == 'randomness': elif key == 'randomness':
opt_str += '--randomness ' + str(val) + ' ' opt_str += '--randomness ' + str(val) + ' '
# if not isinstance(val, list): # if not isinstance(val, list):
# opt_str += '--' + key.replace('_', '-') + ' ' # opt_str += '--' + key.replace('_', '-') + ' '
# if val == False: # if val == False:


+ 1
- 1
gklearn/kernels/graph_kernel.py View File

@@ -37,7 +37,7 @@ class GraphKernel(object):
elif len(graphs[0]) == 0: elif len(graphs[0]) == 0:
raise Exception('The graph list given is empty. No computation was performed.') raise Exception('The graph list given is empty. No computation was performed.')
else: else:
self._graphs = [g.copy() for g in graphs[0]]
self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow.
self._gram_matrix = self._compute_gram_matrix() self._gram_matrix = self._compute_gram_matrix()
self._gram_matrix_unnorm = np.copy(self._gram_matrix) self._gram_matrix_unnorm = np.copy(self._gram_matrix)
if self._normalize: if self._normalize:


+ 15
- 27
gklearn/kernels/structural_sp.py View File

@@ -14,7 +14,7 @@ import sys
from itertools import product from itertools import product
# from functools import partial # from functools import partial
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm
from gklearn.utils import get_iters
# import networkx as nx # import networkx as nx
import numpy as np import numpy as np
from gklearn.utils.parallel import parallel_gm, parallel_me from gklearn.utils.parallel import parallel_gm, parallel_me
@@ -41,10 +41,7 @@ class StructuralSP(GraphKernel):
def _compute_gm_series(self): def _compute_gm_series(self):
# get shortest paths of each graph in the graphs. # get shortest paths of each graph in the graphs.
splist = [] splist = []
if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='getting sp graphs', file=sys.stdout)
else:
iterator = self._graphs
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
if self._compute_method == 'trie': if self._compute_method == 'trie':
for g in iterator: for g in iterator:
splist.append(self._get_sps_as_trie(g)) splist.append(self._get_sps_as_trie(g))
@@ -57,10 +54,9 @@ class StructuralSP(GraphKernel):


from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2:
iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout)
else:
iterator = itr
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self._verbose >= 2))
if self._compute_method == 'trie': if self._compute_method == 'trie':
for i, j in iterator: for i, j in iterator:
kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j])
@@ -91,11 +87,9 @@ class StructuralSP(GraphKernel):
get_sps_fun = self._wrapper_get_sps_trie get_sps_fun = self._wrapper_get_sps_trie
else: else:
get_sps_fun = self._wrapper_get_sps_naive get_sps_fun = self._wrapper_get_sps_naive
if self.verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize),
desc='getting shortest paths', file=sys.stdout)
else:
iterator = pool.imap_unordered(get_sps_fun, itr, chunksize)
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize),
desc='getting shortest paths', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
for i, sp in iterator: for i, sp in iterator:
splist[i] = sp splist[i] = sp
pool.close() pool.close()
@@ -122,10 +116,8 @@ class StructuralSP(GraphKernel):
# get shortest paths of g1 and each graph in g_list. # get shortest paths of g1 and each graph in g_list.
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed'])
splist = [] splist = []
if self._verbose >= 2:
iterator = tqdm(g_list, desc='getting sp graphs', file=sys.stdout)
else:
iterator = g_list
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout,
verbose=(self._verbose >= 2))
if self._compute_method == 'trie': if self._compute_method == 'trie':
for g in iterator: for g in iterator:
splist.append(self._get_sps_as_trie(g)) splist.append(self._get_sps_as_trie(g))
@@ -135,10 +127,8 @@ class StructuralSP(GraphKernel):


# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._verbose >= 2:
iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout)
else:
iterator = range(len(g_list))
iterator = get_iters(range(len(g_list)), desc='Computing kernels',
file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
if self._compute_method == 'trie': if self._compute_method == 'trie':
for i in iterator: for i in iterator:
kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i]) kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i])
@@ -166,11 +156,9 @@ class StructuralSP(GraphKernel):
get_sps_fun = self._wrapper_get_sps_trie get_sps_fun = self._wrapper_get_sps_trie
else: else:
get_sps_fun = self._wrapper_get_sps_naive get_sps_fun = self._wrapper_get_sps_naive
if self.verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize),
desc='getting shortest paths', file=sys.stdout)
else:
iterator = pool.imap_unordered(get_sps_fun, itr, chunksize)
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize),
desc='getting shortest paths', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
for i, sp in iterator: for i, sp in iterator:
splist[i] = sp splist[i] = sp
pool.close() pool.close()


+ 170
- 170
gklearn/utils/dataset.py View File

@@ -12,13 +12,13 @@ import os




class Dataset(object): class Dataset(object):
import warnings
warnings.simplefilter('always', DeprecationWarning)
warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning)


def __init__(self, filename=None, filename_targets=None, **kwargs): def __init__(self, filename=None, filename_targets=None, **kwargs):
import warnings
warnings.simplefilter('always', DeprecationWarning)
warnings.warn('This class has been moved to "gklearn.dataset" module. The class "gklearn.utils.dataset.Dataset" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning)

if filename is None: if filename is None:
self._graphs = None self._graphs = None
self._targets = None self._targets = None
@@ -28,7 +28,7 @@ class Dataset(object):
self._edge_attrs = None self._edge_attrs = None
else: else:
self.load_dataset(filename, filename_targets=filename_targets, **kwargs) self.load_dataset(filename, filename_targets=filename_targets, **kwargs)
self._substructures = None self._substructures = None
self._node_label_dim = None self._node_label_dim = None
self._edge_label_dim = None self._edge_label_dim = None
@@ -53,8 +53,8 @@ class Dataset(object):
self._node_attr_dim = None self._node_attr_dim = None
self._edge_attr_dim = None self._edge_attr_dim = None
self._class_number = None self._class_number = None
def load_dataset(self, filename, filename_targets=None, **kwargs): def load_dataset(self, filename, filename_targets=None, **kwargs):
self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs) self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs)
self._node_labels = label_names['node_labels'] self._node_labels = label_names['node_labels']
@@ -62,15 +62,15 @@ class Dataset(object):
self._edge_labels = label_names['edge_labels'] self._edge_labels = label_names['edge_labels']
self._edge_attrs = label_names['edge_attrs'] self._edge_attrs = label_names['edge_attrs']
self.clean_labels() self.clean_labels()
def load_graphs(self, graphs, targets=None): def load_graphs(self, graphs, targets=None):
# this has to be followed by set_labels(). # this has to be followed by set_labels().
self._graphs = graphs self._graphs = graphs
self._targets = targets self._targets = targets
# self.set_labels_attrs() # @todo # self.set_labels_attrs() # @todo
def load_predefined_dataset(self, ds_name): def load_predefined_dataset(self, ds_name):
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
if ds_name == 'Acyclic': if ds_name == 'Acyclic':
@@ -130,7 +130,7 @@ class Dataset(object):
self._graphs, self._targets, label_names = load_dataset(ds_file) self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'NCI109': elif ds_name == 'NCI109':
ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt' ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'PAH': elif ds_name == 'PAH':
ds_file = current_path + '../../datasets/PAH/dataset.ds' ds_file = current_path + '../../datasets/PAH/dataset.ds'
self._graphs, self._targets, label_names = load_dataset(ds_file) self._graphs, self._targets, label_names = load_dataset(ds_file)
@@ -143,13 +143,13 @@ class Dataset(object):
pass pass
else: else:
raise Exception('The dataset name "', ds_name, '" is not pre-defined.') raise Exception('The dataset name "', ds_name, '" is not pre-defined.')
self._node_labels = label_names['node_labels'] self._node_labels = label_names['node_labels']
self._node_attrs = label_names['node_attrs'] self._node_attrs = label_names['node_attrs']
self._edge_labels = label_names['edge_labels'] self._edge_labels = label_names['edge_labels']
self._edge_attrs = label_names['edge_attrs'] self._edge_attrs = label_names['edge_attrs']
self.clean_labels() self.clean_labels()


def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
self._node_labels = node_labels self._node_labels = node_labels
@@ -157,7 +157,7 @@ class Dataset(object):
self._edge_labels = edge_labels self._edge_labels = edge_labels
self._edge_attrs = edge_attrs self._edge_attrs = edge_attrs


def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None): def set_labels_attrs(self, node_labels=None, node_attrs=None, edge_labels=None, edge_attrs=None):
# @todo: remove labels which have only one possible values. # @todo: remove labels which have only one possible values.
if node_labels is None: if node_labels is None:
@@ -183,86 +183,86 @@ class Dataset(object):
# if 'attributes' in e[2]: # if 'attributes' in e[2]:
# return len(e[2]['attributes']) # return len(e[2]['attributes'])
# return 0 # return 0
def get_dataset_infos(self, keys=None, params=None): def get_dataset_infos(self, keys=None, params=None):
"""Computes and returns the structure and property information of the graph dataset. """Computes and returns the structure and property information of the graph dataset.
Parameters Parameters
---------- ----------
keys : list, optional keys : list, optional
A list of strings which indicate which informations will be returned. The A list of strings which indicate which informations will be returned. The
possible choices includes: possible choices includes:
'substructures': sub-structures graphs contains, including 'linear', 'non
'substructures': sub-structures graphs contains, including 'linear', 'non
linear' and 'cyclic'. linear' and 'cyclic'.
'node_label_dim': whether vertices have symbolic labels. 'node_label_dim': whether vertices have symbolic labels.
'edge_label_dim': whether egdes have symbolic labels. 'edge_label_dim': whether egdes have symbolic labels.
'directed': whether graphs in dataset are directed. 'directed': whether graphs in dataset are directed.
'dataset_size': number of graphs in dataset. 'dataset_size': number of graphs in dataset.
'total_node_num': total number of vertices of all graphs in dataset. 'total_node_num': total number of vertices of all graphs in dataset.
'ave_node_num': average number of vertices of graphs in dataset. 'ave_node_num': average number of vertices of graphs in dataset.
'min_node_num': minimum number of vertices of graphs in dataset. 'min_node_num': minimum number of vertices of graphs in dataset.
'max_node_num': maximum number of vertices of graphs in dataset. 'max_node_num': maximum number of vertices of graphs in dataset.
'total_edge_num': total number of edges of all graphs in dataset. 'total_edge_num': total number of edges of all graphs in dataset.
'ave_edge_num': average number of edges of graphs in dataset. 'ave_edge_num': average number of edges of graphs in dataset.
'min_edge_num': minimum number of edges of graphs in dataset. 'min_edge_num': minimum number of edges of graphs in dataset.
'max_edge_num': maximum number of edges of graphs in dataset. 'max_edge_num': maximum number of edges of graphs in dataset.
'ave_node_degree': average vertex degree of graphs in dataset. 'ave_node_degree': average vertex degree of graphs in dataset.
'min_node_degree': minimum vertex degree of graphs in dataset. 'min_node_degree': minimum vertex degree of graphs in dataset.
'max_node_degree': maximum vertex degree of graphs in dataset. 'max_node_degree': maximum vertex degree of graphs in dataset.
'ave_fill_factor': average fill factor (number_of_edges /
'ave_fill_factor': average fill factor (number_of_edges /
(number_of_nodes ** 2)) of graphs in dataset. (number_of_nodes ** 2)) of graphs in dataset.
'min_fill_factor': minimum fill factor of graphs in dataset. 'min_fill_factor': minimum fill factor of graphs in dataset.
'max_fill_factor': maximum fill factor of graphs in dataset. 'max_fill_factor': maximum fill factor of graphs in dataset.
'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset. 'node_label_nums': list of numbers of symbolic vertex labels of graphs in dataset.
'edge_label_nums': list number of symbolic edge labels of graphs in dataset. 'edge_label_nums': list number of symbolic edge labels of graphs in dataset.
'node_attr_dim': number of dimensions of non-symbolic vertex labels.
'node_attr_dim': number of dimensions of non-symbolic vertex labels.
Extracted from the 'attributes' attribute of graph nodes. Extracted from the 'attributes' attribute of graph nodes.
'edge_attr_dim': number of dimensions of non-symbolic edge labels.
'edge_attr_dim': number of dimensions of non-symbolic edge labels.
Extracted from the 'attributes' attribute of graph edges. Extracted from the 'attributes' attribute of graph edges.
'class_number': number of classes. Only available for classification problems. 'class_number': number of classes. Only available for classification problems.
'all_degree_entropy': the entropy of degree distribution of each graph. 'all_degree_entropy': the entropy of degree distribution of each graph.
'ave_degree_entropy': the average entropy of degree distribution of all graphs. 'ave_degree_entropy': the average entropy of degree distribution of all graphs.
All informations above will be returned if `keys` is not given. All informations above will be returned if `keys` is not given.
params: dict of dict, optional params: dict of dict, optional
A dictinary which contains extra parameters for each possible
A dictinary which contains extra parameters for each possible
element in ``keys``. element in ``keys``.
Return Return
------ ------
dict dict
Information of the graph dataset keyed by `keys`. Information of the graph dataset keyed by `keys`.
""" """
infos = {} infos = {}
if keys == None: if keys == None:
keys = [ keys = [
'substructures', 'substructures',
@@ -292,13 +292,13 @@ class Dataset(object):
'all_degree_entropy', 'all_degree_entropy',
'ave_degree_entropy' 'ave_degree_entropy'
] ]
# dataset size # dataset size
if 'dataset_size' in keys: if 'dataset_size' in keys:
if self._dataset_size is None: if self._dataset_size is None:
self._dataset_size = self._get_dataset_size() self._dataset_size = self._get_dataset_size()
infos['dataset_size'] = self._dataset_size infos['dataset_size'] = self._dataset_size
# graph node number # graph node number
if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']): if any(i in keys for i in ['total_node_num', 'ave_node_num', 'min_node_num', 'max_node_num']):
all_node_nums = self._get_all_node_nums() all_node_nums = self._get_all_node_nums()
@@ -307,22 +307,22 @@ class Dataset(object):
if self._total_node_num is None: if self._total_node_num is None:
self._total_node_num = self._get_total_node_num(all_node_nums) self._total_node_num = self._get_total_node_num(all_node_nums)
infos['total_node_num'] = self._total_node_num infos['total_node_num'] = self._total_node_num
if 'ave_node_num' in keys: if 'ave_node_num' in keys:
if self._ave_node_num is None: if self._ave_node_num is None:
self._ave_node_num = self._get_ave_node_num(all_node_nums) self._ave_node_num = self._get_ave_node_num(all_node_nums)
infos['ave_node_num'] = self._ave_node_num infos['ave_node_num'] = self._ave_node_num
if 'min_node_num' in keys: if 'min_node_num' in keys:
if self._min_node_num is None: if self._min_node_num is None:
self._min_node_num = self._get_min_node_num(all_node_nums) self._min_node_num = self._get_min_node_num(all_node_nums)
infos['min_node_num'] = self._min_node_num infos['min_node_num'] = self._min_node_num
if 'max_node_num' in keys: if 'max_node_num' in keys:
if self._max_node_num is None: if self._max_node_num is None:
self._max_node_num = self._get_max_node_num(all_node_nums) self._max_node_num = self._get_max_node_num(all_node_nums)
infos['max_node_num'] = self._max_node_num infos['max_node_num'] = self._max_node_num
# graph edge number # graph edge number
if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']): if any(i in keys for i in ['total_edge_num', 'ave_edge_num', 'min_edge_num', 'max_edge_num']):
all_edge_nums = self._get_all_edge_nums() all_edge_nums = self._get_all_edge_nums()
@@ -331,12 +331,12 @@ class Dataset(object):
if self._total_edge_num is None: if self._total_edge_num is None:
self._total_edge_num = self._get_total_edge_num(all_edge_nums) self._total_edge_num = self._get_total_edge_num(all_edge_nums)
infos['total_edge_num'] = self._total_edge_num infos['total_edge_num'] = self._total_edge_num
if 'ave_edge_num' in keys: if 'ave_edge_num' in keys:
if self._ave_edge_num is None: if self._ave_edge_num is None:
self._ave_edge_num = self._get_ave_edge_num(all_edge_nums) self._ave_edge_num = self._get_ave_edge_num(all_edge_nums)
infos['ave_edge_num'] = self._ave_edge_num infos['ave_edge_num'] = self._ave_edge_num
if 'max_edge_num' in keys: if 'max_edge_num' in keys:
if self._max_edge_num is None: if self._max_edge_num is None:
self._max_edge_num = self._get_max_edge_num(all_edge_nums) self._max_edge_num = self._get_max_edge_num(all_edge_nums)
@@ -346,120 +346,120 @@ class Dataset(object):
if self._min_edge_num is None: if self._min_edge_num is None:
self._min_edge_num = self._get_min_edge_num(all_edge_nums) self._min_edge_num = self._get_min_edge_num(all_edge_nums)
infos['min_edge_num'] = self._min_edge_num infos['min_edge_num'] = self._min_edge_num
# label number # label number
if 'node_label_dim' in keys: if 'node_label_dim' in keys:
if self._node_label_dim is None: if self._node_label_dim is None:
self._node_label_dim = self._get_node_label_dim() self._node_label_dim = self._get_node_label_dim()
infos['node_label_dim'] = self._node_label_dim
infos['node_label_dim'] = self._node_label_dim
if 'node_label_nums' in keys: if 'node_label_nums' in keys:
if self._node_label_nums is None: if self._node_label_nums is None:
self._node_label_nums = {} self._node_label_nums = {}
for node_label in self._node_labels: for node_label in self._node_labels:
self._node_label_nums[node_label] = self._get_node_label_num(node_label) self._node_label_nums[node_label] = self._get_node_label_num(node_label)
infos['node_label_nums'] = self._node_label_nums infos['node_label_nums'] = self._node_label_nums
if 'edge_label_dim' in keys: if 'edge_label_dim' in keys:
if self._edge_label_dim is None: if self._edge_label_dim is None:
self._edge_label_dim = self._get_edge_label_dim() self._edge_label_dim = self._get_edge_label_dim()
infos['edge_label_dim'] = self._edge_label_dim
infos['edge_label_dim'] = self._edge_label_dim
if 'edge_label_nums' in keys: if 'edge_label_nums' in keys:
if self._edge_label_nums is None: if self._edge_label_nums is None:
self._edge_label_nums = {} self._edge_label_nums = {}
for edge_label in self._edge_labels: for edge_label in self._edge_labels:
self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label) self._edge_label_nums[edge_label] = self._get_edge_label_num(edge_label)
infos['edge_label_nums'] = self._edge_label_nums infos['edge_label_nums'] = self._edge_label_nums
if 'directed' in keys or 'substructures' in keys: if 'directed' in keys or 'substructures' in keys:
if self._directed is None: if self._directed is None:
self._directed = self._is_directed() self._directed = self._is_directed()
infos['directed'] = self._directed infos['directed'] = self._directed
# node degree # node degree
if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']): if any(i in keys for i in ['ave_node_degree', 'max_node_degree', 'min_node_degree']):
all_node_degrees = self._get_all_node_degrees() all_node_degrees = self._get_all_node_degrees()
if 'ave_node_degree' in keys: if 'ave_node_degree' in keys:
if self._ave_node_degree is None: if self._ave_node_degree is None:
self._ave_node_degree = self._get_ave_node_degree(all_node_degrees) self._ave_node_degree = self._get_ave_node_degree(all_node_degrees)
infos['ave_node_degree'] = self._ave_node_degree infos['ave_node_degree'] = self._ave_node_degree
if 'max_node_degree' in keys: if 'max_node_degree' in keys:
if self._max_node_degree is None: if self._max_node_degree is None:
self._max_node_degree = self._get_max_node_degree(all_node_degrees) self._max_node_degree = self._get_max_node_degree(all_node_degrees)
infos['max_node_degree'] = self._max_node_degree infos['max_node_degree'] = self._max_node_degree
if 'min_node_degree' in keys: if 'min_node_degree' in keys:
if self._min_node_degree is None: if self._min_node_degree is None:
self._min_node_degree = self._get_min_node_degree(all_node_degrees) self._min_node_degree = self._get_min_node_degree(all_node_degrees)
infos['min_node_degree'] = self._min_node_degree infos['min_node_degree'] = self._min_node_degree
# fill factor # fill factor
if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']): if any(i in keys for i in ['ave_fill_factor', 'max_fill_factor', 'min_fill_factor']):
all_fill_factors = self._get_all_fill_factors() all_fill_factors = self._get_all_fill_factors()
if 'ave_fill_factor' in keys: if 'ave_fill_factor' in keys:
if self._ave_fill_factor is None: if self._ave_fill_factor is None:
self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors) self._ave_fill_factor = self._get_ave_fill_factor(all_fill_factors)
infos['ave_fill_factor'] = self._ave_fill_factor infos['ave_fill_factor'] = self._ave_fill_factor
if 'max_fill_factor' in keys: if 'max_fill_factor' in keys:
if self._max_fill_factor is None: if self._max_fill_factor is None:
self._max_fill_factor = self._get_max_fill_factor(all_fill_factors) self._max_fill_factor = self._get_max_fill_factor(all_fill_factors)
infos['max_fill_factor'] = self._max_fill_factor infos['max_fill_factor'] = self._max_fill_factor
if 'min_fill_factor' in keys: if 'min_fill_factor' in keys:
if self._min_fill_factor is None: if self._min_fill_factor is None:
self._min_fill_factor = self._get_min_fill_factor(all_fill_factors) self._min_fill_factor = self._get_min_fill_factor(all_fill_factors)
infos['min_fill_factor'] = self._min_fill_factor infos['min_fill_factor'] = self._min_fill_factor
if 'substructures' in keys: if 'substructures' in keys:
if self._substructures is None: if self._substructures is None:
self._substructures = self._get_substructures() self._substructures = self._get_substructures()
infos['substructures'] = self._substructures infos['substructures'] = self._substructures
if 'class_number' in keys: if 'class_number' in keys:
if self._class_number is None: if self._class_number is None:
self._class_number = self._get_class_number() self._class_number = self._get_class_number()
infos['class_number'] = self._class_number infos['class_number'] = self._class_number
if 'node_attr_dim' in keys: if 'node_attr_dim' in keys:
if self._node_attr_dim is None: if self._node_attr_dim is None:
self._node_attr_dim = self._get_node_attr_dim() self._node_attr_dim = self._get_node_attr_dim()
infos['node_attr_dim'] = self._node_attr_dim infos['node_attr_dim'] = self._node_attr_dim
if 'edge_attr_dim' in keys: if 'edge_attr_dim' in keys:
if self._edge_attr_dim is None: if self._edge_attr_dim is None:
self._edge_attr_dim = self._get_edge_attr_dim() self._edge_attr_dim = self._get_edge_attr_dim()
infos['edge_attr_dim'] = self._edge_attr_dim infos['edge_attr_dim'] = self._edge_attr_dim
# entropy of degree distribution. # entropy of degree distribution.
if 'all_degree_entropy' in keys: if 'all_degree_entropy' in keys:
if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']):
base = params['all_degree_entropy']['base'] base = params['all_degree_entropy']['base']
else: else:
base = None base = None
infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base) infos['all_degree_entropy'] = self._compute_all_degree_entropy(base=base)
if 'ave_degree_entropy' in keys: if 'ave_degree_entropy' in keys:
if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']):
base = params['ave_degree_entropy']['base'] base = params['ave_degree_entropy']['base']
else: else:
base = None base = None
infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base)) infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base))
return infos return infos
def print_graph_infos(self, infos): def print_graph_infos(self, infos):
from collections import OrderedDict from collections import OrderedDict
keys = list(infos.keys()) keys = list(infos.keys())
print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0])))) print(OrderedDict(sorted(infos.items(), key=lambda i: keys.index(i[0]))))
def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): def remove_labels(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
node_labels = [item for item in node_labels if item in self._node_labels] node_labels = [item for item in node_labels if item in self._node_labels]
edge_labels = [item for item in edge_labels if item in self._edge_labels] edge_labels = [item for item in edge_labels if item in self._edge_labels]
@@ -485,8 +485,8 @@ class Dataset(object):
self._node_attrs = [na for na in self._node_attrs if na not in node_attrs] self._node_attrs = [na for na in self._node_attrs if na not in node_attrs]
if len(edge_attrs) > 0: if len(edge_attrs) > 0:
self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs] self._edge_attrs = [ea for ea in self._edge_attrs if ea not in edge_attrs]
def clean_labels(self): def clean_labels(self):
labels = [] labels = []
for name in self._node_labels: for name in self._node_labels:
@@ -543,8 +543,8 @@ class Dataset(object):
for ed in G.edges(): for ed in G.edges():
del G.edges[ed][name] del G.edges[ed][name]
self._edge_attrs = labels self._edge_attrs = labels
def cut_graphs(self, range_): def cut_graphs(self, range_):
self._graphs = [self._graphs[i] for i in range_] self._graphs = [self._graphs[i] for i in range_]
if self._targets is not None: if self._targets is not None:
@@ -561,8 +561,8 @@ class Dataset(object):
self._graphs = [p[1] for p in trimed_pairs] self._graphs = [p[1] for p in trimed_pairs]
self._targets = [self._targets[i] for i in idx] self._targets = [self._targets[i] for i in idx]
self.clean_labels() self.clean_labels()
def copy(self): def copy(self):
dataset = Dataset() dataset = Dataset()
graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None graphs = [g.copy() for g in self._graphs] if self._graphs is not None else None
@@ -575,8 +575,8 @@ class Dataset(object):
dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
# @todo: clean_labels and add other class members? # @todo: clean_labels and add other class members?
return dataset return dataset
def get_all_node_labels(self): def get_all_node_labels(self):
node_labels = [] node_labels = []
for g in self._graphs: for g in self._graphs:
@@ -585,8 +585,8 @@ class Dataset(object):
if nl not in node_labels: if nl not in node_labels:
node_labels.append(nl) node_labels.append(nl)
return node_labels return node_labels
def get_all_edge_labels(self): def get_all_edge_labels(self):
edge_labels = [] edge_labels = []
for g in self._graphs: for g in self._graphs:
@@ -595,94 +595,94 @@ class Dataset(object):
if el not in edge_labels: if el not in edge_labels:
edge_labels.append(el) edge_labels.append(el)
return edge_labels return edge_labels
def _get_dataset_size(self): def _get_dataset_size(self):
return len(self._graphs) return len(self._graphs)
def _get_all_node_nums(self): def _get_all_node_nums(self):
return [nx.number_of_nodes(G) for G in self._graphs] return [nx.number_of_nodes(G) for G in self._graphs]
def _get_total_node_nums(self, all_node_nums): def _get_total_node_nums(self, all_node_nums):
return np.sum(all_node_nums) return np.sum(all_node_nums)
def _get_ave_node_num(self, all_node_nums): def _get_ave_node_num(self, all_node_nums):
return np.mean(all_node_nums) return np.mean(all_node_nums)
def _get_min_node_num(self, all_node_nums): def _get_min_node_num(self, all_node_nums):
return np.amin(all_node_nums) return np.amin(all_node_nums)
def _get_max_node_num(self, all_node_nums): def _get_max_node_num(self, all_node_nums):
return np.amax(all_node_nums) return np.amax(all_node_nums)
def _get_all_edge_nums(self): def _get_all_edge_nums(self):
return [nx.number_of_edges(G) for G in self._graphs] return [nx.number_of_edges(G) for G in self._graphs]
def _get_total_edge_nums(self, all_edge_nums): def _get_total_edge_nums(self, all_edge_nums):
return np.sum(all_edge_nums) return np.sum(all_edge_nums)
def _get_ave_edge_num(self, all_edge_nums): def _get_ave_edge_num(self, all_edge_nums):
return np.mean(all_edge_nums) return np.mean(all_edge_nums)
def _get_min_edge_num(self, all_edge_nums): def _get_min_edge_num(self, all_edge_nums):
return np.amin(all_edge_nums) return np.amin(all_edge_nums)
def _get_max_edge_num(self, all_edge_nums): def _get_max_edge_num(self, all_edge_nums):
return np.amax(all_edge_nums) return np.amax(all_edge_nums)
def _get_node_label_dim(self): def _get_node_label_dim(self):
return len(self._node_labels) return len(self._node_labels)
def _get_node_label_num(self, node_label): def _get_node_label_num(self, node_label):
nl = set() nl = set()
for G in self._graphs: for G in self._graphs:
nl = nl | set(nx.get_node_attributes(G, node_label).values()) nl = nl | set(nx.get_node_attributes(G, node_label).values())
return len(nl) return len(nl)
def _get_edge_label_dim(self): def _get_edge_label_dim(self):
return len(self._edge_labels) return len(self._edge_labels)
def _get_edge_label_num(self, edge_label): def _get_edge_label_num(self, edge_label):
el = set() el = set()
for G in self._graphs: for G in self._graphs:
el = el | set(nx.get_edge_attributes(G, edge_label).values()) el = el | set(nx.get_edge_attributes(G, edge_label).values())
return len(el) return len(el)
def _is_directed(self): def _is_directed(self):
return nx.is_directed(self._graphs[0]) return nx.is_directed(self._graphs[0])
def _get_all_node_degrees(self): def _get_all_node_degrees(self):
return [np.mean(list(dict(G.degree()).values())) for G in self._graphs] return [np.mean(list(dict(G.degree()).values())) for G in self._graphs]
def _get_ave_node_degree(self, all_node_degrees): def _get_ave_node_degree(self, all_node_degrees):
return np.mean(all_node_degrees) return np.mean(all_node_degrees)
def _get_max_node_degree(self, all_node_degrees): def _get_max_node_degree(self, all_node_degrees):
return np.amax(all_node_degrees) return np.amax(all_node_degrees)
def _get_min_node_degree(self, all_node_degrees): def _get_min_node_degree(self, all_node_degrees):
return np.amin(all_node_degrees) return np.amin(all_node_degrees)
def _get_all_fill_factors(self): def _get_all_fill_factors(self):
"""Get fill factor, the number of non-zero entries in the adjacency matrix. """Get fill factor, the number of non-zero entries in the adjacency matrix.


@@ -692,20 +692,20 @@ class Dataset(object):
List of fill factors for all graphs. List of fill factors for all graphs.
""" """
return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs] return [nx.number_of_edges(G) / (nx.number_of_nodes(G) ** 2) for G in self._graphs]


def _get_ave_fill_factor(self, all_fill_factors): def _get_ave_fill_factor(self, all_fill_factors):
return np.mean(all_fill_factors) return np.mean(all_fill_factors)
def _get_max_fill_factor(self, all_fill_factors): def _get_max_fill_factor(self, all_fill_factors):
return np.amax(all_fill_factors) return np.amax(all_fill_factors)
def _get_min_fill_factor(self, all_fill_factors): def _get_min_fill_factor(self, all_fill_factors):
return np.amin(all_fill_factors) return np.amin(all_fill_factors)
def _get_substructures(self): def _get_substructures(self):
subs = set() subs = set()
for G in self._graphs: for G in self._graphs:
@@ -737,22 +737,22 @@ class Dataset(object):
# if any(len(i) > 2 for i in cyc): # if any(len(i) > 2 for i in cyc):
# subs.add('cyclic') # subs.add('cyclic')
# break # break
return subs return subs
def _get_class_num(self): def _get_class_num(self):
return len(set(self._targets)) return len(set(self._targets))
def _get_node_attr_dim(self): def _get_node_attr_dim(self):
return len(self._node_attrs) return len(self._node_attrs)
def _get_edge_attr_dim(self): def _get_edge_attr_dim(self):
return len(self._edge_attrs) return len(self._edge_attrs)


def _compute_all_degree_entropy(self, base=None): def _compute_all_degree_entropy(self, base=None):
"""Compute the entropy of degree distribution of each graph. """Compute the entropy of degree distribution of each graph.


@@ -767,15 +767,15 @@ class Dataset(object):
The calculated entropy. The calculated entropy.
""" """
from gklearn.utils.stats import entropy from gklearn.utils.stats import entropy
degree_entropy = [] degree_entropy = []
for g in self._graphs: for g in self._graphs:
degrees = list(dict(g.degree()).values()) degrees = list(dict(g.degree()).values())
en = entropy(degrees, base=base) en = entropy(degrees, base=base)
degree_entropy.append(en) degree_entropy.append(en)
return degree_entropy return degree_entropy
@property @property
def graphs(self): def graphs(self):
return self._graphs return self._graphs
@@ -784,8 +784,8 @@ class Dataset(object):
@property @property
def targets(self): def targets(self):
return self._targets return self._targets
@property @property
def node_labels(self): def node_labels(self):
return self._node_labels return self._node_labels
@@ -794,25 +794,25 @@ class Dataset(object):
@property @property
def edge_labels(self): def edge_labels(self):
return self._edge_labels return self._edge_labels
@property @property
def node_attrs(self): def node_attrs(self):
return self._node_attrs return self._node_attrs
@property @property
def edge_attrs(self): def edge_attrs(self):
return self._edge_attrs return self._edge_attrs
def split_dataset_by_target(dataset): def split_dataset_by_target(dataset):
import warnings import warnings
warnings.simplefilter('always', DeprecationWarning) warnings.simplefilter('always', DeprecationWarning)
warnings.warn('This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning) warnings.warn('This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.', DeprecationWarning)
from gklearn.preimage.utils import get_same_item_indices from gklearn.preimage.utils import get_same_item_indices
graphs = dataset.graphs graphs = dataset.graphs
targets = dataset.targets targets = dataset.targets
datasets = [] datasets = []


+ 63
- 63
gklearn/utils/graph_files.py View File

@@ -1,8 +1,8 @@
""" Utilities function to manage graph files """ Utilities function to manage graph files
""" """
import warnings
warnings.simplefilter('always', DeprecationWarning)
warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning)
# import warnings
# warnings.simplefilter('always', DeprecationWarning)
# warnings.warn('The functions in the module "gklearn.utils.graph_files" will be deprecated and removed since version 0.4.0. Use the corresponding functions in the module "gklearn.dataset" instead.', DeprecationWarning)


from os.path import dirname, splitext from os.path import dirname, splitext


@@ -26,17 +26,17 @@ def load_dataset(filename, filename_targets=None, gformat=None, **kwargs):
y : List y : List


Targets corresponding to graphs. Targets corresponding to graphs.
Notes Notes
----- -----
This function supports following graph dataset formats: This function supports following graph dataset formats:


'ds': load data from .ds file. See comments of function loadFromDS for a example. 'ds': load data from .ds file. See comments of function loadFromDS for a example.


'cxl': load data from Graph eXchange Language file (.cxl file). See
'cxl': load data from Graph eXchange Language file (.cxl file). See
`here <http://www.gupro.de/GXL/Introduction/background.html>`__ for detail. `here <http://www.gupro.de/GXL/Introduction/background.html>`__ for detail.


'sdf': load data from structured data file (.sdf file). See
'sdf': load data from structured data file (.sdf file). See
`here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__ `here <http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx>`__
for details. for details.


@@ -77,20 +77,20 @@ def save_dataset(Gn, y, gformat='gxl', group=None, filename='gfile', **kwargs):
import warnings import warnings
warnings.simplefilter('always', DeprecationWarning) warnings.simplefilter('always', DeprecationWarning)
warnings.warn('The function "gklearn.utils.save_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataSaver" instead.', DeprecationWarning) warnings.warn('The function "gklearn.utils.save_dataset" will be deprecated and removed since version 0.4.0. Use the class "gklearn.dataset.DataSaver" instead.', DeprecationWarning)
import os import os
dirname_ds = os.path.dirname(filename) dirname_ds = os.path.dirname(filename)
if dirname_ds != '': if dirname_ds != '':
dirname_ds += '/' dirname_ds += '/'
os.makedirs(dirname_ds, exist_ok=True) os.makedirs(dirname_ds, exist_ok=True)
if 'graph_dir' in kwargs: if 'graph_dir' in kwargs:
graph_dir = kwargs['graph_dir'] + '/' graph_dir = kwargs['graph_dir'] + '/'
os.makedirs(graph_dir, exist_ok=True) os.makedirs(graph_dir, exist_ok=True)
del kwargs['graph_dir'] del kwargs['graph_dir']
else: else:
graph_dir = dirname_ds
graph_dir = dirname_ds
if group == 'xml' and gformat == 'gxl': if group == 'xml' and gformat == 'gxl':
with open(filename + '.xml', 'w') as fgroup: with open(filename + '.xml', 'w') as fgroup:
fgroup.write("<?xml version=\"1.0\"?>") fgroup.write("<?xml version=\"1.0\"?>")
@@ -122,7 +122,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he
1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo 1 3 1 1 <- each line describes an edge : to, from, bond type, bond stereo


2 3 1 1 2 3 1 1
Check `CTFile Formats file <https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS>`__ Check `CTFile Formats file <https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=10&ved=2ahUKEwivhaSdjsTlAhVhx4UKHczHA8gQFjAJegQIARAC&url=https%3A%2F%2Fwww.daylight.com%2Fmeetings%2Fmug05%2FKappler%2Fctfile.pdf&usg=AOvVaw1cDNrrmMClkFPqodlF2inS>`__
for detailed format discription. for detailed format discription.
""" """
@@ -144,7 +144,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he
if count_line_tags[i] != '': # if not obsoleted if count_line_tags[i] != '': # if not obsoleted
g.graph[count_line_tags[i]] = tmp[i].strip() g.graph[count_line_tags[i]] = tmp[i].strip()
i += 1 i += 1
# read the atom block. # read the atom block.
atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag'] atom_tags = ['x', 'y', 'z', 'atom_symbol', 'mass_difference', 'charge', 'atom_stereo_parity', 'hydrogen_count_plus_1', 'stereo_care_box', 'valence', 'h0_designator', '', '', 'atom_atom_mapping_number', 'inversion_retention_flag', 'exact_change_flag']
for i in range(0, nb_atoms): for i in range(0, nb_atoms):
@@ -156,7 +156,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he
if atom_tags[j] != '': if atom_tags[j] != '':
g.nodes[i][atom_tags[j]] = tmp[j].strip() g.nodes[i][atom_tags[j]] = tmp[j].strip()
j += 1 j += 1
# read the bond block. # read the bond block.
bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status'] bond_tags = ['first_atom_number', 'second_atom_number', 'bond_type', 'bond_stereo', '', 'bond_topology', 'reacting_center_status']
for i in range(0, nb_bonds): for i in range(0, nb_bonds):
@@ -169,7 +169,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he
if bond_tags[j] != '': if bond_tags[j] != '':
g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip() g.edges[(n1, n2)][bond_tags[j]] = tmp[j].strip()
j += 1 j += 1
# get label names. # get label names.
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1] atom_symbolic = [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, None, None, 1, 1, 1]
@@ -188,7 +188,7 @@ def load_ct(filename): # @todo: this function is only tested on CTFile V2000; he
else: else:
label_names['edge_attrs'].append(key) label_names['edge_attrs'].append(key)
break break
return g, label_names return g, label_names




@@ -215,19 +215,19 @@ def load_gxl(filename): # @todo: directed graphs.
for attr in edge.iter('attr'): for attr in edge.iter('attr'):
labels[attr.attrib['name']] = attr[0].text labels[attr.attrib['name']] = attr[0].text
g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels) g.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], **labels)
# get label names. # get label names.
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
for node in root.iter('node'): for node in root.iter('node'):
for attr in node.iter('attr'): for attr in node.iter('attr'):
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow.
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow.
label_names['node_labels'].append(attr.attrib['name']) label_names['node_labels'].append(attr.attrib['name'])
else: else:
label_names['node_attrs'].append(attr.attrib['name']) label_names['node_attrs'].append(attr.attrib['name'])
break break
for edge in root.iter('edge'): for edge in root.iter('edge'):
for attr in edge.iter('attr'): for attr in edge.iter('attr'):
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow.
if attr[0].tag == 'int': # @todo: this maybe wrong, and slow.
label_names['edge_labels'].append(attr.attrib['name']) label_names['edge_labels'].append(attr.attrib['name'])
else: else:
label_names['edge_attrs'].append(attr.attrib['name']) label_names['edge_attrs'].append(attr.attrib['name'])
@@ -249,20 +249,20 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[],
gxl_file.write("<graph id=\"" + name + "\" edgeids=\"false\" edgemode=\"undirected\">\n") gxl_file.write("<graph id=\"" + name + "\" edgeids=\"false\" edgemode=\"undirected\">\n")
for v, attrs in graph.nodes(data=True): for v, attrs in graph.nodes(data=True):
gxl_file.write("<node id=\"_" + str(v) + "\">") gxl_file.write("<node id=\"_" + str(v) + "\">")
for l_name in node_labels:
gxl_file.write("<attr name=\"" + l_name + "\"><int>" +
for l_name in node_labels:
gxl_file.write("<attr name=\"" + l_name + "\"><int>" +
str(attrs[l_name]) + "</int></attr>") str(attrs[l_name]) + "</int></attr>")
for a_name in node_attrs:
gxl_file.write("<attr name=\"" + a_name + "\"><float>" +
for a_name in node_attrs:
gxl_file.write("<attr name=\"" + a_name + "\"><float>" +
str(attrs[a_name]) + "</float></attr>") str(attrs[a_name]) + "</float></attr>")
gxl_file.write("</node>\n") gxl_file.write("</node>\n")
for v1, v2, attrs in graph.edges(data=True): for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">") gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">")
for l_name in edge_labels:
gxl_file.write("<attr name=\"" + l_name + "\"><int>" +
for l_name in edge_labels:
gxl_file.write("<attr name=\"" + l_name + "\"><int>" +
str(attrs[l_name]) + "</int></attr>") str(attrs[l_name]) + "</int></attr>")
for a_name in edge_attrs:
gxl_file.write("<attr name=\"" + a_name + "\"><float>" +
for a_name in edge_attrs:
gxl_file.write("<attr name=\"" + a_name + "\"><float>" +
str(attrs[a_name]) + "</float></attr>") str(attrs[a_name]) + "</float></attr>")
gxl_file.write("</edge>\n") gxl_file.write("</edge>\n")
gxl_file.write("</graph>\n") gxl_file.write("</graph>\n")
@@ -276,7 +276,7 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[],
attr['edgeids'] = 'true' attr['edgeids'] = 'true'
attr['edgemode'] = 'undirected' attr['edgemode'] = 'undirected'
graph_node = ET.SubElement(root_node, 'graph', attrib=attr) graph_node = ET.SubElement(root_node, 'graph', attrib=attr)
for v in graph: for v in graph:
current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)}) current_node = ET.SubElement(graph_node, 'node', attrib={'id': str(v)})
for attr in graph.nodes[v].keys(): for attr in graph.nodes[v].keys():
@@ -285,7 +285,7 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[],
cur_value = ET.SubElement(cur_attr, cur_value = ET.SubElement(cur_attr,
graph.nodes[v][attr].__class__.__name__) graph.nodes[v][attr].__class__.__name__)
cur_value.text = graph.nodes[v][attr] cur_value.text = graph.nodes[v][attr]
for v1 in graph: for v1 in graph:
for v2 in graph[v1]: for v2 in graph[v1]:
if (v1 < v2): # Non oriented graphs if (v1 < v2): # Non oriented graphs
@@ -302,7 +302,7 @@ def save_gxl(graph, filename, method='default', node_labels=[], edge_labels=[],
cur_value = ET.SubElement( cur_value = ET.SubElement(
cur_attr, graph[v1][v2][attr].__class__.__name__) cur_attr, graph[v1][v2][attr].__class__.__name__)
cur_value.text = str(graph[v1][v2][attr]) cur_value.text = str(graph[v1][v2][attr])
tree = ET.ElementTree(root_node) tree = ET.ElementTree(root_node)
tree.write(filename) tree.write(filename)
elif method == 'gedlib': elif method == 'gedlib':
@@ -458,11 +458,11 @@ def load_mat(filename, order): # @todo: need to be updated (auto order) or depre
g.add_edge(col, row) g.add_edge(col, row)
data.append(g) data.append(g)
# print(g.edges(data=True)) # print(g.edges(data=True))
label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} label_names = {'node_labels': ['label_1'], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
if order[1] == 0: if order[1] == 0:
label_names['edge_labels'].append('label_1') label_names['edge_labels'].append('label_1')
return data, y, label_names return data, y, label_names




@@ -477,12 +477,12 @@ def load_tud(filename):
import networkx as nx import networkx as nx
from os import listdir from os import listdir
from os.path import dirname, basename from os.path import dirname, basename
def get_infos_from_readme(frm): # @todo: add README (cuniform), maybe node/edge label maps. def get_infos_from_readme(frm): # @todo: add README (cuniform), maybe node/edge label maps.
"""Get information from DS_label_readme.txt file. """Get information from DS_label_readme.txt file.
""" """
def get_label_names_from_line(line): def get_label_names_from_line(line):
"""Get names of labels/attributes from a line. """Get names of labels/attributes from a line.
""" """
@@ -490,8 +490,8 @@ def load_tud(filename):
names = str_names.split(',') names = str_names.split(',')
names = [attr.strip() for attr in names] names = [attr.strip() for attr in names]
return names return names
def get_class_label_map(label_map_strings): def get_class_label_map(label_map_strings):
label_map = {} label_map = {}
for string in label_map_strings: for string in label_map_strings:
@@ -500,7 +500,7 @@ def load_tud(filename):
return label_map return label_map




label_names = {'node_labels': [], 'node_attrs': [],
label_names = {'node_labels': [], 'node_attrs': [],
'edge_labels': [], 'edge_attrs': []} 'edge_labels': [], 'edge_attrs': []}
class_label_map = None class_label_map = None
class_label_map_strings = [] class_label_map_strings = []
@@ -528,16 +528,16 @@ def load_tud(filename):
line = content_rm[i].strip() line = content_rm[i].strip()
class_label_map = get_class_label_map(class_label_map_strings) class_label_map = get_class_label_map(class_label_map_strings)
i += 1 i += 1
return label_names, class_label_map return label_names, class_label_map


# get dataset name. # get dataset name.
dirname_dataset = dirname(filename) dirname_dataset = dirname(filename)
filename = basename(filename) filename = basename(filename)
fn_split = filename.split('_A') fn_split = filename.split('_A')
ds_name = fn_split[0].strip() ds_name = fn_split[0].strip()
# load data file names # load data file names
for name in listdir(dirname_dataset): for name in listdir(dirname_dataset):
if ds_name + '_A' in name: if ds_name + '_A' in name:
@@ -561,20 +561,20 @@ def load_tud(filename):
# this is supposed to be the node attrs, make sure to put this as the last 'elif' # this is supposed to be the node attrs, make sure to put this as the last 'elif'
elif ds_name + '_attributes' in name: elif ds_name + '_attributes' in name:
fna = dirname_dataset + '/' + name fna = dirname_dataset + '/' + name
# get labels and attributes names. # get labels and attributes names.
if 'frm' in locals(): if 'frm' in locals():
label_names, class_label_map = get_infos_from_readme(frm) label_names, class_label_map = get_infos_from_readme(frm)
else: else:
label_names = {'node_labels': [], 'node_attrs': [],
label_names = {'node_labels': [], 'node_attrs': [],
'edge_labels': [], 'edge_attrs': []} 'edge_labels': [], 'edge_attrs': []}
class_label_map = None class_label_map = None
with open(fgi) as gi: with open(fgi) as gi:
content_gi = gi.read().splitlines() # graph indicator content_gi = gi.read().splitlines() # graph indicator
with open(fam) as am: with open(fam) as am:
content_am = am.read().splitlines() # adjacency matrix content_am = am.read().splitlines() # adjacency matrix
# load targets. # load targets.
if 'fgl' in locals(): if 'fgl' in locals():
with open(fgl) as gl: with open(fgl) as gl:
@@ -609,7 +609,7 @@ def load_tud(filename):
else: else:
for i, line in enumerate(content_gi): for i, line in enumerate(content_gi):
data[int(line) - 1].add_node(i) data[int(line) - 1].add_node(i)
# add edges # add edges
for line in content_am: for line in content_am:
tmp = line.split(',') tmp = line.split(',')
@@ -670,7 +670,7 @@ def load_tud(filename):
data[g].edges[n[0], n[1]][a_name] = attrs[i] data[g].edges[n[0], n[1]][a_name] = attrs[i]


return data, targets, label_names return data, targets, label_names


def load_from_ds(filename, filename_targets): def load_from_ds(filename, filename_targets):
"""Load data from .ds file. """Load data from .ds file.
@@ -681,9 +681,9 @@ def load_from_ds(filename, filename_targets):


'.gxl': see dunction load_gxl for detail. '.gxl': see dunction load_gxl for detail.


Note these graph formats are checked automatically by the extensions of
Note these graph formats are checked automatically by the extensions of
graph files. graph files.
"""
"""
dirname_dataset = dirname(filename) dirname_dataset = dirname(filename)
data = [] data = []
y = [] y = []
@@ -695,7 +695,7 @@ def load_from_ds(filename, filename_targets):
load_file_fun = load_ct load_file_fun = load_ct
elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet. elif extension == 'gxl' or extension == 'sdf': # @todo: .sdf not tested yet.
load_file_fun = load_gxl load_file_fun = load_gxl
if filename_targets is None or filename_targets == '': if filename_targets is None or filename_targets == '':
for i in range(0, len(content)): for i in range(0, len(content)):
tmp = content[i].split(' ') tmp = content[i].split(' ')
@@ -711,7 +711,7 @@ def load_from_ds(filename, filename_targets):
g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1)) g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1))
data.append(g) data.append(g)
_append_label_names(label_names, l_names) _append_label_names(label_names, l_names)
with open(filename_targets) as fnt: with open(filename_targets) as fnt:
content_y = fnt.read().splitlines() content_y = fnt.read().splitlines()
# assume entries in filename and filename_targets have the same order. # assume entries in filename and filename_targets have the same order.
@@ -719,13 +719,13 @@ def load_from_ds(filename, filename_targets):
tmp = item.split(' ') tmp = item.split(' ')
# assume the 3rd entry in a line is y (for Alkane dataset) # assume the 3rd entry in a line is y (for Alkane dataset)
y.append(float(tmp[2])) y.append(float(tmp[2]))
return data, y, label_names return data, y, label_names




# def load_from_cxl(filename): # def load_from_cxl(filename):
# import xml.etree.ElementTree as ET # import xml.etree.ElementTree as ET
#
#
# dirname_dataset = dirname(filename) # dirname_dataset = dirname(filename)
# tree = ET.parse(filename) # tree = ET.parse(filename)
# root = tree.getroot() # root = tree.getroot()
@@ -736,11 +736,11 @@ def load_from_ds(filename, filename_targets):
# mol_class = graph.attrib['class'] # mol_class = graph.attrib['class']
# data.append(load_gxl(dirname_dataset + '/' + mol_filename)) # data.append(load_gxl(dirname_dataset + '/' + mol_filename))
# y.append(mol_class) # y.append(mol_class)
def load_from_xml(filename, dir_dataset=None): def load_from_xml(filename, dir_dataset=None):
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
if dir_dataset is not None: if dir_dataset is not None:
dir_dataset = dir_dataset dir_dataset = dir_dataset
else: else:
@@ -757,16 +757,16 @@ def load_from_xml(filename, dir_dataset=None):
data.append(g) data.append(g)
_append_label_names(label_names, l_names) _append_label_names(label_names, l_names)
y.append(mol_class) y.append(mol_class)
return data, y, label_names return data, y, label_names




def _append_label_names(label_names, new_names): def _append_label_names(label_names, new_names):
for key, val in label_names.items(): for key, val in label_names.items():
label_names[key] += [name for name in new_names[key] if name not in val] label_names[key] += [name for name in new_names[key] if name not in val]
if __name__ == '__main__':
if __name__ == '__main__':
# ### Load dataset from .ds file. # ### Load dataset from .ds file.
# # .ct files. # # .ct files.
# ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', # ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
@@ -782,7 +782,7 @@ if __name__ == '__main__':
# print(Gn[1].nodes(data=True)) # print(Gn[1].nodes(data=True))
# print(Gn[1].edges(data=True)) # print(Gn[1].edges(data=True))
# print(targets[1]) # print(targets[1])
# # .gxl file. # # .gxl file.
# ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb # ds_file = '../../datasets/monoterpenoides/dataset_10+.ds' # node/edge symb
# Gn, y, label_names = load_dataset(ds_file) # Gn, y, label_names = load_dataset(ds_file)
@@ -803,7 +803,7 @@ if __name__ == '__main__':
# ### Convert graph from one format to another. # ### Convert graph from one format to another.
# # .gxl file. # # .gxl file.
# import networkx as nx # import networkx as nx
# ds = {'name': 'monoterpenoides',
# ds = {'name': 'monoterpenoides',
# 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb # 'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
# Gn, y = loadDataset(ds['dataset']) # Gn, y = loadDataset(ds['dataset'])
# y = [int(i) for i in y] # y = [int(i) for i in y]
@@ -826,13 +826,13 @@ if __name__ == '__main__':
# filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides' # filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides'
# xparams = {'method': 'gedlib'} # xparams = {'method': 'gedlib'}
# saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams) # saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams)
# save dataset. # save dataset.
# ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat', # ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb # 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
# Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params']) # Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# saveDataset(Gn, y, group='xml', filename='temp/temp') # saveDataset(Gn, y, group='xml', filename='temp/temp')
# test - new way to add labels and attributes. # test - new way to add labels and attributes.
# dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' # dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
# filename = '../../datasets/Fingerprint/Fingerprint_A.txt' # filename = '../../datasets/Fingerprint/Fingerprint_A.txt'


+ 285
- 285
gklearn/utils/graphdataset.py View File

@@ -5,345 +5,345 @@ This file is for old version of graphkit-learn.




def get_dataset_attributes(Gn, def get_dataset_attributes(Gn,
target=None,
attr_names=[],
node_label=None,
edge_label=None):
"""Returns the structure and property information of the graph dataset Gn.
Parameters
----------
Gn : List of NetworkX graph
List of graphs whose information will be returned.
target : list
The list of classification targets corresponding to Gn. Only works for
classification problems.
attr_names : list
List of strings which indicate which informations will be returned. The
possible choices includes:
'substructures': sub-structures Gn contains, including 'linear', 'non
target=None,
attr_names=[],
node_label=None,
edge_label=None):
"""Returns the structure and property information of the graph dataset Gn.
Parameters
----------
Gn : List of NetworkX graph
List of graphs whose information will be returned.
target : list
The list of classification targets corresponding to Gn. Only works for
classification problems.
attr_names : list
List of strings which indicate which informations will be returned. The
possible choices includes:
'substructures': sub-structures Gn contains, including 'linear', 'non
linear' and 'cyclic'. linear' and 'cyclic'.


'node_labeled': whether vertices have symbolic labels.
'node_labeled': whether vertices have symbolic labels.


'edge_labeled': whether egdes have symbolic labels.
'edge_labeled': whether egdes have symbolic labels.


'is_directed': whether graphs in Gn are directed.
'is_directed': whether graphs in Gn are directed.


'dataset_size': number of graphs in Gn.
'dataset_size': number of graphs in Gn.


'ave_node_num': average number of vertices of graphs in Gn.
'ave_node_num': average number of vertices of graphs in Gn.


'min_node_num': minimum number of vertices of graphs in Gn.
'min_node_num': minimum number of vertices of graphs in Gn.


'max_node_num': maximum number of vertices of graphs in Gn.
'max_node_num': maximum number of vertices of graphs in Gn.


'ave_edge_num': average number of edges of graphs in Gn.
'ave_edge_num': average number of edges of graphs in Gn.


'min_edge_num': minimum number of edges of graphs in Gn.
'min_edge_num': minimum number of edges of graphs in Gn.


'max_edge_num': maximum number of edges of graphs in Gn.
'max_edge_num': maximum number of edges of graphs in Gn.


'ave_node_degree': average vertex degree of graphs in Gn.
'ave_node_degree': average vertex degree of graphs in Gn.


'min_node_degree': minimum vertex degree of graphs in Gn.
'min_node_degree': minimum vertex degree of graphs in Gn.


'max_node_degree': maximum vertex degree of graphs in Gn.
'max_node_degree': maximum vertex degree of graphs in Gn.


'ave_fill_factor': average fill factor (number_of_edges /
'ave_fill_factor': average fill factor (number_of_edges /
(number_of_nodes ** 2)) of graphs in Gn. (number_of_nodes ** 2)) of graphs in Gn.


'min_fill_factor': minimum fill factor of graphs in Gn.
'min_fill_factor': minimum fill factor of graphs in Gn.


'max_fill_factor': maximum fill factor of graphs in Gn.
'max_fill_factor': maximum fill factor of graphs in Gn.


'node_label_num': number of symbolic vertex labels.
'node_label_num': number of symbolic vertex labels.


'edge_label_num': number of symbolic edge labels.
'edge_label_num': number of symbolic edge labels.


'node_attr_dim': number of dimensions of non-symbolic vertex labels.
'node_attr_dim': number of dimensions of non-symbolic vertex labels.
Extracted from the 'attributes' attribute of graph nodes. Extracted from the 'attributes' attribute of graph nodes.


'edge_attr_dim': number of dimensions of non-symbolic edge labels.
'edge_attr_dim': number of dimensions of non-symbolic edge labels.
Extracted from the 'attributes' attribute of graph edges. Extracted from the 'attributes' attribute of graph edges.


'class_number': number of classes. Only available for classification problems.
'class_number': number of classes. Only available for classification problems.


node_label : string
Node attribute used as label. The default node label is atom. Mandatory
when 'node_labeled' or 'node_label_num' is required.
node_label : string
Node attribute used as label. The default node label is atom. Mandatory
when 'node_labeled' or 'node_label_num' is required.


edge_label : string
Edge attribute used as label. The default edge label is bond_type.
Mandatory when 'edge_labeled' or 'edge_label_num' is required.

Return
------
attrs : dict
Value for each property.
"""
import networkx as nx
import numpy as np

attrs = {}

def get_dataset_size(Gn):
return len(Gn)

def get_all_node_num(Gn):
return [nx.number_of_nodes(G) for G in Gn]

def get_ave_node_num(all_node_num):
return np.mean(all_node_num)

def get_min_node_num(all_node_num):
return np.amin(all_node_num)

def get_max_node_num(all_node_num):
return np.amax(all_node_num)

def get_all_edge_num(Gn):
return [nx.number_of_edges(G) for G in Gn]

def get_ave_edge_num(all_edge_num):
return np.mean(all_edge_num)

def get_min_edge_num(all_edge_num):
return np.amin(all_edge_num)

def get_max_edge_num(all_edge_num):
return np.amax(all_edge_num)

def is_node_labeled(Gn):
return False if node_label is None else True

def get_node_label_num(Gn):
nl = set()
for G in Gn:
nl = nl | set(nx.get_node_attributes(G, node_label).values())
return len(nl)

def is_edge_labeled(Gn):
return False if edge_label is None else True

def get_edge_label_num(Gn):
el = set()
for G in Gn:
el = el | set(nx.get_edge_attributes(G, edge_label).values())
return len(el)

def is_directed(Gn):
return nx.is_directed(Gn[0])

def get_ave_node_degree(Gn):
return np.mean([np.mean(list(dict(G.degree()).values())) for G in Gn])

def get_max_node_degree(Gn):
return np.amax([np.mean(list(dict(G.degree()).values())) for G in Gn])

def get_min_node_degree(Gn):
return np.amin([np.mean(list(dict(G.degree()).values())) for G in Gn])
# get fill factor, the number of non-zero entries in the adjacency matrix.
def get_ave_fill_factor(Gn):
return np.mean([nx.number_of_edges(G) / (nx.number_of_nodes(G)
* nx.number_of_nodes(G)) for G in Gn])

def get_max_fill_factor(Gn):
return np.amax([nx.number_of_edges(G) / (nx.number_of_nodes(G)
* nx.number_of_nodes(G)) for G in Gn])

def get_min_fill_factor(Gn):
return np.amin([nx.number_of_edges(G) / (nx.number_of_nodes(G)
* nx.number_of_nodes(G)) for G in Gn])

def get_substructures(Gn):
subs = set()
for G in Gn:
degrees = list(dict(G.degree()).values())
if any(i == 2 for i in degrees):
subs.add('linear')
if np.amax(degrees) >= 3:
subs.add('non linear')
if 'linear' in subs and 'non linear' in subs:
break

if is_directed(Gn):
for G in Gn:
if len(list(nx.find_cycle(G))) > 0:
subs.add('cyclic')
break
# else:
# # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way.
# upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10
# for G in Gn:
# if (nx.number_of_edges(G) < upper):
# cyc = list(nx.simple_cycles(G.to_directed()))
# if any(len(i) > 2 for i in cyc):
# subs.add('cyclic')
# break
# if 'cyclic' not in subs:
# for G in Gn:
# cyc = list(nx.simple_cycles(G.to_directed()))
# if any(len(i) > 2 for i in cyc):
# subs.add('cyclic')
# break

return subs

def get_class_num(target):
return len(set(target))

def get_node_attr_dim(Gn):
for G in Gn:
for n in G.nodes(data=True):
if 'attributes' in n[1]:
return len(n[1]['attributes'])
return 0

def get_edge_attr_dim(Gn):
for G in Gn:
if nx.number_of_edges(G) > 0:
for e in G.edges(data=True):
if 'attributes' in e[2]:
return len(e[2]['attributes'])
return 0

if attr_names == []:
attr_names = [
'substructures',
'node_labeled',
'edge_labeled',
'is_directed',
'dataset_size',
'ave_node_num',
'min_node_num',
'max_node_num',
'ave_edge_num',
'min_edge_num',
'max_edge_num',
'ave_node_degree',
'min_node_degree',
'max_node_degree',
'ave_fill_factor',
'min_fill_factor',
'max_fill_factor',
'node_label_num',
'edge_label_num',
'node_attr_dim',
'edge_attr_dim',
'class_number',
]

# dataset size
if 'dataset_size' in attr_names:

attrs.update({'dataset_size': get_dataset_size(Gn)})

# graph node number
if any(i in attr_names
for i in ['ave_node_num', 'min_node_num', 'max_node_num']):

all_node_num = get_all_node_num(Gn)

if 'ave_node_num' in attr_names:

attrs.update({'ave_node_num': get_ave_node_num(all_node_num)})

if 'min_node_num' in attr_names:

attrs.update({'min_node_num': get_min_node_num(all_node_num)})

if 'max_node_num' in attr_names:

attrs.update({'max_node_num': get_max_node_num(all_node_num)})

# graph edge number
if any(i in attr_names for i in
['ave_edge_num', 'min_edge_num', 'max_edge_num']):

all_edge_num = get_all_edge_num(Gn)
edge_label : string
Edge attribute used as label. The default edge label is bond_type.
Mandatory when 'edge_labeled' or 'edge_label_num' is required.


if 'ave_edge_num' in attr_names:
Return
------
attrs : dict
Value for each property.
"""
import networkx as nx
import numpy as np

attrs = {}

def get_dataset_size(Gn):
return len(Gn)

def get_all_node_num(Gn):
return [nx.number_of_nodes(G) for G in Gn]

def get_ave_node_num(all_node_num):
return np.mean(all_node_num)

def get_min_node_num(all_node_num):
return np.amin(all_node_num)

def get_max_node_num(all_node_num):
return np.amax(all_node_num)

def get_all_edge_num(Gn):
return [nx.number_of_edges(G) for G in Gn]

def get_ave_edge_num(all_edge_num):
return np.mean(all_edge_num)

def get_min_edge_num(all_edge_num):
return np.amin(all_edge_num)

def get_max_edge_num(all_edge_num):
return np.amax(all_edge_num)

def is_node_labeled(Gn):
return False if node_label is None else True

def get_node_label_num(Gn):
nl = set()
for G in Gn:
nl = nl | set(nx.get_node_attributes(G, node_label).values())
return len(nl)

def is_edge_labeled(Gn):
return False if edge_label is None else True

def get_edge_label_num(Gn):
el = set()
for G in Gn:
el = el | set(nx.get_edge_attributes(G, edge_label).values())
return len(el)

def is_directed(Gn):
return nx.is_directed(Gn[0])

def get_ave_node_degree(Gn):
return np.mean([np.mean(list(dict(G.degree()).values())) for G in Gn])

def get_max_node_degree(Gn):
return np.amax([np.mean(list(dict(G.degree()).values())) for G in Gn])

def get_min_node_degree(Gn):
return np.amin([np.mean(list(dict(G.degree()).values())) for G in Gn])

# get fill factor, the number of non-zero entries in the adjacency matrix.
def get_ave_fill_factor(Gn):
return np.mean([nx.number_of_edges(G) / (nx.number_of_nodes(G)
* nx.number_of_nodes(G)) for G in Gn])

def get_max_fill_factor(Gn):
return np.amax([nx.number_of_edges(G) / (nx.number_of_nodes(G)
* nx.number_of_nodes(G)) for G in Gn])

def get_min_fill_factor(Gn):
return np.amin([nx.number_of_edges(G) / (nx.number_of_nodes(G)
* nx.number_of_nodes(G)) for G in Gn])

def get_substructures(Gn):
subs = set()
for G in Gn:
degrees = list(dict(G.degree()).values())
if any(i == 2 for i in degrees):
subs.add('linear')
if np.amax(degrees) >= 3:
subs.add('non linear')
if 'linear' in subs and 'non linear' in subs:
break

if is_directed(Gn):
for G in Gn:
if len(list(nx.find_cycle(G))) > 0:
subs.add('cyclic')
break
# else:
# # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way.
# upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10
# for G in Gn:
# if (nx.number_of_edges(G) < upper):
# cyc = list(nx.simple_cycles(G.to_directed()))
# if any(len(i) > 2 for i in cyc):
# subs.add('cyclic')
# break
# if 'cyclic' not in subs:
# for G in Gn:
# cyc = list(nx.simple_cycles(G.to_directed()))
# if any(len(i) > 2 for i in cyc):
# subs.add('cyclic')
# break

return subs

def get_class_num(target):
return len(set(target))

def get_node_attr_dim(Gn):
for G in Gn:
for n in G.nodes(data=True):
if 'attributes' in n[1]:
return len(n[1]['attributes'])
return 0

def get_edge_attr_dim(Gn):
for G in Gn:
if nx.number_of_edges(G) > 0:
for e in G.edges(data=True):
if 'attributes' in e[2]:
return len(e[2]['attributes'])
return 0

if attr_names == []:
attr_names = [
'substructures',
'node_labeled',
'edge_labeled',
'is_directed',
'dataset_size',
'ave_node_num',
'min_node_num',
'max_node_num',
'ave_edge_num',
'min_edge_num',
'max_edge_num',
'ave_node_degree',
'min_node_degree',
'max_node_degree',
'ave_fill_factor',
'min_fill_factor',
'max_fill_factor',
'node_label_num',
'edge_label_num',
'node_attr_dim',
'edge_attr_dim',
'class_number',
]

# dataset size
if 'dataset_size' in attr_names:


attrs.update({'ave_edge_num': get_ave_edge_num(all_edge_num)})
attrs.update({'dataset_size': get_dataset_size(Gn)})


if 'max_edge_num' in attr_names:
# graph node number
if any(i in attr_names
for i in ['ave_node_num', 'min_node_num', 'max_node_num']):


attrs.update({'max_edge_num': get_max_edge_num(all_edge_num)})
all_node_num = get_all_node_num(Gn)

if 'ave_node_num' in attr_names:


if 'min_edge_num' in attr_names:
attrs.update({'ave_node_num': get_ave_node_num(all_node_num)})

if 'min_node_num' in attr_names:


attrs.update({'min_edge_num': get_min_edge_num(all_edge_num)})
attrs.update({'min_node_num': get_min_node_num(all_node_num)})

if 'max_node_num' in attr_names:


# label number
if any(i in attr_names for i in ['node_labeled', 'node_label_num']):
is_nl = is_node_labeled(Gn)
node_label_num = get_node_label_num(Gn)
attrs.update({'max_node_num': get_max_node_num(all_node_num)})

# graph edge number
if any(i in attr_names for i in
['ave_edge_num', 'min_edge_num', 'max_edge_num']):


if 'node_labeled' in attr_names:
# graphs are considered node unlabeled if all nodes have the same label.
attrs.update({'node_labeled': is_nl if node_label_num > 1 else False})
all_edge_num = get_all_edge_num(Gn)


if 'node_label_num' in attr_names:
attrs.update({'node_label_num': node_label_num})
if 'ave_edge_num' in attr_names:


if any(i in attr_names for i in ['edge_labeled', 'edge_label_num']):
is_el = is_edge_labeled(Gn)
edge_label_num = get_edge_label_num(Gn)
attrs.update({'ave_edge_num': get_ave_edge_num(all_edge_num)})


if 'edge_labeled' in attr_names:
# graphs are considered edge unlabeled if all edges have the same label.
attrs.update({'edge_labeled': is_el if edge_label_num > 1 else False})
if 'max_edge_num' in attr_names:


if 'edge_label_num' in attr_names:
attrs.update({'edge_label_num': edge_label_num})
attrs.update({'max_edge_num': get_max_edge_num(all_edge_num)})


if 'is_directed' in attr_names:
attrs.update({'is_directed': is_directed(Gn)})
if 'min_edge_num' in attr_names:


if 'ave_node_degree' in attr_names:
attrs.update({'ave_node_degree': get_ave_node_degree(Gn)})
attrs.update({'min_edge_num': get_min_edge_num(all_edge_num)})


if 'max_node_degree' in attr_names:
attrs.update({'max_node_degree': get_max_node_degree(Gn)})
# label number
if any(i in attr_names for i in ['node_labeled', 'node_label_num']):
is_nl = is_node_labeled(Gn)
node_label_num = get_node_label_num(Gn)


if 'min_node_degree' in attr_names:
attrs.update({'min_node_degree': get_min_node_degree(Gn)})
if 'ave_fill_factor' in attr_names:
attrs.update({'ave_fill_factor': get_ave_fill_factor(Gn)})
if 'node_labeled' in attr_names:
# graphs are considered node unlabeled if all nodes have the same label.
attrs.update({'node_labeled': is_nl if node_label_num > 1 else False})


if 'max_fill_factor' in attr_names:
attrs.update({'max_fill_factor': get_max_fill_factor(Gn)})
if 'node_label_num' in attr_names:
attrs.update({'node_label_num': node_label_num})


if 'min_fill_factor' in attr_names:
attrs.update({'min_fill_factor': get_min_fill_factor(Gn)})
if any(i in attr_names for i in ['edge_labeled', 'edge_label_num']):
is_el = is_edge_labeled(Gn)
edge_label_num = get_edge_label_num(Gn)


if 'substructures' in attr_names:
attrs.update({'substructures': get_substructures(Gn)})
if 'edge_labeled' in attr_names:
# graphs are considered edge unlabeled if all edges have the same label.
attrs.update({'edge_labeled': is_el if edge_label_num > 1 else False})


if 'class_number' in attr_names:
attrs.update({'class_number': get_class_num(target)})
if 'edge_label_num' in attr_names:
attrs.update({'edge_label_num': edge_label_num})


if 'node_attr_dim' in attr_names:
attrs['node_attr_dim'] = get_node_attr_dim(Gn)
if 'is_directed' in attr_names:
attrs.update({'is_directed': is_directed(Gn)})


if 'edge_attr_dim' in attr_names:
attrs['edge_attr_dim'] = get_edge_attr_dim(Gn)
if 'ave_node_degree' in attr_names:
attrs.update({'ave_node_degree': get_ave_node_degree(Gn)})


from collections import OrderedDict
return OrderedDict(
sorted(attrs.items(), key=lambda i: attr_names.index(i[0])))
if 'max_node_degree' in attr_names:
attrs.update({'max_node_degree': get_max_node_degree(Gn)})

if 'min_node_degree' in attr_names:
attrs.update({'min_node_degree': get_min_node_degree(Gn)})

if 'ave_fill_factor' in attr_names:
attrs.update({'ave_fill_factor': get_ave_fill_factor(Gn)})

if 'max_fill_factor' in attr_names:
attrs.update({'max_fill_factor': get_max_fill_factor(Gn)})

if 'min_fill_factor' in attr_names:
attrs.update({'min_fill_factor': get_min_fill_factor(Gn)})

if 'substructures' in attr_names:
attrs.update({'substructures': get_substructures(Gn)})

if 'class_number' in attr_names:
attrs.update({'class_number': get_class_num(target)})

if 'node_attr_dim' in attr_names:
attrs['node_attr_dim'] = get_node_attr_dim(Gn)

if 'edge_attr_dim' in attr_names:
attrs['edge_attr_dim'] = get_edge_attr_dim(Gn)

from collections import OrderedDict
return OrderedDict(
sorted(attrs.items(), key=lambda i: attr_names.index(i[0])))




def load_predefined_dataset(ds_name): def load_predefined_dataset(ds_name):
import os import os
from gklearn.utils.graphfiles import loadDataset from gklearn.utils.graphfiles import loadDataset
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
if ds_name == 'Acyclic': if ds_name == 'Acyclic':
ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds'
@@ -415,5 +415,5 @@ def load_predefined_dataset(ds_name):
pass pass
else: else:
raise Exception('The dataset name "', ds_name, '" is not pre-defined.') raise Exception('The dataset name "', ds_name, '" is not pre-defined.')
return graphs, targets return graphs, targets

+ 4
- 4
gklearn/utils/kernels.py View File

@@ -18,8 +18,8 @@ def deltakernel(x, y):


References References
---------- ----------
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
labeled graphs. In Proceedings of the 20th International Conference on
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
labeled graphs. In Proceedings of the 20th International Conference on
Machine Learning, Washington, DC, United States, 2003. Machine Learning, Washington, DC, United States, 2003.
""" """
return x == y #(1 if condition else 0) return x == y #(1 if condition else 0)
@@ -68,7 +68,7 @@ def polynomialkernel(x, y, d=1, c=0):
x, y : array x, y : array


d : integer, default 1 d : integer, default 1
c : float, default 0 c : float, default 0


Returns Returns
@@ -89,7 +89,7 @@ def linearkernel(x, y):
x, y : array x, y : array


d : integer, default 1 d : integer, default 1
c : float, default 0 c : float, default 0


Returns Returns


Loading…
Cancel
Save