Browse Source

Merge pull request #48 from jajupmochi/v0.2.x

V0.2.x
master
linlin GitHub 3 years ago
parent
commit
8b853895fa
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 2139 additions and 549 deletions
  1. +2
    -0
      .travis.yml
  2. +2
    -1
      README.md
  3. +0
    -147
      gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py
  4. +6
    -5
      gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py
  5. +172
    -0
      gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py
  6. +1
    -0
      gklearn/experiments/ged/stability/group_results.py
  7. +18
    -9
      gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py
  8. +17
    -1
      gklearn/experiments/ged/stability/utils.py
  9. +1
    -0
      gklearn/ged/__init__.py
  10. +43
    -0
      gklearn/ged/model/distances.py
  11. +97
    -0
      gklearn/ged/model/ged_com.py
  12. +724
    -0
      gklearn/ged/model/ged_model.py
  13. +149
    -0
      gklearn/ged/model/optim_costs.py
  14. +179
    -31
      gklearn/ged/util/util.py
  15. +39
    -20
      gklearn/kernels/graph_kernel.py
  16. +39
    -27
      gklearn/kernels/treelet.py
  17. +52
    -103
      gklearn/kernels/weisfeiler_lehman.py
  18. +24
    -0
      gklearn/model_selection/__init__.py
  19. +287
    -0
      gklearn/model_selection/_split.py
  20. +12
    -3
      gklearn/utils/kernels.py
  21. +272
    -199
      gklearn/utils/utils.py
  22. +1
    -1
      requirements.txt
  23. +2
    -2
      requirements_pypi.txt

+ 2
- 0
.travis.yml View File

@@ -4,6 +4,8 @@ python:
- '3.6'
- '3.7'
- '3.8'
- '3.9'
#- '3.10'

before_install:
- python --version


+ 2
- 1
README.md View File

@@ -1,5 +1,6 @@
# graphkit-learn
[![Build Status](https://travis-ci.com/jajupmochi/graphkit-learn.svg?branch=master)](https://travis-ci.com/jajupmochi/graphkit-learn)

[![Build Status](https://app.travis-ci.com/jajupmochi/graphkit-learn.svg?branch=master)](https://app.travis-ci.com/jajupmochi/graphkit-learn)
[![Build status](https://ci.appveyor.com/api/projects/status/bdxsolk0t1uji9rd?svg=true)](https://ci.appveyor.com/project/jajupmochi/graphkit-learn)
[![codecov](https://codecov.io/gh/jajupmochi/graphkit-learn/branch/master/graph/badge.svg)](https://codecov.io/gh/jajupmochi/graphkit-learn)
[![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master)


+ 0
- 147
gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py View File

@@ -1,147 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 2 16:17:01 2020

@author: ljia
"""
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed.

import os
import multiprocessing
import pickle
import logging
from gklearn.ged.util import compute_geds
import time
from utils import get_dataset
import sys
from group_results import group_trials


def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial):
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)
# Return if the file exists.
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'):
return None, None

"""**2. Set parameters.**"""

# Parameters for GED computation.
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic.
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv)
'lsape_model': 'ECBP', #
# ??when bigger than 1, then the method is considered mIPFP.
# the actual number of computed solutions might be smaller than the specified value
'max_num_solutions': max_num_solutions,
'edit_cost': 'CONSTANT', # use CONSTANT cost.
'greedy_method': 'BASIC', #
# the distance between non-symbolic node/edge labels is computed by euclidean distance.
'attr_distance': 'euclidean',
'optimal': True, # if TRUE, the option --greedy-method has no effect
# parallel threads. Do not work if mpg_options['parallel'] = False.
'threads': multiprocessing.cpu_count(),
'centrality_method': 'NONE',
'centrality_weight': 0.7,
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
}
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))

options = ged_options.copy()
options['edit_cost_constants'] = edit_cost_constants
options['node_labels'] = dataset.node_labels
options['edge_labels'] = dataset.edge_labels
options['node_attrs'] = dataset.node_attrs
options['edge_attrs'] = dataset.edge_attrs
parallel = True # if num_solutions == 1 else False
"""**5. Compute GED matrix.**"""
ged_mat = 'error'
runtime = 0
try:
time0 = time.time()
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True)
runtime = time.time() - time0
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))
"""**6. Get results.**"""
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)

return ged_mat, runtime

def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio):
# Return if the group file exists.
name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.'
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy'
if os.path.isfile(name_group):
return
ged_mats = []
runtimes = []
for trial in range(1, 101):
print()
print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial)
ged_mats.append(ged_mat)
runtimes.append(runtime)
# Group trials and Remove single files.
name_prefix = 'ged_matrix' + name_middle
group_trials(save_dir, name_prefix, True, True, False)
name_prefix = 'runtime' + name_middle
group_trials(save_dir, name_prefix, True, True, False)


def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)
for max_num_solutions in mnum_solutions_list:
print()
print('Max # of solutions:', max_num_solutions)
for ratio in ratio_list:
print()
print('Ratio:', ratio)
save_trials_as_group(dataset, ds_name, max_num_solutions, ratio)
def get_param_lists(ds_name):
if ds_name == 'AIDS_symb':
mnum_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
else:
mnum_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
return mnum_solutions_list, ratio_list

if __name__ == '__main__':
if len(sys.argv) > 1:
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir + 'groups/', exist_ok=True)
for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
mnum_solutions_list, ratio_list = get_param_lists(ds_name)
results_for_a_dataset(ds_name)

+ 6
- 5
gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py View File

@@ -13,7 +13,7 @@ import pickle
import logging
from gklearn.ged.util import compute_geds
import time
from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation
from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids
import sys
from group_results import group_trials, check_group_existence, update_group_marker

@@ -125,9 +125,10 @@ def get_param_lists(ds_name, mode='test'):

elif mode == 'simple':
from sklearn.model_selection import ParameterGrid
param_grid = ParameterGrid([
{'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]},
{'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}])
param_grid = mix_param_grids([list(ParameterGrid([
{'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])),
list(ParameterGrid([
{'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))])
# print(list(param_grid))

if ds_name == 'AIDS_symb':
@@ -148,7 +149,7 @@ if __name__ == '__main__':
# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled']
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']

save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/'
save_dir = 'outputs/CRIANN/edit_costs.real_data.num_sols.ratios.IPFP/'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir + 'groups/', exist_ok=True)



+ 172
- 0
gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py View File

@@ -0,0 +1,172 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 2 16:17:01 2020

@author: ljia
"""
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed.

import os
import multiprocessing
import pickle
import logging
from gklearn.ged.util import compute_geds
import time
from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids
import sys
from group_results import group_trials, check_group_existence, update_group_marker


def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):

save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)

# Return if the file exists.
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'):
return None, None

"""**2. Set parameters.**"""

# Parameters for GED computation.
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic.
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv)
'lsape_model': 'ECBP', #
# ??when bigger than 1, then the method is considered mIPFP.
# the actual number of computed solutions might be smaller than the specified value
'max_num_solutions': 1, # @ max_num_solutions,
'edit_cost': 'CONSTANT', # use CONSTANT cost.
'greedy_method': 'BASIC', #
# the distance between non-symbolic node/edge labels is computed by euclidean distance.
'attr_distance': 'euclidean',
'optimal': True, # if TRUE, the option --greedy-method has no effect
# parallel threads. Do not work if mpg_options['parallel'] = False.
'threads': multiprocessing.cpu_count(),
'centrality_method': 'NONE',
'centrality_weight': 0.7,
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
}

edit_cost_constants = set_edit_cost_consts(ratio,
node_labeled=len(dataset.node_labels),
edge_labeled=len(dataset.edge_labels),
mode='uniform')
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))


options = ged_options.copy()
options['edit_cost_constants'] = edit_cost_constants
options['node_labels'] = dataset.node_labels
options['edge_labels'] = dataset.edge_labels
options['node_attrs'] = dataset.node_attrs
options['edge_attrs'] = dataset.edge_attrs
parallel = True # if num_solutions == 1 else False

"""**5. Compute GED matrix.**"""
ged_mat = 'error'
runtime = 0
try:
time0 = time.time()
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs,
options=options,
repeats=num_solutions,
permute_nodes=True,
random_state=None,
parallel=parallel,
verbose=True)
runtime = time.time() - time0
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))

"""**6. Get results.**"""

with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)

return ged_mat, runtime


def save_trials_as_group(dataset, ds_name, num_solutions, ratio):
# Return if the group file exists.
name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.'
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy'
if check_group_existence(name_group):
return

ged_mats = []
runtimes = []
num_trials = 100
for trial in range(1, num_trials + 1):
print()
print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial)
ged_mats.append(ged_mat)
runtimes.append(runtime)

# Group trials and remove single files.
# @todo: if the program stops between the following lines, then there may be errors.
name_prefix = 'ged_matrix' + name_middle
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
name_prefix = 'runtime' + name_middle
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
update_group_marker(name_group)


def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)

for params in list(param_grid):
print()
print(params)
save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio'])


def get_param_lists(ds_name, mode='test'):
if mode == 'test':
num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
ratio_list = [10]
return num_solutions_list, ratio_list

elif mode == 'simple':
from sklearn.model_selection import ParameterGrid
param_grid = mix_param_grids([list(ParameterGrid([
{'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])),
list(ParameterGrid([
{'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))])
# print(list(param_grid))

if ds_name == 'AIDS_symb':
num_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
else:
num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1]

return param_grid


if __name__ == '__main__':
if len(sys.argv) > 1:
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG']
# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled']
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']

save_dir = 'outputs/CRIANN/edit_costs.real_data.nums_sols.ratios.bipartite/'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir + 'groups/', exist_ok=True)

for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
param_grid = get_param_lists(ds_name, mode='simple')
results_for_a_dataset(ds_name)

+ 1
- 0
gklearn/experiments/ged/stability/group_results.py View File

@@ -32,6 +32,7 @@ def check_group_existence(file_name):


def update_group_marker(file_name):
# @todo: possible error when seveal tasks are using this file at the same time.
path, name = os.path.split(file_name)
marker_fn = os.path.join(path, 'group_names_finished.pkl')
if os.path.isfile(marker_fn):


gklearn/experiments/ged/stability/run_job_edit_costs.max_nums_sols.ratios.bipartite.py → gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py View File

@@ -9,36 +9,45 @@ import os
import re


cur_path = os.path.dirname(os.path.abspath(__file__))


def get_job_script(arg):
script = r"""
#!/bin/bash

#SBATCH --exclusive
#SBATCH --job-name="st.""" + arg + r""".bp"
#SBATCH --partition=tlong
#SBATCH --partition=court
#SBATCH --mail-type=ALL
#SBATCH --mail-user=jajupmochi@gmail.com
#SBATCH --output="outputs/output_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt"
#SBATCH --error="errors/error_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt"
#SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt"
#SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt"
#
#SBATCH --ntasks=1
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --time=300:00:00
#SBATCH --time=48:00:00
#SBATCH --mem-per-cpu=4000

srun hostname
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability
srun python3 edit_costs.max_nums_sols.ratios.bipartite.py """ + arg
cd """ + cur_path + r"""
echo Working directory : $PWD
echo Local work dir : $LOCAL_WORK_DIR
python3 edit_costs.real_data.nums_sols.ratios.bipartite.py """ + arg
script = script.strip()
script = re.sub('\n\t+', '\n', script)
script = re.sub('\n +', '\n', script)
return script

if __name__ == '__main__':
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]:

os.makedirs('outputs/', exist_ok=True)
os.makedirs('errors/', exist_ok=True)

ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG']
for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]:
job_script = get_job_script(ds_name)
command = 'sbatch <<EOF\n' + job_script + '\nEOF'
# print(command)

+ 17
- 1
gklearn/experiments/ged/stability/utils.py View File

@@ -325,6 +325,22 @@ def dichotomous_permutation(arr, layer=0):
# return new_arr


def mix_param_grids(list_of_grids):
mixed_grids = []
not_finished = [True] * len(list_of_grids)
idx = 0
while sum(not_finished) > 0:
for g_idx, grid in enumerate(list_of_grids):
if idx < len(grid):
mixed_grids.append(grid[idx])
else:
not_finished[g_idx] = False
idx += 1

return mixed_grids



if __name__ == '__main__':
root_dir = 'outputs/CRIANN/'
# for dir_ in sorted(os.listdir(root_dir)):
@@ -337,4 +353,4 @@ if __name__ == '__main__':
# get_relative_errors(save_dir)
# except Exception as exp:
# print('An exception occured when running this experiment:')
# print(repr(exp))
# print(repr(exp))

+ 1
- 0
gklearn/ged/__init__.py View File

@@ -0,0 +1 @@
from gklearn.ged.model.ged_model import GEDModel

+ 43
- 0
gklearn/ged/model/distances.py View File

@@ -0,0 +1,43 @@
import numpy as np


def sum_squares(a, b):
"""
Return the sum of squares of the difference between a and b, aka MSE
"""
return np.sum([(a[i] - b[i])**2 for i in range(len(a))])


def euclid_d(x, y):
"""
1D euclidean distance
"""
return np.sqrt((x-y)**2)


def man_d(x, y):
"""
1D manhattan distance
"""
return np.abs((x-y))


def classif_d(x, y):
"""
Function adapted to classification problems
"""
return np.array(0 if x == y else 1)


def rmse(pred, ground_truth):
import numpy as np
return np.sqrt(sum_squares(pred, ground_truth)/len(ground_truth))


def accuracy(pred, ground_truth):
import numpy as np
return np.mean([a == b for a, b in zip(pred, ground_truth)])


def rbf_k(D, sigma=1):
return np.exp(-(D**2)/sigma)

+ 97
- 0
gklearn/ged/model/ged_com.py View File

@@ -0,0 +1,97 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 5 14:02:17 2022

@author: ljia
"""
import sys
from gklearn.ged.model.distances import euclid_d
from gklearn.ged.util import pairwise_ged, get_nb_edit_operations
from gklearn.utils import get_iters


def compute_ged(Gi, Gj, edit_cost, method='BIPARTITE', **kwargs):
"""
Compute GED between two graph according to edit_cost
"""
ged_options = {'edit_cost': 'CONSTANT',
'method': method,
'edit_cost_constants': edit_cost}
node_labels = kwargs.get('node_labels', [])
edge_labels = kwargs.get('edge_labels', [])
dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10)
n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, edit_cost='CONSTANT', node_labels=node_labels, edge_labels=edge_labels)
return dis, n_eo_tmp


def compute_ged_all_dataset(Gn, edit_cost, ed_method, **kwargs):
N = len(Gn)
G_pairs = []
for i in range(N):
for j in range(i, N):
G_pairs.append([i, j])
return compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs)


def compute_geds(G_pairs, Gn, edit_cost, ed_method, verbose=True, **kwargs):
"""
Compute GED between all indexes in G_pairs given edit_cost
:return: ged_vec : the list of computed distances, n_edit_operations : the list of edit operations
"""
ged_vec = []
n_edit_operations = []
for k in get_iters(range(len(G_pairs)), desc='Computing GED', file=sys.stdout, length=len(G_pairs), verbose=verbose):
[i, j] = G_pairs[k]
dis, n_eo_tmp = compute_ged(
Gn[i], Gn[j], edit_cost=edit_cost, method=ed_method, **kwargs)
ged_vec.append(dis)
n_edit_operations.append(n_eo_tmp)

return ged_vec, n_edit_operations


def compute_D(G_app, edit_cost, G_test=None, ed_method='BIPARTITE', **kwargs):
import numpy as np
N = len(G_app)
D_app = np.zeros((N, N))

for i, G1 in get_iters(enumerate(G_app), desc='Computing D - app', file=sys.stdout, length=N):
for j, G2 in enumerate(G_app[i+1:], i+1):
D_app[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs)
D_app[j, i] = D_app[i, j]
if (G_test is None):
return D_app, edit_cost
else:
D_test = np.zeros((len(G_test), N))
for i, G1 in get_iters(enumerate(G_test), desc='Computing D - test', file=sys.stdout, length=len(G_test)):
for j, G2 in enumerate(G_app):
D_test[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs)
return D_app, D_test, edit_cost


def compute_D_random(G_app, G_test=None, ed_method='BIPARTITE', **kwargs):
import numpy as np
edit_costs = np.random.rand(6)
return compute_D(G_app, edit_costs, G_test, ed_method=ed_method, **kwargs)


def compute_D_expert(G_app, G_test=None, ed_method='BIPARTITE', **kwargs):
edit_cost = [3, 3, 1, 3, 3, 1]
return compute_D(G_app, edit_cost, G_test, ed_method=ed_method, **kwargs)


def compute_D_fitted(G_app, y_app, G_test=None, y_distance=euclid_d,
mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs):
from gklearn.ged.models.optim_costs import compute_optimal_costs

costs_optim = compute_optimal_costs(
G_app, y_app, y_distance=y_distance,
mode=mode, unlabeled=unlabeled, ed_method=ed_method, **kwargs)
return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs)


def compute_D_GH2020(G_app, G_test=None, ed_method='BIPARTITE', **kwargs):
from gklearn.ged.optim_costs import get_optimal_costs_GH2020
costs_optim = get_optimal_costs_GH2020(**kwargs)
return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs)

+ 724
- 0
gklearn/ged/model/ged_model.py View File

@@ -0,0 +1,724 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 5 09:42:30 2022

@author: ljia
"""
import sys
import multiprocessing
import time
import numpy as np
import networkx as nx

# from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator # , TransformerMixin
from sklearn.utils.validation import check_is_fitted # check_X_y, check_array,
from sklearn.exceptions import NotFittedError

from gklearn.ged.model.distances import euclid_d
from gklearn.ged.util import pairwise_ged, get_nb_edit_operations
# from gklearn.utils import normalize_gram_matrix
from gklearn.utils import get_iters


class GEDModel(BaseEstimator): #, ABC):
"""The graph edit distance model class compatible with `scikit-learn`.

Attributes
----------
_graphs : list
Stores the input graphs on fit input data.
Default format of the list objects is `NetworkX` graphs.
**We don't guarantee that the input graphs remain unchanged during the
computation.**

References
----------
https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel.
"""

def __init__(self,
ed_method='BIPARTITE',
edit_cost_fun='CONSTANT',
init_edit_cost_constants=[3, 3, 1, 3, 3, 1],
optim_method='init',
optim_options={'y_distance': euclid_d, 'mode': 'reg'},
node_labels=[],
edge_labels=[],
parallel=None,
n_jobs=None,
chunksize=None,
# normalize=True,
copy_graphs=True, # make sure it is a full deep copy. and faster!
verbose=2):
"""`__init__` for `GEDModel` object."""
# @todo: the default settings of the parameters are different from those in the self.compute method.
# self._graphs = None
self.ed_method = ed_method
self.edit_cost_fun = edit_cost_fun
self.init_edit_cost_constants = init_edit_cost_constants
self.optim_method=optim_method
self.optim_options=optim_options
self.node_labels=node_labels
self.edge_labels=edge_labels
self.parallel = parallel
self.n_jobs = n_jobs
self.chunksize = chunksize
# self.normalize = normalize
self.copy_graphs = copy_graphs
self.verbose = verbose
# self._run_time = 0
# self._gram_matrix = None
# self._gram_matrix_unnorm = None


##########################################################################
# The following is the 1st paradigm to compute GED distance matrix, which is
# compatible with `scikit-learn`.
##########################################################################


def fit(self, X, y=None):
"""Fit a graph dataset for a transformer.

Parameters
----------
X : iterable
DESCRIPTION.

y : None, optional
There is no need of a target in a transformer, yet the `scikit-learn`
pipeline API requires this parameter.

Returns
-------
object
Returns self.

"""
# self._is_tranformed = False

# Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used;
self.clear_attributes()

# Validate parameters for the transformer.
self.validate_parameters()

# Validate the input.
self._graphs = self.validate_input(X)
if y is not None:
self._targets = y
# self._targets = self.validate_input(y)

# self._X = X
# self._kernel = self._get_kernel_instance()

# Return the transformer.
return self


def transform(self, X=None, return_dm_train=False):
"""Compute the graph kernel matrix between given and fitted data.

Parameters
----------
X : TYPE
DESCRIPTION.

Raises
------
ValueError
DESCRIPTION.

Returns
-------
None.

"""
# If `return_dm_train`, return the fitted GED distance matrix of training data.
if return_dm_train:
check_is_fitted(self, '_dm_train')
self._is_transformed = True
return self._dm_train # @todo: copy or not?

# Check if method "fit" had been called.
check_is_fitted(self, '_graphs')

# Validate the input.
Y = self.validate_input(X)

# Transform: compute the graph kernel matrix.
dis_matrix = self.compute_distance_matrix(Y)
self._Y = Y

# Self transform must appear before the diagonal call on normilization.
self._is_transformed = True
# if self.normalize:
# X_diag, Y_diag = self.diagonals()
# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
# try:
# kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag))
# except:
# raise
# finally:
# np.seterr(**old_settings)

return dis_matrix


def fit_transform(self, X, y=None, save_dm_train=False):
"""Fit and transform: compute GED distance matrix on the same data.

Parameters
----------
X : list of graphs
Input graphs.

Returns
-------
dis_matrix : numpy array, shape = [len(X), len(X)]
The distance matrix of X.

"""
self.fit(X, y)

# Compute edit cost constants.
self.compute_edit_costs()

# Transform: compute Gram matrix.
dis_matrix = self.compute_distance_matrix()

# # Normalize.
# if self.normalize:
# self._X_diag = np.diagonal(gram_matrix).copy()
# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
# try:
# gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag))
# except:
# raise
# finally:
# np.seterr(**old_settings)

if save_dm_train:
self._dm_train = dis_matrix

return dis_matrix


def get_params(self):
pass


def set_params(self):
pass


def clear_attributes(self): # @todo: update
# if hasattr(self, '_X_diag'):
# delattr(self, '_X_diag')
if hasattr(self, '_graphs'):
delattr(self, '_graphs')
if hasattr(self, '_Y'):
delattr(self, '_Y')
if hasattr(self, '_run_time'):
delattr(self, '_run_time')


def validate_parameters(self):
"""Validate all parameters for the transformer.

Returns
-------
None.

"""
if self.parallel is not None and self.parallel != 'imap_unordered':
raise ValueError('Parallel mode is not set correctly.')

if self.parallel == 'imap_unordered' and self.n_jobs is None:
self.n_jobs = multiprocessing.cpu_count()


def validate_input(self, X):
"""Validate the given input and raise errors if it is invalid.

Parameters
----------
X : list
The input to check. Should be a list of graph.

Raises
------
ValueError
Raise if the input is not correct.

Returns
-------
X : list
The input. A list of graph.

"""
if X is None:
raise ValueError('Please add graphs before computing.')
elif not isinstance(X, list):
raise ValueError('Cannot detect graphs. The input must be a list.')
elif len(X) == 0:
raise ValueError('The graph list given is empty. No computation will be performed.')

return X


def compute_distance_matrix(self, Y=None):
"""Compute the distance matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) or the distance matrix for the fitted
graphs (X / self._graphs).

Parameters
----------
Y : list of graphs, optional
The target graphs. The default is None. If None kernel is computed
between X and itself.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
if Y is None:
# Compute Gram matrix for self._graphs (X).
dis_matrix = self._compute_X_distance_matrix()
# self._gram_matrix_unnorm = np.copy(self._gram_matrix)

else:
# Compute kernel matrix between Y and self._graphs (X).
start_time = time.time()

if self.parallel == 'imap_unordered':
dis_matrix = self._compute_distance_matrix_imap_unordered(Y)

elif self.parallel is None:
Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y)
graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
dis_matrix = self._compute_distance_matrix_series(Y_copy, graphs_copy)

self._run_time = time.time() - start_time
if self.verbose:
print('Distance matrix of size (%d, %d) built in %s seconds.'
% (len(Y), len(self._graphs), self._run_time))

return dis_matrix


def _compute_distance_matrix_series(self, X, Y):
"""Compute the GED distance matrix between two sets of graphs (X and Y)
without parallelization.

Parameters
----------
X, Y : list of graphs
The input graphs.

Returns
-------
dis_matrix : numpy array, shape = [n_X, n_Y]
The computed distance matrix.

"""
dis_matrix = np.zeros((len(X), len(Y)))

for i_x, g_x in enumerate(X):
for i_y, g_y in enumerate(Y):
dis_matrix[i_x, i_y], _ = self.compute_ged(g_x, g_y)

return dis_matrix


def _compute_kernel_matrix_imap_unordered(self, Y):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) using imap unordered parallelization.

Parameters
----------
Y : list of graphs, optional
The target graphs.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
raise Exception('Parallelization for kernel matrix is not implemented.')


def diagonals(self):
"""Compute the kernel matrix diagonals of the fit/transformed data.

Returns
-------
X_diag : numpy array
The diagonal of the kernel matrix between the fitted data.
This consists of each element calculated with itself.

Y_diag : numpy array
The diagonal of the kernel matrix, of the transform.
This consists of each element calculated with itself.

"""
# Check if method "fit" had been called.
check_is_fitted(self, ['_graphs'])

# Check if the diagonals of X exist.
try:
check_is_fitted(self, ['_X_diag'])
except NotFittedError:
# Compute diagonals of X.
self._X_diag = np.empty(shape=(len(self._graphs),))
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
for i, x in enumerate(graphs):
self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel?

try:
# If transform has happened, return both diagonals.
check_is_fitted(self, ['_Y'])
self._Y_diag = np.empty(shape=(len(self._Y),))
Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y)
for (i, y) in enumerate(Y):
self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel?

return self._X_diag, self._Y_diag
except NotFittedError:
# Else just return both X_diag
return self._X_diag


# @abstractmethod
def pairwise_distance(self, x, y):
"""Compute pairwise kernel between two graphs.

Parameters
----------
x, y : NetworkX Graph.
Graphs bewteen which the kernel is computed.

Returns
-------
kernel: float
The computed kernel.

# Notes
# -----
# This method is abstract and must be implemented by a subclass.

"""
raise NotImplementedError('Pairwise kernel computation is not implemented!')



def compute_edit_costs(self, Y=None, Y_targets=None):
"""Compute edit cost constants. When optimizing method is `fiited`,
apply Jia2021's metric learning method by using a given target graphs (Y)
the fitted graphs (X / self._graphs).

Parameters
----------
Y : TYPE, optional
DESCRIPTION. The default is None.

Returns
-------
None.

"""
# Get or compute.
if self.optim_method == 'random':
self._edit_cost_constants = np.random.rand(6)

elif self.optim_method == 'init':
self._edit_cost_constants = self.init_edit_cost_constants


elif self.optim_method == 'expert':
self._edit_cost_constants = [3, 3, 1, 3, 3, 1]


elif self.optim_method == 'fitted': # Jia2021 method
# Get proper inputs.
if Y is None:
check_is_fitted(self, ['_graphs'])
check_is_fitted(self, ['_targets'])
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
targets = self._targets
else:
graphs = ([g.copy() for g in Y] if self.copy_graphs else Y)
targets = Y_targets

# Get optimization options.
node_labels = self.node_labels
edge_labels = self.edge_labels
unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0)
from gklearn.ged.model.optim_costs import compute_optimal_costs
self._edit_cost_constants = compute_optimal_costs(
graphs, targets,
node_labels=node_labels, edge_labels=edge_labels,
unlabeled=unlabeled, ed_method=self.ed_method,
verbose=(self.verbose >= 2),
**self.optim_options)


##########################################################################
# The following is the 2nd paradigm to compute kernel matrix. It is
# simplified and not compatible with `scikit-learn`.
##########################################################################


# def compute(self, *graphs, **kwargs):
# self.parallel = kwargs.get('parallel', 'imap_unordered')
# self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
# self.normalize = kwargs.get('normalize', True)
# self.verbose = kwargs.get('verbose', 2)
# self.copy_graphs = kwargs.get('copy_graphs', True)
# self.save_unnormed = kwargs.get('save_unnormed', True)
# self.validate_parameters()

# # If the inputs is a list of graphs.
# if len(graphs) == 1:
# if not isinstance(graphs[0], list):
# raise Exception('Cannot detect graphs.')
# elif len(graphs[0]) == 0:
# raise Exception('The graph list given is empty. No computation was performed.')
# else:
# if self.copy_graphs:
# self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow.
# else:
# self._graphs = graphs
# self._gram_matrix = self._compute_gram_matrix()

# if self.save_unnormed:
# self._gram_matrix_unnorm = np.copy(self._gram_matrix)
# if self.normalize:
# self._gram_matrix = normalize_gram_matrix(self._gram_matrix)
# return self._gram_matrix, self._run_time

# elif len(graphs) == 2:
# # If the inputs are two graphs.
# if self.is_graph(graphs[0]) and self.is_graph(graphs[1]):
# if self.copy_graphs:
# G0, G1 = graphs[0].copy(), graphs[1].copy()
# else:
# G0, G1 = graphs[0], graphs[1]
# kernel = self._compute_single_kernel(G0, G1)
# return kernel, self._run_time

# # If the inputs are a graph and a list of graphs.
# elif self.is_graph(graphs[0]) and isinstance(graphs[1], list):
# if self.copy_graphs:
# g1 = graphs[0].copy()
# g_list = [g.copy() for g in graphs[1]]
# kernel_list = self._compute_kernel_list(g1, g_list)
# else:
# kernel_list = self._compute_kernel_list(graphs[0], graphs[1])
# return kernel_list, self._run_time

# elif isinstance(graphs[0], list) and self.is_graph(graphs[1]):
# if self.copy_graphs:
# g1 = graphs[1].copy()
# g_list = [g.copy() for g in graphs[0]]
# kernel_list = self._compute_kernel_list(g1, g_list)
# else:
# kernel_list = self._compute_kernel_list(graphs[1], graphs[0])
# return kernel_list, self._run_time

# else:
# raise Exception('Cannot detect graphs.')

# elif len(graphs) == 0 and self._graphs is None:
# raise Exception('Please add graphs before computing.')

# else:
# raise Exception('Cannot detect graphs.')


# def normalize_gm(self, gram_matrix):
# import warnings
# warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning)

# diag = gram_matrix.diagonal().copy()
# for i in range(len(gram_matrix)):
# for j in range(i, len(gram_matrix)):
# gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j])
# gram_matrix[j][i] = gram_matrix[i][j]
# return gram_matrix


# def compute_distance_matrix(self):
# if self._gram_matrix is None:
# raise Exception('Please compute the Gram matrix before computing distance matrix.')
# dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix)))
# for i in range(len(self._gram_matrix)):
# for j in range(i, len(self._gram_matrix)):
# dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j]
# if dis < 0:
# if dis > -1e-10:
# dis = 0
# else:
# raise ValueError('The distance is negative.')
# dis_mat[i, j] = np.sqrt(dis)
# dis_mat[j, i] = dis_mat[i, j]
# dis_max = np.max(np.max(dis_mat))
# dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
# dis_mean = np.mean(np.mean(dis_mat))
# return dis_mat, dis_max, dis_min, dis_mean


def _compute_X_distance_matrix(self):
start_time = time.time()

if self.parallel == 'imap_unordered':
dis_matrix = self._compute_X_dm_imap_unordered()
elif self.parallel is None:
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
dis_matrix = self._compute_X_dm_series(graphs)
else:
raise Exception('Parallel mode is not set correctly.')

self._run_time = time.time() - start_time
if self.verbose:
print('Distance matrix of size %d built in %s seconds.'
% (len(self._graphs), self._run_time))

return dis_matrix


def _compute_X_dm_series(self, graphs):
N = len(graphs)
dis_matrix = np.zeros((N, N))

for i, G1 in get_iters(enumerate(graphs), desc='Computing distance matrix', file=sys.stdout, verbose=(self.verbose >= 2)):
for j, G2 in enumerate(graphs[i+1:], i+1):
dis_matrix[i, j], _ = self.compute_ged(G1, G2)
dis_matrix[j, i] = dis_matrix[i, j]
return dis_matrix


def _compute_X_dm_imap_unordered(self, graphs):
pass


def compute_ged(self, Gi, Gj, **kwargs):
"""
Compute GED between two graph according to edit_cost.
"""
ged_options = {'edit_cost': self.edit_cost_fun,
'method': self.ed_method,
'edit_cost_constants': self._edit_cost_constants}
dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10)
n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward,
edit_cost=self.edit_cost_fun,
node_labels=self.node_labels,
edge_labels=self.edge_labels)
return dis, n_eo_tmp


# def _compute_kernel_list(self, g1, g_list):
# start_time = time.time()

# if self.parallel == 'imap_unordered':
# kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list)
# elif self.parallel is None:
# kernel_list = self._compute_kernel_list_series(g1, g_list)
# else:
# raise Exception('Parallel mode is not set correctly.')

# self._run_time = time.time() - start_time
# if self.verbose:
# print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.'
# % (len(g_list), self._run_time))

# return kernel_list


# def _compute_kernel_list_series(self, g1, g_list):
# pass


# def _compute_kernel_list_imap_unordered(self, g1, g_list):
# pass


# def _compute_single_kernel(self, g1, g2):
# start_time = time.time()

# kernel = self._compute_single_kernel_series(g1, g2)

# self._run_time = time.time() - start_time
# if self.verbose:
# print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time))

# return kernel


# def _compute_single_kernel_series(self, g1, g2):
# pass


def is_graph(self, graph):
if isinstance(graph, nx.Graph):
return True
if isinstance(graph, nx.DiGraph):
return True
if isinstance(graph, nx.MultiGraph):
return True
if isinstance(graph, nx.MultiDiGraph):
return True
return False


@property
def graphs(self):
return self._graphs


# @property
# def parallel(self):
# return self.parallel


# @property
# def n_jobs(self):
# return self.n_jobs


# @property
# def verbose(self):
# return self.verbose


# @property
# def normalize(self):
# return self.normalize


@property
def run_time(self):
return self._run_time


@property
def dis_matrix(self):
return self._dis_matrix

@dis_matrix.setter
def dis_matrix(self, value):
self._dis_matrix = value


# @property
# def gram_matrix_unnorm(self):
# return self._gram_matrix_unnorm

# @gram_matrix_unnorm.setter
# def gram_matrix_unnorm(self, value):
# self._gram_matrix_unnorm = value

+ 149
- 0
gklearn/ged/model/optim_costs.py View File

@@ -0,0 +1,149 @@
import numpy as np

from gklearn.ged.model.distances import sum_squares, euclid_d
from gklearn.ged.model.ged_com import compute_geds


def optimize_costs_unlabeled(nb_cost_mat, dis_k_vec):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
:param dis_k_vec: The N distances to fit
"""
import cvxpy as cp
import numpy as np
MAX_SAMPLE = 1000
nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] for x in nb_cost_mat])
dis_k_vec = np.array(dis_k_vec)
# dis_k_vec_norm = dis_k_vec/np.max(dis_k_vec)

# import pickle
# pickle.dump([nb_cost_mat, dis_k_vec], open('debug', 'wb'))
N = nb_cost_mat_m.shape[0]
sub_sample = np.random.permutation(np.arange(N))
sub_sample = sub_sample[:MAX_SAMPLE]

x = cp.Variable(nb_cost_mat_m.shape[1])
cost = cp.sum_squares((nb_cost_mat_m[sub_sample, :] @ x) - dis_k_vec[sub_sample])
prob = cp.Problem(cp.Minimize(cost), [x >= 0])
prob.solve()
edit_costs_new = [x.value[0], x.value[1], 0, x.value[2], x.value[3], 0]
edit_costs_new = [xi if xi > 0 else 0 for xi in edit_costs_new]
residual = prob.value
return edit_costs_new, residual


def optimize_costs_classif_unlabeled(nb_cost_mat, Y):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in
nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit
operations for each pair of graph
:param dis_k_vec: {-1,1}^N vector of common classes
"""
# import cvxpy as cp
from ml import reg_log
# import pickle
# pickle.dump([nb_cost_mat, Y], open('debug', 'wb'))
nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]]
for x in nb_cost_mat])
w, J, _ = reg_log(nb_cost_mat_m, Y, pos_contraint=True)
edit_costs_new = [w[0], w[1], 0, w[2], w[3], 0]
residual = J[-1]

return edit_costs_new, residual


def optimize_costs_classif(nb_cost_mat, Y):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
:param dis_k_vec: {-1,1}^N vector of common classes
"""
#import pickle
# pickle.dump([nb_cost_mat, Y], open("test.pickle", "wb"))
from ml import reg_log
w, J, _ = reg_log(nb_cost_mat, Y, pos_contraint=True)
return w, J[-1]


def optimize_costs(nb_cost_mat, dis_k_vec):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
:param dis_k_vec: The N distances to fit
"""
import cvxpy as cp
x = cp.Variable(nb_cost_mat.shape[1])
cost = cp.sum_squares((nb_cost_mat @ x) - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost), constraints)
prob.solve()
edit_costs_new = x.value
residual = prob.value

return edit_costs_new, residual


def compute_optimal_costs(G, y, init_costs=[3, 3, 1, 3, 3, 1],
y_distance=euclid_d,
mode='reg', unlabeled=False,
ed_method='BIPARTITE',
verbose=True,
**kwargs):
N = len(y)

G_pairs = []
distances_vec = []

for i in range(N):
for j in range(i+1, N):
G_pairs.append([i, j])
distances_vec.append(y_distance(y[i], y[j]))
ged_vec_init, n_edit_operations = compute_geds(G_pairs, G, init_costs, ed_method,
verbose=verbose, **kwargs)

residual_list = [sum_squares(ged_vec_init, distances_vec)]

if (mode == 'reg'):
if unlabeled:
method_optim = optimize_costs_unlabeled
else:
method_optim = optimize_costs

elif (mode == 'classif'):
if unlabeled:
method_optim = optimize_costs_classif_unlabeled
else:
method_optim = optimize_costs_classif

ite_max = 5
for i in range(ite_max):
if verbose:
print('ite', i + 1, '/', ite_max, ':')
# compute GEDs and numbers of edit operations.
edit_costs_new, residual = method_optim(
np.array(n_edit_operations), distances_vec)
ged_vec, n_edit_operations = compute_geds(G_pairs, G, edit_costs_new, ed_method,
verbose=verbose, **kwargs)
residual_list.append(sum_squares(ged_vec, distances_vec))

return edit_costs_new


def get_optimal_costs_GH2020(**kwargs):
import pickle
import os
dir_root = 'cj/output/'
ds_name = kwargs.get('ds_name')
nb_trial = kwargs.get('nb_trial')
file_name = os.path.join(dir_root, 'costs.' + ds_name + '.' + str(nb_trial) + '.pkl')
with open(file_name, 'rb') as f:
edit_costs = pickle.load(f)
return edit_costs

+ 179
- 31
gklearn/ged/util/util.py View File

@@ -64,10 +64,12 @@ def pairwise_ged(g1, g2, options={}, sort=True, repeats=1, parallel=False, verbo
g = listID[0]
h = listID[1]
dis_min = np.inf
# print('------------------------------------------')
for i in range(0, repeats):
ged_env.run_method(g, h)
upper = ged_env.get_upper_bound(g, h)
dis = upper
# print(dis)
if dis < dis_min:
dis_min = dis
pi_forward = ged_env.get_forward_map(g, h)
@@ -169,12 +171,100 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
return ged_vec, ged_mat, n_edit_operations


def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, n_jobs=None, verbose=True):
#%%


def compute_geds(graphs,
options={},
sort=True,
repeats=1,
permute_nodes=False,
random_state=None,
parallel=False,
n_jobs=None,
verbose=True):
"""Compute graph edit distance matrix using GEDLIB.
"""
if permute_nodes:
return _compute_geds_with_permutation(graphs,
options=options,
sort=sort,
repeats=repeats,
random_state=random_state,
parallel=parallel,
n_jobs=n_jobs,
verbose=verbose)
else:
return _compute_geds_without_permutation(graphs,
options=options,
sort=sort,
repeats=repeats,
parallel=parallel,
n_jobs=n_jobs,
verbose=verbose)


#%%


def _compute_geds_with_permutation(graphs,
options={},
sort=True,
repeats=1,
random_state=None,
parallel=False,
n_jobs=None,
verbose=True):

from gklearn.utils.utils import nx_permute_nodes

# Initialze variables.
ged_mat_optim = np.full((len(graphs), len(graphs)), np.inf)
np.fill_diagonal(ged_mat_optim, 0)
len_itr = int(len(graphs) * (len(graphs) - 1) / 2)
ged_vec = [0] * len_itr
n_edit_operations = [0] * len_itr

# for each repeats:
for i in range(0, repeats):
# Permutate nodes.
graphs_pmut = [nx_permute_nodes(g, random_state=random_state) for g in graphs]

out = _compute_geds_without_permutation(graphs_pmut,
options=options,
sort=sort,
repeats=1,
parallel=parallel,
n_jobs=n_jobs,
verbose=verbose)

# Compare current results with the best one.
idx_cnt = 0
for i in range(len(graphs)):
for j in range(i + 1, len(graphs)):
if out[1][i, j] < ged_mat_optim[i ,j]:
ged_mat_optim[i, j] = out[1][i, j]
ged_mat_optim[j, i] = out[1][j, i]
ged_vec[idx_cnt] = out[0][idx_cnt]
n_edit_operations[idx_cnt] = out[2][idx_cnt]
idx_cnt += 1

return ged_vec, ged_mat_optim, n_edit_operations


def _compute_geds_without_permutation(graphs,
options={},
sort=True,
repeats=1,
parallel=False,
n_jobs=None,
verbose=True):
from gklearn.gedlib import librariesImport, gedlibpy

# initialize ged env.
ged_env = gedlibpy.GEDEnv()
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants'])

for g in graphs:
ged_env.add_nx_graph(g, '')
listID = ged_env.get_all_graph_ids()
@@ -266,6 +356,11 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats):
dis = upper

# make the map label correct (label remove map as np.inf)
# Attention: using node indices instead of NetworkX node labels (as
# implemented here) may cause several issues:
# - Fail if NetworkX node labels are not consecutive integers;
# - Return wrong mappings if nodes are permutated (e.g., by using
# `gklearn.utis.utils.nx_permute_nodes()`.)
nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()]
nb1 = nx.number_of_nodes(g1)
@@ -278,46 +373,57 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats):
pi_forward_min = pi_forward
pi_backward_min = pi_backward

# print('-----')
# print(pi_forward_min)
# print(pi_backward_min)

return dis_min, pi_forward_min, pi_backward_min


def label_costs_to_matrix(costs, nb_labels):
"""Reform a label cost vector to a matrix.
#%%


def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs):
"""Calculate the numbers of the occurence of each edit operation in a given
edit path.

Parameters
----------
costs : numpy.array
The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs.
nb_labels : integer
Number of labels.
g1 : TYPE
DESCRIPTION.
g2 : TYPE
DESCRIPTION.
forward_map : TYPE
DESCRIPTION.
backward_map : TYPE
DESCRIPTION.
edit_cost : TYPE, optional
DESCRIPTION. The default is None.
is_cml : TYPE, optional
DESCRIPTION. The default is False.
**kwargs : TYPE
DESCRIPTION.

Raises
------
Exception
DESCRIPTION.

Returns
-------
cost_matrix : numpy.array.
The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData.
TYPE
DESCRIPTION.

Notes
-----
Attention: when implementing a function to get the numbers of edit
operations, make sure that:
- It does not fail if NetworkX node labels are not consecutive integers;
- It returns correct results if nodes are permutated (e.g., by using
`gklearn.utis.utils.nx_permute_nodes()`.)
Generally speaking, it means you need to distinguish the NetworkX label of
a node from the position (index) of that node in the node list.
"""
# Initialize label cost matrix.
cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1))
i = 0
# Costs of insertions.
for col in range(1, nb_labels + 1):
cost_matrix[0, col] = costs[i]
i += 1
# Costs of deletions.
for row in range(1, nb_labels + 1):
cost_matrix[row, 0] = costs[i]
i += 1
# Costs of substitutions.
for row in range(1, nb_labels + 1):
for col in range(row + 1, nb_labels + 1):
cost_matrix[row, col] = costs[i]
cost_matrix[col, row] = costs[i]
i += 1

return cost_matrix


def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs):
if is_cml:
if edit_cost == 'CONSTANT':
node_labels = kwargs.get('node_labels', [])
@@ -611,6 +717,48 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map,
return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es


#%%


def label_costs_to_matrix(costs, nb_labels):
"""Reform a label cost vector to a matrix.

Parameters
----------
costs : numpy.array
The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs.
nb_labels : integer
Number of labels.

Returns
-------
cost_matrix : numpy.array.
The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData.
"""
# Initialize label cost matrix.
cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1))
i = 0
# Costs of insertions.
for col in range(1, nb_labels + 1):
cost_matrix[0, col] = costs[i]
i += 1
# Costs of deletions.
for row in range(1, nb_labels + 1):
cost_matrix[row, 0] = costs[i]
i += 1
# Costs of substitutions.
for row in range(1, nb_labels + 1):
for col in range(row + 1, nb_labels + 1):
cost_matrix[row, col] = costs[i]
cost_matrix[col, row] = costs[i]
i += 1

return cost_matrix


#%%


def ged_options_to_string(options):
opt_str = ' '
for key, val in options.items():


+ 39
- 20
gklearn/kernels/graph_kernel.py View File

@@ -32,7 +32,13 @@ class GraphKernel(BaseEstimator): #, ABC):
https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel.
"""

def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2):
def __init__(self,
parallel=None,
n_jobs=None,
chunksize=None,
normalize=True,
copy_graphs=True, # make sure it is a full deep copy. and faster!
verbose=2):
"""`__init__` for `GraphKernel` object."""
# @todo: the default settings of the parameters are different from those in the self.compute method.
# self._graphs = None
@@ -40,6 +46,7 @@ class GraphKernel(BaseEstimator): #, ABC):
self.n_jobs = n_jobs
self.chunksize = chunksize
self.normalize = normalize
self.copy_graphs = copy_graphs
self.verbose = verbose
# self._run_time = 0
# self._gram_matrix = None
@@ -90,7 +97,7 @@ class GraphKernel(BaseEstimator): #, ABC):
return self


def transform(self, X):
def transform(self, X=None, load_gm_train=False):
"""Compute the graph kernel matrix between given and fitted data.

Parameters
@@ -108,6 +115,12 @@ class GraphKernel(BaseEstimator): #, ABC):
None.

"""
# If `load_gm_train`, load Gram matrix of training data.
if load_gm_train:
check_is_fitted(self, '_gm_train')
self._is_transformed = True
return self._gm_train # @todo: copy or not?

# Check if method "fit" had been called.
check_is_fitted(self, '_graphs')

@@ -133,8 +146,7 @@ class GraphKernel(BaseEstimator): #, ABC):
return kernel_matrix



def fit_transform(self, X):
def fit_transform(self, X, save_gm_train=False):
"""Fit and transform: compute Gram matrix on the same data.

Parameters
@@ -164,6 +176,9 @@ class GraphKernel(BaseEstimator): #, ABC):
finally:
np.seterr(**old_settings)

if save_gm_train:
self._gm_train = gram_matrix

return gram_matrix


@@ -260,7 +275,9 @@ class GraphKernel(BaseEstimator): #, ABC):
kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y)

elif self.parallel is None:
kernel_matrix = self._compute_kernel_matrix_series(Y)
Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y)
graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
kernel_matrix = self._compute_kernel_matrix_series(Y_copy, graphs_copy)

self._run_time = time.time() - start_time
if self.verbose:
@@ -270,26 +287,25 @@ class GraphKernel(BaseEstimator): #, ABC):
return kernel_matrix


def _compute_kernel_matrix_series(self, Y):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) without parallelization.
def _compute_kernel_matrix_series(self, X, Y):
"""Compute the kernel matrix between two sets of graphs (X and Y) without parallelization.

Parameters
----------
Y : list of graphs, optional
The target graphs.
X, Y : list of graphs
The input graphs.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
kernel_matrix : numpy array, shape = [n_X, n_Y]
The computed kernel matrix.

"""
kernel_matrix = np.zeros((len(Y), len(self._graphs)))
kernel_matrix = np.zeros((len(X), len(Y)))

for i_y, g_y in enumerate(Y):
for i_x, g_x in enumerate(self._graphs):
kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x)
for i_x, g_x in enumerate(X):
for i_y, g_y in enumerate(Y):
kernel_matrix[i_x, i_y] = self.pairwise_kernel(g_x, g_y)

return kernel_matrix

@@ -335,14 +351,16 @@ class GraphKernel(BaseEstimator): #, ABC):
except NotFittedError:
# Compute diagonals of X.
self._X_diag = np.empty(shape=(len(self._graphs),))
for i, x in enumerate(self._graphs):
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
for i, x in enumerate(graphs):
self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel?

try:
# If transform has happened, return both diagonals.
check_is_fitted(self, ['_Y'])
self._Y_diag = np.empty(shape=(len(self._Y),))
for (i, y) in enumerate(self._Y):
Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y)
for (i, y) in enumerate(Y):
self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel?

return self._X_diag, self._Y_diag
@@ -484,7 +502,8 @@ class GraphKernel(BaseEstimator): #, ABC):
if self.parallel == 'imap_unordered':
gram_matrix = self._compute_gm_imap_unordered()
elif self.parallel is None:
gram_matrix = self._compute_gm_series()
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
gram_matrix = self._compute_gm_series(graphs)
else:
raise Exception('Parallel mode is not set correctly.')

@@ -496,11 +515,11 @@ class GraphKernel(BaseEstimator): #, ABC):
return gram_matrix


def _compute_gm_series(self):
def _compute_gm_series(self, graphs):
pass


def _compute_gm_imap_unordered(self):
def _compute_gm_imap_unordered(self, graphs):
pass




+ 39
- 27
gklearn/kernels/treelet.py View File

@@ -28,16 +28,16 @@ from gklearn.kernels import GraphKernel

class Treelet(GraphKernel):

def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs):
def __init__(self, **kwargs):
"""Initialise a treelet kernel.
"""
super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose)
GraphKernel.__init__(self, **{k: kwargs.get(k) for k in ['parallel', 'n_jobs', 'chunksize', 'normalize', 'copy_graphs', 'verbose'] if k in kwargs})
self.node_labels = kwargs.get('node_labels', [])
self.edge_labels = kwargs.get('edge_labels', [])
self.sub_kernel = kwargs.get('sub_kernel', None)
self.ds_infos = kwargs.get('ds_infos', {})
self.precompute_canonkeys = precompute_canonkeys
self.save_canonkeys = save_canonkeys
self.precompute_canonkeys = kwargs.get('precompute_canonkeys', True)
self.save_canonkeys = kwargs.get('save_canonkeys', True)


##########################################################################
@@ -71,7 +71,7 @@ class Treelet(GraphKernel):
raise ValueError('Sub-kernel not set.')


def _compute_kernel_matrix_series(self, Y):
def _compute_kernel_matrix_series(self, Y, X=None, load_canonkeys=True):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) without parallelization.

@@ -86,36 +86,45 @@ class Treelet(GraphKernel):
The computed kernel matrix.

"""
if_comp_X_canonkeys = True

# if load saved canonkeys of X from the instance:
if load_canonkeys:
# Canonical keys for self._graphs.
try:
check_is_fitted(self, ['_canonkeys'])
canonkeys_list1 = self._canonkeys
if_comp_X_canonkeys = False
except NotFittedError:
import warnings
warnings.warn('The canonkeys of self._graphs are not computed/saved. The keys of `X` is computed instead.')
if_comp_X_canonkeys = True

# self._add_dummy_labels will modify the input in place.
self._add_dummy_labels() # For self._graphs
# Y = [g.copy() for g in Y] # @todo: ?
self._add_dummy_labels(Y)

# get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset.

# Canonical keys for self._graphs.
try:
check_is_fitted(self, ['_canonkeys'])
canonkeys_list1 = self._canonkeys
except NotFittedError:
# Compute the canonical keys of X.
if if_comp_X_canonkeys:
if X is None:
raise('X can not be None.')
# self._add_dummy_labels will modify the input in place.
self._add_dummy_labels(X) # for X
canonkeys_list1 = []
iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2))
iterator = get_iters(self._graphs, desc='Getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2))
for g in iterator:
canonkeys_list1.append(self._get_canonkeys(g))

if self.save_canonkeys:
self._canonkeys = canonkeys_list1

# Canonical keys for Y.
# Y = [g.copy() for g in Y] # @todo: ?
self._add_dummy_labels(Y)
canonkeys_list2 = []
iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2))
iterator = get_iters(Y, desc='Getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2))
for g in iterator:
canonkeys_list2.append(self._get_canonkeys(g))

if self.save_canonkeys:
self._Y_canonkeys = canonkeys_list2
# if self.save_canonkeys:
# self._Y_canonkeys = canonkeys_list2

# compute kernel matrix.
kernel_matrix = np.zeros((len(Y), len(canonkeys_list1)))
@@ -235,13 +244,13 @@ class Treelet(GraphKernel):
##########################################################################


def _compute_gm_series(self):
self._add_dummy_labels(self._graphs)
def _compute_gm_series(self, graphs):
self._add_dummy_labels(graphs)

# get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset.
canonkeys = []
iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout,
iterator = get_iters(graphs, desc='getting canonkeys', file=sys.stdout,
verbose=(self.verbose >= 2))
for g in iterator:
canonkeys.append(self._get_canonkeys(g))
@@ -250,11 +259,11 @@ class Treelet(GraphKernel):
self._canonkeys = canonkeys

# compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
gram_matrix = np.zeros((len(graphs), len(graphs)))

from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
itr = combinations_with_replacement(range(0, len(graphs)), 2)
len_itr = int(len(graphs) * (len(graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self.verbose >= 2))
for i, j in iterator:
@@ -390,6 +399,9 @@ class Treelet(GraphKernel):
Treelet kernel between 2 graphs.
"""
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs
if len(keys) == 0: # There is nothing in common...
return 0

vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys])
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys])



+ 52
- 103
gklearn/kernels/weisfeiler_lehman.py View File

@@ -28,7 +28,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


def __init__(self, **kwargs):
GraphKernel.__init__(self)
GraphKernel.__init__(self, **{k: kwargs.get(k) for k in ['parallel', 'n_jobs', 'chunksize', 'normalize', 'copy_graphs', 'verbose'] if k in kwargs})
self.node_labels = kwargs.get('node_labels', [])
self.edge_labels = kwargs.get('edge_labels', [])
self.height = int(kwargs.get('height', 0))
@@ -50,7 +50,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
##########################################################################


def _compute_gm_series(self):
def _compute_gm_series(self, graphs):
# if self.verbose >= 2:
# import warnings
# warnings.warn('A part of the computation is parallelized.')
@@ -59,19 +59,19 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.

# for WL subtree kernel
if self._base_kernel == 'subtree':
gram_matrix = self._subtree_kernel_do(self._graphs)
gram_matrix = self._subtree_kernel_do(graphs)

# for WL shortest path kernel
elif self._base_kernel == 'sp':
gram_matrix = self._sp_kernel_do(self._graphs)
gram_matrix = self._sp_kernel_do(graphs)

# for WL edge kernel
elif self._base_kernel == 'edge':
gram_matrix = self._edge_kernel_do(self._graphs)
gram_matrix = self._edge_kernel_do(graphs)

# for user defined base kernel
else:
gram_matrix = self._user_kernel_do(self._graphs)
gram_matrix = self._user_kernel_do(graphs)

return gram_matrix

@@ -204,70 +204,13 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


def pairwise_kernel(self, g1, g2):
Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster!
kernel = 0

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

# for each graph
for G in Gn:
# set all labels into a tuple.
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
# get the set of original labels
labels_ori = list(nx.get_node_attributes(G, 'lt').values())
# number of occurence of each label in G
all_num_of_each_label.append(dict(Counter(labels_ori)))

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
kernel = self._compute_kernel_itr(kernel, all_num_of_each_label)

# iterate each height
for h in range(1, self.height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for G in Gn:

all_multisets = []
for node, attrs in G.nodes(data=True):
# Multiset-label determination.
multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]]
# sorting each multiset
multiset.sort()
multiset = [attrs['lt']] + multiset # add the prefix
all_multisets.append(tuple(multiset))

# label compression
set_unique = list(set(all_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
set_compressed = {}
# if a label occured before, assign its former compressed label,
# else assign the number of labels occured + 1 as the compressed label.
for value in set_unique:
if value in all_set_compressed.keys():
set_compressed[value] = all_set_compressed[value]
else:
set_compressed[value] = str(num_of_labels_occured + 1)
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)

# relabel nodes
for idx, node in enumerate(G.nodes()):
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]

# get the set of compressed labels
labels_comp = list(nx.get_node_attributes(G, 'lt').values())
# all_labels_ori.update(labels_comp)
all_num_of_each_label.append(dict(Counter(labels_comp)))
# Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster!
Gn = [g1, g2]
# for WL subtree kernel
if self._base_kernel == 'subtree':
kernel = self._subtree_kernel_do(Gn, return_mat=False)

# Compute subtree kernel with h iterations and add it to the final kernel
kernel = self._compute_kernel_itr(kernel, all_num_of_each_label)
# @todo: other subkernels.

return kernel

@@ -291,7 +234,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
return kernel


def _subtree_kernel_do_nl(self, Gn):
def _subtree_kernel_do_nl(self, Gn, return_mat=True):
"""Compute Weisfeiler-Lehman kernels between graphs with node labels.

Parameters
@@ -301,10 +244,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.

Return
------
gram_matrix : Numpy matrix
kernel_matrix : Numpy matrix / float
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
gram_matrix = np.zeros((len(Gn), len(Gn)))
kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
@@ -324,7 +268,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
all_num_of_each_label.append(dict(Counter(labels_ori)))

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)

# iterate each height
for h in range(1, self.height + 1):
@@ -342,12 +286,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)

return gram_matrix
return kernel_matrix


def _subtree_kernel_do_el(self, Gn):
def _subtree_kernel_do_el(self, Gn, return_mat=True):
"""Compute Weisfeiler-Lehman kernels between graphs with edge labels.

Parameters
@@ -357,19 +301,20 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.

Return
------
gram_matrix : Numpy matrix
kernel_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
gram_matrix = np.zeros((len(Gn), len(Gn)))
kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2)
for i, j in iterator:
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
gram_matrix[j][i] = gram_matrix[i][j]
iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2)
for i, j in iterator: # @todo: not correct if return_mat == False.
kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
kernel_matrix[j][i] = kernel_matrix[i][j]


# if h >= 1.
@@ -393,7 +338,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)


# Iterate along heights (>= 2).
@@ -407,12 +352,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)

return gram_matrix
return kernel_matrix


def _subtree_kernel_do_labeled(self, Gn):
def _subtree_kernel_do_labeled(self, Gn, return_mat=True):
"""Compute Weisfeiler-Lehman kernels between graphs with both node and
edge labels.

@@ -423,10 +368,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.

Return
------
gram_matrix : Numpy matrix
kernel_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
gram_matrix = np.zeros((len(Gn), len(Gn)))
kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
@@ -446,10 +392,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
all_num_of_each_label.append(dict(Counter(labels_ori)))

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)


# if h >= 1.
# if h >= 1:
if self.height > 0:
# Set all edge labels into a tuple. # @todo: remove this original labels or not?
if self.verbose >= 2:
@@ -470,7 +416,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)


# Iterate along heights.
@@ -484,12 +430,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)

return gram_matrix
return kernel_matrix


def _subtree_kernel_do_unlabeled(self, Gn):
def _subtree_kernel_do_unlabeled(self, Gn, return_mat=True):
"""Compute Weisfeiler-Lehman kernels between graphs without labels.

Parameters
@@ -499,19 +445,20 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.

Return
------
gram_matrix : Numpy matrix
kernel_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
gram_matrix = np.zeros((len(Gn), len(Gn)))
kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2)
for i, j in iterator:
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
gram_matrix[j][i] = gram_matrix[i][j]
iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2)
for i, j in iterator: # @todo: not correct if return_mat == False.
kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
kernel_matrix[j][i] = kernel_matrix[i][j]


# if h >= 1.
@@ -526,7 +473,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)


# Iterate along heights (>= 2).
@@ -540,9 +487,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)

return gram_matrix
return kernel_matrix


def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
@@ -717,6 +664,8 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
all_num_of_each_label[j])
gram_matrix[j][i] = gram_matrix[i][j]

return gram_matrix


def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2):
"""Compute the subtree kernel.


+ 24
- 0
gklearn/model_selection/__init__.py View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 24 14:25:57 2022

@author: ljia
"""
from ._split import BaseCrossValidatorWithValid
# from ._split import BaseShuffleSplit
from ._split import KFoldWithValid
# from ._split import GroupKFold
# from ._split import StratifiedKFoldWithValid
# from ._split import TimeSeriesSplit
# from ._split import LeaveOneGroupOut
# from ._split import LeaveOneOut
# from ._split import LeavePGroupsOut
# from ._split import LeavePOut
from ._split import RepeatedKFoldWithValid
# from ._split import RepeatedStratifiedKFold
# from ._split import ShuffleSplit
# from ._split import GroupShuffleSplit
# from ._split import StratifiedShuffleSplit
# from ._split import StratifiedGroupKFold
# from ._split import PredefinedSplit

+ 287
- 0
gklearn/model_selection/_split.py View File

@@ -0,0 +1,287 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 24 11:13:26 2022

@author: ljia

Reference: scikit-learn.
"""
from abc import abstractmethod
import numbers
import warnings
import numpy as np
from sklearn.utils import check_random_state, check_array, column_or_1d, indexable
from sklearn.utils.validation import _num_samples
from sklearn.utils.multiclass import type_of_target


class BaseCrossValidatorWithValid(object):
"""Base class for all cross-validators.
Implementations must define `_iter_valid_test_masks` or `_iter_valid_stest_indices`.
"""

def split(self, X, y=None, groups=None):
"""Generate indices to split data into training, valid, and test set.

Parameters
----------

X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.

y : array-like of shape (n_samples,)
The target variable for supervised learning problems.

groups : array-like of shape (n_samples,), default=None
Group labels for the samples used while splitting the dataset into
train/test set.

Yields
------
train : ndarray
The training set indices for that split.

valid : ndarray
The valid set indices for that split.

test : ndarray
The testing set indices for that split.
"""
X, y, groups = indexable(X, y, groups)
indices = np.arange(_num_samples(X))
for valid_index, test_index in self._iter_valid_test_masks(X, y, groups):
train_index = indices[np.logical_not(np.logical_or(valid_index, test_index))]
valid_index = indices[valid_index]
test_index = indices[test_index]
yield train_index, valid_index, test_index


# Since subclasses must implement either _iter_valid_test_masks or
# _iter_valid_test_indices, neither can be abstract.
def _iter_valid_test_masks(self, X=None, y=None, groups=None):
"""Generates boolean masks corresponding to valid and test sets.
By default, delegates to _iter_valid_test_indices(X, y, groups)
"""
for valid_index, test_index in self._iter_valid_test_indices(X, y, groups):
valid_mask = np.zeros(_num_samples(X), dtype=bool)
test_mask = np.zeros(_num_samples(X), dtype=bool)
valid_mask[valid_index] = True
test_mask[test_index] = True
yield valid_mask, test_mask


def _iter_valid_test_indices(self, X=None, y=None, groups=None):
"""Generates integer indices corresponding to valid and test sets."""
raise NotImplementedError


@abstractmethod
def get_n_splits(self, X=None, y=None, groups=None):
"""Returns the number of splitting iterations in the cross-validator"""


def __repr__(self):
return _build_repr(self)


class _BaseKFoldWithValid(BaseCrossValidatorWithValid):
"""Base class for KFoldWithValid, GroupKFoldWithValid, and StratifiedKFoldWithValid"""

@abstractmethod
def __init__(self, n_splits, *, stratify, shuffle, random_state):
if not isinstance(n_splits, numbers.Integral):
raise ValueError(
'The number of folds must be of Integral type. '
'%s of type %s was passed.' % (n_splits, type(n_splits))
)
n_splits = int(n_splits)

if n_splits <= 2:
raise ValueError(
'k-fold cross-validation requires at least one'
' train/valid/test split by setting n_splits=3 or more,'
' got n_splits={0}.'.format(n_splits)
)

if not isinstance(shuffle, bool):
raise TypeError('shuffle must be True or False; got {0}'.format(shuffle))

if not shuffle and random_state is not None: # None is the default
raise ValueError(
'Setting a random_state has no effect since shuffle is '
'False. You should leave '
'random_state to its default (None), or set shuffle=True.',
)

self.n_splits = n_splits
self.stratify = stratify
self.shuffle = shuffle
self.random_state = random_state


def split(self, X, y=None, groups=None):
"""Generate indices to split data into training, valid and test set."""
X, y, groups = indexable(X, y, groups)
n_samples = _num_samples(X)
if self.n_splits > n_samples:
raise ValueError(
(
'Cannot have number of splits n_splits={0} greater'
' than the number of samples: n_samples={1}.'
).format(self.n_splits, n_samples)
)

for train, valid, test in super().split(X, y, groups):
yield train, valid, test


class KFoldWithValid(_BaseKFoldWithValid):


def __init__(
self,
n_splits=5,
*,
stratify=False,
shuffle=False,
random_state=None
):
super().__init__(
n_splits=n_splits,
stratify=stratify,
shuffle=shuffle,
random_state=random_state
)


def _make_valid_test_folds(self, X, y=None):
rng = check_random_state(self.random_state)
y = np.asarray(y)
type_of_target_y = type_of_target(y)
allowed_target_types = ('binary', 'multiclass')
if type_of_target_y not in allowed_target_types:
raise ValueError(
'Supported target types are: {}. Got {!r} instead.'.format(
allowed_target_types, type_of_target_y
)
)

y = column_or_1d(y)

_, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
# y_inv encodes y according to lexicographic order. We invert y_idx to
# map the classes so that they are encoded by order of appearance:
# 0 represents the first label appearing in y, 1 the second, etc.
_, class_perm = np.unique(y_idx, return_inverse=True)
y_encoded = class_perm[y_inv]

n_classes = len(y_idx)
y_counts = np.bincount(y_encoded)
min_groups = np.min(y_counts)
if np.all(self.n_splits > y_counts):
raise ValueError(
"n_splits=%d cannot be greater than the"
" number of members in each class." % (self.n_splits)
)
if self.n_splits > min_groups:
warnings.warn(
"The least populated class in y has only %d"
" members, which is less than n_splits=%d."
% (min_groups, self.n_splits),
UserWarning,
)

# Determine the optimal number of samples from each class in each fold,
# using round robin over the sorted y. (This can be done direct from
# counts, but that code is unreadable.)
y_order = np.sort(y_encoded)
allocation = np.asarray(
[
np.bincount(y_order[i :: self.n_splits], minlength=n_classes)
for i in range(self.n_splits)
]
)

# To maintain the data order dependencies as best as possible within
# the stratification constraint, we assign samples from each class in
# blocks (and then mess that up when shuffle=True).
test_folds = np.empty(len(y), dtype='i')
for k in range(n_classes):
# since the kth column of allocation stores the number of samples
# of class k in each test set, this generates blocks of fold
# indices corresponding to the allocation for class k.
folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])
if self.shuffle:
rng.shuffle(folds_for_class)
test_folds[y_encoded == k] = folds_for_class
return test_folds


def _iter_valid_test_masks(self, X, y=None, groups=None):
test_folds = self._make_valid_test_folds(X, y)
for i in range(self.n_splits):
if i + 1 < self.n_splits:
j = i + 1
else:
j = 0
yield test_folds == i, test_folds == j


def split(self, X, y, groups=None):
y = check_array(y, input_name='y', ensure_2d=False, dtype=None)
return super().split(X, y, groups)


class _RepeatedSplitsWithValid(object):


def __init__(
self,
cv,
*,
n_repeats=10,
random_state=None,
**cvargs
):
if not isinstance(n_repeats, int):
raise ValueError('Number of repetitions must be of integer type.')

if n_repeats <= 0:
raise ValueError('Number of repetitions must be greater than 0.')

self.cv = cv
self.n_repeats = n_repeats
self.random_state = random_state
self.cvargs = cvargs


def split(self, X, y=None, groups=None):
n_repeats = self.n_repeats
rng = check_random_state(self.random_state)

for idx in range(n_repeats):
cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
for train_index, valid_index, test_index in cv.split(X, y, groups):
yield train_index, valid_index, test_index


class RepeatedKFoldWithValid(_RepeatedSplitsWithValid):


def __init__(
self,
*,
n_splits=5,
n_repeats=10,
stratify=False,
random_state=None
):
super().__init__(
KFoldWithValid,
n_repeats=n_repeats,
stratify=stratify,
random_state=random_state,
n_splits=n_splits,
)

+ 12
- 3
gklearn/utils/kernels.py View File

@@ -4,7 +4,7 @@ These kernels are defined between pairs of vectors.
import numpy as np


def delta_kernel(x, y):
def kronecker_delta_kernel(x, y):
"""Delta kernel. Return 1 if x == y, 0 otherwise.

Parameters
@@ -23,6 +23,10 @@ def delta_kernel(x, y):
labeled graphs. In Proceedings of the 20th International Conference on
Machine Learning, Washington, DC, United States, 2003.
"""
return (1 if np.array_equal(x, y) else 0)


def delta_kernel(x, y):
return x == y #(1 if condition else 0)


@@ -64,6 +68,11 @@ def gaussian_kernel(x, y, gamma=None):
return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma)


def tanimoto_kernel(x, y):
xy = np.dot(x, y)
return xy / (np.dot(x, x) + np.dot(y, y) - xy)


def gaussiankernel(x, y, gamma=None):
return gaussian_kernel(x, y, gamma=gamma)

@@ -123,7 +132,7 @@ def linearkernel(x, y):


def cosine_kernel(x, y):
return np.dot(x, y) / (np.abs(x) * np.abs(y))
return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))


def sigmoid_kernel(x, y, gamma=None, coef0=1):
@@ -142,7 +151,7 @@ def laplacian_kernel(x, y, gamma=None):
if gamma is None:
gamma = 1.0 / len(x)

k = -gamma * np.abs(np.subtract(x, y))
k = -gamma * np.linalg.norm(np.subtract(x, y))
k = np.exp(k)
return k



+ 272
- 199
gklearn/utils/utils.py View File

@@ -7,6 +7,9 @@ from enum import Enum, unique
# from tqdm import tqdm


#%%


def getSPLengths(G1):
sp = nx.shortest_path(G1)
distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
@@ -286,81 +289,146 @@ def direct_product_graph(G1, G2, node_labels, edge_labels):
return gt


def graph_deepcopy(G):
"""Deep copy a graph, including deep copy of all nodes, edges and
attributes of the graph, nodes and edges.
def find_paths(G, source_node, length):
"""Find all paths with a certain length those start from a source node.
A recursive depth first search is applied.

Note
----
It is the same as the NetworkX function graph.copy(), as far as I know.
Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
source_node : integer
The number of the node from where all paths start.
length : integer
The length of paths.

Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
# add graph attributes.
labels = {}
for k, v in G.graph.items():
labels[k] = deepcopy(v)
if G.is_directed():
G_copy = nx.DiGraph(**labels)
else:
G_copy = nx.Graph(**labels)
if length == 0:
return [[source_node]]
path = [[source_node] + path for neighbor in G[source_node] \
for path in find_paths(G, neighbor, length - 1) if source_node not in path]
return path

# add nodes
for nd, attrs in G.nodes(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_node(nd, **labels)

# add edges.
for nd1, nd2, attrs in G.edges(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_edge(nd1, nd2, **labels)
def find_all_paths(G, length, is_directed):
"""Find all paths with a certain length in a graph. A recursive depth first
search is applied.

return G_copy
Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
length : integer
The length of paths.

Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
all_paths = []
for node in G:
all_paths.extend(find_paths(G, node, length))

def graph_isIdentical(G1, G2):
"""Check if two graphs are identical, including: same nodes, edges, node
labels/attributes, edge labels/attributes.
if not is_directed:
# For each path, two presentations are retrieved from its two extremities.
# Remove one of them.
all_paths_r = [path[::-1] for path in all_paths]
for idx, path in enumerate(all_paths[:-1]):
for path2 in all_paths_r[idx+1::]:
if path == path2:
all_paths[idx] = []
break
all_paths = list(filter(lambda a: a != [], all_paths))

Notes
-----
1. The type of graphs has to be the same.
return all_paths

2. Global/Graph attributes are neglected as they may contain names for graphs.
"""
# check nodes.
nlist1 = [n for n in G1.nodes(data=True)]
nlist2 = [n for n in G2.nodes(data=True)]
if not nlist1 == nlist2:
return False
# check edges.
elist1 = [n for n in G1.edges(data=True)]
elist2 = [n for n in G2.edges(data=True)]
if not elist1 == elist2:
return False
# check graph attributes.

return True
# @todo: use it in ShortestPath.
def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]):
"""Compute kernels between each pair of vertices in two graphs.

Parameters
----------
g1, g2 : NetworkX graph
The kernels bewteen pairs of vertices in these two graphs are computed.
node_kernels : dict
A dictionary of kernel functions for nodes, including 3 items: 'symb'
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
for both labels. The first 2 functions take two node labels as
parameters, and the 'mix' function takes 4 parameters, a symbolic and a
non-symbolic label for each the two nodes. Each label is in form of 2-D
dimension array (n_samples, n_features). Each function returns a number
as the kernel value. Ignored when nodes are unlabeled. This argument
is designated to conjugate gradient method and fixed-point iterations.
node_labels : list, optional
The list of the name strings of the node labels. The default is [].
node_attrs : list, optional
The list of the name strings of the node attributes. The default is [].

def get_node_labels(Gn, node_label):
"""Get node labels of dataset Gn.
"""
nl = set()
for G in Gn:
nl = nl | set(nx.get_node_attributes(G, node_label).values())
return nl
Returns
-------
vk_dict : dict
Vertex kernels keyed by vertices.

Notes
-----
This function is used by ``gklearn.kernels.FixedPoint'' and
``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1].

def get_edge_labels(Gn, edge_label):
"""Get edge labels of dataset Gn.
References
----------
.. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang.
Parallelization of shortest path graph kernels on multi-core cpus and gpus.
Proceedings of the Programmability Issues for Heterogeneous Multicores
(MultiProg), Vienna, Austria, 2014.
"""
el = set()
for G in Gn:
el = el | set(nx.get_edge_attributes(G, edge_label).values())
return el
vk_dict = {} # shortest path matrices dict
if len(node_labels) > 0:
# node symb and non-synb labeled
if len(node_attrs) > 0:
kn = node_kernels['mix']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in node_labels]
n2_labels = [n2[1][nl] for nl in node_labels]
n1_attrs = [n1[1][na] for na in node_attrs]
n2_attrs = [n2[1][na] for na in node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
# node symb labeled
else:
kn = node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in node_labels]
n2_labels = [n2[1][nl] for nl in node_labels]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels)
else:
# node non-synb labeled
if len(node_attrs) > 0:
kn = node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_attrs = [n1[1][na] for na in node_attrs]
n2_attrs = [n2[1][na] for na in node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs)
# node unlabeled
else:
pass # @todo: add edge weights.
# for e1 in g1.edges(data=True):
# for e2 in g2.edges(data=True):
# if e1[2]['cost'] == e2[2]['cost']:
# kernel += 1
# return kernel

return vk_dict


#%%


def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs):
@@ -513,79 +581,6 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d
print('\ncomplete.')


def find_paths(G, source_node, length):
"""Find all paths with a certain length those start from a source node.
A recursive depth first search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
source_node : integer
The number of the node from where all paths start.
length : integer
The length of paths.

Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
if length == 0:
return [[source_node]]
path = [[source_node] + path for neighbor in G[source_node] \
for path in find_paths(G, neighbor, length - 1) if source_node not in path]
return path


def find_all_paths(G, length, is_directed):
"""Find all paths with a certain length in a graph. A recursive depth first
search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
length : integer
The length of paths.

Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
all_paths = []
for node in G:
all_paths.extend(find_paths(G, node, length))

if not is_directed:
# For each path, two presentations are retrieved from its two extremities.
# Remove one of them.
all_paths_r = [path[::-1] for path in all_paths]
for idx, path in enumerate(all_paths[:-1]):
for path2 in all_paths_r[idx+1::]:
if path == path2:
all_paths[idx] = []
break
all_paths = list(filter(lambda a: a != [], all_paths))

return all_paths


def get_mlti_dim_node_attrs(G, attr_names):
attributes = []
for nd, attrs in G.nodes(data=True):
attributes.append(tuple(attrs[aname] for aname in attr_names))
return attributes


def get_mlti_dim_edge_attrs(G, attr_names):
attributes = []
for ed, attrs in G.edges(data=True):
attributes.append(tuple(attrs[aname] for aname in attr_names))
return attributes


def normalize_gram_matrix(gram_matrix):
diag = gram_matrix.diagonal().copy()
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
@@ -621,84 +616,162 @@ def compute_distance_matrix(gram_matrix):
return dis_mat, dis_max, dis_min, dis_mean


# @todo: use it in ShortestPath.
def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]):
"""Compute kernels between each pair of vertices in two graphs.
#%%


def graph_deepcopy(G):
"""Deep copy a graph, including deep copy of all nodes, edges and
attributes of the graph, nodes and edges.

Note
----
- It is the same as the NetworkX function graph.copy(), as far as I know.

- This function only supports Networkx.Graph and Networkx.DiGraph.
"""
# add graph attributes.
labels = {}
for k, v in G.graph.items():
labels[k] = deepcopy(v)
if G.is_directed():
G_copy = nx.DiGraph(**labels)
else:
G_copy = nx.Graph(**labels)

# add nodes
for nd, attrs in G.nodes(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_node(nd, **labels)

# add edges.
for nd1, nd2, attrs in G.edges(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_edge(nd1, nd2, **labels)

return G_copy


def graph_isIdentical(G1, G2):
"""Check if two graphs are identical, including: same nodes, edges, node
labels/attributes, edge labels/attributes.

Notes
-----
1. The type of graphs has to be the same.

2. Global/Graph attributes are neglected as they may contain names for graphs.
"""
# check nodes.
nlist1 = [n for n in G1.nodes(data=True)]
nlist2 = [n for n in G2.nodes(data=True)]
if not nlist1 == nlist2:
return False
# check edges.
elist1 = [n for n in G1.edges(data=True)]
elist2 = [n for n in G2.edges(data=True)]
if not elist1 == elist2:
return False
# check graph attributes.

return True


def get_node_labels(Gn, node_label):
"""Get node labels of dataset Gn.
"""
nl = set()
for G in Gn:
nl = nl | set(nx.get_node_attributes(G, node_label).values())
return nl


def get_edge_labels(Gn, edge_label):
"""Get edge labels of dataset Gn.
"""
el = set()
for G in Gn:
el = el | set(nx.get_edge_attributes(G, edge_label).values())
return el


def get_mlti_dim_node_attrs(G, attr_names):
attributes = []
for nd, attrs in G.nodes(data=True):
attributes.append(tuple(attrs[aname] for aname in attr_names))
return attributes


def get_mlti_dim_edge_attrs(G, attr_names):
attributes = []
for ed, attrs in G.edges(data=True):
attributes.append(tuple(attrs[aname] for aname in attr_names))
return attributes


def nx_permute_nodes(G, random_state=None):
"""Permute node indices in a NetworkX graph.

Parameters
----------
g1, g2 : NetworkX graph
The kernels bewteen pairs of vertices in these two graphs are computed.
node_kernels : dict
A dictionary of kernel functions for nodes, including 3 items: 'symb'
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
for both labels. The first 2 functions take two node labels as
parameters, and the 'mix' function takes 4 parameters, a symbolic and a
non-symbolic label for each the two nodes. Each label is in form of 2-D
dimension array (n_samples, n_features). Each function returns a number
as the kernel value. Ignored when nodes are unlabeled. This argument
is designated to conjugate gradient method and fixed-point iterations.
node_labels : list, optional
The list of the name strings of the node labels. The default is [].
node_attrs : list, optional
The list of the name strings of the node attributes. The default is [].
G : TYPE
DESCRIPTION.
random_state : TYPE, optional
DESCRIPTION. The default is None.

Returns
-------
vk_dict : dict
Vertex kernels keyed by vertices.
G_new : TYPE
DESCRIPTION.

Notes
-----
This function is used by ``gklearn.kernels.FixedPoint'' and
``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1].

References
----------
.. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang.
Parallelization of shortest path graph kernels on multi-core cpus and gpus.
Proceedings of the Programmability Issues for Heterogeneous Multicores
(MultiProg), Vienna, Austria, 2014.
- This function only supports Networkx.Graph and Networkx.DiGraph.
"""
vk_dict = {} # shortest path matrices dict
if len(node_labels) > 0:
# node symb and non-synb labeled
if len(node_attrs) > 0:
kn = node_kernels['mix']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in node_labels]
n2_labels = [n2[1][nl] for nl in node_labels]
n1_attrs = [n1[1][na] for na in node_attrs]
n2_attrs = [n2[1][na] for na in node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
# node symb labeled
else:
kn = node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in node_labels]
n2_labels = [n2[1][nl] for nl in node_labels]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels)
# @todo: relabel node with integers? (in case something went wrong...)
# Add graph attributes.
labels = {}
for k, v in G.graph.items():
labels[k] = deepcopy(v)
if G.is_directed():
G_new = nx.DiGraph(**labels)
else:
# node non-synb labeled
if len(node_attrs) > 0:
kn = node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_attrs = [n1[1][na] for na in node_attrs]
n2_attrs = [n2[1][na] for na in node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs)
# node unlabeled
else:
pass # @todo: add edge weights.
# for e1 in g1.edges(data=True):
# for e2 in g2.edges(data=True):
# if e1[2]['cost'] == e2[2]['cost']:
# kernel += 1
# return kernel
G_new = nx.Graph(**labels)

return vk_dict
# Create a random mapping old node indices <-> new indices.
nb_nodes = nx.number_of_nodes(G)
indices_orig = range(nb_nodes)
idx_mapping = np.random.RandomState(seed=random_state).permutation(indices_orig)

# Add nodes.
nodes_orig = list(G.nodes)
for i_orig in range(nb_nodes):
i_new = idx_mapping[i_orig]
labels = {}
for k, v in G.nodes[nodes_orig[i_new]].items():
labels[k] = deepcopy(v)
G_new.add_node(nodes_orig[i_new], **labels)

# Add edges.
for nd1, nd2, attrs in G.edges(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_new.add_edge(nd1, nd2, **labels)


# # create a random mapping old label -> new label
# node_mapping = dict(zip(G.nodes(), np.random.RandomState(seed=random_state).permutation(G.nodes())))
# # build a new graph
# G_new = nx.relabel_nodes(G, node_mapping)

return G_new


#%%


def dummy_node():


+ 1
- 1
requirements.txt View File

@@ -2,7 +2,7 @@ numpy>=1.16.2
scipy>=1.1.0
matplotlib>=3.1.0
networkx>=2.2
scikit-learn>=0.20.0
scikit-learn>=1.1.0
tabulate>=0.8.2
tqdm>=4.26.0
control>=0.8.2 # for generalized random walk kernels only.


+ 2
- 2
requirements_pypi.txt View File

@@ -1,8 +1,8 @@
numpy>=1.16.2
scipy>=1.1.0
matplotlib>=3.0.0
matplotlib>=3.1.0
networkx>=2.2
scikit-learn>=0.20.0
scikit-learn>=1.1.0
tabulate>=0.8.2
tqdm>=4.26.0
control>=0.8.2 # for generalized random walk kernels only.


Loading…
Cancel
Save