Browse Source

Merge pull request #48 from jajupmochi/v0.2.x

V0.2.x
master
linlin GitHub 3 years ago
parent
commit
8b853895fa
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 2139 additions and 549 deletions
  1. +2
    -0
      .travis.yml
  2. +2
    -1
      README.md
  3. +0
    -147
      gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py
  4. +6
    -5
      gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py
  5. +172
    -0
      gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py
  6. +1
    -0
      gklearn/experiments/ged/stability/group_results.py
  7. +18
    -9
      gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py
  8. +17
    -1
      gklearn/experiments/ged/stability/utils.py
  9. +1
    -0
      gklearn/ged/__init__.py
  10. +43
    -0
      gklearn/ged/model/distances.py
  11. +97
    -0
      gklearn/ged/model/ged_com.py
  12. +724
    -0
      gklearn/ged/model/ged_model.py
  13. +149
    -0
      gklearn/ged/model/optim_costs.py
  14. +179
    -31
      gklearn/ged/util/util.py
  15. +39
    -20
      gklearn/kernels/graph_kernel.py
  16. +39
    -27
      gklearn/kernels/treelet.py
  17. +52
    -103
      gklearn/kernels/weisfeiler_lehman.py
  18. +24
    -0
      gklearn/model_selection/__init__.py
  19. +287
    -0
      gklearn/model_selection/_split.py
  20. +12
    -3
      gklearn/utils/kernels.py
  21. +272
    -199
      gklearn/utils/utils.py
  22. +1
    -1
      requirements.txt
  23. +2
    -2
      requirements_pypi.txt

+ 2
- 0
.travis.yml View File

@@ -4,6 +4,8 @@ python:
- '3.6' - '3.6'
- '3.7' - '3.7'
- '3.8' - '3.8'
- '3.9'
#- '3.10'


before_install: before_install:
- python --version - python --version


+ 2
- 1
README.md View File

@@ -1,5 +1,6 @@
# graphkit-learn # graphkit-learn
[![Build Status](https://travis-ci.com/jajupmochi/graphkit-learn.svg?branch=master)](https://travis-ci.com/jajupmochi/graphkit-learn)

[![Build Status](https://app.travis-ci.com/jajupmochi/graphkit-learn.svg?branch=master)](https://app.travis-ci.com/jajupmochi/graphkit-learn)
[![Build status](https://ci.appveyor.com/api/projects/status/bdxsolk0t1uji9rd?svg=true)](https://ci.appveyor.com/project/jajupmochi/graphkit-learn) [![Build status](https://ci.appveyor.com/api/projects/status/bdxsolk0t1uji9rd?svg=true)](https://ci.appveyor.com/project/jajupmochi/graphkit-learn)
[![codecov](https://codecov.io/gh/jajupmochi/graphkit-learn/branch/master/graph/badge.svg)](https://codecov.io/gh/jajupmochi/graphkit-learn) [![codecov](https://codecov.io/gh/jajupmochi/graphkit-learn/branch/master/graph/badge.svg)](https://codecov.io/gh/jajupmochi/graphkit-learn)
[![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master) [![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master)


+ 0
- 147
gklearn/experiments/ged/stability/edit_costs.max_num_sols.ratios.bipartite.py View File

@@ -1,147 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 2 16:17:01 2020

@author: ljia
"""
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed.

import os
import multiprocessing
import pickle
import logging
from gklearn.ged.util import compute_geds
import time
from utils import get_dataset
import sys
from group_results import group_trials


def xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial):
save_file_suffix = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)
# Return if the file exists.
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'):
return None, None

"""**2. Set parameters.**"""

# Parameters for GED computation.
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic.
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv)
'lsape_model': 'ECBP', #
# ??when bigger than 1, then the method is considered mIPFP.
# the actual number of computed solutions might be smaller than the specified value
'max_num_solutions': max_num_solutions,
'edit_cost': 'CONSTANT', # use CONSTANT cost.
'greedy_method': 'BASIC', #
# the distance between non-symbolic node/edge labels is computed by euclidean distance.
'attr_distance': 'euclidean',
'optimal': True, # if TRUE, the option --greedy-method has no effect
# parallel threads. Do not work if mpg_options['parallel'] = False.
'threads': multiprocessing.cpu_count(),
'centrality_method': 'NONE',
'centrality_weight': 0.7,
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
}
edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))

options = ged_options.copy()
options['edit_cost_constants'] = edit_cost_constants
options['node_labels'] = dataset.node_labels
options['edge_labels'] = dataset.edge_labels
options['node_attrs'] = dataset.node_attrs
options['edge_attrs'] = dataset.edge_attrs
parallel = True # if num_solutions == 1 else False
"""**5. Compute GED matrix.**"""
ged_mat = 'error'
runtime = 0
try:
time0 = time.time()
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, repeats=1, parallel=parallel, verbose=True)
runtime = time.time() - time0
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))
"""**6. Get results.**"""
with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)

return ged_mat, runtime

def save_trials_as_group(dataset, ds_name, max_num_solutions, ratio):
# Return if the group file exists.
name_middle = '.' + ds_name + '.mnum_sols_' + str(max_num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.'
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy'
if os.path.isfile(name_group):
return
ged_mats = []
runtimes = []
for trial in range(1, 101):
print()
print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, max_num_solutions, ratio, trial)
ged_mats.append(ged_mat)
runtimes.append(runtime)
# Group trials and Remove single files.
name_prefix = 'ged_matrix' + name_middle
group_trials(save_dir, name_prefix, True, True, False)
name_prefix = 'runtime' + name_middle
group_trials(save_dir, name_prefix, True, True, False)


def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)
for max_num_solutions in mnum_solutions_list:
print()
print('Max # of solutions:', max_num_solutions)
for ratio in ratio_list:
print()
print('Ratio:', ratio)
save_trials_as_group(dataset, ds_name, max_num_solutions, ratio)
def get_param_lists(ds_name):
if ds_name == 'AIDS_symb':
mnum_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
else:
mnum_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
return mnum_solutions_list, ratio_list

if __name__ == '__main__':
if len(sys.argv) > 1:
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
save_dir = 'outputs/edit_costs.max_num_sols.ratios.bipartite/'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir + 'groups/', exist_ok=True)
for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
mnum_solutions_list, ratio_list = get_param_lists(ds_name)
results_for_a_dataset(ds_name)

+ 6
- 5
gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py View File

@@ -13,7 +13,7 @@ import pickle
import logging import logging
from gklearn.ged.util import compute_geds from gklearn.ged.util import compute_geds
import time import time
from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation
from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids
import sys import sys
from group_results import group_trials, check_group_existence, update_group_marker from group_results import group_trials, check_group_existence, update_group_marker


@@ -125,9 +125,10 @@ def get_param_lists(ds_name, mode='test'):


elif mode == 'simple': elif mode == 'simple':
from sklearn.model_selection import ParameterGrid from sklearn.model_selection import ParameterGrid
param_grid = ParameterGrid([
{'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]},
{'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}])
param_grid = mix_param_grids([list(ParameterGrid([
{'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])),
list(ParameterGrid([
{'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))])
# print(list(param_grid)) # print(list(param_grid))


if ds_name == 'AIDS_symb': if ds_name == 'AIDS_symb':
@@ -148,7 +149,7 @@ if __name__ == '__main__':
# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled'] # ds_name_list = ['MUTAG'] # 'Alkane_unlabeled']
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb'] # ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']


save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/'
save_dir = 'outputs/CRIANN/edit_costs.real_data.num_sols.ratios.IPFP/'
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir + 'groups/', exist_ok=True) os.makedirs(save_dir + 'groups/', exist_ok=True)




+ 172
- 0
gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.bipartite.py View File

@@ -0,0 +1,172 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 2 16:17:01 2020

@author: ljia
"""
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1]. The minimum solution from given numbers of repeats are computed.

import os
import multiprocessing
import pickle
import logging
from gklearn.ged.util import compute_geds
import time
from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation, mix_param_grids
import sys
from group_results import group_trials, check_group_existence, update_group_marker


def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):

save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)

# Return if the file exists.
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'):
return None, None

"""**2. Set parameters.**"""

# Parameters for GED computation.
ged_options = {'method': 'BIPARTITE', # use BIPARTITE huristic.
# 'initialization_method': 'RANDOM', # or 'NODE', etc. (for GEDEnv)
'lsape_model': 'ECBP', #
# ??when bigger than 1, then the method is considered mIPFP.
# the actual number of computed solutions might be smaller than the specified value
'max_num_solutions': 1, # @ max_num_solutions,
'edit_cost': 'CONSTANT', # use CONSTANT cost.
'greedy_method': 'BASIC', #
# the distance between non-symbolic node/edge labels is computed by euclidean distance.
'attr_distance': 'euclidean',
'optimal': True, # if TRUE, the option --greedy-method has no effect
# parallel threads. Do not work if mpg_options['parallel'] = False.
'threads': multiprocessing.cpu_count(),
'centrality_method': 'NONE',
'centrality_weight': 0.7,
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
}

edit_cost_constants = set_edit_cost_consts(ratio,
node_labeled=len(dataset.node_labels),
edge_labeled=len(dataset.edge_labels),
mode='uniform')
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))


options = ged_options.copy()
options['edit_cost_constants'] = edit_cost_constants
options['node_labels'] = dataset.node_labels
options['edge_labels'] = dataset.edge_labels
options['node_attrs'] = dataset.node_attrs
options['edge_attrs'] = dataset.edge_attrs
parallel = True # if num_solutions == 1 else False

"""**5. Compute GED matrix.**"""
ged_mat = 'error'
runtime = 0
try:
time0 = time.time()
ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs,
options=options,
repeats=num_solutions,
permute_nodes=True,
random_state=None,
parallel=parallel,
verbose=True)
runtime = time.time() - time0
except Exception as exp:
print('An exception occured when running this experiment:')
LOG_FILENAME = save_dir + 'error.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))

"""**6. Get results.**"""

with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)

return ged_mat, runtime


def save_trials_as_group(dataset, ds_name, num_solutions, ratio):
# Return if the group file exists.
name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.'
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy'
if check_group_existence(name_group):
return

ged_mats = []
runtimes = []
num_trials = 100
for trial in range(1, num_trials + 1):
print()
print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial)
ged_mats.append(ged_mat)
runtimes.append(runtime)

# Group trials and remove single files.
# @todo: if the program stops between the following lines, then there may be errors.
name_prefix = 'ged_matrix' + name_middle
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
name_prefix = 'runtime' + name_middle
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
update_group_marker(name_group)


def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)

for params in list(param_grid):
print()
print(params)
save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio'])


def get_param_lists(ds_name, mode='test'):
if mode == 'test':
num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
ratio_list = [10]
return num_solutions_list, ratio_list

elif mode == 'simple':
from sklearn.model_selection import ParameterGrid
param_grid = mix_param_grids([list(ParameterGrid([
{'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 40, 50, 60, 70, 80, 90, 100]), 'ratio': [10]}])),
list(ParameterGrid([
{'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}]))])
# print(list(param_grid))

if ds_name == 'AIDS_symb':
num_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
else:
num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1]

return param_grid


if __name__ == '__main__':
if len(sys.argv) > 1:
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG']
# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled']
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']

save_dir = 'outputs/CRIANN/edit_costs.real_data.nums_sols.ratios.bipartite/'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir + 'groups/', exist_ok=True)

for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
param_grid = get_param_lists(ds_name, mode='simple')
results_for_a_dataset(ds_name)

+ 1
- 0
gklearn/experiments/ged/stability/group_results.py View File

@@ -32,6 +32,7 @@ def check_group_existence(file_name):




def update_group_marker(file_name): def update_group_marker(file_name):
# @todo: possible error when seveal tasks are using this file at the same time.
path, name = os.path.split(file_name) path, name = os.path.split(file_name)
marker_fn = os.path.join(path, 'group_names_finished.pkl') marker_fn = os.path.join(path, 'group_names_finished.pkl')
if os.path.isfile(marker_fn): if os.path.isfile(marker_fn):


gklearn/experiments/ged/stability/run_job_edit_costs.max_nums_sols.ratios.bipartite.py → gklearn/experiments/ged/stability/run_job_edit_costs.real_data.nums_sols.ratios.bipartite.py View File

@@ -9,36 +9,45 @@ import os
import re import re




cur_path = os.path.dirname(os.path.abspath(__file__))


def get_job_script(arg): def get_job_script(arg):
script = r""" script = r"""
#!/bin/bash #!/bin/bash


#SBATCH --exclusive #SBATCH --exclusive
#SBATCH --job-name="st.""" + arg + r""".bp" #SBATCH --job-name="st.""" + arg + r""".bp"
#SBATCH --partition=tlong
#SBATCH --partition=court
#SBATCH --mail-type=ALL #SBATCH --mail-type=ALL
#SBATCH --mail-user=jajupmochi@gmail.com #SBATCH --mail-user=jajupmochi@gmail.com
#SBATCH --output="outputs/output_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt"
#SBATCH --error="errors/error_edit_costs.max_num_sols.ratios.bipartite.""" + arg + """.txt"
#SBATCH --output="outputs/output_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt"
#SBATCH --error="errors/error_edit_costs.real_data.nums_sols.ratios.bipartite.""" + arg + """.txt"
# #
#SBATCH --ntasks=1 #SBATCH --ntasks=1
#SBATCH --nodes=1 #SBATCH --nodes=1
#SBATCH --cpus-per-task=1 #SBATCH --cpus-per-task=1
#SBATCH --time=300:00:00
#SBATCH --time=48:00:00
#SBATCH --mem-per-cpu=4000 #SBATCH --mem-per-cpu=4000


srun hostname srun hostname
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/ged/stability
srun python3 edit_costs.max_nums_sols.ratios.bipartite.py """ + arg
cd """ + cur_path + r"""
echo Working directory : $PWD
echo Local work dir : $LOCAL_WORK_DIR
python3 edit_costs.real_data.nums_sols.ratios.bipartite.py """ + arg
script = script.strip() script = script.strip()
script = re.sub('\n\t+', '\n', script) script = re.sub('\n\t+', '\n', script)
script = re.sub('\n +', '\n', script) script = re.sub('\n +', '\n', script)
return script return script


if __name__ == '__main__': if __name__ == '__main__':
ds_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
for ds_name in [ds_list[i] for i in [0, 1, 2, 3]]:

os.makedirs('outputs/', exist_ok=True)
os.makedirs('errors/', exist_ok=True)

ds_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG']
for ds_name in [ds_list[i] for i in [0, 1, 2, 3, 4]]:
job_script = get_job_script(ds_name) job_script = get_job_script(ds_name)
command = 'sbatch <<EOF\n' + job_script + '\nEOF' command = 'sbatch <<EOF\n' + job_script + '\nEOF'
# print(command) # print(command)

+ 17
- 1
gklearn/experiments/ged/stability/utils.py View File

@@ -325,6 +325,22 @@ def dichotomous_permutation(arr, layer=0):
# return new_arr # return new_arr




def mix_param_grids(list_of_grids):
mixed_grids = []
not_finished = [True] * len(list_of_grids)
idx = 0
while sum(not_finished) > 0:
for g_idx, grid in enumerate(list_of_grids):
if idx < len(grid):
mixed_grids.append(grid[idx])
else:
not_finished[g_idx] = False
idx += 1

return mixed_grids



if __name__ == '__main__': if __name__ == '__main__':
root_dir = 'outputs/CRIANN/' root_dir = 'outputs/CRIANN/'
# for dir_ in sorted(os.listdir(root_dir)): # for dir_ in sorted(os.listdir(root_dir)):
@@ -337,4 +353,4 @@ if __name__ == '__main__':
# get_relative_errors(save_dir) # get_relative_errors(save_dir)
# except Exception as exp: # except Exception as exp:
# print('An exception occured when running this experiment:') # print('An exception occured when running this experiment:')
# print(repr(exp))
# print(repr(exp))

+ 1
- 0
gklearn/ged/__init__.py View File

@@ -0,0 +1 @@
from gklearn.ged.model.ged_model import GEDModel

+ 43
- 0
gklearn/ged/model/distances.py View File

@@ -0,0 +1,43 @@
import numpy as np


def sum_squares(a, b):
"""
Return the sum of squares of the difference between a and b, aka MSE
"""
return np.sum([(a[i] - b[i])**2 for i in range(len(a))])


def euclid_d(x, y):
"""
1D euclidean distance
"""
return np.sqrt((x-y)**2)


def man_d(x, y):
"""
1D manhattan distance
"""
return np.abs((x-y))


def classif_d(x, y):
"""
Function adapted to classification problems
"""
return np.array(0 if x == y else 1)


def rmse(pred, ground_truth):
import numpy as np
return np.sqrt(sum_squares(pred, ground_truth)/len(ground_truth))


def accuracy(pred, ground_truth):
import numpy as np
return np.mean([a == b for a, b in zip(pred, ground_truth)])


def rbf_k(D, sigma=1):
return np.exp(-(D**2)/sigma)

+ 97
- 0
gklearn/ged/model/ged_com.py View File

@@ -0,0 +1,97 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 5 14:02:17 2022

@author: ljia
"""
import sys
from gklearn.ged.model.distances import euclid_d
from gklearn.ged.util import pairwise_ged, get_nb_edit_operations
from gklearn.utils import get_iters


def compute_ged(Gi, Gj, edit_cost, method='BIPARTITE', **kwargs):
"""
Compute GED between two graph according to edit_cost
"""
ged_options = {'edit_cost': 'CONSTANT',
'method': method,
'edit_cost_constants': edit_cost}
node_labels = kwargs.get('node_labels', [])
edge_labels = kwargs.get('edge_labels', [])
dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10)
n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, edit_cost='CONSTANT', node_labels=node_labels, edge_labels=edge_labels)
return dis, n_eo_tmp


def compute_ged_all_dataset(Gn, edit_cost, ed_method, **kwargs):
N = len(Gn)
G_pairs = []
for i in range(N):
for j in range(i, N):
G_pairs.append([i, j])
return compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs)


def compute_geds(G_pairs, Gn, edit_cost, ed_method, verbose=True, **kwargs):
"""
Compute GED between all indexes in G_pairs given edit_cost
:return: ged_vec : the list of computed distances, n_edit_operations : the list of edit operations
"""
ged_vec = []
n_edit_operations = []
for k in get_iters(range(len(G_pairs)), desc='Computing GED', file=sys.stdout, length=len(G_pairs), verbose=verbose):
[i, j] = G_pairs[k]
dis, n_eo_tmp = compute_ged(
Gn[i], Gn[j], edit_cost=edit_cost, method=ed_method, **kwargs)
ged_vec.append(dis)
n_edit_operations.append(n_eo_tmp)

return ged_vec, n_edit_operations


def compute_D(G_app, edit_cost, G_test=None, ed_method='BIPARTITE', **kwargs):
import numpy as np
N = len(G_app)
D_app = np.zeros((N, N))

for i, G1 in get_iters(enumerate(G_app), desc='Computing D - app', file=sys.stdout, length=N):
for j, G2 in enumerate(G_app[i+1:], i+1):
D_app[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs)
D_app[j, i] = D_app[i, j]
if (G_test is None):
return D_app, edit_cost
else:
D_test = np.zeros((len(G_test), N))
for i, G1 in get_iters(enumerate(G_test), desc='Computing D - test', file=sys.stdout, length=len(G_test)):
for j, G2 in enumerate(G_app):
D_test[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs)
return D_app, D_test, edit_cost


def compute_D_random(G_app, G_test=None, ed_method='BIPARTITE', **kwargs):
import numpy as np
edit_costs = np.random.rand(6)
return compute_D(G_app, edit_costs, G_test, ed_method=ed_method, **kwargs)


def compute_D_expert(G_app, G_test=None, ed_method='BIPARTITE', **kwargs):
edit_cost = [3, 3, 1, 3, 3, 1]
return compute_D(G_app, edit_cost, G_test, ed_method=ed_method, **kwargs)


def compute_D_fitted(G_app, y_app, G_test=None, y_distance=euclid_d,
mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs):
from gklearn.ged.models.optim_costs import compute_optimal_costs

costs_optim = compute_optimal_costs(
G_app, y_app, y_distance=y_distance,
mode=mode, unlabeled=unlabeled, ed_method=ed_method, **kwargs)
return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs)


def compute_D_GH2020(G_app, G_test=None, ed_method='BIPARTITE', **kwargs):
from gklearn.ged.optim_costs import get_optimal_costs_GH2020
costs_optim = get_optimal_costs_GH2020(**kwargs)
return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs)

+ 724
- 0
gklearn/ged/model/ged_model.py View File

@@ -0,0 +1,724 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 5 09:42:30 2022

@author: ljia
"""
import sys
import multiprocessing
import time
import numpy as np
import networkx as nx

# from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator # , TransformerMixin
from sklearn.utils.validation import check_is_fitted # check_X_y, check_array,
from sklearn.exceptions import NotFittedError

from gklearn.ged.model.distances import euclid_d
from gklearn.ged.util import pairwise_ged, get_nb_edit_operations
# from gklearn.utils import normalize_gram_matrix
from gklearn.utils import get_iters


class GEDModel(BaseEstimator): #, ABC):
"""The graph edit distance model class compatible with `scikit-learn`.

Attributes
----------
_graphs : list
Stores the input graphs on fit input data.
Default format of the list objects is `NetworkX` graphs.
**We don't guarantee that the input graphs remain unchanged during the
computation.**

References
----------
https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel.
"""

def __init__(self,
ed_method='BIPARTITE',
edit_cost_fun='CONSTANT',
init_edit_cost_constants=[3, 3, 1, 3, 3, 1],
optim_method='init',
optim_options={'y_distance': euclid_d, 'mode': 'reg'},
node_labels=[],
edge_labels=[],
parallel=None,
n_jobs=None,
chunksize=None,
# normalize=True,
copy_graphs=True, # make sure it is a full deep copy. and faster!
verbose=2):
"""`__init__` for `GEDModel` object."""
# @todo: the default settings of the parameters are different from those in the self.compute method.
# self._graphs = None
self.ed_method = ed_method
self.edit_cost_fun = edit_cost_fun
self.init_edit_cost_constants = init_edit_cost_constants
self.optim_method=optim_method
self.optim_options=optim_options
self.node_labels=node_labels
self.edge_labels=edge_labels
self.parallel = parallel
self.n_jobs = n_jobs
self.chunksize = chunksize
# self.normalize = normalize
self.copy_graphs = copy_graphs
self.verbose = verbose
# self._run_time = 0
# self._gram_matrix = None
# self._gram_matrix_unnorm = None


##########################################################################
# The following is the 1st paradigm to compute GED distance matrix, which is
# compatible with `scikit-learn`.
##########################################################################


def fit(self, X, y=None):
"""Fit a graph dataset for a transformer.

Parameters
----------
X : iterable
DESCRIPTION.

y : None, optional
There is no need of a target in a transformer, yet the `scikit-learn`
pipeline API requires this parameter.

Returns
-------
object
Returns self.

"""
# self._is_tranformed = False

# Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used;
self.clear_attributes()

# Validate parameters for the transformer.
self.validate_parameters()

# Validate the input.
self._graphs = self.validate_input(X)
if y is not None:
self._targets = y
# self._targets = self.validate_input(y)

# self._X = X
# self._kernel = self._get_kernel_instance()

# Return the transformer.
return self


def transform(self, X=None, return_dm_train=False):
"""Compute the graph kernel matrix between given and fitted data.

Parameters
----------
X : TYPE
DESCRIPTION.

Raises
------
ValueError
DESCRIPTION.

Returns
-------
None.

"""
# If `return_dm_train`, return the fitted GED distance matrix of training data.
if return_dm_train:
check_is_fitted(self, '_dm_train')
self._is_transformed = True
return self._dm_train # @todo: copy or not?

# Check if method "fit" had been called.
check_is_fitted(self, '_graphs')

# Validate the input.
Y = self.validate_input(X)

# Transform: compute the graph kernel matrix.
dis_matrix = self.compute_distance_matrix(Y)
self._Y = Y

# Self transform must appear before the diagonal call on normilization.
self._is_transformed = True
# if self.normalize:
# X_diag, Y_diag = self.diagonals()
# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
# try:
# kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag))
# except:
# raise
# finally:
# np.seterr(**old_settings)

return dis_matrix


def fit_transform(self, X, y=None, save_dm_train=False):
"""Fit and transform: compute GED distance matrix on the same data.

Parameters
----------
X : list of graphs
Input graphs.

Returns
-------
dis_matrix : numpy array, shape = [len(X), len(X)]
The distance matrix of X.

"""
self.fit(X, y)

# Compute edit cost constants.
self.compute_edit_costs()

# Transform: compute Gram matrix.
dis_matrix = self.compute_distance_matrix()

# # Normalize.
# if self.normalize:
# self._X_diag = np.diagonal(gram_matrix).copy()
# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
# try:
# gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag))
# except:
# raise
# finally:
# np.seterr(**old_settings)

if save_dm_train:
self._dm_train = dis_matrix

return dis_matrix


def get_params(self):
pass


def set_params(self):
pass


def clear_attributes(self): # @todo: update
# if hasattr(self, '_X_diag'):
# delattr(self, '_X_diag')
if hasattr(self, '_graphs'):
delattr(self, '_graphs')
if hasattr(self, '_Y'):
delattr(self, '_Y')
if hasattr(self, '_run_time'):
delattr(self, '_run_time')


def validate_parameters(self):
"""Validate all parameters for the transformer.

Returns
-------
None.

"""
if self.parallel is not None and self.parallel != 'imap_unordered':
raise ValueError('Parallel mode is not set correctly.')

if self.parallel == 'imap_unordered' and self.n_jobs is None:
self.n_jobs = multiprocessing.cpu_count()


def validate_input(self, X):
"""Validate the given input and raise errors if it is invalid.

Parameters
----------
X : list
The input to check. Should be a list of graph.

Raises
------
ValueError
Raise if the input is not correct.

Returns
-------
X : list
The input. A list of graph.

"""
if X is None:
raise ValueError('Please add graphs before computing.')
elif not isinstance(X, list):
raise ValueError('Cannot detect graphs. The input must be a list.')
elif len(X) == 0:
raise ValueError('The graph list given is empty. No computation will be performed.')

return X


def compute_distance_matrix(self, Y=None):
"""Compute the distance matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) or the distance matrix for the fitted
graphs (X / self._graphs).

Parameters
----------
Y : list of graphs, optional
The target graphs. The default is None. If None kernel is computed
between X and itself.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
if Y is None:
# Compute Gram matrix for self._graphs (X).
dis_matrix = self._compute_X_distance_matrix()
# self._gram_matrix_unnorm = np.copy(self._gram_matrix)

else:
# Compute kernel matrix between Y and self._graphs (X).
start_time = time.time()

if self.parallel == 'imap_unordered':
dis_matrix = self._compute_distance_matrix_imap_unordered(Y)

elif self.parallel is None:
Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y)
graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
dis_matrix = self._compute_distance_matrix_series(Y_copy, graphs_copy)

self._run_time = time.time() - start_time
if self.verbose:
print('Distance matrix of size (%d, %d) built in %s seconds.'
% (len(Y), len(self._graphs), self._run_time))

return dis_matrix


def _compute_distance_matrix_series(self, X, Y):
"""Compute the GED distance matrix between two sets of graphs (X and Y)
without parallelization.

Parameters
----------
X, Y : list of graphs
The input graphs.

Returns
-------
dis_matrix : numpy array, shape = [n_X, n_Y]
The computed distance matrix.

"""
dis_matrix = np.zeros((len(X), len(Y)))

for i_x, g_x in enumerate(X):
for i_y, g_y in enumerate(Y):
dis_matrix[i_x, i_y], _ = self.compute_ged(g_x, g_y)

return dis_matrix


def _compute_kernel_matrix_imap_unordered(self, Y):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) using imap unordered parallelization.

Parameters
----------
Y : list of graphs, optional
The target graphs.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
raise Exception('Parallelization for kernel matrix is not implemented.')


def diagonals(self):
"""Compute the kernel matrix diagonals of the fit/transformed data.

Returns
-------
X_diag : numpy array
The diagonal of the kernel matrix between the fitted data.
This consists of each element calculated with itself.

Y_diag : numpy array
The diagonal of the kernel matrix, of the transform.
This consists of each element calculated with itself.

"""
# Check if method "fit" had been called.
check_is_fitted(self, ['_graphs'])

# Check if the diagonals of X exist.
try:
check_is_fitted(self, ['_X_diag'])
except NotFittedError:
# Compute diagonals of X.
self._X_diag = np.empty(shape=(len(self._graphs),))
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
for i, x in enumerate(graphs):
self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel?

try:
# If transform has happened, return both diagonals.
check_is_fitted(self, ['_Y'])
self._Y_diag = np.empty(shape=(len(self._Y),))
Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y)
for (i, y) in enumerate(Y):
self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel?

return self._X_diag, self._Y_diag
except NotFittedError:
# Else just return both X_diag
return self._X_diag


# @abstractmethod
def pairwise_distance(self, x, y):
"""Compute pairwise kernel between two graphs.

Parameters
----------
x, y : NetworkX Graph.
Graphs bewteen which the kernel is computed.

Returns
-------
kernel: float
The computed kernel.

# Notes
# -----
# This method is abstract and must be implemented by a subclass.

"""
raise NotImplementedError('Pairwise kernel computation is not implemented!')



def compute_edit_costs(self, Y=None, Y_targets=None):
"""Compute edit cost constants. When optimizing method is `fiited`,
apply Jia2021's metric learning method by using a given target graphs (Y)
the fitted graphs (X / self._graphs).

Parameters
----------
Y : TYPE, optional
DESCRIPTION. The default is None.

Returns
-------
None.

"""
# Get or compute.
if self.optim_method == 'random':
self._edit_cost_constants = np.random.rand(6)

elif self.optim_method == 'init':
self._edit_cost_constants = self.init_edit_cost_constants


elif self.optim_method == 'expert':
self._edit_cost_constants = [3, 3, 1, 3, 3, 1]


elif self.optim_method == 'fitted': # Jia2021 method
# Get proper inputs.
if Y is None:
check_is_fitted(self, ['_graphs'])
check_is_fitted(self, ['_targets'])
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
targets = self._targets
else:
graphs = ([g.copy() for g in Y] if self.copy_graphs else Y)
targets = Y_targets

# Get optimization options.
node_labels = self.node_labels
edge_labels = self.edge_labels
unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0)
from gklearn.ged.model.optim_costs import compute_optimal_costs
self._edit_cost_constants = compute_optimal_costs(
graphs, targets,
node_labels=node_labels, edge_labels=edge_labels,
unlabeled=unlabeled, ed_method=self.ed_method,
verbose=(self.verbose >= 2),
**self.optim_options)


##########################################################################
# The following is the 2nd paradigm to compute kernel matrix. It is
# simplified and not compatible with `scikit-learn`.
##########################################################################


# def compute(self, *graphs, **kwargs):
# self.parallel = kwargs.get('parallel', 'imap_unordered')
# self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
# self.normalize = kwargs.get('normalize', True)
# self.verbose = kwargs.get('verbose', 2)
# self.copy_graphs = kwargs.get('copy_graphs', True)
# self.save_unnormed = kwargs.get('save_unnormed', True)
# self.validate_parameters()

# # If the inputs is a list of graphs.
# if len(graphs) == 1:
# if not isinstance(graphs[0], list):
# raise Exception('Cannot detect graphs.')
# elif len(graphs[0]) == 0:
# raise Exception('The graph list given is empty. No computation was performed.')
# else:
# if self.copy_graphs:
# self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow.
# else:
# self._graphs = graphs
# self._gram_matrix = self._compute_gram_matrix()

# if self.save_unnormed:
# self._gram_matrix_unnorm = np.copy(self._gram_matrix)
# if self.normalize:
# self._gram_matrix = normalize_gram_matrix(self._gram_matrix)
# return self._gram_matrix, self._run_time

# elif len(graphs) == 2:
# # If the inputs are two graphs.
# if self.is_graph(graphs[0]) and self.is_graph(graphs[1]):
# if self.copy_graphs:
# G0, G1 = graphs[0].copy(), graphs[1].copy()
# else:
# G0, G1 = graphs[0], graphs[1]
# kernel = self._compute_single_kernel(G0, G1)
# return kernel, self._run_time

# # If the inputs are a graph and a list of graphs.
# elif self.is_graph(graphs[0]) and isinstance(graphs[1], list):
# if self.copy_graphs:
# g1 = graphs[0].copy()
# g_list = [g.copy() for g in graphs[1]]
# kernel_list = self._compute_kernel_list(g1, g_list)
# else:
# kernel_list = self._compute_kernel_list(graphs[0], graphs[1])
# return kernel_list, self._run_time

# elif isinstance(graphs[0], list) and self.is_graph(graphs[1]):
# if self.copy_graphs:
# g1 = graphs[1].copy()
# g_list = [g.copy() for g in graphs[0]]
# kernel_list = self._compute_kernel_list(g1, g_list)
# else:
# kernel_list = self._compute_kernel_list(graphs[1], graphs[0])
# return kernel_list, self._run_time

# else:
# raise Exception('Cannot detect graphs.')

# elif len(graphs) == 0 and self._graphs is None:
# raise Exception('Please add graphs before computing.')

# else:
# raise Exception('Cannot detect graphs.')


# def normalize_gm(self, gram_matrix):
# import warnings
# warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning)

# diag = gram_matrix.diagonal().copy()
# for i in range(len(gram_matrix)):
# for j in range(i, len(gram_matrix)):
# gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j])
# gram_matrix[j][i] = gram_matrix[i][j]
# return gram_matrix


# def compute_distance_matrix(self):
# if self._gram_matrix is None:
# raise Exception('Please compute the Gram matrix before computing distance matrix.')
# dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix)))
# for i in range(len(self._gram_matrix)):
# for j in range(i, len(self._gram_matrix)):
# dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j]
# if dis < 0:
# if dis > -1e-10:
# dis = 0
# else:
# raise ValueError('The distance is negative.')
# dis_mat[i, j] = np.sqrt(dis)
# dis_mat[j, i] = dis_mat[i, j]
# dis_max = np.max(np.max(dis_mat))
# dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
# dis_mean = np.mean(np.mean(dis_mat))
# return dis_mat, dis_max, dis_min, dis_mean


def _compute_X_distance_matrix(self):
start_time = time.time()

if self.parallel == 'imap_unordered':
dis_matrix = self._compute_X_dm_imap_unordered()
elif self.parallel is None:
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
dis_matrix = self._compute_X_dm_series(graphs)
else:
raise Exception('Parallel mode is not set correctly.')

self._run_time = time.time() - start_time
if self.verbose:
print('Distance matrix of size %d built in %s seconds.'
% (len(self._graphs), self._run_time))

return dis_matrix


def _compute_X_dm_series(self, graphs):
N = len(graphs)
dis_matrix = np.zeros((N, N))

for i, G1 in get_iters(enumerate(graphs), desc='Computing distance matrix', file=sys.stdout, verbose=(self.verbose >= 2)):
for j, G2 in enumerate(graphs[i+1:], i+1):
dis_matrix[i, j], _ = self.compute_ged(G1, G2)
dis_matrix[j, i] = dis_matrix[i, j]
return dis_matrix


def _compute_X_dm_imap_unordered(self, graphs):
pass


def compute_ged(self, Gi, Gj, **kwargs):
"""
Compute GED between two graph according to edit_cost.
"""
ged_options = {'edit_cost': self.edit_cost_fun,
'method': self.ed_method,
'edit_cost_constants': self._edit_cost_constants}
dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10)
n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward,
edit_cost=self.edit_cost_fun,
node_labels=self.node_labels,
edge_labels=self.edge_labels)
return dis, n_eo_tmp


# def _compute_kernel_list(self, g1, g_list):
# start_time = time.time()

# if self.parallel == 'imap_unordered':
# kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list)
# elif self.parallel is None:
# kernel_list = self._compute_kernel_list_series(g1, g_list)
# else:
# raise Exception('Parallel mode is not set correctly.')

# self._run_time = time.time() - start_time
# if self.verbose:
# print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.'
# % (len(g_list), self._run_time))

# return kernel_list


# def _compute_kernel_list_series(self, g1, g_list):
# pass


# def _compute_kernel_list_imap_unordered(self, g1, g_list):
# pass


# def _compute_single_kernel(self, g1, g2):
# start_time = time.time()

# kernel = self._compute_single_kernel_series(g1, g2)

# self._run_time = time.time() - start_time
# if self.verbose:
# print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time))

# return kernel


# def _compute_single_kernel_series(self, g1, g2):
# pass


def is_graph(self, graph):
if isinstance(graph, nx.Graph):
return True
if isinstance(graph, nx.DiGraph):
return True
if isinstance(graph, nx.MultiGraph):
return True
if isinstance(graph, nx.MultiDiGraph):
return True
return False


@property
def graphs(self):
return self._graphs


# @property
# def parallel(self):
# return self.parallel


# @property
# def n_jobs(self):
# return self.n_jobs


# @property
# def verbose(self):
# return self.verbose


# @property
# def normalize(self):
# return self.normalize


@property
def run_time(self):
return self._run_time


@property
def dis_matrix(self):
return self._dis_matrix

@dis_matrix.setter
def dis_matrix(self, value):
self._dis_matrix = value


# @property
# def gram_matrix_unnorm(self):
# return self._gram_matrix_unnorm

# @gram_matrix_unnorm.setter
# def gram_matrix_unnorm(self, value):
# self._gram_matrix_unnorm = value

+ 149
- 0
gklearn/ged/model/optim_costs.py View File

@@ -0,0 +1,149 @@
import numpy as np

from gklearn.ged.model.distances import sum_squares, euclid_d
from gklearn.ged.model.ged_com import compute_geds


def optimize_costs_unlabeled(nb_cost_mat, dis_k_vec):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
:param dis_k_vec: The N distances to fit
"""
import cvxpy as cp
import numpy as np
MAX_SAMPLE = 1000
nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] for x in nb_cost_mat])
dis_k_vec = np.array(dis_k_vec)
# dis_k_vec_norm = dis_k_vec/np.max(dis_k_vec)

# import pickle
# pickle.dump([nb_cost_mat, dis_k_vec], open('debug', 'wb'))
N = nb_cost_mat_m.shape[0]
sub_sample = np.random.permutation(np.arange(N))
sub_sample = sub_sample[:MAX_SAMPLE]

x = cp.Variable(nb_cost_mat_m.shape[1])
cost = cp.sum_squares((nb_cost_mat_m[sub_sample, :] @ x) - dis_k_vec[sub_sample])
prob = cp.Problem(cp.Minimize(cost), [x >= 0])
prob.solve()
edit_costs_new = [x.value[0], x.value[1], 0, x.value[2], x.value[3], 0]
edit_costs_new = [xi if xi > 0 else 0 for xi in edit_costs_new]
residual = prob.value
return edit_costs_new, residual


def optimize_costs_classif_unlabeled(nb_cost_mat, Y):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in
nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit
operations for each pair of graph
:param dis_k_vec: {-1,1}^N vector of common classes
"""
# import cvxpy as cp
from ml import reg_log
# import pickle
# pickle.dump([nb_cost_mat, Y], open('debug', 'wb'))
nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]]
for x in nb_cost_mat])
w, J, _ = reg_log(nb_cost_mat_m, Y, pos_contraint=True)
edit_costs_new = [w[0], w[1], 0, w[2], w[3], 0]
residual = J[-1]

return edit_costs_new, residual


def optimize_costs_classif(nb_cost_mat, Y):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
:param dis_k_vec: {-1,1}^N vector of common classes
"""
#import pickle
# pickle.dump([nb_cost_mat, Y], open("test.pickle", "wb"))
from ml import reg_log
w, J, _ = reg_log(nb_cost_mat, Y, pos_contraint=True)
return w, J[-1]


def optimize_costs(nb_cost_mat, dis_k_vec):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
:param dis_k_vec: The N distances to fit
"""
import cvxpy as cp
x = cp.Variable(nb_cost_mat.shape[1])
cost = cp.sum_squares((nb_cost_mat @ x) - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost), constraints)
prob.solve()
edit_costs_new = x.value
residual = prob.value

return edit_costs_new, residual


def compute_optimal_costs(G, y, init_costs=[3, 3, 1, 3, 3, 1],
y_distance=euclid_d,
mode='reg', unlabeled=False,
ed_method='BIPARTITE',
verbose=True,
**kwargs):
N = len(y)

G_pairs = []
distances_vec = []

for i in range(N):
for j in range(i+1, N):
G_pairs.append([i, j])
distances_vec.append(y_distance(y[i], y[j]))
ged_vec_init, n_edit_operations = compute_geds(G_pairs, G, init_costs, ed_method,
verbose=verbose, **kwargs)

residual_list = [sum_squares(ged_vec_init, distances_vec)]

if (mode == 'reg'):
if unlabeled:
method_optim = optimize_costs_unlabeled
else:
method_optim = optimize_costs

elif (mode == 'classif'):
if unlabeled:
method_optim = optimize_costs_classif_unlabeled
else:
method_optim = optimize_costs_classif

ite_max = 5
for i in range(ite_max):
if verbose:
print('ite', i + 1, '/', ite_max, ':')
# compute GEDs and numbers of edit operations.
edit_costs_new, residual = method_optim(
np.array(n_edit_operations), distances_vec)
ged_vec, n_edit_operations = compute_geds(G_pairs, G, edit_costs_new, ed_method,
verbose=verbose, **kwargs)
residual_list.append(sum_squares(ged_vec, distances_vec))

return edit_costs_new


def get_optimal_costs_GH2020(**kwargs):
import pickle
import os
dir_root = 'cj/output/'
ds_name = kwargs.get('ds_name')
nb_trial = kwargs.get('nb_trial')
file_name = os.path.join(dir_root, 'costs.' + ds_name + '.' + str(nb_trial) + '.pkl')
with open(file_name, 'rb') as f:
edit_costs = pickle.load(f)
return edit_costs

+ 179
- 31
gklearn/ged/util/util.py View File

@@ -64,10 +64,12 @@ def pairwise_ged(g1, g2, options={}, sort=True, repeats=1, parallel=False, verbo
g = listID[0] g = listID[0]
h = listID[1] h = listID[1]
dis_min = np.inf dis_min = np.inf
# print('------------------------------------------')
for i in range(0, repeats): for i in range(0, repeats):
ged_env.run_method(g, h) ged_env.run_method(g, h)
upper = ged_env.get_upper_bound(g, h) upper = ged_env.get_upper_bound(g, h)
dis = upper dis = upper
# print(dis)
if dis < dis_min: if dis < dis_min:
dis_min = dis dis_min = dis
pi_forward = ged_env.get_forward_map(g, h) pi_forward = ged_env.get_forward_map(g, h)
@@ -169,12 +171,100 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
return ged_vec, ged_mat, n_edit_operations return ged_vec, ged_mat, n_edit_operations




def compute_geds(graphs, options={}, sort=True, repeats=1, parallel=False, n_jobs=None, verbose=True):
#%%


def compute_geds(graphs,
options={},
sort=True,
repeats=1,
permute_nodes=False,
random_state=None,
parallel=False,
n_jobs=None,
verbose=True):
"""Compute graph edit distance matrix using GEDLIB.
"""
if permute_nodes:
return _compute_geds_with_permutation(graphs,
options=options,
sort=sort,
repeats=repeats,
random_state=random_state,
parallel=parallel,
n_jobs=n_jobs,
verbose=verbose)
else:
return _compute_geds_without_permutation(graphs,
options=options,
sort=sort,
repeats=repeats,
parallel=parallel,
n_jobs=n_jobs,
verbose=verbose)


#%%


def _compute_geds_with_permutation(graphs,
options={},
sort=True,
repeats=1,
random_state=None,
parallel=False,
n_jobs=None,
verbose=True):

from gklearn.utils.utils import nx_permute_nodes

# Initialze variables.
ged_mat_optim = np.full((len(graphs), len(graphs)), np.inf)
np.fill_diagonal(ged_mat_optim, 0)
len_itr = int(len(graphs) * (len(graphs) - 1) / 2)
ged_vec = [0] * len_itr
n_edit_operations = [0] * len_itr

# for each repeats:
for i in range(0, repeats):
# Permutate nodes.
graphs_pmut = [nx_permute_nodes(g, random_state=random_state) for g in graphs]

out = _compute_geds_without_permutation(graphs_pmut,
options=options,
sort=sort,
repeats=1,
parallel=parallel,
n_jobs=n_jobs,
verbose=verbose)

# Compare current results with the best one.
idx_cnt = 0
for i in range(len(graphs)):
for j in range(i + 1, len(graphs)):
if out[1][i, j] < ged_mat_optim[i ,j]:
ged_mat_optim[i, j] = out[1][i, j]
ged_mat_optim[j, i] = out[1][j, i]
ged_vec[idx_cnt] = out[0][idx_cnt]
n_edit_operations[idx_cnt] = out[2][idx_cnt]
idx_cnt += 1

return ged_vec, ged_mat_optim, n_edit_operations


def _compute_geds_without_permutation(graphs,
options={},
sort=True,
repeats=1,
parallel=False,
n_jobs=None,
verbose=True):
from gklearn.gedlib import librariesImport, gedlibpy from gklearn.gedlib import librariesImport, gedlibpy


# initialize ged env. # initialize ged env.
ged_env = gedlibpy.GEDEnv() ged_env = gedlibpy.GEDEnv()
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants'])

for g in graphs: for g in graphs:
ged_env.add_nx_graph(g, '') ged_env.add_nx_graph(g, '')
listID = ged_env.get_all_graph_ids() listID = ged_env.get_all_graph_ids()
@@ -266,6 +356,11 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats):
dis = upper dis = upper


# make the map label correct (label remove map as np.inf) # make the map label correct (label remove map as np.inf)
# Attention: using node indices instead of NetworkX node labels (as
# implemented here) may cause several issues:
# - Fail if NetworkX node labels are not consecutive integers;
# - Return wrong mappings if nodes are permutated (e.g., by using
# `gklearn.utis.utils.nx_permute_nodes()`.)
nodes1 = [n for n in g1.nodes()] nodes1 = [n for n in g1.nodes()]
nodes2 = [n for n in g2.nodes()] nodes2 = [n for n in g2.nodes()]
nb1 = nx.number_of_nodes(g1) nb1 = nx.number_of_nodes(g1)
@@ -278,46 +373,57 @@ def _compute_ged(env, gid1, gid2, g1, g2, repeats):
pi_forward_min = pi_forward pi_forward_min = pi_forward
pi_backward_min = pi_backward pi_backward_min = pi_backward


# print('-----')
# print(pi_forward_min)
# print(pi_backward_min)

return dis_min, pi_forward_min, pi_backward_min return dis_min, pi_forward_min, pi_backward_min




def label_costs_to_matrix(costs, nb_labels):
"""Reform a label cost vector to a matrix.
#%%


def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs):
"""Calculate the numbers of the occurence of each edit operation in a given
edit path.


Parameters Parameters
---------- ----------
costs : numpy.array
The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs.
nb_labels : integer
Number of labels.
g1 : TYPE
DESCRIPTION.
g2 : TYPE
DESCRIPTION.
forward_map : TYPE
DESCRIPTION.
backward_map : TYPE
DESCRIPTION.
edit_cost : TYPE, optional
DESCRIPTION. The default is None.
is_cml : TYPE, optional
DESCRIPTION. The default is False.
**kwargs : TYPE
DESCRIPTION.

Raises
------
Exception
DESCRIPTION.


Returns Returns
------- -------
cost_matrix : numpy.array.
The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData.
TYPE
DESCRIPTION.

Notes
-----
Attention: when implementing a function to get the numbers of edit
operations, make sure that:
- It does not fail if NetworkX node labels are not consecutive integers;
- It returns correct results if nodes are permutated (e.g., by using
`gklearn.utis.utils.nx_permute_nodes()`.)
Generally speaking, it means you need to distinguish the NetworkX label of
a node from the position (index) of that node in the node list.
""" """
# Initialize label cost matrix.
cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1))
i = 0
# Costs of insertions.
for col in range(1, nb_labels + 1):
cost_matrix[0, col] = costs[i]
i += 1
# Costs of deletions.
for row in range(1, nb_labels + 1):
cost_matrix[row, 0] = costs[i]
i += 1
# Costs of substitutions.
for row in range(1, nb_labels + 1):
for col in range(row + 1, nb_labels + 1):
cost_matrix[row, col] = costs[i]
cost_matrix[col, row] = costs[i]
i += 1

return cost_matrix


def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs):
if is_cml: if is_cml:
if edit_cost == 'CONSTANT': if edit_cost == 'CONSTANT':
node_labels = kwargs.get('node_labels', []) node_labels = kwargs.get('node_labels', [])
@@ -611,6 +717,48 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map,
return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es




#%%


def label_costs_to_matrix(costs, nb_labels):
"""Reform a label cost vector to a matrix.

Parameters
----------
costs : numpy.array
The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs.
nb_labels : integer
Number of labels.

Returns
-------
cost_matrix : numpy.array.
The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData.
"""
# Initialize label cost matrix.
cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1))
i = 0
# Costs of insertions.
for col in range(1, nb_labels + 1):
cost_matrix[0, col] = costs[i]
i += 1
# Costs of deletions.
for row in range(1, nb_labels + 1):
cost_matrix[row, 0] = costs[i]
i += 1
# Costs of substitutions.
for row in range(1, nb_labels + 1):
for col in range(row + 1, nb_labels + 1):
cost_matrix[row, col] = costs[i]
cost_matrix[col, row] = costs[i]
i += 1

return cost_matrix


#%%


def ged_options_to_string(options): def ged_options_to_string(options):
opt_str = ' ' opt_str = ' '
for key, val in options.items(): for key, val in options.items():


+ 39
- 20
gklearn/kernels/graph_kernel.py View File

@@ -32,7 +32,13 @@ class GraphKernel(BaseEstimator): #, ABC):
https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel.
""" """


def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2):
def __init__(self,
parallel=None,
n_jobs=None,
chunksize=None,
normalize=True,
copy_graphs=True, # make sure it is a full deep copy. and faster!
verbose=2):
"""`__init__` for `GraphKernel` object.""" """`__init__` for `GraphKernel` object."""
# @todo: the default settings of the parameters are different from those in the self.compute method. # @todo: the default settings of the parameters are different from those in the self.compute method.
# self._graphs = None # self._graphs = None
@@ -40,6 +46,7 @@ class GraphKernel(BaseEstimator): #, ABC):
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.chunksize = chunksize self.chunksize = chunksize
self.normalize = normalize self.normalize = normalize
self.copy_graphs = copy_graphs
self.verbose = verbose self.verbose = verbose
# self._run_time = 0 # self._run_time = 0
# self._gram_matrix = None # self._gram_matrix = None
@@ -90,7 +97,7 @@ class GraphKernel(BaseEstimator): #, ABC):
return self return self




def transform(self, X):
def transform(self, X=None, load_gm_train=False):
"""Compute the graph kernel matrix between given and fitted data. """Compute the graph kernel matrix between given and fitted data.


Parameters Parameters
@@ -108,6 +115,12 @@ class GraphKernel(BaseEstimator): #, ABC):
None. None.


""" """
# If `load_gm_train`, load Gram matrix of training data.
if load_gm_train:
check_is_fitted(self, '_gm_train')
self._is_transformed = True
return self._gm_train # @todo: copy or not?

# Check if method "fit" had been called. # Check if method "fit" had been called.
check_is_fitted(self, '_graphs') check_is_fitted(self, '_graphs')


@@ -133,8 +146,7 @@ class GraphKernel(BaseEstimator): #, ABC):
return kernel_matrix return kernel_matrix





def fit_transform(self, X):
def fit_transform(self, X, save_gm_train=False):
"""Fit and transform: compute Gram matrix on the same data. """Fit and transform: compute Gram matrix on the same data.


Parameters Parameters
@@ -164,6 +176,9 @@ class GraphKernel(BaseEstimator): #, ABC):
finally: finally:
np.seterr(**old_settings) np.seterr(**old_settings)


if save_gm_train:
self._gm_train = gram_matrix

return gram_matrix return gram_matrix




@@ -260,7 +275,9 @@ class GraphKernel(BaseEstimator): #, ABC):
kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y) kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y)


elif self.parallel is None: elif self.parallel is None:
kernel_matrix = self._compute_kernel_matrix_series(Y)
Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y)
graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
kernel_matrix = self._compute_kernel_matrix_series(Y_copy, graphs_copy)


self._run_time = time.time() - start_time self._run_time = time.time() - start_time
if self.verbose: if self.verbose:
@@ -270,26 +287,25 @@ class GraphKernel(BaseEstimator): #, ABC):
return kernel_matrix return kernel_matrix




def _compute_kernel_matrix_series(self, Y):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) without parallelization.
def _compute_kernel_matrix_series(self, X, Y):
"""Compute the kernel matrix between two sets of graphs (X and Y) without parallelization.


Parameters Parameters
---------- ----------
Y : list of graphs, optional
The target graphs.
X, Y : list of graphs
The input graphs.


Returns Returns
------- -------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
kernel_matrix : numpy array, shape = [n_X, n_Y]
The computed kernel matrix. The computed kernel matrix.


""" """
kernel_matrix = np.zeros((len(Y), len(self._graphs)))
kernel_matrix = np.zeros((len(X), len(Y)))


for i_y, g_y in enumerate(Y):
for i_x, g_x in enumerate(self._graphs):
kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x)
for i_x, g_x in enumerate(X):
for i_y, g_y in enumerate(Y):
kernel_matrix[i_x, i_y] = self.pairwise_kernel(g_x, g_y)


return kernel_matrix return kernel_matrix


@@ -335,14 +351,16 @@ class GraphKernel(BaseEstimator): #, ABC):
except NotFittedError: except NotFittedError:
# Compute diagonals of X. # Compute diagonals of X.
self._X_diag = np.empty(shape=(len(self._graphs),)) self._X_diag = np.empty(shape=(len(self._graphs),))
for i, x in enumerate(self._graphs):
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
for i, x in enumerate(graphs):
self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel?


try: try:
# If transform has happened, return both diagonals. # If transform has happened, return both diagonals.
check_is_fitted(self, ['_Y']) check_is_fitted(self, ['_Y'])
self._Y_diag = np.empty(shape=(len(self._Y),)) self._Y_diag = np.empty(shape=(len(self._Y),))
for (i, y) in enumerate(self._Y):
Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y)
for (i, y) in enumerate(Y):
self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel?


return self._X_diag, self._Y_diag return self._X_diag, self._Y_diag
@@ -484,7 +502,8 @@ class GraphKernel(BaseEstimator): #, ABC):
if self.parallel == 'imap_unordered': if self.parallel == 'imap_unordered':
gram_matrix = self._compute_gm_imap_unordered() gram_matrix = self._compute_gm_imap_unordered()
elif self.parallel is None: elif self.parallel is None:
gram_matrix = self._compute_gm_series()
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
gram_matrix = self._compute_gm_series(graphs)
else: else:
raise Exception('Parallel mode is not set correctly.') raise Exception('Parallel mode is not set correctly.')


@@ -496,11 +515,11 @@ class GraphKernel(BaseEstimator): #, ABC):
return gram_matrix return gram_matrix




def _compute_gm_series(self):
def _compute_gm_series(self, graphs):
pass pass




def _compute_gm_imap_unordered(self):
def _compute_gm_imap_unordered(self, graphs):
pass pass






+ 39
- 27
gklearn/kernels/treelet.py View File

@@ -28,16 +28,16 @@ from gklearn.kernels import GraphKernel


class Treelet(GraphKernel): class Treelet(GraphKernel):


def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs):
def __init__(self, **kwargs):
"""Initialise a treelet kernel. """Initialise a treelet kernel.
""" """
super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose)
GraphKernel.__init__(self, **{k: kwargs.get(k) for k in ['parallel', 'n_jobs', 'chunksize', 'normalize', 'copy_graphs', 'verbose'] if k in kwargs})
self.node_labels = kwargs.get('node_labels', []) self.node_labels = kwargs.get('node_labels', [])
self.edge_labels = kwargs.get('edge_labels', []) self.edge_labels = kwargs.get('edge_labels', [])
self.sub_kernel = kwargs.get('sub_kernel', None) self.sub_kernel = kwargs.get('sub_kernel', None)
self.ds_infos = kwargs.get('ds_infos', {}) self.ds_infos = kwargs.get('ds_infos', {})
self.precompute_canonkeys = precompute_canonkeys
self.save_canonkeys = save_canonkeys
self.precompute_canonkeys = kwargs.get('precompute_canonkeys', True)
self.save_canonkeys = kwargs.get('save_canonkeys', True)




########################################################################## ##########################################################################
@@ -71,7 +71,7 @@ class Treelet(GraphKernel):
raise ValueError('Sub-kernel not set.') raise ValueError('Sub-kernel not set.')




def _compute_kernel_matrix_series(self, Y):
def _compute_kernel_matrix_series(self, Y, X=None, load_canonkeys=True):
"""Compute the kernel matrix between a given target graphs (Y) and """Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) without parallelization. the fitted graphs (X / self._graphs) without parallelization.


@@ -86,36 +86,45 @@ class Treelet(GraphKernel):
The computed kernel matrix. The computed kernel matrix.


""" """
if_comp_X_canonkeys = True

# if load saved canonkeys of X from the instance:
if load_canonkeys:
# Canonical keys for self._graphs.
try:
check_is_fitted(self, ['_canonkeys'])
canonkeys_list1 = self._canonkeys
if_comp_X_canonkeys = False
except NotFittedError:
import warnings
warnings.warn('The canonkeys of self._graphs are not computed/saved. The keys of `X` is computed instead.')
if_comp_X_canonkeys = True


# self._add_dummy_labels will modify the input in place.
self._add_dummy_labels() # For self._graphs
# Y = [g.copy() for g in Y] # @todo: ?
self._add_dummy_labels(Y)


# get all canonical keys of all graphs before computing kernels to save # get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset. # time, but this may cost a lot of memory for large dataset.


# Canonical keys for self._graphs.
try:
check_is_fitted(self, ['_canonkeys'])
canonkeys_list1 = self._canonkeys
except NotFittedError:
# Compute the canonical keys of X.
if if_comp_X_canonkeys:
if X is None:
raise('X can not be None.')
# self._add_dummy_labels will modify the input in place.
self._add_dummy_labels(X) # for X
canonkeys_list1 = [] canonkeys_list1 = []
iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2))
iterator = get_iters(self._graphs, desc='Getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2))
for g in iterator: for g in iterator:
canonkeys_list1.append(self._get_canonkeys(g)) canonkeys_list1.append(self._get_canonkeys(g))


if self.save_canonkeys:
self._canonkeys = canonkeys_list1

# Canonical keys for Y. # Canonical keys for Y.
# Y = [g.copy() for g in Y] # @todo: ?
self._add_dummy_labels(Y)
canonkeys_list2 = [] canonkeys_list2 = []
iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2))
iterator = get_iters(Y, desc='Getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2))
for g in iterator: for g in iterator:
canonkeys_list2.append(self._get_canonkeys(g)) canonkeys_list2.append(self._get_canonkeys(g))


if self.save_canonkeys:
self._Y_canonkeys = canonkeys_list2
# if self.save_canonkeys:
# self._Y_canonkeys = canonkeys_list2


# compute kernel matrix. # compute kernel matrix.
kernel_matrix = np.zeros((len(Y), len(canonkeys_list1))) kernel_matrix = np.zeros((len(Y), len(canonkeys_list1)))
@@ -235,13 +244,13 @@ class Treelet(GraphKernel):
########################################################################## ##########################################################################




def _compute_gm_series(self):
self._add_dummy_labels(self._graphs)
def _compute_gm_series(self, graphs):
self._add_dummy_labels(graphs)


# get all canonical keys of all graphs before computing kernels to save # get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset. # time, but this may cost a lot of memory for large dataset.
canonkeys = [] canonkeys = []
iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout,
iterator = get_iters(graphs, desc='getting canonkeys', file=sys.stdout,
verbose=(self.verbose >= 2)) verbose=(self.verbose >= 2))
for g in iterator: for g in iterator:
canonkeys.append(self._get_canonkeys(g)) canonkeys.append(self._get_canonkeys(g))
@@ -250,11 +259,11 @@ class Treelet(GraphKernel):
self._canonkeys = canonkeys self._canonkeys = canonkeys


# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
gram_matrix = np.zeros((len(graphs), len(graphs)))


from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
itr = combinations_with_replacement(range(0, len(graphs)), 2)
len_itr = int(len(graphs) * (len(graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self.verbose >= 2)) length=len_itr, verbose=(self.verbose >= 2))
for i, j in iterator: for i, j in iterator:
@@ -390,6 +399,9 @@ class Treelet(GraphKernel):
Treelet kernel between 2 graphs. Treelet kernel between 2 graphs.
""" """
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs
if len(keys) == 0: # There is nothing in common...
return 0

vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys])
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys])




+ 52
- 103
gklearn/kernels/weisfeiler_lehman.py View File

@@ -28,7 +28,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.




def __init__(self, **kwargs): def __init__(self, **kwargs):
GraphKernel.__init__(self)
GraphKernel.__init__(self, **{k: kwargs.get(k) for k in ['parallel', 'n_jobs', 'chunksize', 'normalize', 'copy_graphs', 'verbose'] if k in kwargs})
self.node_labels = kwargs.get('node_labels', []) self.node_labels = kwargs.get('node_labels', [])
self.edge_labels = kwargs.get('edge_labels', []) self.edge_labels = kwargs.get('edge_labels', [])
self.height = int(kwargs.get('height', 0)) self.height = int(kwargs.get('height', 0))
@@ -50,7 +50,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
########################################################################## ##########################################################################




def _compute_gm_series(self):
def _compute_gm_series(self, graphs):
# if self.verbose >= 2: # if self.verbose >= 2:
# import warnings # import warnings
# warnings.warn('A part of the computation is parallelized.') # warnings.warn('A part of the computation is parallelized.')
@@ -59,19 +59,19 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


# for WL subtree kernel # for WL subtree kernel
if self._base_kernel == 'subtree': if self._base_kernel == 'subtree':
gram_matrix = self._subtree_kernel_do(self._graphs)
gram_matrix = self._subtree_kernel_do(graphs)


# for WL shortest path kernel # for WL shortest path kernel
elif self._base_kernel == 'sp': elif self._base_kernel == 'sp':
gram_matrix = self._sp_kernel_do(self._graphs)
gram_matrix = self._sp_kernel_do(graphs)


# for WL edge kernel # for WL edge kernel
elif self._base_kernel == 'edge': elif self._base_kernel == 'edge':
gram_matrix = self._edge_kernel_do(self._graphs)
gram_matrix = self._edge_kernel_do(graphs)


# for user defined base kernel # for user defined base kernel
else: else:
gram_matrix = self._user_kernel_do(self._graphs)
gram_matrix = self._user_kernel_do(graphs)


return gram_matrix return gram_matrix


@@ -204,70 +204,13 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.




def pairwise_kernel(self, g1, g2): def pairwise_kernel(self, g1, g2):
Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster!
kernel = 0

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

# for each graph
for G in Gn:
# set all labels into a tuple.
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
# get the set of original labels
labels_ori = list(nx.get_node_attributes(G, 'lt').values())
# number of occurence of each label in G
all_num_of_each_label.append(dict(Counter(labels_ori)))

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
kernel = self._compute_kernel_itr(kernel, all_num_of_each_label)

# iterate each height
for h in range(1, self.height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for G in Gn:

all_multisets = []
for node, attrs in G.nodes(data=True):
# Multiset-label determination.
multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]]
# sorting each multiset
multiset.sort()
multiset = [attrs['lt']] + multiset # add the prefix
all_multisets.append(tuple(multiset))

# label compression
set_unique = list(set(all_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
set_compressed = {}
# if a label occured before, assign its former compressed label,
# else assign the number of labels occured + 1 as the compressed label.
for value in set_unique:
if value in all_set_compressed.keys():
set_compressed[value] = all_set_compressed[value]
else:
set_compressed[value] = str(num_of_labels_occured + 1)
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)

# relabel nodes
for idx, node in enumerate(G.nodes()):
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]

# get the set of compressed labels
labels_comp = list(nx.get_node_attributes(G, 'lt').values())
# all_labels_ori.update(labels_comp)
all_num_of_each_label.append(dict(Counter(labels_comp)))
# Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster!
Gn = [g1, g2]
# for WL subtree kernel
if self._base_kernel == 'subtree':
kernel = self._subtree_kernel_do(Gn, return_mat=False)


# Compute subtree kernel with h iterations and add it to the final kernel
kernel = self._compute_kernel_itr(kernel, all_num_of_each_label)
# @todo: other subkernels.


return kernel return kernel


@@ -291,7 +234,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
return kernel return kernel




def _subtree_kernel_do_nl(self, Gn):
def _subtree_kernel_do_nl(self, Gn, return_mat=True):
"""Compute Weisfeiler-Lehman kernels between graphs with node labels. """Compute Weisfeiler-Lehman kernels between graphs with node labels.


Parameters Parameters
@@ -301,10 +244,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


Return Return
------ ------
gram_matrix : Numpy matrix
kernel_matrix : Numpy matrix / float
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
""" """
gram_matrix = np.zeros((len(Gn), len(Gn)))
kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)


# initial for height = 0 # initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
@@ -324,7 +268,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
all_num_of_each_label.append(dict(Counter(labels_ori))) all_num_of_each_label.append(dict(Counter(labels_ori)))


# Compute subtree kernel with the 0th iteration and add it to the final kernel. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)


# iterate each height # iterate each height
for h in range(1, self.height + 1): for h in range(1, self.height + 1):
@@ -342,12 +286,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)


# Compute subtree kernel with h iterations and add it to the final kernel # Compute subtree kernel with h iterations and add it to the final kernel
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)


return gram_matrix
return kernel_matrix




def _subtree_kernel_do_el(self, Gn):
def _subtree_kernel_do_el(self, Gn, return_mat=True):
"""Compute Weisfeiler-Lehman kernels between graphs with edge labels. """Compute Weisfeiler-Lehman kernels between graphs with edge labels.


Parameters Parameters
@@ -357,19 +301,20 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


Return Return
------ ------
gram_matrix : Numpy matrix
kernel_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
""" """
gram_matrix = np.zeros((len(Gn), len(Gn)))
kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)


# initial for height = 0 # initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration


# Compute subtree kernel with the 0th iteration and add it to the final kernel. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2)
for i, j in iterator:
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
gram_matrix[j][i] = gram_matrix[i][j]
iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2)
for i, j in iterator: # @todo: not correct if return_mat == False.
kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
kernel_matrix[j][i] = kernel_matrix[i][j]




# if h >= 1. # if h >= 1.
@@ -393,7 +338,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)


# Compute subtree kernel with h iterations and add it to the final kernel. # Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)




# Iterate along heights (>= 2). # Iterate along heights (>= 2).
@@ -407,12 +352,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)


# Compute subtree kernel with h iterations and add it to the final kernel. # Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)


return gram_matrix
return kernel_matrix




def _subtree_kernel_do_labeled(self, Gn):
def _subtree_kernel_do_labeled(self, Gn, return_mat=True):
"""Compute Weisfeiler-Lehman kernels between graphs with both node and """Compute Weisfeiler-Lehman kernels between graphs with both node and
edge labels. edge labels.


@@ -423,10 +368,11 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


Return Return
------ ------
gram_matrix : Numpy matrix
kernel_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
""" """
gram_matrix = np.zeros((len(Gn), len(Gn)))
kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)


# initial for height = 0 # initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
@@ -446,10 +392,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
all_num_of_each_label.append(dict(Counter(labels_ori))) all_num_of_each_label.append(dict(Counter(labels_ori)))


# Compute subtree kernel with the 0th iteration and add it to the final kernel. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)




# if h >= 1.
# if h >= 1:
if self.height > 0: if self.height > 0:
# Set all edge labels into a tuple. # @todo: remove this original labels or not? # Set all edge labels into a tuple. # @todo: remove this original labels or not?
if self.verbose >= 2: if self.verbose >= 2:
@@ -470,7 +416,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)


# Compute subtree kernel with h iterations and add it to the final kernel. # Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)




# Iterate along heights. # Iterate along heights.
@@ -484,12 +430,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)


# Compute subtree kernel with h iterations and add it to the final kernel. # Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)


return gram_matrix
return kernel_matrix




def _subtree_kernel_do_unlabeled(self, Gn):
def _subtree_kernel_do_unlabeled(self, Gn, return_mat=True):
"""Compute Weisfeiler-Lehman kernels between graphs without labels. """Compute Weisfeiler-Lehman kernels between graphs without labels.


Parameters Parameters
@@ -499,19 +445,20 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


Return Return
------ ------
gram_matrix : Numpy matrix
kernel_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
""" """
gram_matrix = np.zeros((len(Gn), len(Gn)))
kernel_matrix = (np.zeros((len(Gn), len(Gn))) if return_mat else 0)
gram_itr_fun = (self._compute_gram_itr if return_mat else self._compute_kernel_itr)


# initial for height = 0 # initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration


# Compute subtree kernel with the 0th iteration and add it to the final kernel. # Compute subtree kernel with the 0th iteration and add it to the final kernel.
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2)
for i, j in iterator:
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
gram_matrix[j][i] = gram_matrix[i][j]
iterator = combinations_with_replacement(range(0, len(kernel_matrix)), 2)
for i, j in iterator: # @todo: not correct if return_mat == False.
kernel_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
kernel_matrix[j][i] = kernel_matrix[i][j]




# if h >= 1. # if h >= 1.
@@ -526,7 +473,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)


# Compute subtree kernel with h iterations and add it to the final kernel. # Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)




# Iterate along heights (>= 2). # Iterate along heights (>= 2).
@@ -540,9 +487,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured) num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)


# Compute subtree kernel with h iterations and add it to the final kernel. # Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)
kernel_matrix = gram_itr_fun(kernel_matrix, all_num_of_each_label)


return gram_matrix
return kernel_matrix




def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured): def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
@@ -717,6 +664,8 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
all_num_of_each_label[j]) all_num_of_each_label[j])
gram_matrix[j][i] = gram_matrix[i][j] gram_matrix[j][i] = gram_matrix[i][j]


return gram_matrix



def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2): def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2):
"""Compute the subtree kernel. """Compute the subtree kernel.


+ 24
- 0
gklearn/model_selection/__init__.py View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 24 14:25:57 2022

@author: ljia
"""
from ._split import BaseCrossValidatorWithValid
# from ._split import BaseShuffleSplit
from ._split import KFoldWithValid
# from ._split import GroupKFold
# from ._split import StratifiedKFoldWithValid
# from ._split import TimeSeriesSplit
# from ._split import LeaveOneGroupOut
# from ._split import LeaveOneOut
# from ._split import LeavePGroupsOut
# from ._split import LeavePOut
from ._split import RepeatedKFoldWithValid
# from ._split import RepeatedStratifiedKFold
# from ._split import ShuffleSplit
# from ._split import GroupShuffleSplit
# from ._split import StratifiedShuffleSplit
# from ._split import StratifiedGroupKFold
# from ._split import PredefinedSplit

+ 287
- 0
gklearn/model_selection/_split.py View File

@@ -0,0 +1,287 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 24 11:13:26 2022

@author: ljia

Reference: scikit-learn.
"""
from abc import abstractmethod
import numbers
import warnings
import numpy as np
from sklearn.utils import check_random_state, check_array, column_or_1d, indexable
from sklearn.utils.validation import _num_samples
from sklearn.utils.multiclass import type_of_target


class BaseCrossValidatorWithValid(object):
"""Base class for all cross-validators.
Implementations must define `_iter_valid_test_masks` or `_iter_valid_stest_indices`.
"""

def split(self, X, y=None, groups=None):
"""Generate indices to split data into training, valid, and test set.

Parameters
----------

X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.

y : array-like of shape (n_samples,)
The target variable for supervised learning problems.

groups : array-like of shape (n_samples,), default=None
Group labels for the samples used while splitting the dataset into
train/test set.

Yields
------
train : ndarray
The training set indices for that split.

valid : ndarray
The valid set indices for that split.

test : ndarray
The testing set indices for that split.
"""
X, y, groups = indexable(X, y, groups)
indices = np.arange(_num_samples(X))
for valid_index, test_index in self._iter_valid_test_masks(X, y, groups):
train_index = indices[np.logical_not(np.logical_or(valid_index, test_index))]
valid_index = indices[valid_index]
test_index = indices[test_index]
yield train_index, valid_index, test_index


# Since subclasses must implement either _iter_valid_test_masks or
# _iter_valid_test_indices, neither can be abstract.
def _iter_valid_test_masks(self, X=None, y=None, groups=None):
"""Generates boolean masks corresponding to valid and test sets.
By default, delegates to _iter_valid_test_indices(X, y, groups)
"""
for valid_index, test_index in self._iter_valid_test_indices(X, y, groups):
valid_mask = np.zeros(_num_samples(X), dtype=bool)
test_mask = np.zeros(_num_samples(X), dtype=bool)
valid_mask[valid_index] = True
test_mask[test_index] = True
yield valid_mask, test_mask


def _iter_valid_test_indices(self, X=None, y=None, groups=None):
"""Generates integer indices corresponding to valid and test sets."""
raise NotImplementedError


@abstractmethod
def get_n_splits(self, X=None, y=None, groups=None):
"""Returns the number of splitting iterations in the cross-validator"""


def __repr__(self):
return _build_repr(self)


class _BaseKFoldWithValid(BaseCrossValidatorWithValid):
"""Base class for KFoldWithValid, GroupKFoldWithValid, and StratifiedKFoldWithValid"""

@abstractmethod
def __init__(self, n_splits, *, stratify, shuffle, random_state):
if not isinstance(n_splits, numbers.Integral):
raise ValueError(
'The number of folds must be of Integral type. '
'%s of type %s was passed.' % (n_splits, type(n_splits))
)
n_splits = int(n_splits)

if n_splits <= 2:
raise ValueError(
'k-fold cross-validation requires at least one'
' train/valid/test split by setting n_splits=3 or more,'
' got n_splits={0}.'.format(n_splits)
)

if not isinstance(shuffle, bool):
raise TypeError('shuffle must be True or False; got {0}'.format(shuffle))

if not shuffle and random_state is not None: # None is the default
raise ValueError(
'Setting a random_state has no effect since shuffle is '
'False. You should leave '
'random_state to its default (None), or set shuffle=True.',
)

self.n_splits = n_splits
self.stratify = stratify
self.shuffle = shuffle
self.random_state = random_state


def split(self, X, y=None, groups=None):
"""Generate indices to split data into training, valid and test set."""
X, y, groups = indexable(X, y, groups)
n_samples = _num_samples(X)
if self.n_splits > n_samples:
raise ValueError(
(
'Cannot have number of splits n_splits={0} greater'
' than the number of samples: n_samples={1}.'
).format(self.n_splits, n_samples)
)

for train, valid, test in super().split(X, y, groups):
yield train, valid, test


class KFoldWithValid(_BaseKFoldWithValid):


def __init__(
self,
n_splits=5,
*,
stratify=False,
shuffle=False,
random_state=None
):
super().__init__(
n_splits=n_splits,
stratify=stratify,
shuffle=shuffle,
random_state=random_state
)


def _make_valid_test_folds(self, X, y=None):
rng = check_random_state(self.random_state)
y = np.asarray(y)
type_of_target_y = type_of_target(y)
allowed_target_types = ('binary', 'multiclass')
if type_of_target_y not in allowed_target_types:
raise ValueError(
'Supported target types are: {}. Got {!r} instead.'.format(
allowed_target_types, type_of_target_y
)
)

y = column_or_1d(y)

_, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
# y_inv encodes y according to lexicographic order. We invert y_idx to
# map the classes so that they are encoded by order of appearance:
# 0 represents the first label appearing in y, 1 the second, etc.
_, class_perm = np.unique(y_idx, return_inverse=True)
y_encoded = class_perm[y_inv]

n_classes = len(y_idx)
y_counts = np.bincount(y_encoded)
min_groups = np.min(y_counts)
if np.all(self.n_splits > y_counts):
raise ValueError(
"n_splits=%d cannot be greater than the"
" number of members in each class." % (self.n_splits)
)
if self.n_splits > min_groups:
warnings.warn(
"The least populated class in y has only %d"
" members, which is less than n_splits=%d."
% (min_groups, self.n_splits),
UserWarning,
)

# Determine the optimal number of samples from each class in each fold,
# using round robin over the sorted y. (This can be done direct from
# counts, but that code is unreadable.)
y_order = np.sort(y_encoded)
allocation = np.asarray(
[
np.bincount(y_order[i :: self.n_splits], minlength=n_classes)
for i in range(self.n_splits)
]
)

# To maintain the data order dependencies as best as possible within
# the stratification constraint, we assign samples from each class in
# blocks (and then mess that up when shuffle=True).
test_folds = np.empty(len(y), dtype='i')
for k in range(n_classes):
# since the kth column of allocation stores the number of samples
# of class k in each test set, this generates blocks of fold
# indices corresponding to the allocation for class k.
folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])
if self.shuffle:
rng.shuffle(folds_for_class)
test_folds[y_encoded == k] = folds_for_class
return test_folds


def _iter_valid_test_masks(self, X, y=None, groups=None):
test_folds = self._make_valid_test_folds(X, y)
for i in range(self.n_splits):
if i + 1 < self.n_splits:
j = i + 1
else:
j = 0
yield test_folds == i, test_folds == j


def split(self, X, y, groups=None):
y = check_array(y, input_name='y', ensure_2d=False, dtype=None)
return super().split(X, y, groups)


class _RepeatedSplitsWithValid(object):


def __init__(
self,
cv,
*,
n_repeats=10,
random_state=None,
**cvargs
):
if not isinstance(n_repeats, int):
raise ValueError('Number of repetitions must be of integer type.')

if n_repeats <= 0:
raise ValueError('Number of repetitions must be greater than 0.')

self.cv = cv
self.n_repeats = n_repeats
self.random_state = random_state
self.cvargs = cvargs


def split(self, X, y=None, groups=None):
n_repeats = self.n_repeats
rng = check_random_state(self.random_state)

for idx in range(n_repeats):
cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
for train_index, valid_index, test_index in cv.split(X, y, groups):
yield train_index, valid_index, test_index


class RepeatedKFoldWithValid(_RepeatedSplitsWithValid):


def __init__(
self,
*,
n_splits=5,
n_repeats=10,
stratify=False,
random_state=None
):
super().__init__(
KFoldWithValid,
n_repeats=n_repeats,
stratify=stratify,
random_state=random_state,
n_splits=n_splits,
)

+ 12
- 3
gklearn/utils/kernels.py View File

@@ -4,7 +4,7 @@ These kernels are defined between pairs of vectors.
import numpy as np import numpy as np




def delta_kernel(x, y):
def kronecker_delta_kernel(x, y):
"""Delta kernel. Return 1 if x == y, 0 otherwise. """Delta kernel. Return 1 if x == y, 0 otherwise.


Parameters Parameters
@@ -23,6 +23,10 @@ def delta_kernel(x, y):
labeled graphs. In Proceedings of the 20th International Conference on labeled graphs. In Proceedings of the 20th International Conference on
Machine Learning, Washington, DC, United States, 2003. Machine Learning, Washington, DC, United States, 2003.
""" """
return (1 if np.array_equal(x, y) else 0)


def delta_kernel(x, y):
return x == y #(1 if condition else 0) return x == y #(1 if condition else 0)




@@ -64,6 +68,11 @@ def gaussian_kernel(x, y, gamma=None):
return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma) return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma)




def tanimoto_kernel(x, y):
xy = np.dot(x, y)
return xy / (np.dot(x, x) + np.dot(y, y) - xy)


def gaussiankernel(x, y, gamma=None): def gaussiankernel(x, y, gamma=None):
return gaussian_kernel(x, y, gamma=gamma) return gaussian_kernel(x, y, gamma=gamma)


@@ -123,7 +132,7 @@ def linearkernel(x, y):




def cosine_kernel(x, y): def cosine_kernel(x, y):
return np.dot(x, y) / (np.abs(x) * np.abs(y))
return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))




def sigmoid_kernel(x, y, gamma=None, coef0=1): def sigmoid_kernel(x, y, gamma=None, coef0=1):
@@ -142,7 +151,7 @@ def laplacian_kernel(x, y, gamma=None):
if gamma is None: if gamma is None:
gamma = 1.0 / len(x) gamma = 1.0 / len(x)


k = -gamma * np.abs(np.subtract(x, y))
k = -gamma * np.linalg.norm(np.subtract(x, y))
k = np.exp(k) k = np.exp(k)
return k return k




+ 272
- 199
gklearn/utils/utils.py View File

@@ -7,6 +7,9 @@ from enum import Enum, unique
# from tqdm import tqdm # from tqdm import tqdm




#%%


def getSPLengths(G1): def getSPLengths(G1):
sp = nx.shortest_path(G1) sp = nx.shortest_path(G1)
distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
@@ -286,81 +289,146 @@ def direct_product_graph(G1, G2, node_labels, edge_labels):
return gt return gt




def graph_deepcopy(G):
"""Deep copy a graph, including deep copy of all nodes, edges and
attributes of the graph, nodes and edges.
def find_paths(G, source_node, length):
"""Find all paths with a certain length those start from a source node.
A recursive depth first search is applied.


Note
----
It is the same as the NetworkX function graph.copy(), as far as I know.
Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
source_node : integer
The number of the node from where all paths start.
length : integer
The length of paths.

Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
""" """
# add graph attributes.
labels = {}
for k, v in G.graph.items():
labels[k] = deepcopy(v)
if G.is_directed():
G_copy = nx.DiGraph(**labels)
else:
G_copy = nx.Graph(**labels)
if length == 0:
return [[source_node]]
path = [[source_node] + path for neighbor in G[source_node] \
for path in find_paths(G, neighbor, length - 1) if source_node not in path]
return path


# add nodes
for nd, attrs in G.nodes(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_node(nd, **labels)


# add edges.
for nd1, nd2, attrs in G.edges(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_edge(nd1, nd2, **labels)
def find_all_paths(G, length, is_directed):
"""Find all paths with a certain length in a graph. A recursive depth first
search is applied.


return G_copy
Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
length : integer
The length of paths.


Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
all_paths = []
for node in G:
all_paths.extend(find_paths(G, node, length))


def graph_isIdentical(G1, G2):
"""Check if two graphs are identical, including: same nodes, edges, node
labels/attributes, edge labels/attributes.
if not is_directed:
# For each path, two presentations are retrieved from its two extremities.
# Remove one of them.
all_paths_r = [path[::-1] for path in all_paths]
for idx, path in enumerate(all_paths[:-1]):
for path2 in all_paths_r[idx+1::]:
if path == path2:
all_paths[idx] = []
break
all_paths = list(filter(lambda a: a != [], all_paths))


Notes
-----
1. The type of graphs has to be the same.
return all_paths


2. Global/Graph attributes are neglected as they may contain names for graphs.
"""
# check nodes.
nlist1 = [n for n in G1.nodes(data=True)]
nlist2 = [n for n in G2.nodes(data=True)]
if not nlist1 == nlist2:
return False
# check edges.
elist1 = [n for n in G1.edges(data=True)]
elist2 = [n for n in G2.edges(data=True)]
if not elist1 == elist2:
return False
# check graph attributes.


return True
# @todo: use it in ShortestPath.
def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]):
"""Compute kernels between each pair of vertices in two graphs.


Parameters
----------
g1, g2 : NetworkX graph
The kernels bewteen pairs of vertices in these two graphs are computed.
node_kernels : dict
A dictionary of kernel functions for nodes, including 3 items: 'symb'
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
for both labels. The first 2 functions take two node labels as
parameters, and the 'mix' function takes 4 parameters, a symbolic and a
non-symbolic label for each the two nodes. Each label is in form of 2-D
dimension array (n_samples, n_features). Each function returns a number
as the kernel value. Ignored when nodes are unlabeled. This argument
is designated to conjugate gradient method and fixed-point iterations.
node_labels : list, optional
The list of the name strings of the node labels. The default is [].
node_attrs : list, optional
The list of the name strings of the node attributes. The default is [].


def get_node_labels(Gn, node_label):
"""Get node labels of dataset Gn.
"""
nl = set()
for G in Gn:
nl = nl | set(nx.get_node_attributes(G, node_label).values())
return nl
Returns
-------
vk_dict : dict
Vertex kernels keyed by vertices.


Notes
-----
This function is used by ``gklearn.kernels.FixedPoint'' and
``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1].


def get_edge_labels(Gn, edge_label):
"""Get edge labels of dataset Gn.
References
----------
.. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang.
Parallelization of shortest path graph kernels on multi-core cpus and gpus.
Proceedings of the Programmability Issues for Heterogeneous Multicores
(MultiProg), Vienna, Austria, 2014.
""" """
el = set()
for G in Gn:
el = el | set(nx.get_edge_attributes(G, edge_label).values())
return el
vk_dict = {} # shortest path matrices dict
if len(node_labels) > 0:
# node symb and non-synb labeled
if len(node_attrs) > 0:
kn = node_kernels['mix']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in node_labels]
n2_labels = [n2[1][nl] for nl in node_labels]
n1_attrs = [n1[1][na] for na in node_attrs]
n2_attrs = [n2[1][na] for na in node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
# node symb labeled
else:
kn = node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in node_labels]
n2_labels = [n2[1][nl] for nl in node_labels]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels)
else:
# node non-synb labeled
if len(node_attrs) > 0:
kn = node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_attrs = [n1[1][na] for na in node_attrs]
n2_attrs = [n2[1][na] for na in node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs)
# node unlabeled
else:
pass # @todo: add edge weights.
# for e1 in g1.edges(data=True):
# for e2 in g2.edges(data=True):
# if e1[2]['cost'] == e2[2]['cost']:
# kernel += 1
# return kernel

return vk_dict


#%%




def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs): def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs):
@@ -513,79 +581,6 @@ def compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, d
print('\ncomplete.') print('\ncomplete.')




def find_paths(G, source_node, length):
"""Find all paths with a certain length those start from a source node.
A recursive depth first search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
source_node : integer
The number of the node from where all paths start.
length : integer
The length of paths.

Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
if length == 0:
return [[source_node]]
path = [[source_node] + path for neighbor in G[source_node] \
for path in find_paths(G, neighbor, length - 1) if source_node not in path]
return path


def find_all_paths(G, length, is_directed):
"""Find all paths with a certain length in a graph. A recursive depth first
search is applied.

Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
length : integer
The length of paths.

Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
all_paths = []
for node in G:
all_paths.extend(find_paths(G, node, length))

if not is_directed:
# For each path, two presentations are retrieved from its two extremities.
# Remove one of them.
all_paths_r = [path[::-1] for path in all_paths]
for idx, path in enumerate(all_paths[:-1]):
for path2 in all_paths_r[idx+1::]:
if path == path2:
all_paths[idx] = []
break
all_paths = list(filter(lambda a: a != [], all_paths))

return all_paths


def get_mlti_dim_node_attrs(G, attr_names):
attributes = []
for nd, attrs in G.nodes(data=True):
attributes.append(tuple(attrs[aname] for aname in attr_names))
return attributes


def get_mlti_dim_edge_attrs(G, attr_names):
attributes = []
for ed, attrs in G.edges(data=True):
attributes.append(tuple(attrs[aname] for aname in attr_names))
return attributes


def normalize_gram_matrix(gram_matrix): def normalize_gram_matrix(gram_matrix):
diag = gram_matrix.diagonal().copy() diag = gram_matrix.diagonal().copy()
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
@@ -621,84 +616,162 @@ def compute_distance_matrix(gram_matrix):
return dis_mat, dis_max, dis_min, dis_mean return dis_mat, dis_max, dis_min, dis_mean




# @todo: use it in ShortestPath.
def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]):
"""Compute kernels between each pair of vertices in two graphs.
#%%


def graph_deepcopy(G):
"""Deep copy a graph, including deep copy of all nodes, edges and
attributes of the graph, nodes and edges.

Note
----
- It is the same as the NetworkX function graph.copy(), as far as I know.

- This function only supports Networkx.Graph and Networkx.DiGraph.
"""
# add graph attributes.
labels = {}
for k, v in G.graph.items():
labels[k] = deepcopy(v)
if G.is_directed():
G_copy = nx.DiGraph(**labels)
else:
G_copy = nx.Graph(**labels)

# add nodes
for nd, attrs in G.nodes(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_node(nd, **labels)

# add edges.
for nd1, nd2, attrs in G.edges(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_edge(nd1, nd2, **labels)

return G_copy


def graph_isIdentical(G1, G2):
"""Check if two graphs are identical, including: same nodes, edges, node
labels/attributes, edge labels/attributes.

Notes
-----
1. The type of graphs has to be the same.

2. Global/Graph attributes are neglected as they may contain names for graphs.
"""
# check nodes.
nlist1 = [n for n in G1.nodes(data=True)]
nlist2 = [n for n in G2.nodes(data=True)]
if not nlist1 == nlist2:
return False
# check edges.
elist1 = [n for n in G1.edges(data=True)]
elist2 = [n for n in G2.edges(data=True)]
if not elist1 == elist2:
return False
# check graph attributes.

return True


def get_node_labels(Gn, node_label):
"""Get node labels of dataset Gn.
"""
nl = set()
for G in Gn:
nl = nl | set(nx.get_node_attributes(G, node_label).values())
return nl


def get_edge_labels(Gn, edge_label):
"""Get edge labels of dataset Gn.
"""
el = set()
for G in Gn:
el = el | set(nx.get_edge_attributes(G, edge_label).values())
return el


def get_mlti_dim_node_attrs(G, attr_names):
attributes = []
for nd, attrs in G.nodes(data=True):
attributes.append(tuple(attrs[aname] for aname in attr_names))
return attributes


def get_mlti_dim_edge_attrs(G, attr_names):
attributes = []
for ed, attrs in G.edges(data=True):
attributes.append(tuple(attrs[aname] for aname in attr_names))
return attributes


def nx_permute_nodes(G, random_state=None):
"""Permute node indices in a NetworkX graph.


Parameters Parameters
---------- ----------
g1, g2 : NetworkX graph
The kernels bewteen pairs of vertices in these two graphs are computed.
node_kernels : dict
A dictionary of kernel functions for nodes, including 3 items: 'symb'
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
for both labels. The first 2 functions take two node labels as
parameters, and the 'mix' function takes 4 parameters, a symbolic and a
non-symbolic label for each the two nodes. Each label is in form of 2-D
dimension array (n_samples, n_features). Each function returns a number
as the kernel value. Ignored when nodes are unlabeled. This argument
is designated to conjugate gradient method and fixed-point iterations.
node_labels : list, optional
The list of the name strings of the node labels. The default is [].
node_attrs : list, optional
The list of the name strings of the node attributes. The default is [].
G : TYPE
DESCRIPTION.
random_state : TYPE, optional
DESCRIPTION. The default is None.


Returns Returns
------- -------
vk_dict : dict
Vertex kernels keyed by vertices.
G_new : TYPE
DESCRIPTION.


Notes Notes
----- -----
This function is used by ``gklearn.kernels.FixedPoint'' and
``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1].

References
----------
.. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang.
Parallelization of shortest path graph kernels on multi-core cpus and gpus.
Proceedings of the Programmability Issues for Heterogeneous Multicores
(MultiProg), Vienna, Austria, 2014.
- This function only supports Networkx.Graph and Networkx.DiGraph.
""" """
vk_dict = {} # shortest path matrices dict
if len(node_labels) > 0:
# node symb and non-synb labeled
if len(node_attrs) > 0:
kn = node_kernels['mix']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in node_labels]
n2_labels = [n2[1][nl] for nl in node_labels]
n1_attrs = [n1[1][na] for na in node_attrs]
n2_attrs = [n2[1][na] for na in node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
# node symb labeled
else:
kn = node_kernels['symb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_labels = [n1[1][nl] for nl in node_labels]
n2_labels = [n2[1][nl] for nl in node_labels]
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels)
# @todo: relabel node with integers? (in case something went wrong...)
# Add graph attributes.
labels = {}
for k, v in G.graph.items():
labels[k] = deepcopy(v)
if G.is_directed():
G_new = nx.DiGraph(**labels)
else: else:
# node non-synb labeled
if len(node_attrs) > 0:
kn = node_kernels['nsymb']
for n1 in g1.nodes(data=True):
for n2 in g2.nodes(data=True):
n1_attrs = [n1[1][na] for na in node_attrs]
n2_attrs = [n2[1][na] for na in node_attrs]
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs)
# node unlabeled
else:
pass # @todo: add edge weights.
# for e1 in g1.edges(data=True):
# for e2 in g2.edges(data=True):
# if e1[2]['cost'] == e2[2]['cost']:
# kernel += 1
# return kernel
G_new = nx.Graph(**labels)


return vk_dict
# Create a random mapping old node indices <-> new indices.
nb_nodes = nx.number_of_nodes(G)
indices_orig = range(nb_nodes)
idx_mapping = np.random.RandomState(seed=random_state).permutation(indices_orig)

# Add nodes.
nodes_orig = list(G.nodes)
for i_orig in range(nb_nodes):
i_new = idx_mapping[i_orig]
labels = {}
for k, v in G.nodes[nodes_orig[i_new]].items():
labels[k] = deepcopy(v)
G_new.add_node(nodes_orig[i_new], **labels)

# Add edges.
for nd1, nd2, attrs in G.edges(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_new.add_edge(nd1, nd2, **labels)


# # create a random mapping old label -> new label
# node_mapping = dict(zip(G.nodes(), np.random.RandomState(seed=random_state).permutation(G.nodes())))
# # build a new graph
# G_new = nx.relabel_nodes(G, node_mapping)

return G_new


#%%




def dummy_node(): def dummy_node():


+ 1
- 1
requirements.txt View File

@@ -2,7 +2,7 @@ numpy>=1.16.2
scipy>=1.1.0 scipy>=1.1.0
matplotlib>=3.1.0 matplotlib>=3.1.0
networkx>=2.2 networkx>=2.2
scikit-learn>=0.20.0
scikit-learn>=1.1.0
tabulate>=0.8.2 tabulate>=0.8.2
tqdm>=4.26.0 tqdm>=4.26.0
control>=0.8.2 # for generalized random walk kernels only. control>=0.8.2 # for generalized random walk kernels only.


+ 2
- 2
requirements_pypi.txt View File

@@ -1,8 +1,8 @@
numpy>=1.16.2 numpy>=1.16.2
scipy>=1.1.0 scipy>=1.1.0
matplotlib>=3.0.0
matplotlib>=3.1.0
networkx>=2.2 networkx>=2.2
scikit-learn>=0.20.0
scikit-learn>=1.1.0
tabulate>=0.8.2 tabulate>=0.8.2
tqdm>=4.26.0 tqdm>=4.26.0
control>=0.8.2 # for generalized random walk kernels only. control>=0.8.2 # for generalized random walk kernels only.


Loading…
Cancel
Save