Browse Source

Merge pull request #46 from jajupmochi/v0.2.x

V0.2.x
master
linlin GitHub 3 years ago
parent
commit
e1fe40d894
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 2822 additions and 571 deletions
  1. +6
    -0
      .gitignore
  2. +2
    -2
      README.md
  3. +21
    -2
      gklearn/dataset/dataset.py
  4. +19
    -15
      gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py
  5. +131
    -46
      gklearn/experiments/ged/stability/utils.py
  6. +29
    -0
      gklearn/experiments/taskhub.py
  7. +40
    -1
      gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py
  8. +9
    -5
      gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py
  9. +3
    -2
      gklearn/kernels/__init__.py
  10. +5
    -5
      gklearn/kernels/common_walk.py
  11. +13
    -13
      gklearn/kernels/conjugate_gradient.py
  12. +13
    -13
      gklearn/kernels/fixed_point.py
  13. +422
    -44
      gklearn/kernels/graph_kernel.py
  14. +14
    -14
      gklearn/kernels/marginalized.py
  15. +23
    -11
      gklearn/kernels/metadata.py
  16. +14
    -14
      gklearn/kernels/path_up_to_h.py
  17. +14
    -14
      gklearn/kernels/shortest_path.py
  18. +19
    -19
      gklearn/kernels/spectral_decomposition.py
  19. +14
    -14
      gklearn/kernels/structural_sp.py
  20. +19
    -19
      gklearn/kernels/sylvester_equation.py
  21. +333
    -90
      gklearn/kernels/treelet.py
  22. +448
    -78
      gklearn/kernels/weisfeiler_lehman.py
  23. +14
    -0
      gklearn/model_learning/__init__.py
  24. +714
    -0
      gklearn/model_learning/nested_cv.py
  25. +89
    -0
      gklearn/model_learning/parameters.py
  26. +109
    -0
      gklearn/model_learning/workflow.py
  27. +36
    -30
      gklearn/tests/test_graph_kernels.py
  28. +185
    -111
      gklearn/utils/kernels.py
  29. +64
    -9
      gklearn/utils/utils.py

+ 6
- 0
.gitignore View File

@@ -79,3 +79,9 @@ outputs/

# pyCharm.
.idea/

# tests.
gklearn/tests/datasets/

# Experiments.
gklearn/experiments/datasets/

+ 2
- 2
README.md View File

@@ -1,5 +1,5 @@
# graphkit-learn
[![Build Status](https://travis-ci.org/jajupmochi/graphkit-learn.svg?branch=master)](https://travis-ci.org/jajupmochi/graphkit-learn)
[![Build Status](https://travis-ci.com/jajupmochi/graphkit-learn.svg?branch=master)](https://travis-ci.com/jajupmochi/graphkit-learn)
[![Build status](https://ci.appveyor.com/api/projects/status/bdxsolk0t1uji9rd?svg=true)](https://ci.appveyor.com/project/jajupmochi/graphkit-learn)
[![codecov](https://codecov.io/gh/jajupmochi/graphkit-learn/branch/master/graph/badge.svg)](https://codecov.io/gh/jajupmochi/graphkit-learn)
[![Documentation Status](https://readthedocs.org/projects/graphkit-learn/badge/?version=master)](https://graphkit-learn.readthedocs.io/en/master/?badge=master)
@@ -68,7 +68,7 @@ The docs of the library can be found [here](https://graphkit-learn.readthedocs.i
* [The common walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/common_walk.py) [1]
* Exponential
* Geometric
* [The marginalized kenrel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py)
* [The marginalized kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/marginalized.py)
* With tottering [2]
* Without tottering [7]
* [The generalized random walk kernel](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/kernels/random_walk.py) [3]


+ 21
- 2
gklearn/dataset/dataset.py View File

@@ -40,6 +40,7 @@ class Dataset(object):
self._edge_attr_dim = None
self._class_number = None
self._ds_name = None
self._task_type = None

if inputs is None:
self._graphs = None
@@ -117,11 +118,16 @@ class Dataset(object):
ds_file = [os.path.join(path, fn) for fn in load_files[0]]
fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None

# Get extra_params.
if 'extra_params' in DATASET_META[ds_name]:
kwargs = DATASET_META[ds_name]['extra_params']
else:
kwargs = {}

# Get the task type that is associated with the dataset. If it is classification, get the number of classes.
self._get_task_type(ds_name)


self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets, **kwargs).data

self._node_labels = label_names['node_labels']
@@ -276,7 +282,8 @@ class Dataset(object):
'edge_attr_dim',
'class_number',
'all_degree_entropy',
'ave_degree_entropy'
'ave_degree_entropy',
'class_type'
]

# dataset size
@@ -408,7 +415,7 @@ class Dataset(object):

if 'class_number' in keys:
if self._class_number is None:
self._class_number = self._get_class_number()
self._class_number = self._get_class_num()
infos['class_number'] = self._class_number

if 'node_attr_dim' in keys:
@@ -437,6 +444,11 @@ class Dataset(object):
base = None
infos['ave_degree_entropy'] = np.mean(self._compute_all_degree_entropy(base=base))

if 'task_type' in keys:
if self._task_type is None:
self._task_type = self._get_task_type()
infos['task_type'] = self._task_type

return infos


@@ -790,6 +802,13 @@ class Dataset(object):
return degree_entropy


def _get_task_type(self, ds_name):
if 'task_type' in DATASET_META[ds_name]:
self._task_type = DATASET_META[ds_name]['task_type']
if self._task_type == 'classification' and self._class_number is None and 'class_number' in DATASET_META[ds_name]:
self._class_number = DATASET_META[ds_name]['class_number']


@property
def graphs(self):
return self._graphs


+ 19
- 15
gklearn/experiments/ged/stability/edit_costs.real_data.nums_sols.ratios.IPFP.py View File

@@ -13,7 +13,7 @@ import pickle
import logging
from gklearn.ged.util import compute_geds
import time
from utils import get_dataset, set_edit_cost_consts
from utils import get_dataset, set_edit_cost_consts, dichotomous_permutation
import sys
from group_results import group_trials, check_group_existence, update_group_marker

@@ -37,7 +37,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
# the distance between non-symbolic node/edge labels is computed by euclidean distance.
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 0.25,
# parallel threads. Do not work if mpg_options['parallel'] = False.
# parallel threads. Set to 1 automatically if parallel=True in compute_geds().
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
}
@@ -98,7 +98,7 @@ def save_trials_as_group(dataset, ds_name, num_solutions, ratio):
ged_mats.append(ged_mat)
runtimes.append(runtime)

# Group trials and Remove single files.
# Group trials and remove single files.
# @todo: if the program stops between the following lines, then there may be errors.
name_prefix = 'ged_matrix' + name_middle
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
@@ -111,21 +111,25 @@ def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)

for ratio in ratio_list:
for params in list(param_grid):
print()
print('Ratio:', ratio)
for num_solutions in num_solutions_list:
print()
print('# of solutions:', num_solutions)
save_trials_as_group(dataset, ds_name, num_solutions, ratio)
print(params)
save_trials_as_group(dataset, ds_name, params['num_solutions'], params['ratio'])


def get_param_lists(ds_name, test=False):
if test:
num_solutions_list = [1, 10, 20, 30, 40, 50]
def get_param_lists(ds_name, mode='test'):
if mode == 'test':
num_solutions_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
ratio_list = [10]
return num_solutions_list, ratio_list

elif mode == 'simple':
from sklearn.model_selection import ParameterGrid
param_grid = ParameterGrid([
{'num_solutions': dichotomous_permutation([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]), 'ratio': [10]},
{'num_solutions': [10], 'ratio': dichotomous_permutation([0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10])}])
# print(list(param_grid))

if ds_name == 'AIDS_symb':
num_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
@@ -133,7 +137,7 @@ def get_param_lists(ds_name, test=False):
num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1]

return num_solutions_list, ratio_list
return param_grid


if __name__ == '__main__':
@@ -141,7 +145,7 @@ if __name__ == '__main__':
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG']
# ds_name_list = ['Acyclic'] # 'Alkane_unlabeled']
# ds_name_list = ['MUTAG'] # 'Alkane_unlabeled']
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']

save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/'
@@ -151,5 +155,5 @@ if __name__ == '__main__':
for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
num_solutions_list, ratio_list = get_param_lists(ds_name, test=False)
param_grid = get_param_lists(ds_name, mode='simple')
results_for_a_dataset(ds_name)

+ 131
- 46
gklearn/experiments/ged/stability/utils.py View File

@@ -16,12 +16,12 @@ from gklearn.experiments import DATASET_ROOT

def get_dataset(ds_name):
# The node/edge labels that will not be used in the computation.
# if ds_name == 'MAO':
# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}
# if ds_name == 'Monoterpenoides':
# irrelevant_labels = {'edge_labels': ['valence']}
# elif ds_name == 'MUTAG':
# irrelevant_labels = {'edge_labels': ['label_0']}
# if ds_name == 'MAO':
# irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}
# if ds_name == 'Monoterpenoides':
# irrelevant_labels = {'edge_labels': ['valence']}
# elif ds_name == 'MUTAG':
# irrelevant_labels = {'edge_labels': ['label_0']}
if ds_name == 'AIDS_symb':
irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']}
ds_name = 'AIDS'
@@ -49,34 +49,36 @@ def set_edit_cost_consts(ratio, node_labeled=True, edge_labeled=True, mode='unif


def nested_keys_exists(element, *keys):
'''
Check if *keys (nested) exists in `element` (dict).
'''
if not isinstance(element, dict):
raise AttributeError('keys_exists() expects dict as first argument.')
if len(keys) == 0:
raise AttributeError('keys_exists() expects at least two arguments, one given.')

_element = element
for key in keys:
try:
_element = _element[key]
except KeyError:
return False
return True

'''
Check if *keys (nested) exists in `element` (dict).
'''
if not isinstance(element, dict):
raise AttributeError('keys_exists() expects dict as first argument.')
if len(keys) == 0:
raise AttributeError('keys_exists() expects at least two arguments, one given.')

_element = element
for key in keys:
try:
_element = _element[key]
except KeyError:
return False
return True


# Check average relative error along elements in two ged matrices.
def matrices_ave_relative_error(m1, m2):
error = 0
base = 0
for i in range(m1.shape[0]):
for j in range(m1.shape[1]):
error += np.abs(m1[i, j] - m2[i, j])
base += (np.abs(m1[i, j]) + np.abs(m2[i, j])) / 2
error = 0
base = 0
for i in range(m1.shape[0]):
for j in range(m1.shape[1]):
error += np.abs(m1[i, j] - m2[i, j])
# base += (np.abs(m1[i, j]) + np.abs(m2[i, j]))
base += (m1[i, j] + m2[i, j]) # Require only 25% of the time of "base += (np.abs(m1[i, j]) + np.abs(m2[i, j]))".

return error / base
base = base / 2

return error / base


def compute_relative_error(ged_mats):
@@ -92,9 +94,9 @@ def compute_relative_error(ged_mats):
errors = []
for i, mat in enumerate(ged_mats):
err = matrices_ave_relative_error(mat, ged_mat_s)
# if not per_correct:
# print('matrix # ', str(i))
# pass
# if not per_correct:
# print('matrix # ', str(i))
# pass
errors.append(err)
else:
errors = [0]
@@ -107,11 +109,11 @@ def parse_group_file_name(fn):
key1 = splits_all[1]

pos2 = splits_all[2].rfind('_')
# key2 = splits_all[2][:pos2]
# key2 = splits_all[2][:pos2]
val2 = splits_all[2][pos2+1:]

pos3 = splits_all[3].rfind('_')
# key3 = splits_all[3][:pos3]
# key3 = splits_all[3][:pos3]
val3 = splits_all[3][pos3+1:] + '.' + splits_all[4]

return key1, val2, val3
@@ -232,7 +234,7 @@ def set_axis_style(ax):
ax.tick_params(labelsize=8, color='w', pad=1, grid_color='w')
ax.tick_params(axis='x', pad=-2)
ax.tick_params(axis='y', labelrotation=-40, pad=-2)
# ax.zaxis._axinfo['juggled'] = (1, 2, 0)
# ax.zaxis._axinfo['juggled'] = (1, 2, 0)
ax.set_xlabel(ax.get_xlabel(), fontsize=10, labelpad=-3)
ax.set_ylabel(ax.get_ylabel(), fontsize=10, labelpad=-2, rotation=50)
ax.set_zlabel(ax.get_zlabel(), fontsize=10, labelpad=-2)
@@ -240,16 +242,99 @@ def set_axis_style(ax):
return


def dichotomous_permutation(arr, layer=0):
import math

# def seperate_arr(arr, new_arr):
# if (length % 2) == 0:
# half = int(length / 2)
# new_arr += [arr[half - 1], arr[half]]
# subarr1 = [arr[i] for i in range(1, half - 1)]
# else:
# half = math.floor(length / 2)
# new_arr.append(arr[half])
# subarr1 = [arr[i] for i in range(1, half)]
# subarr2 = [arr[i] for i in range(half + 1, length - 1)]
# subarrs = [subarr1, subarr2]
# return subarrs


if layer == 0:
length = len(arr)
if length <= 2:
return arr

new_arr = [arr[0], arr[-1]]
if (length % 2) == 0:
half = int(length / 2)
new_arr += [arr[half - 1], arr[half]]
subarr1 = [arr[i] for i in range(1, half - 1)]
else:
half = math.floor(length / 2)
new_arr.append(arr[half])
subarr1 = [arr[i] for i in range(1, half)]
subarr2 = [arr[i] for i in range(half + 1, length - 1)]
subarrs = [subarr1, subarr2]
# subarrs = seperate_arr(arr, new_arr)
new_arr += dichotomous_permutation(subarrs, layer=layer+1)

else:
new_arr = []
subarrs = []
for a in arr:
length = len(a)
if length <= 2:
new_arr += a
else:
# subarrs += seperate_arr(a, new_arr)
if (length % 2) == 0:
half = int(length / 2)
new_arr += [a[half - 1], a[half]]
subarr1 = [a[i] for i in range(0, half - 1)]
else:
half = math.floor(length / 2)
new_arr.append(a[half])
subarr1 = [a[i] for i in range(0, half)]
subarr2 = [a[i] for i in range(half + 1, length)]
subarrs += [subarr1, subarr2]

if len(subarrs) > 0:
new_arr += dichotomous_permutation(subarrs, layer=layer+1)

return new_arr

# length = len(arr)
# if length <= 2:
# return arr

# new_arr = [arr[0], arr[-1]]
# if (length % 2) == 0:
# half = int(length / 2)
# new_arr += [arr[half - 1], arr[half]]
# subarr1 = [arr[i] for i in range(1, half - 1)]
# else:
# half = math.floor(length / 2)
# new_arr.append(arr[half])
# subarr1 = [arr[i] for i in range(1, half)]
# subarr2 = [arr[i] for i in range(half + 1, length - 1)]
# if len(subarr1) > 0:
# new_arr += dichotomous_permutation(subarr1)
# if len(subarr2) > 0:
# new_arr += dichotomous_permutation(subarr2)

# return new_arr


if __name__ == '__main__':
root_dir = 'outputs/CRIANN/'
# for dir_ in sorted(os.listdir(root_dir)):
# if os.path.isdir(root_dir):
# full_dir = os.path.join(root_dir, dir_)
# print('---', full_dir,':')
# save_dir = os.path.join(full_dir, 'groups/')
# if os.path.exists(save_dir):
# try:
# get_relative_errors(save_dir)
# except Exception as exp:
# print('An exception occured when running this experiment:')
# print(repr(exp))
# for dir_ in sorted(os.listdir(root_dir)):
# if os.path.isdir(root_dir):
# full_dir = os.path.join(root_dir, dir_)
# print('---', full_dir,':')
# save_dir = os.path.join(full_dir, 'groups/')
# if os.path.exists(save_dir):
# try:
# get_relative_errors(save_dir)
# except Exception as exp:
# print('An exception occured when running this experiment:')
# print(repr(exp))

+ 29
- 0
gklearn/experiments/taskhub.py View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 26 09:53:33 2021

@author: ljia
"""

if __name__ == '__main__':
tasks = [
{'path': 'thesis/graph_kernels/fcsp',
'file': 'run_jobs_compare_fcsp.py'
},
{'path': 'thesis/graph_kernels/fcsp',
'file': 'run_jobs_compare_fcsp_space.py'
},
{'path': 'ged/stability',
'file': 'run_job_edit_costs.real_data.nums_sols.ratios.IPFP.py'
},
]

import os
for t in tasks:
print(t['file'])
command = ''
command += 'cd ' + t['path'] + '\n'
command += 'python3 ' + t['file'] + '\n'
# command += 'cd ' + '/'.join(['..'] * len(t['path'].split('/'))) + '\n'
os.system(command)

+ 40
- 1
gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py View File

@@ -19,7 +19,15 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'),
('StructuralSP', 'NCI1', 'False'),
('ShortestPath', 'NCI109', 'False'),
('StructuralSP', 'NCI109', 'True'),
('ShortestPath', 'NCI-H23', 'True'),
('ShortestPath', 'NCI-H23', 'False'),
('StructuralSP', 'NCI-H23', 'True'),
('StructuralSP', 'NCI-H23', 'False'),
('StructuralSP', 'NCI109', 'False'),
('ShortestPath', 'NCI-H23H', 'True'),
('ShortestPath', 'NCI-H23H', 'False'),
('StructuralSP', 'NCI-H23H', 'True'),
('StructuralSP', 'NCI-H23H', 'False'),
('ShortestPath', 'DD', 'True'),
('ShortestPath', 'DD', 'False'),
('StructuralSP', 'BZR', 'False'),
@@ -27,9 +35,37 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'),
('StructuralSP', 'COX2', 'False'),
('ShortestPath', 'DHFR', 'False'),
('StructuralSP', 'DHFR', 'False'),
('ShortestPath', 'MCF-7', 'True'),
('ShortestPath', 'MCF-7', 'False'),
('StructuralSP', 'MCF-7', 'True'),
('StructuralSP', 'MCF-7', 'False'),
('ShortestPath', 'MCF-7H', 'True'),
('ShortestPath', 'MCF-7H', 'False'),
('StructuralSP', 'MCF-7H', 'True'),
('StructuralSP', 'MCF-7H', 'False'),
('ShortestPath', 'MOLT-4', 'True'),
('ShortestPath', 'MOLT-4', 'False'),
('StructuralSP', 'MOLT-4', 'True'),
('StructuralSP', 'MOLT-4', 'False'),
('ShortestPath', 'MOLT-4H', 'True'),
('ShortestPath', 'MOLT-4H', 'False'),
('StructuralSP', 'MOLT-4H', 'True'),
('StructuralSP', 'MOLT-4H', 'False'),
('StructuralSP', 'OHSU', 'True'),
('StructuralSP', 'OHSU', 'False'),
('StructuralSP', 'SYNTHETIC', 'False'),
('ShortestPath', 'OVCAR-8', 'True'),
('ShortestPath', 'OVCAR-8', 'False'),
('StructuralSP', 'OVCAR-8', 'True'),
('StructuralSP', 'OVCAR-8', 'False'),
('ShortestPath', 'OVCAR-8H', 'True'),
('ShortestPath', 'OVCAR-8H', 'False'),
('StructuralSP', 'OVCAR-8H', 'True'),
('StructuralSP', 'OVCAR-8H', 'False'),
('ShortestPath', 'P388', 'False'),
('ShortestPath', 'P388', 'True'),
('StructuralSP', 'P388', 'True'),
('StructuralSP', 'Steroid', 'False'),
('ShortestPath', 'SYNTHETIC', 'False'),
('StructuralSP', 'SYNTHETIC', 'True'),
('StructuralSP', 'SYNTHETIC', 'False'),
('ShortestPath', 'SYNTHETICnew', 'False'),
@@ -47,6 +83,9 @@ OUT_TIME_LIST = set({('ShortestPath', 'ENZYMES', 'False'),
('StructuralSP', 'Mutagenicity', 'False'),
('StructuralSP', 'REDDIT-BINARY', 'True'),
('StructuralSP', 'REDDIT-BINARY', 'False'),
('StructuralSP', 'Vitamin_D', 'False'),
('ShortestPath', 'Web', 'True'),
('ShortestPath', 'Web', 'False'),
})

OUT_MEM_LIST = set({('StructuralSP', 'DD', 'True'),


+ 9
- 5
gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp_space.py View File

@@ -17,6 +17,7 @@ OUT_TIME_LIST = []
OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'),
('ShortestPath', 'REDDIT-BINARY', 'False'),
('StructuralSP', 'ENZYMES', 'False'),
('StructuralSP', 'AIDS', 'False'),
('ShortestPath', 'DD', 'True'),
('ShortestPath', 'DD', 'False'),
('StructuralSP', 'DD', 'True'),
@@ -55,6 +56,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'),
('ShortestPath', 'P388H', 'False'),
('StructuralSP', 'P388H', 'True'),
('StructuralSP', 'P388H', 'False'),
('StructuralSP', 'NCI1', 'False'),
('ShortestPath', 'NCI-H23', 'True'),
('ShortestPath', 'NCI-H23', 'False'),
('StructuralSP', 'NCI-H23', 'True'),
@@ -63,6 +65,7 @@ OUT_MEM_LIST = set({('ShortestPath', 'REDDIT-BINARY', 'True'),
('ShortestPath', 'NCI-H23H', 'False'),
('StructuralSP', 'NCI-H23H', 'True'),
('StructuralSP', 'NCI-H23H', 'False'),
('StructuralSP', 'OHSU', 'False'),
('ShortestPath', 'OVCAR-8', 'True'),
('ShortestPath', 'OVCAR-8', 'False'),
('StructuralSP', 'OVCAR-8', 'True'),
@@ -208,11 +211,12 @@ def check_task_status(save_dir, *params):

# Check if the task is already computed.
file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl')
if os.path.isfile(file_name):
with open(file_name, 'rb') as f:
data = pickle.load(f)
if data['completed']:
return True
if os.path.getsize(file_name) > 0:
if os.path.isfile(file_name):
with open(file_name, 'rb') as f:
data = pickle.load(f)
if data['completed']:
return True

return False



+ 3
- 2
gklearn/kernels/__init__.py View File

@@ -7,7 +7,6 @@ __version__ = "0.1"
__author__ = "Linlin Jia"
__date__ = "November 2018"

from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels

from gklearn.kernels.graph_kernel import GraphKernel
from gklearn.kernels.common_walk import CommonWalk
@@ -24,6 +23,8 @@ from gklearn.kernels.path_up_to_h import PathUpToH
from gklearn.kernels.treelet import Treelet
from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree

from gklearn.kernels.metadata import GRAPH_KERNELS, list_of_graph_kernels

# old version.
from gklearn.kernels.commonWalkKernel import commonwalkkernel
from gklearn.kernels.marginalizedKernel import marginalizedkernel
@@ -32,4 +33,4 @@ from gklearn.kernels.spKernel import spkernel
from gklearn.kernels.structuralspKernel import structuralspkernel
from gklearn.kernels.untilHPathKernel import untilhpathkernel
from gklearn.kernels.treeletKernel import treeletkernel
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel

+ 5
- 5
gklearn/kernels/common_walk.py View File

@@ -47,7 +47,7 @@ class CommonWalk(GraphKernel):
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self._verbose >= 2))
length=len_itr, verbose=(self.verbose >= 2))

# direct product graph method - exponential
if self._compute_method == 'exp':
@@ -86,7 +86,7 @@ class CommonWalk(GraphKernel):
do_fun = self._wrapper_kernel_do_geo

parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose)

return gram_matrix

@@ -100,9 +100,9 @@ class CommonWalk(GraphKernel):

# compute kernel list.
kernel_list = [None] * len(g_list)
if self._verbose >= 2:
if self.verbose >= 2:
iterator = get_iters(range(len(g_list)), desc='Computing kernels',
file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))
else:
iterator = range(len(g_list))

@@ -148,7 +148,7 @@ class CommonWalk(GraphKernel):
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

return kernel_list



+ 13
- 13
gklearn/kernels/conjugate_gradient.py View File

@@ -35,7 +35,7 @@ class ConjugateGradient(RandomWalkMeta):


def _compute_gm_series(self):
self._check_edge_weight(self._graphs, self._verbose)
self._check_edge_weight(self._graphs, self.verbose)
self._check_graphs(self._graphs)

lmda = self._weight
@@ -44,7 +44,7 @@ class ConjugateGradient(RandomWalkMeta):
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

# Reindex nodes using consecutive integers for the convenience of kernel computation.
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2))
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default.
@@ -52,7 +52,7 @@ class ConjugateGradient(RandomWalkMeta):
from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2))

for i, j in iterator:
kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda)
@@ -66,7 +66,7 @@ class ConjugateGradient(RandomWalkMeta):


def _compute_gm_imap_unordered(self):
self._check_edge_weight(self._graphs, self._verbose)
self._check_edge_weight(self._graphs, self.verbose)
self._check_graphs(self._graphs)

# Compute Gram matrix.
@@ -74,7 +74,7 @@ class ConjugateGradient(RandomWalkMeta):

# @todo: parallel this.
# Reindex nodes using consecutive integers for the convenience of kernel computation.
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2))
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default.
@@ -86,7 +86,7 @@ class ConjugateGradient(RandomWalkMeta):
do_fun = self._wrapper_kernel_do

parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose)

else: # @todo
pass
@@ -95,7 +95,7 @@ class ConjugateGradient(RandomWalkMeta):


def _compute_kernel_list_series(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose)
self._check_edge_weight(g_list + [g1], self.verbose)
self._check_graphs(g_list + [g1])

lmda = self._weight
@@ -105,11 +105,11 @@ class ConjugateGradient(RandomWalkMeta):

# Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2))
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default.
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))

for i in iterator:
kernel = self._kernel_do(g1, g_list[i], lmda)
@@ -122,7 +122,7 @@ class ConjugateGradient(RandomWalkMeta):


def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose)
self._check_edge_weight(g_list + [g1], self.verbose)
self._check_graphs(g_list + [g1])

# compute kernel list.
@@ -131,7 +131,7 @@ class ConjugateGradient(RandomWalkMeta):
# Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
# @todo: parallel this.
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2))
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default.
@@ -149,7 +149,7 @@ class ConjugateGradient(RandomWalkMeta):
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

else: # @todo
pass
@@ -162,7 +162,7 @@ class ConjugateGradient(RandomWalkMeta):


def _compute_single_kernel_series(self, g1, g2):
self._check_edge_weight([g1] + [g2], self._verbose)
self._check_edge_weight([g1] + [g2], self.verbose)
self._check_graphs([g1] + [g2])

lmda = self._weight


+ 13
- 13
gklearn/kernels/fixed_point.py View File

@@ -35,7 +35,7 @@ class FixedPoint(RandomWalkMeta):


def _compute_gm_series(self):
self._check_edge_weight(self._graphs, self._verbose)
self._check_edge_weight(self._graphs, self.verbose)
self._check_graphs(self._graphs)

lmda = self._weight
@@ -44,7 +44,7 @@ class FixedPoint(RandomWalkMeta):
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

# Reindex nodes using consecutive integers for the convenience of kernel computation.
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self.verbose >= 2))
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default.
@@ -52,7 +52,7 @@ class FixedPoint(RandomWalkMeta):
from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2))

for i, j in iterator:
kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda)
@@ -66,7 +66,7 @@ class FixedPoint(RandomWalkMeta):


def _compute_gm_imap_unordered(self):
self._check_edge_weight(self._graphs, self._verbose)
self._check_edge_weight(self._graphs, self.verbose)
self._check_graphs(self._graphs)

# Compute Gram matrix.
@@ -74,7 +74,7 @@ class FixedPoint(RandomWalkMeta):

# @todo: parallel this.
# Reindex nodes using consecutive integers for the convenience of kernel computation.
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2))
self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default.
@@ -86,7 +86,7 @@ class FixedPoint(RandomWalkMeta):
do_fun = self._wrapper_kernel_do

parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose)

else: # @todo
pass
@@ -95,7 +95,7 @@ class FixedPoint(RandomWalkMeta):


def _compute_kernel_list_series(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose)
self._check_edge_weight(g_list + [g1], self.verbose)
self._check_graphs(g_list + [g1])

lmda = self._weight
@@ -105,12 +105,12 @@ class FixedPoint(RandomWalkMeta):

# Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2))
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default.

iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))

for i in iterator:
kernel = self._kernel_do(g1, g_list[i], lmda)
@@ -123,7 +123,7 @@ class FixedPoint(RandomWalkMeta):


def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose)
self._check_edge_weight(g_list + [g1], self.verbose)
self._check_graphs(g_list + [g1])

# compute kernel list.
@@ -132,7 +132,7 @@ class FixedPoint(RandomWalkMeta):
# Reindex nodes using consecutive integers for the convenience of kernel computation.
g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal')
# @todo: parallel this.
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2))
g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator]

if self._p is None and self._q is None: # p and q are uniform distributions as default.
@@ -150,7 +150,7 @@ class FixedPoint(RandomWalkMeta):
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

else: # @todo
pass
@@ -163,7 +163,7 @@ class FixedPoint(RandomWalkMeta):


def _compute_single_kernel_series(self, g1, g2):
self._check_edge_weight([g1] + [g2], self._verbose)
self._check_edge_weight([g1] + [g2], self.verbose)
self._check_graphs([g1] + [g2])

lmda = self._weight


+ 422
- 44
gklearn/kernels/graph_kernel.py View File

@@ -9,55 +9,433 @@ import numpy as np
import networkx as nx
import multiprocessing
import time
# from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator # , TransformerMixin
from sklearn.utils.validation import check_is_fitted # check_X_y, check_array,
from sklearn.exceptions import NotFittedError
from gklearn.utils import normalize_gram_matrix


class GraphKernel(object):
class GraphKernel(BaseEstimator): #, ABC):
"""The basic graph kernel class.

def __init__(self):
self._graphs = None
self._parallel = ''
self._n_jobs = 0
self._verbose = None
self._normalize = True
self._run_time = 0
self._gram_matrix = None
self._gram_matrix_unnorm = None
Attributes
----------
_graphs : list
Stores the input graphs on fit input data.
Default format of the list objects is `NetworkX` graphs.
**We don't guarantee that the input graphs remain unchanged during the
computation.**

References
----------
https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel.
"""

def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2):
"""`__init__` for `GraphKernel` object."""
# @todo: the default settings of the parameters are different from those in the self.compute method.
# self._graphs = None
self.parallel = parallel
self.n_jobs = n_jobs
self.chunksize = chunksize
self.normalize = normalize
self.verbose = verbose
# self._run_time = 0
# self._gram_matrix = None
# self._gram_matrix_unnorm = None

def compute(self, *graphs, **kwargs):
self._parallel = kwargs.get('parallel', 'imap_unordered')
self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
self._normalize = kwargs.get('normalize', True)
self._verbose = kwargs.get('verbose', 2)

##########################################################################
# The following is the 1st paradigm to compute kernel matrix, which is
# compatible with `scikit-learn`.
# -------------------------------------------------------------------
# Special thanks to the "GraKeL" library for providing an excellent template!
##########################################################################


def fit(self, X, y=None):
"""Fit a graph dataset for a transformer.

Parameters
----------
X : iterable
DESCRIPTION.

y : None, optional
There is no need of a target in a transformer, yet the `scikit-learn`
pipeline API requires this parameter.

Returns
-------
object
Returns self.

"""
# self._is_tranformed = False

# Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used;
self.clear_attributes()

# Validate parameters for the transformer.
self.validate_parameters()

# Validate the input.
self._graphs = self.validate_input(X)

# self._X = X
# self._kernel = self._get_kernel_instance()

# Return the transformer.
return self


def transform(self, X):
"""Compute the graph kernel matrix between given and fitted data.

Parameters
----------
X : TYPE
DESCRIPTION.

Raises
------
ValueError
DESCRIPTION.

Returns
-------
None.

"""
# Check if method "fit" had been called.
check_is_fitted(self, '_graphs')

# Validate the input.
Y = self.validate_input(X)

# Transform: compute the graph kernel matrix.
kernel_matrix = self.compute_kernel_matrix(Y)
self._Y = Y

# Self transform must appear before the diagonal call on normilization.
self._is_transformed = True
if self.normalize:
X_diag, Y_diag = self.diagonals()
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
try:
kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag))
except:
raise
finally:
np.seterr(**old_settings)

return kernel_matrix



def fit_transform(self, X):
"""Fit and transform: compute Gram matrix on the same data.

Parameters
----------
X : list of graphs
Input graphs.

Returns
-------
gram_matrix : numpy array, shape = [len(X), len(X)]
The Gram matrix of X.

"""
self.fit(X)

# Transform: compute Gram matrix.
gram_matrix = self.compute_kernel_matrix()

# Normalize.
if self.normalize:
self._X_diag = np.diagonal(gram_matrix).copy()
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
try:
gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag))
except:
raise
finally:
np.seterr(**old_settings)

return gram_matrix


def get_params(self):
pass


def set_params(self):
pass


def clear_attributes(self):
if hasattr(self, '_X_diag'):
delattr(self, '_X_diag')
if hasattr(self, '_graphs'):
delattr(self, '_graphs')
if hasattr(self, '_Y'):
delattr(self, '_Y')
if hasattr(self, '_run_time'):
delattr(self, '_run_time')


def validate_parameters(self):
"""Validate all parameters for the transformer.

Returns
-------
None.

"""
if self.parallel is not None and self.parallel != 'imap_unordered':
raise ValueError('Parallel mode is not set correctly.')

if self.parallel == 'imap_unordered' and self.n_jobs is None:
self.n_jobs = multiprocessing.cpu_count()


def validate_input(self, X):
"""Validate the given input and raise errors if it is invalid.

Parameters
----------
X : list
The input to check. Should be a list of graph.

Raises
------
ValueError
Raise if the input is not correct.

Returns
-------
X : list
The input. A list of graph.

"""
if X is None:
raise ValueError('Please add graphs before computing.')
elif not isinstance(X, list):
raise ValueError('Cannot detect graphs.')
elif len(X) == 0:
raise ValueError('The graph list given is empty. No computation will be performed.')

return X


def compute_kernel_matrix(self, Y=None):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) or the Gram matrix for the fitted
graphs (X / self._graphs).

Parameters
----------
Y : list of graphs, optional
The target graphs. The default is None. If None kernel is computed
between X and itself.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
if Y is None:
# Compute Gram matrix for self._graphs (X).
kernel_matrix = self._compute_gram_matrix()
# self._gram_matrix_unnorm = np.copy(self._gram_matrix)

else:
# Compute kernel matrix between Y and self._graphs (X).
start_time = time.time()

if self.parallel == 'imap_unordered':
kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y)

elif self.parallel is None:
kernel_matrix = self._compute_kernel_matrix_series(Y)

self._run_time = time.time() - start_time
if self.verbose:
print('Kernel matrix of size (%d, %d) built in %s seconds.'
% (len(Y), len(self._graphs), self._run_time))

return kernel_matrix


def _compute_kernel_matrix_series(self, Y):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) without parallelization.

Parameters
----------
Y : list of graphs, optional
The target graphs.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
kernel_matrix = np.zeros((len(Y), len(self._graphs)))

for i_y, g_y in enumerate(Y):
for i_x, g_x in enumerate(self._graphs):
kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x)

return kernel_matrix


def _compute_kernel_matrix_imap_unordered(self, Y):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) using imap unordered parallelization.

Parameters
----------
Y : list of graphs, optional
The target graphs.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
raise Exception('Parallelization for kernel matrix is not implemented.')


def diagonals(self):
"""Compute the kernel matrix diagonals of the fit/transformed data.

Returns
-------
X_diag : numpy array
The diagonal of the kernel matrix between the fitted data.
This consists of each element calculated with itself.

Y_diag : numpy array
The diagonal of the kernel matrix, of the transform.
This consists of each element calculated with itself.

"""
# Check if method "fit" had been called.
check_is_fitted(self, ['_graphs'])

# Check if the diagonals of X exist.
try:
check_is_fitted(self, ['_X_diag'])
except NotFittedError:
# Compute diagonals of X.
self._X_diag = np.empty(shape=(len(self._graphs),))
for i, x in enumerate(self._graphs):
self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel?

try:
# If transform has happened, return both diagonals.
check_is_fitted(self, ['_Y'])
self._Y_diag = np.empty(shape=(len(self._Y),))
for (i, y) in enumerate(self._Y):
self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel?

return self._X_diag, self._Y_diag
except NotFittedError:
# Else just return both X_diag
return self._X_diag


# @abstractmethod
def pairwise_kernel(self, x, y):
"""Compute pairwise kernel between two graphs.

Parameters
----------
x, y : NetworkX Graph.
Graphs bewteen which the kernel is computed.

Returns
-------
kernel: float
The computed kernel.

# Notes
# -----
# This method is abstract and must be implemented by a subclass.

"""
raise NotImplementedError('Pairwise kernel computation is not implemented!')


##########################################################################
# The following is the 2nd paradigm to compute kernel matrix. It is
# simplified and not compatible with `scikit-learn`.
##########################################################################


def compute(self, *graphs, **kwargs):
self.parallel = kwargs.get('parallel', 'imap_unordered')
self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
self.normalize = kwargs.get('normalize', True)
self.verbose = kwargs.get('verbose', 2)
self.copy_graphs = kwargs.get('copy_graphs', True)
self.save_unnormed = kwargs.get('save_unnormed', True)
self.validate_parameters()

# If the inputs is a list of graphs.
if len(graphs) == 1:
if not isinstance(graphs[0], list):
raise Exception('Cannot detect graphs.')
elif len(graphs[0]) == 0:
raise Exception('The graph list given is empty. No computation was performed.')
else:
self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow.
if self.copy_graphs:
self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow.
else:
self._graphs = graphs
self._gram_matrix = self._compute_gram_matrix()
self._gram_matrix_unnorm = np.copy(self._gram_matrix)
if self._normalize:

if self.save_unnormed:
self._gram_matrix_unnorm = np.copy(self._gram_matrix)
if self.normalize:
self._gram_matrix = normalize_gram_matrix(self._gram_matrix)
return self._gram_matrix, self._run_time

elif len(graphs) == 2:
# If the inputs are two graphs.
if self.is_graph(graphs[0]) and self.is_graph(graphs[1]):
kernel = self._compute_single_kernel(graphs[0].copy(), graphs[1].copy())
if self.copy_graphs:
G0, G1 = graphs[0].copy(), graphs[1].copy()
else:
G0, G1 = graphs[0], graphs[1]
kernel = self._compute_single_kernel(G0, G1)
return kernel, self._run_time

# If the inputs are a graph and a list of graphs.
elif self.is_graph(graphs[0]) and isinstance(graphs[1], list):
g1 = graphs[0].copy()
g_list = [g.copy() for g in graphs[1]]
kernel_list = self._compute_kernel_list(g1, g_list)
if self.copy_graphs:
g1 = graphs[0].copy()
g_list = [g.copy() for g in graphs[1]]
kernel_list = self._compute_kernel_list(g1, g_list)
else:
kernel_list = self._compute_kernel_list(graphs[0], graphs[1])
return kernel_list, self._run_time

elif isinstance(graphs[0], list) and self.is_graph(graphs[1]):
g1 = graphs[1].copy()
g_list = [g.copy() for g in graphs[0]]
kernel_list = self._compute_kernel_list(g1, g_list)
if self.copy_graphs:
g1 = graphs[1].copy()
g_list = [g.copy() for g in graphs[0]]
kernel_list = self._compute_kernel_list(g1, g_list)
else:
kernel_list = self._compute_kernel_list(graphs[1], graphs[0])
return kernel_list, self._run_time

else:
raise Exception('Cannot detect graphs.')

@@ -103,15 +481,15 @@ class GraphKernel(object):
def _compute_gram_matrix(self):
start_time = time.time()

if self._parallel == 'imap_unordered':
if self.parallel == 'imap_unordered':
gram_matrix = self._compute_gm_imap_unordered()
elif self._parallel is None:
elif self.parallel is None:
gram_matrix = self._compute_gm_series()
else:
raise Exception('Parallel mode is not set correctly.')

self._run_time = time.time() - start_time
if self._verbose:
if self.verbose:
print('Gram matrix of size %d built in %s seconds.'
% (len(self._graphs), self._run_time))

@@ -129,15 +507,15 @@ class GraphKernel(object):
def _compute_kernel_list(self, g1, g_list):
start_time = time.time()

if self._parallel == 'imap_unordered':
if self.parallel == 'imap_unordered':
kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list)
elif self._parallel is None:
elif self.parallel is None:
kernel_list = self._compute_kernel_list_series(g1, g_list)
else:
raise Exception('Parallel mode is not set correctly.')

self._run_time = time.time() - start_time
if self._verbose:
if self.verbose:
print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.'
% (len(g_list), self._run_time))

@@ -158,7 +536,7 @@ class GraphKernel(object):
kernel = self._compute_single_kernel_series(g1, g2)

self._run_time = time.time() - start_time
if self._verbose:
if self.verbose:
print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time))

return kernel
@@ -185,24 +563,24 @@ class GraphKernel(object):
return self._graphs


@property
def parallel(self):
return self._parallel
# @property
# def parallel(self):
# return self.parallel


@property
def n_jobs(self):
return self._n_jobs
# @property
# def n_jobs(self):
# return self.n_jobs


@property
def verbose(self):
return self._verbose
# @property
# def verbose(self):
# return self.verbose


@property
def normalize(self):
return self._normalize
# @property
# def normalize(self):
# return self.normalize


@property


+ 14
- 14
gklearn/kernels/marginalized.py View File

@@ -46,7 +46,7 @@ class Marginalized(GraphKernel):
self._add_dummy_labels(self._graphs)

if self._remove_totters:
iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2))
# @todo: this may not work.
self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator]

@@ -57,7 +57,7 @@ class Marginalized(GraphKernel):
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self._verbose >= 2))
length=len_itr, verbose=(self.verbose >= 2))
for i, j in iterator:
kernel = self._kernel_do(self._graphs[i], self._graphs[j])
gram_matrix[i][j] = kernel
@@ -70,16 +70,16 @@ class Marginalized(GraphKernel):
self._add_dummy_labels(self._graphs)

if self._remove_totters:
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
itr = range(0, len(self._graphs))
if len(self._graphs) < 100 * self._n_jobs:
chunksize = int(len(self._graphs) / self._n_jobs) + 1
if len(self._graphs) < 100 * self.n_jobs:
chunksize = int(len(self._graphs) / self.n_jobs) + 1
else:
chunksize = 100
remove_fun = self._wrapper_untotter
iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize),
desc='removing tottering', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
length=len(self._graphs), verbose=(self.verbose >= 2))
for i, g in iterator:
self._graphs[i] = g
pool.close()
@@ -93,7 +93,7 @@ class Marginalized(GraphKernel):
G_gn = gn_toshare
do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose)

return gram_matrix

@@ -103,13 +103,13 @@ class Marginalized(GraphKernel):

if self._remove_totters:
g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work.
iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2))
# @todo: this may not work.
g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator]

# compute kernel list.
kernel_list = [None] * len(g_list)
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))
for i in iterator:
kernel = self._kernel_do(g1, g_list[i])
kernel_list[i] = kernel
@@ -122,16 +122,16 @@ class Marginalized(GraphKernel):

if self._remove_totters:
g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work.
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
itr = range(0, len(g_list))
if len(g_list) < 100 * self._n_jobs:
chunksize = int(len(g_list) / self._n_jobs) + 1
if len(g_list) < 100 * self.n_jobs:
chunksize = int(len(g_list) / self.n_jobs) + 1
else:
chunksize = 100
remove_fun = self._wrapper_untotter
iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize),
desc='removing tottering', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
length=len(g_list), verbose=(self.verbose >= 2))
for i, g in iterator:
g_list[i] = g
pool.close()
@@ -151,7 +151,7 @@ class Marginalized(GraphKernel):
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

return kernel_list



+ 23
- 11
gklearn/kernels/metadata.py View File

@@ -5,23 +5,35 @@ Created on Fri Nov 6 10:11:08 2020

@author: ljia
"""
from gklearn.kernels.common_walk import CommonWalk
from gklearn.kernels.marginalized import Marginalized
from gklearn.kernels.sylvester_equation import SylvesterEquation
from gklearn.kernels.conjugate_gradient import ConjugateGradient
from gklearn.kernels.fixed_point import FixedPoint
from gklearn.kernels.spectral_decomposition import SpectralDecomposition
from gklearn.kernels.shortest_path import ShortestPath
from gklearn.kernels.structural_sp import StructuralSP
from gklearn.kernels.path_up_to_h import PathUpToH
from gklearn.kernels.treelet import Treelet
from gklearn.kernels.weisfeiler_lehman import WLSubtree


# The metadata of all graph kernels.
GRAPH_KERNELS = {
### based on walks.
'common walk': '',
'marginalized': '',
'sylvester equation': '',
'fixed point': '',
'conjugate gradient': '',
'spectral decomposition': '',
'common walk': CommonWalk,
'marginalized': Marginalized,
'sylvester equation': SylvesterEquation,
'fixed point': FixedPoint,
'conjugate gradient': ConjugateGradient,
'spectral decomposition': SpectralDecomposition,
### based on paths.
'shortest path': '',
'structural shortest path': '',
'path up to length h': '',
'shortest path': ShortestPath,
'structural shortest path': StructuralSP,
'path up to length h': PathUpToH,
### based on non-linear patterns.
'weisfeiler-lehman subtree': '',
'treelet': '',
'weisfeiler-lehman subtree': WLSubtree,
'treelet': Treelet,
}




+ 14
- 14
gklearn/kernels/path_up_to_h.py View File

@@ -41,10 +41,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None

from itertools import combinations_with_replacement
itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2)
iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2))
iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self.verbose >= 2))
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator_kernel = get_iters(itr_kernel, desc='Computing kernels',
file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))
file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2))

gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

@@ -69,10 +69,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None

# get all paths of all graphs before computing kernels to save time,
# but this may cost a lot of memory for large datasets.
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
itr = zip(self._graphs, range(0, len(self._graphs)))
if len(self._graphs) < 100 * self._n_jobs:
chunksize = int(len(self._graphs) / self._n_jobs) + 1
if len(self._graphs) < 100 * self.n_jobs:
chunksize = int(len(self._graphs) / self.n_jobs) + 1
else:
chunksize = 100
all_paths = [[] for _ in range(len(self._graphs))]
@@ -84,7 +84,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False)
iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize),
desc='getting paths', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
length=len(self._graphs), verbose=(self.verbose >= 2))
for i, ps in iterator:
all_paths[i] = ps
pool.close()
@@ -109,7 +109,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
G_plist = plist_toshare
do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this?
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(all_paths,), n_jobs=self.n_jobs, verbose=self.verbose)

return gram_matrix

@@ -117,8 +117,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
def _compute_kernel_list_series(self, g1, g_list):
self._add_dummy_labels(g_list + [g1])

iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self._verbose >= 2))
iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self.verbose >= 2))
iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))

kernel_list = [None] * len(g_list)

@@ -143,10 +143,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None

# get all paths of all graphs before computing kernels to save time,
# but this may cost a lot of memory for large datasets.
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
itr = zip(g_list, range(0, len(g_list)))
if len(g_list) < 100 * self._n_jobs:
chunksize = int(len(g_list) / self._n_jobs) + 1
if len(g_list) < 100 * self.n_jobs:
chunksize = int(len(g_list) / self.n_jobs) + 1
else:
chunksize = 100
paths_g_list = [[] for _ in range(len(g_list))]
@@ -161,7 +161,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False)
iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize),
desc='getting paths', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
length=len(g_list), verbose=(self.verbose >= 2))
for i, ps in iterator:
paths_g_list[i] = ps
pool.close()
@@ -180,7 +180,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None
itr = range(len(g_list))
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

return kernel_list



+ 14
- 14
gklearn/kernels/shortest_path.py View File

@@ -38,7 +38,7 @@ class ShortestPath(GraphKernel):
def _compute_gm_series(self):
self._all_graphs_have_edges(self._graphs)
# get shortest path graph of each graph.
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2))
self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator]

# compute Gram matrix.
@@ -48,7 +48,7 @@ class ShortestPath(GraphKernel):
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels',
length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2))
length=len_itr, file=sys.stdout,verbose=(self.verbose >= 2))
for i, j in iterator:
kernel = self._sp_do(self._graphs[i], self._graphs[j])
gram_matrix[i][j] = kernel
@@ -60,16 +60,16 @@ class ShortestPath(GraphKernel):
def _compute_gm_imap_unordered(self):
self._all_graphs_have_edges(self._graphs)
# get shortest path graph of each graph.
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
get_sp_graphs_fun = self._wrapper_get_sp_graphs
itr = zip(self._graphs, range(0, len(self._graphs)))
if len(self._graphs) < 100 * self._n_jobs:
chunksize = int(len(self._graphs) / self._n_jobs) + 1
if len(self._graphs) < 100 * self.n_jobs:
chunksize = int(len(self._graphs) / self.n_jobs) + 1
else:
chunksize = 100
iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize),
desc='getting sp graphs', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
length=len(self._graphs), verbose=(self.verbose >= 2))
for i, g in iterator:
self._graphs[i] = g
pool.close()
@@ -83,7 +83,7 @@ class ShortestPath(GraphKernel):
G_gs = gs_toshare
do_fun = self._wrapper_sp_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose)

return gram_matrix

@@ -92,12 +92,12 @@ class ShortestPath(GraphKernel):
self._all_graphs_have_edges([g1] + g_list)
# get shortest path graphs of g1 and each graph in g_list.
g1 = getSPGraph(g1, edge_weight=self._edge_weight)
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2))
g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator]

# compute kernel list.
kernel_list = [None] * len(g_list)
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))
for i in iterator:
kernel = self._sp_do(g1, g_list[i])
kernel_list[i] = kernel
@@ -109,16 +109,16 @@ class ShortestPath(GraphKernel):
self._all_graphs_have_edges([g1] + g_list)
# get shortest path graphs of g1 and each graph in g_list.
g1 = getSPGraph(g1, edge_weight=self._edge_weight)
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
get_sp_graphs_fun = self._wrapper_get_sp_graphs
itr = zip(g_list, range(0, len(g_list)))
if len(g_list) < 100 * self._n_jobs:
chunksize = int(len(g_list) / self._n_jobs) + 1
if len(g_list) < 100 * self.n_jobs:
chunksize = int(len(g_list) / self.n_jobs) + 1
else:
chunksize = 100
iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize),
desc='getting sp graphs', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
length=len(g_list), verbose=(self.verbose >= 2))
for i, g in iterator:
g_list[i] = g
pool.close()
@@ -137,7 +137,7 @@ class ShortestPath(GraphKernel):
itr = range(len(g_list))
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

return kernel_list



+ 19
- 19
gklearn/kernels/spectral_decomposition.py View File

@@ -28,9 +28,9 @@ class SpectralDecomposition(RandomWalkMeta):


def _compute_gm_series(self):
self._check_edge_weight(self._graphs, self._verbose)
self._check_edge_weight(self._graphs, self.verbose)
self._check_graphs(self._graphs)
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.')

@@ -41,7 +41,7 @@ class SpectralDecomposition(RandomWalkMeta):
# precompute the spectral decomposition of each graph.
P_list = []
D_list = []
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2))
for G in iterator:
# don't normalize adjacency matrices if q is a uniform vector. Note
# A actually is the transpose of the adjacency matrix.
@@ -58,7 +58,7 @@ class SpectralDecomposition(RandomWalkMeta):
from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2))

for i, j in iterator:
kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel)
@@ -74,9 +74,9 @@ class SpectralDecomposition(RandomWalkMeta):


def _compute_gm_imap_unordered(self):
self._check_edge_weight(self._graphs, self._verbose)
self._check_edge_weight(self._graphs, self.verbose)
self._check_graphs(self._graphs)
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.')

@@ -87,7 +87,7 @@ class SpectralDecomposition(RandomWalkMeta):
# precompute the spectral decomposition of each graph.
P_list = []
D_list = []
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2))
for G in iterator:
# don't normalize adjacency matrices if q is a uniform vector. Note
# A actually is the transpose of the adjacency matrix.
@@ -107,7 +107,7 @@ class SpectralDecomposition(RandomWalkMeta):

do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(q_T_list, P_list, D_list), n_jobs=self.n_jobs, verbose=self.verbose)

else: # @todo
pass
@@ -118,9 +118,9 @@ class SpectralDecomposition(RandomWalkMeta):


def _compute_kernel_list_series(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose)
self._check_edge_weight(g_list + [g1], self.verbose)
self._check_graphs(g_list + [g1])
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.')

@@ -133,7 +133,7 @@ class SpectralDecomposition(RandomWalkMeta):
D1, P1 = np.linalg.eig(A1)
P_list = []
D_list = []
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2))
for G in iterator:
# don't normalize adjacency matrices if q is a uniform vector. Note
# A actually is the transpose of the adjacency matrix.
@@ -145,7 +145,7 @@ class SpectralDecomposition(RandomWalkMeta):
if self._p is None: # p is uniform distribution as default.
q_T1 = 1 / nx.number_of_nodes(g1)
q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list]
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))

for i in iterator:
kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel)
@@ -160,9 +160,9 @@ class SpectralDecomposition(RandomWalkMeta):


def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose)
self._check_edge_weight(g_list + [g1], self.verbose)
self._check_graphs(g_list + [g1])
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.')

@@ -175,8 +175,8 @@ class SpectralDecomposition(RandomWalkMeta):
D1, P1 = np.linalg.eig(A1)
P_list = []
D_list = []
if self._verbose >= 2:
iterator = tqdm(g_list, desc='spectral decompose', file=sys.stdout)
if self.verbose >= 2:
iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout)
else:
iterator = g_list
for G in iterator:
@@ -207,7 +207,7 @@ class SpectralDecomposition(RandomWalkMeta):
itr = range(len(g_list))
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

else: # @todo
pass
@@ -222,9 +222,9 @@ class SpectralDecomposition(RandomWalkMeta):


def _compute_single_kernel_series(self, g1, g2):
self._check_edge_weight([g1] + [g2], self._verbose)
self._check_edge_weight([g1] + [g2], self.verbose)
self._check_graphs([g1] + [g2])
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored. Only works for undirected graphs.')



+ 14
- 14
gklearn/kernels/structural_sp.py View File

@@ -41,7 +41,7 @@ class StructuralSP(GraphKernel):
def _compute_gm_series(self):
# get shortest paths of each graph in the graphs.
splist = []
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2))
if self._compute_method == 'trie':
for g in iterator:
splist.append(self._get_sps_as_trie(g))
@@ -56,7 +56,7 @@ class StructuralSP(GraphKernel):
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self._verbose >= 2))
length=len_itr, verbose=(self.verbose >= 2))
if self._compute_method == 'trie':
for i, j in iterator:
kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j])
@@ -76,10 +76,10 @@ class StructuralSP(GraphKernel):
def _compute_gm_imap_unordered(self):
# get shortest paths of each graph in the graphs.
splist = [None] * len(self._graphs)
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
itr = zip(self._graphs, range(0, len(self._graphs)))
if len(self._graphs) < 100 * self._n_jobs:
chunksize = int(len(self._graphs) / self._n_jobs) + 1
if len(self._graphs) < 100 * self.n_jobs:
chunksize = int(len(self._graphs) / self.n_jobs) + 1
else:
chunksize = 100
# get shortest path graphs of self._graphs
@@ -89,7 +89,7 @@ class StructuralSP(GraphKernel):
get_sps_fun = self._wrapper_get_sps_naive
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize),
desc='getting shortest paths', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
length=len(self._graphs), verbose=(self.verbose >= 2))
for i, sp in iterator:
splist[i] = sp
pool.close()
@@ -107,7 +107,7 @@ class StructuralSP(GraphKernel):
else:
do_fun = self._wrapper_ssp_do_naive
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(splist, self._graphs), n_jobs=self.n_jobs, verbose=self.verbose)

return gram_matrix

@@ -117,7 +117,7 @@ class StructuralSP(GraphKernel):
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed'])
splist = []
iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout,
verbose=(self._verbose >= 2))
verbose=(self.verbose >= 2))
if self._compute_method == 'trie':
for g in iterator:
splist.append(self._get_sps_as_trie(g))
@@ -128,7 +128,7 @@ class StructuralSP(GraphKernel):
# compute kernel list.
kernel_list = [None] * len(g_list)
iterator = get_iters(range(len(g_list)), desc='Computing kernels',
file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))
if self._compute_method == 'trie':
for i in iterator:
kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i])
@@ -145,10 +145,10 @@ class StructuralSP(GraphKernel):
# get shortest paths of g1 and each graph in g_list.
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed'])
splist = [None] * len(g_list)
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
itr = zip(g_list, range(0, len(g_list)))
if len(g_list) < 100 * self._n_jobs:
chunksize = int(len(g_list) / self._n_jobs) + 1
if len(g_list) < 100 * self.n_jobs:
chunksize = int(len(g_list) / self.n_jobs) + 1
else:
chunksize = 100
# get shortest path graphs of g_list
@@ -158,7 +158,7 @@ class StructuralSP(GraphKernel):
get_sps_fun = self._wrapper_get_sps_naive
iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize),
desc='getting shortest paths', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
length=len(g_list), verbose=(self.verbose >= 2))
for i, sp in iterator:
splist[i] = sp
pool.close()
@@ -182,7 +182,7 @@ class StructuralSP(GraphKernel):
itr = range(len(g_list))
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

return kernel_list



+ 19
- 19
gklearn/kernels/sylvester_equation.py View File

@@ -14,6 +14,7 @@ import sys
from gklearn.utils import get_iters
import numpy as np
import networkx as nx
from control import dlyap
from gklearn.utils.parallel import parallel_gm, parallel_me
from gklearn.kernels import RandomWalkMeta

@@ -22,14 +23,13 @@ class SylvesterEquation(RandomWalkMeta):


def __init__(self, **kwargs):
from control import dlyap
super().__init__(**kwargs)


def _compute_gm_series(self):
self._check_edge_weight(self._graphs, self._verbose)
self._check_edge_weight(self._graphs, self.verbose)
self._check_graphs(self._graphs)
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored.')

@@ -41,7 +41,7 @@ class SylvesterEquation(RandomWalkMeta):
if self._q is None:
# don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list actually contains the transposes of the adjacency matrices.
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2))
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator]
# # normalized adjacency matrices
# A_wave_list = []
@@ -55,7 +55,7 @@ class SylvesterEquation(RandomWalkMeta):
from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2))
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2))

for i, j in iterator:
kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda)
@@ -71,9 +71,9 @@ class SylvesterEquation(RandomWalkMeta):


def _compute_gm_imap_unordered(self):
self._check_edge_weight(self._graphs, self._verbose)
self._check_edge_weight(self._graphs, self.verbose)
self._check_graphs(self._graphs)
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored.')

@@ -83,7 +83,7 @@ class SylvesterEquation(RandomWalkMeta):
if self._q is None:
# don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list actually contains the transposes of the adjacency matrices.
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2))
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel?

if self._p is None: # p is uniform distribution as default.
@@ -94,7 +94,7 @@ class SylvesterEquation(RandomWalkMeta):
do_fun = self._wrapper_kernel_do

parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(A_wave_list,), n_jobs=self.n_jobs, verbose=self.verbose)

else: # @todo
pass
@@ -105,9 +105,9 @@ class SylvesterEquation(RandomWalkMeta):


def _compute_kernel_list_series(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose)
self._check_edge_weight(g_list + [g1], self.verbose)
self._check_graphs(g_list + [g1])
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored.')

@@ -120,11 +120,11 @@ class SylvesterEquation(RandomWalkMeta):
# don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list actually contains the transposes of the adjacency matrices.
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose()
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2))
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator]

if self._p is None: # p is uniform distribution as default.
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))

for i in iterator:
kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda)
@@ -139,9 +139,9 @@ class SylvesterEquation(RandomWalkMeta):


def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._check_edge_weight(g_list + [g1], self._verbose)
self._check_edge_weight(g_list + [g1], self.verbose)
self._check_graphs(g_list + [g1])
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored.')

@@ -152,7 +152,7 @@ class SylvesterEquation(RandomWalkMeta):
# don't normalize adjacency matrices if q is a uniform vector. Note
# A_wave_list actually contains the transposes of the adjacency matrices.
A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose()
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2))
A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel?

if self._p is None: # p is uniform distribution as default.
@@ -169,7 +169,7 @@ class SylvesterEquation(RandomWalkMeta):
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

else: # @todo
pass
@@ -184,9 +184,9 @@ class SylvesterEquation(RandomWalkMeta):


def _compute_single_kernel_series(self, g1, g2):
self._check_edge_weight([g1] + [g2], self._verbose)
self._check_edge_weight([g1] + [g2], self.verbose)
self._check_graphs([g1] + [g2])
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('All labels are ignored.')



+ 333
- 90
gklearn/kernels/treelet.py View File

@@ -18,6 +18,8 @@ import numpy as np
import networkx as nx
from collections import Counter
from itertools import chain
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError
from gklearn.utils import SpecialLabel
from gklearn.utils.parallel import parallel_gm, parallel_me
from gklearn.utils.utils import find_all_paths, get_mlti_dim_node_attrs
@@ -26,14 +28,211 @@ from gklearn.kernels import GraphKernel

class Treelet(GraphKernel):

def __init__(self, **kwargs):
GraphKernel.__init__(self)
self._node_labels = kwargs.get('node_labels', [])
self._edge_labels = kwargs.get('edge_labels', [])
self._sub_kernel = kwargs.get('sub_kernel', None)
self._ds_infos = kwargs.get('ds_infos', {})
if self._sub_kernel is None:
raise Exception('Sub kernel not set.')
def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs):
"""Initialise a treelet kernel.
"""
super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose)
self.node_labels = kwargs.get('node_labels', [])
self.edge_labels = kwargs.get('edge_labels', [])
self.sub_kernel = kwargs.get('sub_kernel', None)
self.ds_infos = kwargs.get('ds_infos', {})
self.precompute_canonkeys = precompute_canonkeys
self.save_canonkeys = save_canonkeys


##########################################################################
# The following is the 1st paradigm to compute kernel matrix, which is
# compatible with `scikit-learn`.
# -------------------------------------------------------------------
# Special thanks to the "GraKeL" library for providing an excellent template!
##########################################################################


def clear_attributes(self):
super().clear_attributes()
if hasattr(self, '_canonkeys'):
delattr(self, '_canonkeys')
if hasattr(self, '_Y_canonkeys'):
delattr(self, '_Y_canonkeys')
if hasattr(self, '_dummy_labels_considered'):
delattr(self, '_dummy_labels_considered')


def validate_parameters(self):
"""Validate all parameters for the transformer.

Returns
-------
None.

"""
super().validate_parameters()
if self.sub_kernel is None:
raise ValueError('Sub-kernel not set.')


def _compute_kernel_matrix_series(self, Y):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) without parallelization.

Parameters
----------
Y : list of graphs, optional
The target graphs.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""

# self._add_dummy_labels will modify the input in place.
self._add_dummy_labels() # For self._graphs
# Y = [g.copy() for g in Y] # @todo: ?
self._add_dummy_labels(Y)

# get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset.

# Canonical keys for self._graphs.
try:
check_is_fitted(self, ['_canonkeys'])
canonkeys_list1 = self._canonkeys
except NotFittedError:
canonkeys_list1 = []
iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2))
for g in iterator:
canonkeys_list1.append(self._get_canonkeys(g))

if self.save_canonkeys:
self._canonkeys = canonkeys_list1

# Canonical keys for Y.
canonkeys_list2 = []
iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2))
for g in iterator:
canonkeys_list2.append(self._get_canonkeys(g))

if self.save_canonkeys:
self._Y_canonkeys = canonkeys_list2

# compute kernel matrix.
kernel_matrix = np.zeros((len(Y), len(canonkeys_list1)))

from itertools import product
itr = product(range(len(Y)), range(len(canonkeys_list1)))
len_itr = int(len(Y) * len(canonkeys_list1))
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self.verbose >= 2))
for i_y, i_x in iterator:
kernel = self._kernel_do(canonkeys_list2[i_y], canonkeys_list1[i_x])
kernel_matrix[i_y][i_x] = kernel

return kernel_matrix


def _compute_kernel_matrix_imap_unordered(self, Y):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) using imap unordered parallelization.

Parameters
----------
Y : list of graphs, optional
The target graphs.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
raise Exception('Parallelization for kernel matrix is not implemented.')


def pairwise_kernel(self, x, y, are_keys=False):
"""Compute pairwise kernel between two graphs.

Parameters
----------
x, y : NetworkX Graph.
Graphs bewteen which the kernel is computed.

are_keys : boolean, optional
If `True`, `x` and `y` are canonical keys, otherwise are graphs.
The default is False.

Returns
-------
kernel: float
The computed kernel.

"""
if are_keys:
# x, y are canonical keys.
kernel = self._kernel_do(x, y)

else:
# x, y are graphs.
kernel = self._compute_single_kernel_series(x, y)

return kernel


def diagonals(self):
"""Compute the kernel matrix diagonals of the fit/transformed data.

Returns
-------
X_diag : numpy array
The diagonal of the kernel matrix between the fitted data.
This consists of each element calculated with itself.

Y_diag : numpy array
The diagonal of the kernel matrix, of the transform.
This consists of each element calculated with itself.

"""
# Check if method "fit" had been called.
check_is_fitted(self, ['_graphs'])

# Check if the diagonals of X exist.
try:
check_is_fitted(self, ['_X_diag'])
except NotFittedError:
# Compute diagonals of X.
self._X_diag = np.empty(shape=(len(self._graphs),))
try:
check_is_fitted(self, ['_canonkeys'])
for i, x in enumerate(self._canonkeys):
self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=True) # @todo: parallel?
except NotFittedError:
for i, x in enumerate(self._graphs):
self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=False) # @todo: parallel?

try:
# If transform has happened, return both diagonals.
check_is_fitted(self, ['_Y'])
self._Y_diag = np.empty(shape=(len(self._Y),))
try:
check_is_fitted(self, ['_Y_canonkeys'])
for (i, y) in enumerate(self._Y_canonkeys):
self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=True) # @todo: parallel?
except NotFittedError:
for (i, y) in enumerate(self._Y):
self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=False) # @todo: parallel?

return self._X_diag, self._Y_diag

except NotFittedError:
# Else just return both X_diag
return self._X_diag


##########################################################################
# The following is the 2nd paradigm to compute kernel matrix. It is
# simplified and not compatible with `scikit-learn`.
##########################################################################


def _compute_gm_series(self):
@@ -43,10 +242,13 @@ class Treelet(GraphKernel):
# time, but this may cost a lot of memory for large dataset.
canonkeys = []
iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout,
verbose=(self._verbose >= 2))
verbose=(self.verbose >= 2))
for g in iterator:
canonkeys.append(self._get_canonkeys(g))

if self.save_canonkeys:
self._canonkeys = canonkeys

# compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

@@ -54,7 +256,7 @@ class Treelet(GraphKernel):
itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
length=len_itr, verbose=(self._verbose >= 2))
length=len_itr, verbose=(self.verbose >= 2))
for i, j in iterator:
kernel = self._kernel_do(canonkeys[i], canonkeys[j])
gram_matrix[i][j] = kernel
@@ -68,22 +270,25 @@ class Treelet(GraphKernel):

# get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset.
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
itr = zip(self._graphs, range(0, len(self._graphs)))
if len(self._graphs) < 100 * self._n_jobs:
chunksize = int(len(self._graphs) / self._n_jobs) + 1
if len(self._graphs) < 100 * self.n_jobs:
chunksize = int(len(self._graphs) / self.n_jobs) + 1
else:
chunksize = 100
canonkeys = [[] for _ in range(len(self._graphs))]
get_fun = self._wrapper_get_canonkeys
iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize),
desc='getting canonkeys', file=sys.stdout,
length=len(self._graphs), verbose=(self._verbose >= 2))
length=len(self._graphs), verbose=(self.verbose >= 2))
for i, ck in iterator:
canonkeys[i] = ck
pool.close()
pool.join()

if self.save_canonkeys:
self._canonkeys = canonkeys

# compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))

@@ -92,25 +297,25 @@ class Treelet(GraphKernel):
G_canonkeys = canonkeys_toshare
do_fun = self._wrapper_kernel_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(canonkeys,), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(canonkeys,), n_jobs=self.n_jobs, verbose=self.verbose)

return gram_matrix


def _compute_kernel_list_series(self, g1, g_list):
self._add_dummy_labels(g_list + [g1])
# self._add_dummy_labels(g_list + [g1])

# get all canonical keys of all graphs before computing kernels to save
# time, but this may cost a lot of memory for large dataset.
canonkeys_1 = self._get_canonkeys(g1)
canonkeys_list = []
iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self._verbose >= 2))
iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self.verbose >= 2))
for g in iterator:
canonkeys_list.append(self._get_canonkeys(g))

# compute kernel list.
kernel_list = [None] * len(g_list)
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2))
iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2))
for i in iterator:
kernel = self._kernel_do(canonkeys_1, canonkeys_list[i])
kernel_list[i] = kernel
@@ -125,16 +330,16 @@ class Treelet(GraphKernel):
# time, but this may cost a lot of memory for large dataset.
canonkeys_1 = self._get_canonkeys(g1)
canonkeys_list = [[] for _ in range(len(g_list))]
pool = Pool(self._n_jobs)
pool = Pool(self.n_jobs)
itr = zip(g_list, range(0, len(g_list)))
if len(g_list) < 100 * self._n_jobs:
chunksize = int(len(g_list) / self._n_jobs) + 1
if len(g_list) < 100 * self.n_jobs:
chunksize = int(len(g_list) / self.n_jobs) + 1
else:
chunksize = 100
get_fun = self._wrapper_get_canonkeys
iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize),
desc='getting canonkeys', file=sys.stdout,
length=len(g_list), verbose=(self._verbose >= 2))
length=len(g_list), verbose=(self.verbose >= 2))
for i, ck in iterator:
canonkeys_list[i] = ck
pool.close()
@@ -154,7 +359,7 @@ class Treelet(GraphKernel):
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)

return kernel_list

@@ -164,13 +369,13 @@ class Treelet(GraphKernel):


def _compute_single_kernel_series(self, g1, g2):
self._add_dummy_labels([g1] + [g2])
# self._add_dummy_labels([g1] + [g2])
canonkeys_1 = self._get_canonkeys(g1)
canonkeys_2 = self._get_canonkeys(g2)
kernel = self._kernel_do(canonkeys_1, canonkeys_2)
return kernel

# @profile
def _kernel_do(self, canonkey1, canonkey2):
"""Compute treelet graph kernel between 2 graphs.

@@ -187,7 +392,24 @@ class Treelet(GraphKernel):
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs
vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys])
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys])
kernel = self._sub_kernel(vector1, vector2)

# vector1, vector2 = [], []
# keys1, keys2 = canonkey1, canonkey2
# keys_searched = {}
# for k, v in canonkey1.items():
# if k in keys2:
# vector1.append(v)
# vector2.append(canonkey2[k])
# keys_searched[k] = v

# for k, v in canonkey2.items():
# if k in keys1 and k not in keys_searched:
# vector1.append(canonkey1[k])
# vector2.append(v)

# vector1, vector2 = np.array(vector1), np.array(vector2)

kernel = self.sub_kernel(vector1, vector2)
return kernel


@@ -223,7 +445,7 @@ class Treelet(GraphKernel):
patterns['0'] = list(G.nodes())
canonkey['0'] = nx.number_of_nodes(G)
for i in range(1, 6): # for i in range(1, 6):
patterns[str(i)] = find_all_paths(G, i, self._ds_infos['directed'])
patterns[str(i)] = find_all_paths(G, i, self.ds_infos['directed'])
canonkey[str(i)] = len(patterns[str(i)])

# n-star patterns
@@ -317,11 +539,11 @@ class Treelet(GraphKernel):
### pattern obtained in the structural analysis section above, which is a
### string corresponding to a unique treelet. A dictionary is built to keep
### track of the amount of every treelet.
if len(self._node_labels) > 0 or len(self._edge_labels) > 0:
if len(self.node_labels) > 0 or len(self.edge_labels) > 0:
canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet.

# linear patterns
canonkey_t = Counter(get_mlti_dim_node_attrs(G, self._node_labels))
canonkey_t = Counter(get_mlti_dim_node_attrs(G, self.node_labels))
for key in canonkey_t:
canonkey_l[('0', key)] = canonkey_t[key]

@@ -330,9 +552,9 @@ class Treelet(GraphKernel):
for pattern in patterns[str(i)]:
canonlist = []
for idx, node in enumerate(pattern[:-1]):
canonlist.append(tuple(G.nodes[node][nl] for nl in self._node_labels))
canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self._edge_labels))
canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self._node_labels))
canonlist.append(tuple(G.nodes[node][nl] for nl in self.node_labels))
canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self.edge_labels))
canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self.node_labels))
canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1]
treelet.append(tuple([str(i)] + canonkey_t))
canonkey_l.update(Counter(treelet))
@@ -343,13 +565,13 @@ class Treelet(GraphKernel):
for pattern in patterns[str(i) + 'star']:
canonlist = []
for leaf in pattern[1:]:
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels)
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels)
canonlist.append(tuple((nlabels, elabels)))
canonlist.sort()
canonlist = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['d' if i == 5 else str(i * 2)] +
[tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
[tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)]
+ canonlist)
treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet))
@@ -359,17 +581,17 @@ class Treelet(GraphKernel):
for pattern in patterns['7']:
canonlist = []
for leaf in pattern[1:3]:
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels)
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels)
canonlist.append(tuple((nlabels, elabels)))
canonlist.sort()
canonlist = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['7']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)])
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)]
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)]
+ [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)]
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)])
treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet))

@@ -378,38 +600,38 @@ class Treelet(GraphKernel):
for pattern in patterns['11']:
canonlist = []
for leaf in pattern[1:4]:
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels)
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels)
canonlist.append(tuple((nlabels, elabels)))
canonlist.sort()
canonlist = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['b']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[4]][pattern[0]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)])
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist
+ [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)]
+ [tuple(G[pattern[4]][pattern[0]][el] for el in self.edge_labels)]
+ [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels)]
+ [tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)])
treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet))

# pattern 10
treelet = []
for pattern in patterns['10']:
canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels),
tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)]
canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels),
tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)]
canonlist = []
for leaf in pattern[1:3]:
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels)
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels)
canonlist.append(tuple((nlabels, elabels)))
canonlist.sort()
canonkey0 = list(chain.from_iterable(canonlist))
canonkey_t = tuple(['a']
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)]
+ [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)]
+ [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)]
+ [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)]
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)]
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)]
+ canonkey4 + canonkey0)
treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet))
@@ -419,15 +641,15 @@ class Treelet(GraphKernel):
for pattern in patterns['12']:
canonlist0 = []
for leaf in pattern[1:3]:
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels)
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels)
elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels)
canonlist0.append(tuple((nlabels, elabels)))
canonlist0.sort()
canonlist0 = list(chain.from_iterable(canonlist0))
canonlist3 = []
for leaf in pattern[4:6]:
nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels)
elabels = tuple(G[leaf][pattern[3]][el] for el in self._edge_labels)
nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels)
elabels = tuple(G[leaf][pattern[3]][el] for el in self.edge_labels)
canonlist3.append(tuple((nlabels, elabels)))
canonlist3.sort()
canonlist3 = list(chain.from_iterable(canonlist3))
@@ -435,14 +657,14 @@ class Treelet(GraphKernel):
# 2 possible key can be generated from 2 nodes with extended label 3,
# select the one with lower lexicographic order.
canonkey_t1 = tuple(['c']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist0
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)]
+ [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)]
+ canonlist3)
canonkey_t2 = tuple(['c']
+ [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)]
+ [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + canonlist3
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)]
+ [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)]
+ canonlist0)
treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2)
canonkey_l.update(Counter(treelet))
@@ -450,24 +672,24 @@ class Treelet(GraphKernel):
# pattern 9
treelet = []
for pattern in patterns['9']:
canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels),
tuple(G[pattern[4]][pattern[2]][el] for el in self._edge_labels)]
canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels),
tuple(G[pattern[5]][pattern[3]][el] for el in self._edge_labels)]
prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self._node_labels),
tuple(G[pattern[2]][pattern[0]][el] for el in self._edge_labels)]
prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels),
tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)]
canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels),
tuple(G[pattern[4]][pattern[2]][el] for el in self.edge_labels)]
canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels),
tuple(G[pattern[5]][pattern[3]][el] for el in self.edge_labels)]
prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self.node_labels),
tuple(G[pattern[2]][pattern[0]][el] for el in self.edge_labels)]
prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels),
tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)]
if prekey2 + canonkey2 < prekey3 + canonkey3:
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \
+ prekey2 + prekey3 + canonkey2 + canonkey3
else:
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \
canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \
+ [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \
+ prekey3 + prekey2 + canonkey3 + canonkey2
treelet.append(tuple(['9']
+ [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)]
+ [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)]
+ canonkey_t))
canonkey_l.update(Counter(treelet))

@@ -482,12 +704,33 @@ class Treelet(GraphKernel):
return i, self._get_canonkeys(g)


def _add_dummy_labels(self, Gn):
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self._node_labels = [SpecialLabel.DUMMY]
if len(self._edge_labels) == 0 or (len(self._edge_labels) == 1 and self._edge_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self._edge_labels = [SpecialLabel.DUMMY]
def _add_dummy_labels(self, Gn=None):
def _add_dummy(Gn):
if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self.node_labels = [SpecialLabel.DUMMY]
if len(self.edge_labels) == 0 or (len(self.edge_labels) == 1 and self.edge_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self.edge_labels = [SpecialLabel.DUMMY]

if Gn is None or Gn is self._graphs:
# Add dummy labels for the copy of self._graphs.
try:
check_is_fitted(self, ['_dummy_labels_considered'])
if not self._dummy_labels_considered:
Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs]
_add_dummy(Gn)
self._graphs = Gn
self._dummy_labels_considered = True
except NotFittedError:
Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs]
_add_dummy(Gn)
self._graphs = Gn
self._dummy_labels_considered = True

else:
# Add dummy labels for the input.
_add_dummy(Gn)


+ 448
- 78
gklearn/kernels/weisfeiler_lehman.py View File

@@ -14,30 +14,48 @@ Created on Tue Apr 14 15:16:34 2020

import numpy as np
import networkx as nx
import sys
from collections import Counter
# from functools import partial
from itertools import combinations_with_replacement
from gklearn.utils import SpecialLabel
from gklearn.utils.parallel import parallel_gm, parallel_me
from gklearn.kernels import GraphKernel
from gklearn.utils.iters import get_iters


class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


def __init__(self, **kwargs):
GraphKernel.__init__(self)
self._node_labels = kwargs.get('node_labels', [])
self._edge_labels = kwargs.get('edge_labels', [])
self._height = int(kwargs.get('height', 0))
self.node_labels = kwargs.get('node_labels', [])
self.edge_labels = kwargs.get('edge_labels', [])
self.height = int(kwargs.get('height', 0))
self._base_kernel = kwargs.get('base_kernel', 'subtree')
self._ds_infos = kwargs.get('ds_infos', {})


##########################################################################
# The following is the 1st paradigm to compute kernel matrix, which is
# compatible with `scikit-learn`.
# -------------------------------------------------------------------
# Special thanks to the "GraKeL" library for providing an excellent template!
##########################################################################


##########################################################################
# The following is the 2nd paradigm to compute kernel matrix. It is
# simplified and not compatible with `scikit-learn`.
##########################################################################


def _compute_gm_series(self):
# if self._verbose >= 2:
# if self.verbose >= 2:
# import warnings
# warnings.warn('A part of the computation is parallelized.')

self._add_dummy_node_labels(self._graphs)
# self._add_dummy_node_labels(self._graphs)

# for WL subtree kernel
if self._base_kernel == 'subtree':
@@ -59,7 +77,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


def _compute_gm_imap_unordered(self):
self._add_dummy_node_labels(self._graphs)
# self._add_dummy_node_labels(self._graphs)

if self._base_kernel == 'subtree':
gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
@@ -74,17 +92,17 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
G_gn = gn_toshare
do_fun = self._wrapper_pairwise
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose)
return gram_matrix
else:
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('This base kernel is not parallelized. The serial computation is used instead.')
return self._compute_gm_series()


def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better.
# if self._verbose >= 2:
# if self.verbose >= 2:
# import warnings
# warnings.warn('A part of the computation is parallelized.')

@@ -126,10 +144,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered',
n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose)
return kernel_list
else:
if self._verbose >= 2:
if self.verbose >= 2:
import warnings
warnings.warn('This base kernel is not parallelized. The serial computation is used instead.')
return self._compute_kernel_list_series(g1, g_list)
@@ -160,6 +178,30 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.

return gram_matrix[0][1]

##########################################################################
# The following are the methods used by both diagrams.
##########################################################################

def validate_parameters(self):
"""Validate all parameters for the transformer.

Returns
-------
None.

"""
super().validate_parameters()
if len(self.node_labels) == 0:
if len(self.edge_labels) == 0:
self._subtree_kernel_do = self._subtree_kernel_do_unlabeled
else:
self._subtree_kernel_do = self._subtree_kernel_do_el
else:
if len(self.edge_labels) == 0:
self._subtree_kernel_do = self._subtree_kernel_do_nl
else:
self._subtree_kernel_do = self._subtree_kernel_do_labeled


def pairwise_kernel(self, g1, g2):
Gn = [g1.copy(), g2.copy()] # @todo: make sure it is a full deep copy. and faster!
@@ -172,9 +214,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
for G in Gn:
# set all labels into a tuple.
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels)
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
# get the set of original labels
labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values())
labels_ori = list(nx.get_node_attributes(G, 'lt').values())
# number of occurence of each label in G
all_num_of_each_label.append(dict(Counter(labels_ori)))

@@ -182,22 +224,22 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
kernel = self._compute_kernel_itr(kernel, all_num_of_each_label)

# iterate each height
for h in range(1, self._height + 1):
for h in range(1, self.height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for idx, G in enumerate(Gn):
for G in Gn:

all_multisets = []
for node, attrs in G.nodes(data=True):
# Multiset-label determination.
multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]]
multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]]
# sorting each multiset
multiset.sort()
multiset = [attrs['label_tuple']] + multiset # add the prefix
multiset = [attrs['lt']] + multiset # add the prefix
all_multisets.append(tuple(multiset))

# label compression
@@ -208,19 +250,19 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
# else assign the number of labels occured + 1 as the compressed label.
for value in set_unique:
if value in all_set_compressed.keys():
set_compressed.update({value: all_set_compressed[value]})
set_compressed[value] = all_set_compressed[value]
else:
set_compressed.update({value: str(num_of_labels_occured + 1)})
set_compressed[value] = str(num_of_labels_occured + 1)
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)

# relabel nodes
for idx, node in enumerate(G.nodes()):
G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]]
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]

# get the set of compressed labels
labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values())
labels_comp = list(nx.get_node_attributes(G, 'lt').values())
# all_labels_ori.update(labels_comp)
all_num_of_each_label.append(dict(Counter(labels_comp)))

@@ -249,8 +291,8 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
return kernel


def _subtree_kernel_do(self, Gn):
"""Compute Weisfeiler-Lehman kernels between graphs.
def _subtree_kernel_do_nl(self, Gn):
"""Compute Weisfeiler-Lehman kernels between graphs with node labels.

Parameters
----------
@@ -268,12 +310,16 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

# for each graph
for G in Gn:
# set all labels into a tuple.
if self.verbose >= 2:
iterator = get_iters(Gn, desc='Setting all labels into a tuple')
else:
iterator = Gn
for G in iterator:
# set all labels into a tuple. # @todo: remove this original labels or not?
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
G.nodes[nd]['label_tuple'] = tuple(attrs[name] for name in self._node_labels)
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
# get the set of original labels
labels_ori = list(nx.get_node_attributes(G, 'label_tuple').values())
labels_ori = list(nx.get_node_attributes(G, 'lt').values())
# number of occurence of each label in G
all_num_of_each_label.append(dict(Counter(labels_ori)))

@@ -281,74 +327,398 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)

# iterate each height
for h in range(1, self._height + 1):
for h in range(1, self.height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
# all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for idx, G in enumerate(Gn):
# if self.verbose >= 2:
# iterator = get_iters(enumerate(Gn), desc='Going through iteration ' + str(h), length=len(Gn))
# else:
# iterator = enumerate(Gn)
for G in Gn:
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

all_multisets = []
for node, attrs in G.nodes(data=True):
# Multiset-label determination.
multiset = [G.nodes[neighbors]['label_tuple'] for neighbors in G[node]]
# sorting each multiset
multiset.sort()
multiset = [attrs['label_tuple']] + multiset # add the prefix
all_multisets.append(tuple(multiset))
# Compute subtree kernel with h iterations and add it to the final kernel
self._compute_gram_itr(gram_matrix, all_num_of_each_label)

# label compression
set_unique = list(set(all_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
set_compressed = {}
# if a label occured before, assign its former compressed label,
# else assign the number of labels occured + 1 as the compressed label.
for value in set_unique:
if value in all_set_compressed.keys():
set_compressed.update({value: all_set_compressed[value]})
else:
set_compressed.update({value: str(num_of_labels_occured + 1)})
num_of_labels_occured += 1
return gram_matrix

all_set_compressed.update(set_compressed)

# relabel nodes
for idx, node in enumerate(G.nodes()):
G.nodes[node]['label_tuple'] = set_compressed[all_multisets[idx]]
def _subtree_kernel_do_el(self, Gn):
"""Compute Weisfeiler-Lehman kernels between graphs with edge labels.

# get the set of compressed labels
labels_comp = list(nx.get_node_attributes(G, 'label_tuple').values())
# all_labels_ori.update(labels_comp)
all_num_of_each_label.append(dict(Counter(labels_comp)))
Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are computed.

# Compute subtree kernel with h iterations and add it to the final kernel
Return
------
gram_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
gram_matrix = np.zeros((len(Gn), len(Gn)))

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2)
for i, j in iterator:
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
gram_matrix[j][i] = gram_matrix[i][j]


# if h >= 1.
if self.height > 0:
# Set all edge labels into a tuple. # @todo: remove this original labels or not?
if self.verbose >= 2:
iterator = get_iters(Gn, desc='Setting all labels into a tuple')
else:
iterator = Gn
for G in iterator:
for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way.
G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels)

# When h == 1, compute the kernel.
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for G in Gn:
num_of_labels_occured = self._subtree_1graph_el(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)


# Iterate along heights (>= 2).
for h in range(2, self.height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for G in Gn:
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)

return gram_matrix


def _subtree_kernel_do_labeled(self, Gn):
"""Compute Weisfeiler-Lehman kernels between graphs with both node and
edge labels.

Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are computed.

Return
------
gram_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
gram_matrix = np.zeros((len(Gn), len(Gn)))

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

# Set all node labels into a tuple and get # of occurence of each label.
if self.verbose >= 2:
iterator = get_iters(Gn, desc='Setting all node labels into a tuple')
else:
iterator = Gn
for G in iterator:
# Set all node labels into a tuple. # @todo: remove this original labels or not?
for nd, attrs in G.nodes(data=True): # @todo: there may be a better way.
G.nodes[nd]['lt'] = tuple(attrs[name] for name in self.node_labels)
# Get the set of original labels.
labels_ori = list(nx.get_node_attributes(G, 'lt').values())
# number of occurence of each label in G
all_num_of_each_label.append(dict(Counter(labels_ori)))

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)


# if h >= 1.
if self.height > 0:
# Set all edge labels into a tuple. # @todo: remove this original labels or not?
if self.verbose >= 2:
iterator = get_iters(Gn, desc='Setting all edge labels into a tuple')
else:
iterator = Gn
for G in iterator:
for n1, n2, attrs in G.edges(data=True): # @todo: there may be a better way.
G.edges[(n1, n2)]['lt'] = tuple(attrs[name] for name in self.edge_labels)

# When h == 1, compute the kernel.
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for G in Gn:
num_of_labels_occured = self._subtree_1graph_labeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)


# Iterate along heights.
for h in range(2, self.height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for G in Gn:
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)

return gram_matrix


def _subtree_kernel_do_unlabeled(self, Gn):
"""Compute Weisfeiler-Lehman kernels between graphs without labels.

Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are computed.

Return
------
gram_matrix : Numpy matrix
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
gram_matrix = np.zeros((len(Gn), len(Gn)))

# initial for height = 0
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

# Compute subtree kernel with the 0th iteration and add it to the final kernel.
iterator = combinations_with_replacement(range(0, len(gram_matrix)), 2)
for i, j in iterator:
gram_matrix[i][j] += nx.number_of_nodes(Gn[i]) * nx.number_of_nodes(Gn[j])
gram_matrix[j][i] = gram_matrix[i][j]


# if h >= 1.
if self.height > 0:
# When h == 1, compute the kernel.
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for G in Gn:
num_of_labels_occured = self._subtree_1graph_unlabeled(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)


# Iterate along heights (>= 2).
for h in range(2, self.height + 1):
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
all_num_of_each_label = [] # number of occurence of each label in G

# @todo: parallel this part.
for G in Gn:
num_of_labels_occured = self._subtree_1graph_nl(G, all_set_compressed, all_num_of_each_label, num_of_labels_occured)

# Compute subtree kernel with h iterations and add it to the final kernel.
self._compute_gram_itr(gram_matrix, all_num_of_each_label)

return gram_matrix


def _subtree_1graph_nl(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
all_multisets = []
for node, attrs in G.nodes(data=True):
# Multiset-label determination.
multiset = [G.nodes[neighbors]['lt'] for neighbors in G[node]]
# sorting each multiset
multiset.sort()
multiset = [attrs['lt']] + multiset # add the prefix
all_multisets.append(tuple(multiset))

# label compression
set_unique = list(set(all_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
set_compressed = {}
# If a label occured before, assign its former compressed label;
# otherwise assign the number of labels occured + 1 as the
# compressed label.
for value in set_unique:
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
set_compressed[value] = all_set_compressed[value]
else:
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str? and what if num_of_labels_occured is extremely big.
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)

# Relabel nodes.
for idx, node in enumerate(G.nodes()):
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]

# Get the set of compressed labels.
labels_comp = list(nx.get_node_attributes(G, 'lt').values())
all_num_of_each_label.append(dict(Counter(labels_comp)))

return num_of_labels_occured


def _subtree_1graph_el(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
all_multisets = []
# for node, attrs in G.nodes(data=True):
for node in G.nodes():
# Multiset-label determination.
multiset = [G.edges[(node, neighbors)]['lt'] for neighbors in G[node]] # @todo: check reference for this.
# sorting each multiset
multiset.sort()
# multiset = [attrs['lt']] + multiset # add the prefix
all_multisets.append(tuple(multiset))

# label compression
set_unique = list(set(all_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
set_compressed = {}
# If a label occured before, assign its former compressed label;
# otherwise assign the number of labels occured + 1 as the
# compressed label.
for value in set_unique:
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
set_compressed[value] = all_set_compressed[value]
else:
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str?
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)

# Relabel nodes.
for idx, node in enumerate(G.nodes()):
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]

# Get the set of compressed labels.
labels_comp = list(nx.get_node_attributes(G, 'lt').values()) # @todo: maybe can be faster.
all_num_of_each_label.append(dict(Counter(labels_comp)))

return num_of_labels_occured


def _subtree_1graph_labeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
all_multisets = []
for node, attrs in G.nodes(data=True):
# Multiset-label determination.
multiset = [tuple((G.edges[(node, neighbors)]['lt'], G.nodes[neighbors]['lt'])) for neighbors in G[node]] # @todo: check reference for this.
# sorting each multiset
multiset.sort()
multiset = [attrs['lt']] + multiset # add the prefix
all_multisets.append(tuple(multiset))

# label compression
set_unique = list(set(all_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
set_compressed = {}
# If a label occured before, assign its former compressed label;
# otherwise assign the number of labels occured + 1 as the
# compressed label.
for value in set_unique:
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
set_compressed[value] = all_set_compressed[value]
else:
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str?
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)

# Relabel nodes.
for idx, node in enumerate(G.nodes()):
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]

# Get the set of compressed labels.
labels_comp = list(nx.get_node_attributes(G, 'lt').values())
all_num_of_each_label.append(dict(Counter(labels_comp)))

return num_of_labels_occured


def _subtree_1graph_unlabeled(self, G, all_set_compressed, all_num_of_each_label, num_of_labels_occured):
# all_multisets = []
# for node, attrs in G.nodes(data=True): # @todo: it can be better.
# # Multiset-label determination.
# multiset = [0 for neighbors in G[node]]
# # sorting each multiset
# multiset.sort()
# multiset = [0] + multiset # add the prefix
# all_multisets.append(tuple(multiset))
all_multisets = [len(G[node]) for node in G.nodes()]

# label compression
set_unique = list(set(all_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
set_compressed = {}
# If a label occured before, assign its former compressed label;
# otherwise assign the number of labels occured + 1 as the
# compressed label.
for value in set_unique:
if value in all_set_compressed.keys(): # @todo: put keys() function out of for loop?
set_compressed[value] = all_set_compressed[value]
else:
set_compressed[value] = str(num_of_labels_occured + 1) # @todo: remove str?
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)

# Relabel nodes.
for idx, node in enumerate(G.nodes()):
G.nodes[node]['lt'] = set_compressed[all_multisets[idx]]

# Get the set of compressed labels.
labels_comp = list(nx.get_node_attributes(G, 'lt').values())
all_num_of_each_label.append(dict(Counter(labels_comp)))

return num_of_labels_occured


def _compute_gram_itr(self, gram_matrix, all_num_of_each_label):
"""Compute Gram matrix using the base kernel.
"""
# if self._parallel == 'imap_unordered':
# if self.parallel == 'imap_unordered':
# # compute kernels.
# def init_worker(alllabels_toshare):
# global G_alllabels
# G_alllabels = alllabels_toshare
# do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix)
# parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker,
# glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose)
# elif self._parallel is None:
for i in range(len(gram_matrix)):
for j in range(i, len(gram_matrix)):
gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i],
all_num_of_each_label[j], gram_matrix[i][j])
gram_matrix[j][i] = gram_matrix[i][j]


def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2, kernel):
# glbv=(all_num_of_each_label,), n_jobs=self.n_jobs, verbose=self.verbose)
# elif self.parallel is None:
itr = combinations_with_replacement(range(0, len(gram_matrix)), 2)
len_itr = int(len(gram_matrix) * (len(gram_matrix) + 1) / 2)
iterator = get_iters(itr, desc='Computing Gram matrix for this iteration', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2))
for i, j in iterator:
# for i in iterator:
# for j in range(i, len(gram_matrix)):
gram_matrix[i][j] += self._compute_subtree_kernel(all_num_of_each_label[i],
all_num_of_each_label[j])
gram_matrix[j][i] = gram_matrix[i][j]


def _compute_subtree_kernel(self, num_of_each_label1, num_of_each_label2):
"""Compute the subtree kernel.
"""
labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
@@ -358,7 +728,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
vector2 = np.array([(num_of_each_label2[label]
if (label in num_of_each_label2.keys()) else 0)
for label in labels])
kernel += np.dot(vector1, vector2)
kernel = np.dot(vector1, vector2)
return kernel


@@ -426,9 +796,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
for value in set_unique:
if value in all_set_compressed.keys():
set_compressed.update({ value : all_set_compressed[value] })
set_compressed[value] = all_set_compressed[value]
else:
set_compressed.update({ value : str(num_of_labels_occured + 1) })
set_compressed[value] = str(num_of_labels_occured + 1)
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)
@@ -504,9 +874,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
for value in set_unique:
if value in all_set_compressed.keys():
set_compressed.update({ value : all_set_compressed[value] })
set_compressed[value] = all_set_compressed[value]
else:
set_compressed.update({ value : str(num_of_labels_occured + 1) })
set_compressed[value] = str(num_of_labels_occured + 1)
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)
@@ -577,9 +947,9 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
for value in set_unique:
if value in all_set_compressed.keys():
set_compressed.update({ value : all_set_compressed[value] })
set_compressed[value] = all_set_compressed[value]
else:
set_compressed.update({ value : str(num_of_labels_occured + 1) })
set_compressed[value] = str(num_of_labels_occured + 1)
num_of_labels_occured += 1

all_set_compressed.update(set_compressed)
@@ -595,10 +965,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel.


def _add_dummy_node_labels(self, Gn):
if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY):
if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY):
for i in range(len(Gn)):
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY)
self._node_labels = [SpecialLabel.DUMMY]
self.node_labels = [SpecialLabel.DUMMY]


class WLSubtree(WeisfeilerLehman):


+ 14
- 0
gklearn/model_learning/__init__.py View File

@@ -0,0 +1,14 @@
# -*-coding:utf-8 -*-
"""
model learning.
"""

# info
__version__ = "0.2"
__author__ = "Linlin Jia"
__date__ = "November 2020"


from gklearn.model_learning.nested_cv import NestedCV
from gklearn.model_learning.workflow import Workflow
from gklearn.model_learning.parameters import dichotomous_permutation

+ 714
- 0
gklearn/model_learning/nested_cv.py View File

@@ -0,0 +1,714 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 18:59:28 2020

@author: ljia
"""
import os
import datetime
import time
import sys
from tqdm import tqdm
from multiprocessing import Pool, Array
from functools import partial
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold, train_test_split, ParameterGrid
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error


class NestedCV(object):
"""Perform model selection, fitting and testing for precomputed kernels
using nested CV. Print out neccessary data during the process then finally
the results.

Parameters
----------
datafile : string
Path of dataset file.
estimator : function
kernel function used to estimate. This function needs to return a gram matrix.
param_grid_precomputed : dictionary
Dictionary with names (string) of parameters used to calculate gram
matrices as keys and lists of parameter settings to try as values. This
enables searching over any sequence of parameter settings. Params with
length 1 will be omitted.
param_grid : dictionary
Dictionary with names (string) of parameters used as penelties as keys
and lists of parameter settings to try as values. This enables
searching over any sequence of parameter settings. Params with length 1
will be omitted.
model_type : string
Type of the problem, can be 'regression' or 'classification'.
NUM_TRIALS : integer
Number of random trials of the outer CV loop. The default is 30.
datafile_y : string
Path of file storing y data. This parameter is optional depending on
the given dataset file.
extra_params : dict
Extra parameters for loading dataset. See function gklearn.utils.
graphfiles.loadDataset for detail.
ds_name : string
Name of the dataset.
n_jobs : int
Number of jobs for parallelization.
read_gm_from_file : boolean
Whether gram matrices are loaded from a file.

Examples
--------
>>> import numpy as np
>>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
>>> from gklearn.kernels.untilHPathKernel import untilhpathkernel
>>>
>>> datafile = '../datasets/MUTAG/MUTAG_A.txt'
>>> estimator = untilhpathkernel
>>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’:
[’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]}
>>> # ’C’ for classification problems and ’alpha’ for regression problems.
>>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’:
np.logspace(-10, 10, num=41, base=10)}]
>>>
>>> model_selection_for_precomputed_kernel(datafile, estimator,
param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’)
"""
def __init__(self, dataset, estimator, param_grid_precomputed=None, param_grid=None, model_type=None, num_trials=30, output_dir=None, n_jobs=1, save_gms=True, save_gm_figs=False, logging=True, verbose=True, **kwargs):
tqdm.monitor_interval = 0
self._ds = dataset
self._estimator = estimator
self._num_trials = num_trials
self._n_jobs = n_jobs
self._save_gms = save_gms
self._save_gm_figs = save_gm_figs
self._logging = logging
self._verbose = verbose
self._kwargs = kwargs

# Set dataset name.
if self._ds._ds_name is None:
self._ds_name = 'ds-unknown'
else:
self._ds_name = self._ds._ds_name

# The output directory.
if output_dir is None:
self._output_dir = os.path.join('outputs/', estimator.__name__)
else:
self._output_dir = output_dir
os.makedirs(self._output_dir, exist_ok=True)

# Setup the model type.
if model_type is None:
self._model_type = dataset._task_type
else:
self._model_type = model_type.lower()
if self._model_type != 'regression' and self._model_type != 'classification':
raise Exception('The model type is incorrect! Please choose from regression or classification.')

# @todo: Set param_grid_precomputed and param_grid.
self._param_grid_precomputed = param_grid_precomputed
self._param_grid = param_grid

if self._verbose:
print()
print('--- This is a %s problem ---' % self._model_type)
# A string to save all the results.
if self._logging:
self._str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
self._str_fw += '# This file contains results of ' + self._estimator.__name__ + ' on dataset ' + self._ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'
self._str_fw += 'This is a %s problem.\n' % self._model_type

self.run()


def run(self):
self.fit()
self.compute_gram_matrices()
if len(self._gram_matrices) == 0:
if self._verbose:
print('All gram matrices are ignored, no results obtained.')
if self._logging:
self._str_fw += '\nAll gram matrices are ignored, no results obtained.\n\n'
else:
self.do_cv()

# print out results as table.
if self._logging:
self._str_fw += self.printResultsInTable(self._param_list, self._param_list_pre_revised, self._average_val_scores, self._std_val_scores, self._average_perf_scores, self._std_perf_scores, self._average_train_scores, self._std_train_scores, self._gram_matrix_time, self._model_type, self._verbose)

# open file to save all results for this dataset.
if not os.path.exists(self._output_dir + '/' + self._ds_name + '.output.txt'):
with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'w') as f:
f.write(self._str_fw)
else:
with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'r+') as f:
content = f.read()
f.seek(0, 0)
f.write(self._str_fw + '\n\n\n' + content)

return self._final_performance, self._final_confidence


def fit(self):
return


def compute_gram_matrices(self):
"""Compute all gram matrices.

Returns
-------
None.

"""
# Grid of parameters with a discrete number of values for each.
self._param_list_precomputed = list(ParameterGrid(self._param_grid_precomputed))
self._param_list = list(ParameterGrid(self._param_grid))

self._gram_matrices = [
] # a list to store gram matrices for all param_grid_precomputed
self._gram_matrix_time = [
] # a list to store time to calculate gram matrices
self._param_list_pre_revised = [
] # list to store param grids precomputed ignoring the useless ones

if self._verbose:
print()
print('\n1. Computing gram matrices. This could take a while...')
if self._logging:
self._str_fw += '\nI. Gram matrices.\n\n'
self._tts = time.time() # start training time
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for idx, params_out in enumerate(self._param_list_precomputed):
y = self._ds.targets[:]
params_out['n_jobs'] = self._n_jobs
params_out['verbose'] = self._verbose
# print(dataset)
# import networkx as nx
# nx.draw_networkx(dataset[1])
# plt.show()
rtn_data = self._estimator(self._ds.graphs[:], **params_out) # @todo: Attention! this will not copy the graphs.
Kmatrix = rtn_data[0]
current_run_time = rtn_data[1]
# for some kernels, some graphs in datasets may not meet the
# kernels' requirements for graph structure. These graphs are trimmed.
if len(rtn_data) == 3:
idx_trim = rtn_data[2] # the index of trimmed graph list
y = [y[idxt] for idxt in idx_trim] # trim y accordingly
# Kmatrix = np.random.rand(2250, 2250)
# current_run_time = 0.1

# remove graphs whose kernels with themselves are zeros
# @todo: y not changed accordingly?
Kmatrix_diag = Kmatrix.diagonal().copy()
nb_g_ignore = 0
for idxk, diag in enumerate(Kmatrix_diag):
if diag == 0:
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0)
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
nb_g_ignore += 1

# normalization
# @todo: works only for undirected graph?
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
if self._verbose:
print()

if params_out == {}:
if self._verbose:
print('the gram matrix is: ')
if self._logging:
self._str_fw += 'the gram matrix is:\n\n'
else:
if self._verbose:
print('the gram matrix with parameters', params_out, 'is: \n\n')
if self._logging:
self._str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out

if len(Kmatrix) < 2:
nb_gm_ignore += 1
if self._verbose:
print('ignored, as at most only one of all its diagonal value is non-zero.')
if self._logging:
self._str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
else:
if np.isnan(Kmatrix).any(
): # if the matrix contains elements that are not numbers
nb_gm_ignore += 1
if self._verbose:
print('ignored, as it contains elements that are not numbers.')
if self._logging:
self._str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
else:
# print(Kmatrix)
if self._logging:
self._str_fw += np.array2string(
Kmatrix,
separator=',') + '\n\n'
# separator=',',
# threshold=np.inf,
# floatmode='unique') + '\n\n'

# Draw and save Gram matrix figures.
if self._save_gm_figs:
fig_file_name = self._output_dir + '/GM[ds]' + self._ds_name
if params_out != {}:
fig_file_name += '[params]' + str(idx)
plt.imshow(Kmatrix)
plt.colorbar()
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
# plt.show()
plt.clf()

self._gram_matrices.append(Kmatrix)
self._gram_matrix_time.append(current_run_time)
self._param_list_pre_revised.append(params_out)

if nb_g_ignore > 0:
if self._verbose:
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
if self._logging:
self._str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore

if self._verbose:
print()
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(self._param_list_precomputed), nb_gm_ignore))
if self._logging:
self._str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(self._param_list_precomputed), nb_gm_ignore)
self._str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
self._str_fw += ''.join(['{}: {}\n'.format(idx, params_out) for idx, params_out in enumerate(self._param_list_precomputed)])


def do_cv(self):
# save gram matrices to file.
# np.savez(output_dir + '/' + ds_name + '.gm',
# gms=gram_matrices, params=param_list_pre_revised, y=y,
# gmtime=gram_matrix_time)
if self._verbose:
print('2. Fitting and predicting using nested cross validation. This could really take a while...')

# ---- use pool.imap_unordered to parallel and track progress. ----
# train_pref = []
# val_pref = []
# test_pref = []
# def func_assign(result, var_to_assign):
# for idx, itm in enumerate(var_to_assign):
# itm.append(result[idx])
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)
#
# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign,
# [train_pref, val_pref, test_pref], glbv=gram_matrices,
# method='imap_unordered', n_jobs=n_jobs, chunksize=1,
# itr_desc='cross validation')

def init_worker(gms_toshare):
global G_gms
G_gms = gms_toshare

# gram_matrices = np.array(gram_matrices)
# gms_shape = gram_matrices.shape
# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C'))
# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape))
pool = Pool(processes=self._n_jobs, initializer=init_worker, initargs=(self._gram_matrices,))
trial_do_partial = partial(self._parallel_trial_do, self._param_list_pre_revised, self._param_list, self._ds.targets[:], self._model_type) # @todo: maybe self._ds.targets[:] should be y.
train_pref = []
val_pref = []
test_pref = []
# if NUM_TRIALS < 1000 * n_jobs:
# chunksize = int(NUM_TRIALS / n_jobs) + 1
# else:
# chunksize = 1000
chunksize = 1
if self._verbose:
iterator = tqdm(pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize), desc='cross validation', file=sys.stdout)
else:
iterator = pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize)
for o1, o2, o3 in iterator:
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
pool.close()
pool.join()

# # ---- use pool.map to parallel. ----
# pool = Pool(n_jobs)
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type)
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]

# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# print()

if self._verbose:
print()
print('3. Getting final performance...')
if self._logging:
self._str_fw += '\nII. Performance.\n\n'

# averages and confidences of performances on outer trials for each combination of parameters
self._average_train_scores = np.mean(train_pref, axis=0)
# print('val_pref: ', val_pref[0][0])
self._average_val_scores = np.mean(val_pref, axis=0)
# print('test_pref: ', test_pref[0][0])
self._average_perf_scores = np.mean(test_pref, axis=0)
# sample std is used here
self._std_train_scores = np.std(train_pref, axis=0, ddof=1)
self._std_val_scores = np.std(val_pref, axis=0, ddof=1)
self._std_perf_scores = np.std(test_pref, axis=0, ddof=1)

if self._model_type == 'regression':
best_val_perf = np.amin(self._average_val_scores)
else:
best_val_perf = np.amax(self._average_val_scores)
# print('average_val_scores: ', self._average_val_scores)
# print('best_val_perf: ', best_val_perf)
# print()
best_params_index = np.where(self._average_val_scores == best_val_perf)
# find smallest val std with best val perf.
best_val_stds = [
self._std_val_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
min_val_std = np.amin(best_val_stds)
best_params_index = np.where(self._std_val_scores == min_val_std)
best_params_out = [self._param_list_pre_revised[i] for i in best_params_index[0]]
best_params_in = [self._param_list[i] for i in best_params_index[1]]

if self._verbose:
print('best_params_out: ', best_params_out)
print('best_params_in: ', best_params_in)
print()
print('best_val_perf: ', best_val_perf)
print('best_val_std: ', min_val_std)
if self._logging:
self._str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
self._str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
self._str_fw += 'best_val_perf: %s\n' % best_val_perf
self._str_fw += 'best_val_std: %s\n' % min_val_std

# print(best_params_index)
# print(best_params_index[0])
# print(self._average_perf_scores)
self._final_performance = [
self._average_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
self._final_confidence = [
self._std_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]

if self._verbose:
print('final_performance: ', self._final_performance)
print('final_confidence: ', self._final_confidence)
if self._logging:
self._str_fw += 'final_performance: %s\n' % self._final_performance
self._str_fw += 'final_confidence: %s\n' % self._final_confidence

train_performance = [
self._average_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
train_std = [
self._std_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]

if self._verbose:
print('train_performance: %s' % train_performance)
print('train_std: ', train_std)
if self._logging:
self._str_fw += 'train_performance: %s\n' % train_performance
self._str_fw += 'train_std: %s\n\n' % train_std

if self._verbose:
print()

tt_total = time.time() - self._tts # training time for all hyper-parameters
average_gram_matrix_time = np.mean(self._gram_matrix_time)
std_gram_matrix_time = np.std(self._gram_matrix_time, ddof=1) if len(self._gram_matrix_time) > 1 else 0
best_gram_matrix_time = [self._gram_matrix_time[i] for i in best_params_index[0]]
ave_bgmt = np.mean(best_gram_matrix_time)
std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0

if self._verbose:
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
ave_bgmt, std_bgmt))
print('total training time with all hyper-param choices: {:.2f}s'.format(
tt_total))
if self._logging:
self._str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
self._str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
self._str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)

# # save results to file
# np.savetxt(results_name_pre + 'average_train_scores.dt',
# average_train_scores)
# np.savetxt(results_name_pre + 'average_val_scores', self._average_val_scores)
# np.savetxt(results_name_pre + 'average_perf_scores.dt',
# average_perf_scores)
# np.savetxt(results_name_pre + 'std_train_scores.dt', self._std_train_scores)
# np.savetxt(results_name_pre + 'std_val_scores.dt', self._std_val_scores)
# np.savetxt(results_name_pre + 'std_perf_scores.dt', self._std_perf_scores)

# np.save(results_name_pre + 'best_params_index', best_params_index)
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
# np.save(results_name_pre + 'best_params_in.dt', best_params_in)
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
# np.save(results_name_pre + 'best_val_std.dt', best_val_std)
# np.save(results_name_pre + 'final_performance.dt', self._final_performance)
# np.save(results_name_pre + 'final_confidence.dt', self._final_confidence)
# np.save(results_name_pre + 'train_performance.dt', train_performance)
# np.save(results_name_pre + 'train_std.dt', train_std)

# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
# np.save(results_name_pre + 'average_gram_matrix_time.dt',
# average_gram_matrix_time)
# np.save(results_name_pre + 'std_gram_matrix_time.dt',
# std_gram_matrix_time)
# np.save(results_name_pre + 'best_gram_matrix_time.dt',
# best_gram_matrix_time)


def trial_do(self, param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level

# # get gram matrices from global variables.
# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C')

# Arrays to store scores
train_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))

# randomness added to seeds of split function below. "high" is "size" times
# 10 so that at least 10 different random output will be yielded. Remove
# these lines if identical outputs is required.
rdm_out = np.random.RandomState(seed=None)
rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10,
size=len(param_list_pre_revised))
# print(trial, rdm_seed_out_l)
# print()
# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# get gram matrices from global variables.
# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]]
# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C')
gm_now = gram_matrices[index_out].copy()

# split gram matrix and y to app and test sets.
indices = range(len(y))
# The argument "random_state" in function "train_test_split" can not be
# set to None, because it will use RandomState instance used by
# np.random, which is possible for multiple subprocesses to inherit the
# same seed if they forked at the same time, leading to identical
# random variates for different subprocesses. Instead, we use "trial"
# and "index_out" parameters to generate different seeds for different
# trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add
# randomness into seeds, so that it yields a different output every
# time the program is run. To yield identical outputs every time,
# remove the second line below. Same method is used to the "KFold"
# function in the inner loop.
rdm_seed_out = (trial + 1) * (index_out + 1)
rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1)
# print(trial, rdm_seed_out)
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
gm_now, y, indices, test_size=0.1,
random_state=rdm_seed_out, shuffle=True)
# print(trial, idx_app, idx_test)
# print()
X_app = X_app[:, idx_app]
X_test = X_test[:, idx_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10,
size=len(param_list))
# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
# if trial == 0:
# print(index_out, index_in)
# print('params_in: ', params_in)
# st = time.time()
rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1)
# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in)
rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1)
# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in)
current_train_perf = []
current_valid_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
# try:
if self._model_type == 'regression':
kr = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# print("train_index, valid_index: ", trial, index_in, train_index, valid_index)
# if trial == 0:
# print('train_index: ', train_index)
# print('valid_index: ', valid_index)
# print('idx_test: ', idx_test)
# print('y_app[train_index]: ', y_app[train_index])
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
kr.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = kr.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = kr.predict(
X_app[valid_index, :][:, train_index])
# if trial == 0:
# print('y_pred_valid: ', y_pred_valid)
# print()
y_pred_test = kr.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
np.sqrt(
mean_squared_error(
y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(
mean_squared_error(
y_app[valid_index], y_pred_valid)))
# if trial == 0:
# print(mean_squared_error(
# y_app[valid_index], y_pred_valid))
current_test_perf.append(
np.sqrt(
mean_squared_error(
y_test, y_pred_test)))
# For clcassification use SVM
else:
svc = SVC(kernel='precomputed', cache_size=200,
verbose=False, **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
# if trial == 0:
# print('train_index: ', train_index)
# print('valid_index: ', valid_index)
# print('idx_test: ', idx_test)
# print('y_app[train_index]: ', y_app[train_index])
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
svc.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = svc.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = svc.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = svc.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
accuracy_score(y_app[train_index],
y_pred_train))
current_valid_perf.append(
accuracy_score(y_app[valid_index],
y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
# except ValueError:
# print(sys.exc_info()[0])
# print(params_out, params_in)

# average performance on inner splits
train_pref[index_out][index_in] = np.mean(
current_train_perf)
val_pref[index_out][index_in] = np.mean(
current_valid_perf)
test_pref[index_out][index_in] = np.mean(
current_test_perf)
# print(time.time() - st)
# if trial == 0:
# print('val_pref: ', val_pref)
# print('test_pref: ', test_pref)

return train_pref, val_pref, test_pref


def _parallel_trial_do(self, param_list_pre_revised, param_list, y, model_type, trial):
train_pref, val_pref, test_pref = self._trial_do(param_list_pre_revised,
param_list, G_gms, y,
model_type, trial)
return train_pref, val_pref, test_pref


def printResultsInTable(self, param_list, param_list_pre_revised, average_val_scores,
std_val_scores, average_perf_scores, std_perf_scores,
average_train_scores, std_train_scores, gram_matrix_time,
model_type, verbose):
from collections import OrderedDict
from tabulate import tabulate
table_dict = {}
if model_type == 'regression':
for param_in in param_list:
param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
else:
for param_in in param_list:
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = [
'{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['valid_perf'] = [
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['test_perf'] = [
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['train_perf'] = [
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]

keyorder = [
'params', 'train_perf', 'valid_perf', 'test_perf',
'gram_matrix_time'
]
if verbose:
print()
tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))), headers='keys')
# print(tb_print)
return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print

+ 89
- 0
gklearn/model_learning/parameters.py View File

@@ -0,0 +1,89 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 21 12:18:02 2021

@author: ljia
"""

def dichotomous_permutation(arr, layer=0):
import math

# def seperate_arr(arr, new_arr):
# if (length % 2) == 0:
# half = int(length / 2)
# new_arr += [arr[half - 1], arr[half]]
# subarr1 = [arr[i] for i in range(1, half - 1)]
# else:
# half = math.floor(length / 2)
# new_arr.append(arr[half])
# subarr1 = [arr[i] for i in range(1, half)]
# subarr2 = [arr[i] for i in range(half + 1, length - 1)]
# subarrs = [subarr1, subarr2]
# return subarrs


if layer == 0:
length = len(arr)
if length <= 2:
return arr

new_arr = [arr[0], arr[-1]]
if (length % 2) == 0:
half = int(length / 2)
new_arr += [arr[half - 1], arr[half]]
subarr1 = [arr[i] for i in range(1, half - 1)]
else:
half = math.floor(length / 2)
new_arr.append(arr[half])
subarr1 = [arr[i] for i in range(1, half)]
subarr2 = [arr[i] for i in range(half + 1, length - 1)]
subarrs = [subarr1, subarr2]
# subarrs = seperate_arr(arr, new_arr)
new_arr += dichotomous_permutation(subarrs, layer=layer+1)

else:
new_arr = []
subarrs = []
for a in arr:
length = len(a)
if length <= 2:
new_arr += a
else:
# subarrs += seperate_arr(a, new_arr)
if (length % 2) == 0:
half = int(length / 2)
new_arr += [a[half - 1], a[half]]
subarr1 = [a[i] for i in range(0, half - 1)]
else:
half = math.floor(length / 2)
new_arr.append(a[half])
subarr1 = [a[i] for i in range(0, half)]
subarr2 = [a[i] for i in range(half + 1, length)]
subarrs += [subarr1, subarr2]

if len(subarrs) > 0:
new_arr += dichotomous_permutation(subarrs, layer=layer+1)

return new_arr

# length = len(arr)
# if length <= 2:
# return arr

# new_arr = [arr[0], arr[-1]]
# if (length % 2) == 0:
# half = int(length / 2)
# new_arr += [arr[half - 1], arr[half]]
# subarr1 = [arr[i] for i in range(1, half - 1)]
# else:
# half = math.floor(length / 2)
# new_arr.append(arr[half])
# subarr1 = [arr[i] for i in range(1, half)]
# subarr2 = [arr[i] for i in range(half + 1, length - 1)]
# if len(subarr1) > 0:
# new_arr += dichotomous_permutation(subarr1)
# if len(subarr2) > 0:
# new_arr += dichotomous_permutation(subarr2)

# return new_arr

+ 109
- 0
gklearn/model_learning/workflow.py View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 19:33:51 2020

@author: ljia
"""
import os
import numpy as np
import pickle
from gklearn.dataset import Dataset
from gklearn.model_learning import NestedCV
from gklearn.kernels import GRAPH_KERNELS

class Workflow(object):


def __init__(self, **kwargs):
self._job_prefix = kwargs.get('job_prefix', 'gktask')
self._max_num_running_tasks = kwargs.get('max_num_running_tasks', np.inf)
self._root_dir = kwargs.get('root_dir', 'outputs/')


def run(self, tasks):
### Check inputs.
if self._check_inputs(tasks):
self._tasks = tasks
else:
raise ValueError('The input "tasks" is not correct.')


### Sort tasks.
self.sort_tasks_by_complexity()


### The main process.
complete = False
while not complete:

self.get_running_tasks()

if self._num_running_tasks < self._max_num_running_tasks:

### Load results from table.
self.load_results_from_table()

for task in self._tasks:
state = self.get_task_state(task)
if state != 'complete' and state != 'runnning':
self.run_task(task)

if self._num_running_tasks >= self._max_num_running_tasks:
break

### Save results.
self.save_results()

complete = self.check_completeness()

# sleep()


def _check_inputs(self, tasks):
if not isinstance(tasks, list):
return False
else:
for i in tasks:
if not 'kernel' in i or not 'dataset' in i:
return False
return True


def sort_tasks_by_complexity(self):
return


def get_running_tasks(self):
command = 'squeue --user $USER --format "%.50j" --noheader'
stream = os.popen(command)
output = stream.readlines()
running_tasks = [o for o in output if o.strip().startswith(self._job_prefix)]
self._num_running_tasks = len(running_tasks)


def load_results_from_table(self):
pass


def get_task_state(self, task):
task_dir = os.path.join(self._root_dir, task['kernel'] + '.' + task['dataset'] + '/')
fn_summary = os.path.join(task_dir, 'results_summary.pkl')
if os.path.isfile(fn_summary):
output = pickle.loads(fn_summary)
state = output['state']
return state
else:
return 'unstarted'


def run_task(self, task):
ds_name = task['dataset']
k_name = task['kernel']

# Get dataset.
ds = Dataset(ds_name)
graph_kernel = GRAPH_KERNELS[k_name]

# Start CV.
results = NestedCV(ds, graph_kernel)

+ 36
- 30
gklearn/tests/test_graph_kernels.py View File

@@ -25,34 +25,40 @@ def chooseDataset(ds_name):
current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
root = current_path + '../../datasets/'

# no node labels (and no edge labels).
if ds_name == 'Alkane':
# no labels at all.
if ds_name == 'Alkane_unlabeled':
dataset = Dataset('Alkane_unlabeled', root=root)
dataset.trim_dataset(edge_required=False)
dataset.cut_graphs(range(1, 10))
# node symbolic labels.
# node symbolic labels only.
elif ds_name == 'Acyclic':
dataset = Dataset('Acyclic', root=root)
dataset.trim_dataset(edge_required=False)
# node non-symbolic labels.
# node non-symbolic labels only.
elif ds_name == 'Letter-med':
dataset = Dataset('Letter-med', root=root)
dataset.trim_dataset(edge_required=False)
# node symbolic and non-symbolic labels (and edge symbolic labels).
# node symbolic + non-symbolic labels + edge symbolic labels.
elif ds_name == 'AIDS':
dataset = Dataset('AIDS', root=root)
dataset.trim_dataset(edge_required=False)
# edge non-symbolic labels (no node labels).
elif ds_name == 'Fingerprint_edge':
# node non-symbolic labels + edge non-symbolic labels.
elif ds_name == 'Fingerprint':
dataset = Dataset('Fingerprint', root=root)
dataset.trim_dataset(edge_required=True)
irrelevant_labels = {'edge_attrs': ['orient', 'angle']}
# edge symbolic only.
elif ds_name == 'MAO':
dataset = Dataset('MAO', root=root)
dataset.trim_dataset(edge_required=True)
irrelevant_labels = {'node_labels': ['atom_symbol'], 'node_attrs': ['x', 'y']}
dataset.remove_labels(**irrelevant_labels)
# edge non-symbolic labels (and node non-symbolic labels).
elif ds_name == 'Fingerprint':
# edge non-symbolic labels only.
elif ds_name == 'Fingerprint_edge':
dataset = Dataset('Fingerprint', root=root)
dataset.trim_dataset(edge_required=True)
# edge symbolic and non-symbolic labels (and node symbolic and non-symbolic labels).
irrelevant_labels = {'edge_attrs': ['orient', 'angle']}
dataset.remove_labels(**irrelevant_labels)
# node symbolic and non-symbolic labels + edge symbolic and non-symbolic labels.
elif ds_name == 'Cuneiform':
dataset = Dataset('Cuneiform', root=root)
dataset.trim_dataset(edge_required=True)
@@ -91,7 +97,7 @@ def assert_equality(compute_fun, **kwargs):
assert np.array_equal(lst[i], lst[i + 1])


@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS'])
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS'])
@pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')])
# @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_CommonWalk(ds_name, weight, compute_method):
@@ -126,7 +132,7 @@ def test_CommonWalk(ds_name, weight, compute_method):
assert_equality(compute, parallel=['imap_unordered', None])


@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS'])
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS'])
@pytest.mark.parametrize('remove_totters', [False]) #[True, False])
# @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_Marginalized(ds_name, remove_totters):
@@ -319,13 +325,13 @@ def test_SpectralDecomposition(ds_name, sub_kernel):
# @pytest.mark.parametrize(
# 'compute_method,ds_name,sub_kernel',
# [
# ('sylvester', 'Alkane', None),
# ('conjugate', 'Alkane', None),
# ('sylvester', 'Alkane_unlabeled', None),
# ('conjugate', 'Alkane_unlabeled', None),
# ('conjugate', 'AIDS', None),
# ('fp', 'Alkane', None),
# ('fp', 'Alkane_unlabeled', None),
# ('fp', 'AIDS', None),
# ('spectral', 'Alkane', 'exp'),
# ('spectral', 'Alkane', 'geo'),
# ('spectral', 'Alkane_unlabeled', 'exp'),
# ('spectral', 'Alkane_unlabeled', 'geo'),
# ]
# )
# @pytest.mark.parametrize('parallel', ['imap_unordered', None])
@@ -365,7 +371,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel):
# assert False, exception


@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint'])
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint'])
# @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_ShortestPath(ds_name):
"""Test shortest path kernel.
@@ -401,8 +407,8 @@ def test_ShortestPath(ds_name):
assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False])


#@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint'])
@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform'])
#@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint'])
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform'])
# @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_StructuralSP(ds_name):
"""Test structural shortest path kernel.
@@ -441,7 +447,7 @@ def test_StructuralSP(ds_name):
assert_equality(compute, parallel=['imap_unordered', None], fcsp=[True, False])


@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS'])
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS'])
# @pytest.mark.parametrize('parallel', ['imap_unordered', None])
#@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto', None])
@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto'])
@@ -476,7 +482,7 @@ def test_PathUpToH(ds_name, k_func):
compute_method=['trie', 'naive'])


@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS'])
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'AIDS'])
# @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_Treelet(ds_name):
"""Test treelet kernel.
@@ -510,7 +516,7 @@ def test_Treelet(ds_name):
assert_equality(compute, parallel=['imap_unordered', None])


@pytest.mark.parametrize('ds_name', ['Acyclic'])
@pytest.mark.parametrize('ds_name', ['Alkane_unlabeled', 'Acyclic', 'MAO', 'AIDS'])
#@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge'])
# @pytest.mark.parametrize('base_kernel', ['subtree'])
# @pytest.mark.parametrize('parallel', ['imap_unordered', None])
@@ -540,17 +546,17 @@ def test_WLSubtree(ds_name):
else:
return gram_matrix, kernel_list, kernel

assert_equality(compute, parallel=['imap_unordered', None])
assert_equality(compute, parallel=[None, 'imap_unordered'])


if __name__ == "__main__":
test_list_graph_kernels()
# test_spkernel('Alkane', 'imap_unordered')
# test_ShortestPath('Alkane')
# test_list_graph_kernels()
# test_spkernel('Alkane_unlabeled', 'imap_unordered')
# test_ShortestPath('Alkane_unlabeled')
# test_StructuralSP('Fingerprint_edge', 'imap_unordered')
# test_StructuralSP('Acyclic')
# test_StructuralSP('Cuneiform', None)
# test_WLSubtree('Acyclic')
test_WLSubtree('MAO') # 'Alkane_unlabeled', 'Acyclic', 'AIDS'
# test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered')
# test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered')
# test_RandomWalk('Acyclic', 'fp', None, None)
@@ -559,7 +565,7 @@ if __name__ == "__main__":
# test_Marginalized('Acyclic', False)
# test_ShortestPath('Acyclic')
# test_PathUpToH('Acyclic', 'MinMax')
# test_Treelet('Acyclic')
# test_Treelet('AIDS')
# test_SylvesterEquation('Acyclic')
# test_ConjugateGradient('Acyclic')
# test_FixedPoint('Acyclic')

+ 185
- 111
gklearn/utils/kernels.py View File

@@ -3,156 +3,230 @@ These kernels are defined between pairs of vectors.
"""
import numpy as np


def delta_kernel(x, y):
"""Delta kernel. Return 1 if x == y, 0 otherwise.

Parameters
----------
x, y : any
Two parts to compare.

Return
------
kernel : integer
Delta kernel.

References
----------
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
labeled graphs. In Proceedings of the 20th International Conference on
Machine Learning, Washington, DC, United States, 2003.
"""
return x == y #(1 if condition else 0)


def deltakernel(x, y):
"""Delta kernel. Return 1 if x == y, 0 otherwise.
return delta_kernel(x, y)


def gaussian_kernel(x, y, gamma=None):
"""Gaussian kernel.
Compute the rbf (gaussian) kernel between x and y:

Parameters
----------
x, y : any
Two parts to compare.
K(x, y) = exp(-gamma ||x-y||^2).

Return
------
kernel : integer
Delta kernel.
Read more in the `User Guide of scikit-learn library <https://scikit-learn.org/stable/modules/metrics.html#rbf-kernel>`__.

References
----------
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
labeled graphs. In Proceedings of the 20th International Conference on
Machine Learning, Washington, DC, United States, 2003.
"""
return x == y #(1 if condition else 0)
Parameters
----------
x, y : array

gamma : float, default None
If None, defaults to 1.0 / n_features

Returns
-------
kernel : float
"""
if gamma is None:
gamma = 1.0 / len(x)

# xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up.
# yt = np.array([float(itm) for itm in y])
# kernel = xt - yt
# kernel = kernel ** 2
# kernel = np.sum(kernel)
# kernel *= -gamma
# kernel = np.exp(kernel)
# return kernel

return np.exp((np.sum(np.subtract(x, y) ** 2)) * -gamma)


def gaussiankernel(x, y, gamma=None):
"""Gaussian kernel.
Compute the rbf (gaussian) kernel between x and y:
return gaussian_kernel(x, y, gamma=gamma)

K(x, y) = exp(-gamma ||x-y||^2).

Read more in the `User Guide of scikit-learn library <https://scikit-learn.org/stable/modules/metrics.html#rbf-kernel>`__.
def polynomial_kernel(x, y, gamma=1, coef0=0, d=1):
return (np.dot(x, y) * gamma + coef0) ** d

Parameters
----------
x, y : array

gamma : float, default None
If None, defaults to 1.0 / n_features
def highest_polynomial_kernel(x, y, d=1, c=0):
"""Polynomial kernel.
Compute the polynomial kernel between x and y:

Returns
-------
kernel : float
"""
if gamma is None:
gamma = 1.0 / len(x)
K(x, y) = <x, y> ^d + c.

xt = np.array([float(itm) for itm in x]) # @todo: move this to dataset or datafile to speed up.
yt = np.array([float(itm) for itm in y])
kernel = xt - yt
kernel = kernel ** 2
kernel = np.sum(kernel)
kernel *= -gamma
kernel = np.exp(kernel)
return kernel
Parameters
----------
x, y : array

d : integer, default 1

c : float, default 0

Returns
-------
kernel : float
"""
return np.dot(x, y) ** d + c


def polynomialkernel(x, y, d=1, c=0):
"""Polynomial kernel.
Compute the polynomial kernel between x and y:
return highest_polynomial_kernel(x, y, d=d, c=c)

K(x, y) = <x, y> ^d + c.
def linear_kernel(x, y):
"""Polynomial kernel.
Compute the polynomial kernel between x and y:

Parameters
----------
x, y : array
K(x, y) = <x, y>.

d : integer, default 1
Parameters
----------
x, y : array

c : float, default 0
d : integer, default 1

Returns
-------
kernel : float
"""
return np.dot(x, y) ** d + c
c : float, default 0

Returns
-------
kernel : float
"""
return np.dot(x, y)


def linearkernel(x, y):
"""Polynomial kernel.
Compute the polynomial kernel between x and y:
return linear_kernel(x, y)


def cosine_kernel(x, y):
return np.dot(x, y) / (np.abs(x) * np.abs(y))


def sigmoid_kernel(x, y, gamma=None, coef0=1):
if gamma is None:
gamma = 1.0 / len(x)

k = np.dot(x, y)
k *= gamma
k += coef0
k = np.tanh(k)
# k = np.tanh(k, k) # compute tanh in-place
return k


def laplacian_kernel(x, y, gamma=None):
if gamma is None:
gamma = 1.0 / len(x)

k = -gamma * np.abs(np.subtract(x, y))
k = np.exp(k)
return k


def chi2_kernel(x, y, gamma=1.0):
k = np.divide(np.subtract(x, y) ** 2, np.add(x, y))
k = np.sum(k)
k *= -gamma
return np.exp(k)


def exponential_kernel(x, y, gamma=None):
if gamma is None:
gamma = 1.0 / len(x)

return np.exp(np.dot(x, y) * gamma)


K(x, y) = <x, y>.
def intersection_kernel(x, y):
return np.sum(np.minimum(x, y))

Parameters
----------
x, y : array

d : integer, default 1
def multiquadratic_kernel(x, y, c=0):
return np.sqrt((np.sum(np.subtract(x, y) ** 2)) + c)

c : float, default 0

Returns
-------
kernel : float
"""
return np.dot(x, y)
def inverse_multiquadratic_kernel(x, y, c=0):
return 1 / multiquadratic_kernel(x, y, c=c)


def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1):
"""Sum of a pair of kernels.
"""Sum of a pair of kernels.

k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22)
k = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22)

Parameters
----------
k1, k2 : function
A pair of kernel functions.
d11, d12:
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2.
d21, d22:
Inputs of k2.
lamda1, lamda2: float
Coefficients of the product.
Parameters
----------
k1, k2 : function
A pair of kernel functions.
d11, d12:
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2.
d21, d22:
Inputs of k2.
lamda1, lamda2: float
Coefficients of the product.

Return
------
kernel : integer
Return
------
kernel : integer

"""
if d21 == None or d22 == None:
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12)
else:
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22)
return kernel
"""
if d21 == None or d22 == None:
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d11, d12)
else:
kernel = lamda1 * k1(d11, d12) + lamda2 * k2(d21, d22)
return kernel


def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1):
"""Product of a pair of kernels.
k = lamda * k1(d11, d12) * k2(d21, d22)
Parameters
----------
k1, k2 : function
A pair of kernel functions.
d11, d12:
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2.
d21, d22:
Inputs of k2.
lamda: float
Coefficient of the product.
Return
------
kernel : integer
"""
if d21 == None or d22 == None:
kernel = lamda * k1(d11, d12) * k2(d11, d12)
else:
kernel = lamda * k1(d11, d12) * k2(d21, d22)
return kernel
"""Product of a pair of kernels.
k = lamda * k1(d11, d12) * k2(d21, d22)
Parameters
----------
k1, k2 : function
A pair of kernel functions.
d11, d12:
Inputs of k1. If d21 or d22 is None, apply d11, d12 to both k1 and k2.
d21, d22:
Inputs of k2.
lamda: float
Coefficient of the product.
Return
------
kernel : integer
"""
if d21 == None or d22 == None:
kernel = lamda * k1(d11, d12) * k2(d11, d12)
else:
kernel = lamda * k1(d11, d12) * k2(d21, d22)
return kernel


if __name__ == '__main__':
o = polynomialkernel([1, 2], [3, 4], 2, 3)
o = polynomialkernel([1, 2], [3, 4], 2, 3)

+ 64
- 9
gklearn/utils/utils.py View File

@@ -366,19 +366,62 @@ def get_edge_labels(Gn, edge_label):
def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}, **kwargs):
if len(kwargs) != 0:
kernel_options = kwargs
if name == 'Marginalized':

if name == 'CommonWalk' or name == 'common walk':
from gklearn.kernels import CommonWalk
graph_kernel = CommonWalk(node_labels=node_labels,
edge_labels=edge_labels,
ds_infos=ds_infos,
**kernel_options)

elif name == 'Marginalized' or name == 'marginalized':
from gklearn.kernels import Marginalized
graph_kernel = Marginalized(node_labels=node_labels,
edge_labels=edge_labels,
ds_infos=ds_infos,
**kernel_options)
elif name == 'ShortestPath':

elif name == 'SylvesterEquation' or name == 'sylvester equation':
from gklearn.kernels import SylvesterEquation
graph_kernel = SylvesterEquation(
ds_infos=ds_infos,
**kernel_options)

elif name == 'FixedPoint' or name == 'fixed point':
from gklearn.kernels import FixedPoint
graph_kernel = FixedPoint(node_labels=node_labels,
edge_labels=edge_labels,
node_attrs=node_attrs,
edge_attrs=edge_attrs,
ds_infos=ds_infos,
**kernel_options)

elif name == 'ConjugateGradient' or name == 'conjugate gradient':
from gklearn.kernels import ConjugateGradient
graph_kernel = ConjugateGradient(node_labels=node_labels,
edge_labels=edge_labels,
node_attrs=node_attrs,
edge_attrs=edge_attrs,
ds_infos=ds_infos,
**kernel_options)

elif name == 'SpectralDecomposition' or name == 'spectral decomposition':
from gklearn.kernels import SpectralDecomposition
graph_kernel = SpectralDecomposition(node_labels=node_labels,
edge_labels=edge_labels,
node_attrs=node_attrs,
edge_attrs=edge_attrs,
ds_infos=ds_infos,
**kernel_options)

elif name == 'ShortestPath' or name == 'shortest path':
from gklearn.kernels import ShortestPath
graph_kernel = ShortestPath(node_labels=node_labels,
node_attrs=node_attrs,
ds_infos=ds_infos,
**kernel_options)
elif name == 'StructuralSP':

elif name == 'StructuralSP' or name == 'structural shortest path':
from gklearn.kernels import StructuralSP
graph_kernel = StructuralSP(node_labels=node_labels,
edge_labels=edge_labels,
@@ -386,25 +429,29 @@ def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attr
edge_attrs=edge_attrs,
ds_infos=ds_infos,
**kernel_options)
elif name == 'PathUpToH':

elif name == 'PathUpToH' or name == 'path up to length h':
from gklearn.kernels import PathUpToH
graph_kernel = PathUpToH(node_labels=node_labels,
edge_labels=edge_labels,
ds_infos=ds_infos,
**kernel_options)
elif name == 'Treelet':

elif name == 'Treelet' or name == 'treelet':
from gklearn.kernels import Treelet
graph_kernel = Treelet(node_labels=node_labels,
edge_labels=edge_labels,
ds_infos=ds_infos,
**kernel_options)
elif name == 'WLSubtree':

elif name == 'WLSubtree' or name == 'weisfeiler-lehman subtree':
from gklearn.kernels import WLSubtree
graph_kernel = WLSubtree(node_labels=node_labels,
edge_labels=edge_labels,
ds_infos=ds_infos,
**kernel_options)
elif name == 'WeisfeilerLehman':

elif name == 'WeisfeilerLehman' or name == 'weisfeiler-lehman':
from gklearn.kernels import WeisfeilerLehman
graph_kernel = WeisfeilerLehman(node_labels=node_labels,
edge_labels=edge_labels,
@@ -541,10 +588,18 @@ def get_mlti_dim_edge_attrs(G, attr_names):

def normalize_gram_matrix(gram_matrix):
diag = gram_matrix.diagonal().copy()
old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
for i in range(len(gram_matrix)):
for j in range(i, len(gram_matrix)):
gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j])
gram_matrix[j][i] = gram_matrix[i][j]
try:
gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j])
except:
# rollback()
np.seterr(**old_settings)
raise
else:
gram_matrix[j][i] = gram_matrix[i][j]
np.seterr(**old_settings)
return gram_matrix




Loading…
Cancel
Save