Browse Source

Merge pull request #39 from jajupmochi/v0.2.x

V0.2.x
master
linlin GitHub 4 years ago
parent
commit
50ab7aa370
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 655 additions and 400 deletions
  1. +4
    -2
      gklearn/dataset/data_fetcher.py
  2. +80
    -87
      gklearn/dataset/dataset.py
  3. +20
    -5
      gklearn/dataset/file_managers.py
  4. +6
    -6
      gklearn/dataset/metadata.py
  5. +54
    -53
      gklearn/kernels/graph_kernel.py
  6. +1
    -1
      gklearn/kernels/metadata.py
  7. +121
    -43
      gklearn/kernels/shortest_path.py
  8. +182
    -54
      gklearn/kernels/structural_sp.py
  9. +97
    -97
      gklearn/kernels/structuralspKernel.py
  10. +90
    -52
      gklearn/tests/test_graph_kernels.py

+ 4
- 2
gklearn/dataset/data_fetcher.py View File

@@ -74,6 +74,8 @@ class DataFetcher():
message = 'Invalid Dataset name "' + self._name + '".' message = 'Invalid Dataset name "' + self._name + '".'
message += '\nAvailable datasets are as follows: \n\n' message += '\nAvailable datasets are as follows: \n\n'
message += '\n'.join(ds for ds in sorted(DATASET_META)) message += '\n'.join(ds for ds in sorted(DATASET_META))
message += '\n\nFollowing special suffices can be added to the name:'
message += '\n\n' + '\n'.join(['_unlabeled'])
raise ValueError(message) raise ValueError(message)
else: else:
self.write_archive_file(self._name) self.write_archive_file(self._name)
@@ -127,9 +129,9 @@ class DataFetcher():
def write_archive_file(self, ds_name): def write_archive_file(self, ds_name):
path = osp.join(self._root, ds_name) path = osp.join(self._root, ds_name)
url = DATASET_META[ds_name]['url']
# filename_dir = osp.join(path,filename) # filename_dir = osp.join(path,filename)
if not osp.exists(path) or self._reload: if not osp.exists(path) or self._reload:
url = DATASET_META[ds_name]['url']
response = self.download_file(url) response = self.download_file(url)
if response is None: if response is None:
return False return False
@@ -152,7 +154,7 @@ class DataFetcher():
with tarfile.open(filename_archive, 'r:gz') as tar: with tarfile.open(filename_archive, 'r:gz') as tar:
if self._reload and self._verbose: if self._reload and self._verbose:
print(filename + ' Downloaded.') print(filename + ' Downloaded.')
subpath = os.path.join(path, tar.getnames()[0])
subpath = os.path.join(path, tar.getnames()[0].split('/')[0])
if not osp.exists(subpath) or self._reload: if not osp.exists(subpath) or self._reload:
tar.extractall(path = path) tar.extractall(path = path)
return subpath return subpath


+ 80
- 87
gklearn/dataset/dataset.py View File

@@ -7,24 +7,14 @@ Created on Thu Mar 26 18:48:27 2020
""" """
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from gklearn.utils.graph_files import load_dataset
import os import os
from gklearn.dataset import DATASET_META, DataFetcher, DataLoader




class Dataset(object): class Dataset(object):
def __init__(self, filename=None, filename_targets=None, **kwargs):
if filename is None:
self._graphs = None
self._targets = None
self._node_labels = None
self._edge_labels = None
self._node_attrs = None
self._edge_attrs = None
else:
self.load_dataset(filename, filename_targets=filename_targets, **kwargs)
def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs):
self._substructures = None self._substructures = None
self._node_label_dim = None self._node_label_dim = None
self._edge_label_dim = None self._edge_label_dim = None
@@ -49,15 +39,61 @@ class Dataset(object):
self._node_attr_dim = None self._node_attr_dim = None
self._edge_attr_dim = None self._edge_attr_dim = None
self._class_number = None self._class_number = None
self._ds_name = None
if inputs is None:
self._graphs = None
self._targets = None
self._node_labels = None
self._edge_labels = None
self._node_attrs = None
self._edge_attrs = None
# If inputs is a list of graphs.
elif isinstance(inputs, list):
node_labels = kwargs.get('node_labels', None)
node_attrs = kwargs.get('node_attrs', None)
edge_labels = kwargs.get('edge_labels', None)
edge_attrs = kwargs.get('edge_attrs', None)
self.load_graphs(inputs, targets=targets)
self.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
if clean_labels:
self.clean_labels()
elif isinstance(inputs, str):
# If inputs is predefined dataset name.
if inputs in DATASET_META:
self.load_predefined_dataset(inputs, root=root, clean_labels=clean_labels, reload=reload, verbose=verbose)
self._ds_name = inputs
elif inputs.endswith('_unlabeled'):
self.load_predefined_dataset(inputs[:len(inputs) - 10], root=root, clean_labels=clean_labels, reload=reload, verbose=verbose)
self._ds_name = inputs

# Deal with special suffices.
self.check_special_suffices()
# If inputs is a file name.
elif os.path.isfile(inputs):
self.load_dataset(inputs, filename_targets=filename_targets, clean_labels=clean_labels, **kwargs)
# If inputs is a file name.
else:
raise ValueError('The "inputs" argument "' + inputs + '" is not a valid dataset name or file name.')
else:
raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.')
def load_dataset(self, filename, filename_targets=None, **kwargs):
self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs)
def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs):
self._graphs, self._targets, label_names = DataLoader(filename, filename_targets=filename_targets, **kwargs).data
self._node_labels = label_names['node_labels'] self._node_labels = label_names['node_labels']
self._node_attrs = label_names['node_attrs'] self._node_attrs = label_names['node_attrs']
self._edge_labels = label_names['edge_labels'] self._edge_labels = label_names['edge_labels']
self._edge_attrs = label_names['edge_attrs'] self._edge_attrs = label_names['edge_attrs']
self.clean_labels()
if clean_labels:
self.clean_labels()
def load_graphs(self, graphs, targets=None): def load_graphs(self, graphs, targets=None):
@@ -67,84 +103,33 @@ class Dataset(object):
# self.set_labels_attrs() # @todo # self.set_labels_attrs() # @todo
def load_predefined_dataset(self, ds_name):
current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
if ds_name == 'Acyclic':
ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'AIDS':
ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'Alkane':
ds_file = current_path + '../../datasets/Alkane/dataset.ds'
fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets)
elif ds_name == 'COIL-DEL':
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'COIL-RAG':
ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'COLORS-3':
ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'Cuneiform':
ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'DD':
ds_file = current_path + '../../datasets/DD/DD_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'ENZYMES':
ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'Fingerprint':
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'FRANKENSTEIN':
ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'Letter-high': # node non-symb
ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'Letter-low': # node non-symb
ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'Letter-med': # node non-symb
ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'MAO':
ds_file = current_path + '../../datasets/MAO/dataset.ds'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'Monoterpenoides':
ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'MUTAG':
ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'NCI1':
ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'NCI109':
ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'PAH':
ds_file = current_path + '../../datasets/PAH/dataset.ds'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'SYNTHETIC':
pass
elif ds_name == 'SYNTHETICnew':
ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
self._graphs, self._targets, label_names = load_dataset(ds_file)
elif ds_name == 'Synthie':
pass
def load_predefined_dataset(self, ds_name, root='datasets', clean_labels=True, reload=False, verbose=False):
path = DataFetcher(name=ds_name, root=root, reload=reload, verbose=verbose).path
if DATASET_META[ds_name]['database'] == 'tudataset':
ds_file = os.path.join(path, ds_name + '_A.txt')
fn_targets = None
else: else:
raise Exception('The dataset name "', ds_name, '" is not pre-defined.')
load_files = DATASET_META[ds_name]['load_files']
if isinstance(load_files[0], str):
ds_file = os.path.join(path, load_files[0])
else: # load_files[0] is a list of files.
ds_file = [os.path.join(path, fn) for fn in load_files[0]]
fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None
self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets).data
self._node_labels = label_names['node_labels'] self._node_labels = label_names['node_labels']
self._node_attrs = label_names['node_attrs'] self._node_attrs = label_names['node_attrs']
self._edge_labels = label_names['edge_labels'] self._edge_labels = label_names['edge_labels']
self._edge_attrs = label_names['edge_attrs'] self._edge_attrs = label_names['edge_attrs']
self.clean_labels()
if clean_labels:
self.clean_labels()
# Deal with specific datasets.
if ds_name == 'Alkane':
self.trim_dataset(edge_required=True)
self.remove_labels(node_labels=['atom_symbol'])


def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]):
@@ -573,6 +558,14 @@ class Dataset(object):
return dataset return dataset
def check_special_suffices(self):
if self._ds_name.endswith('_unlabeled'):
self.remove_labels(node_labels=self._node_labels,
edge_labels=self._edge_labels,
node_attrs=self._node_attrs,
edge_attrs=self._edge_attrs)
def get_all_node_labels(self): def get_all_node_labels(self):
node_labels = [] node_labels = []
for g in self._graphs: for g in self._graphs:


+ 20
- 5
gklearn/dataset/file_managers.py View File

@@ -38,7 +38,11 @@ class DataLoader():
for details. Note here filename is the name of either .txt file in for details. Note here filename is the name of either .txt file in
the dataset directory. the dataset directory.
""" """
extension = splitext(filename)[1][1:]
if isinstance(filename, str):
extension = splitext(filename)[1][1:]
else: # filename is a list of files.
extension = splitext(filename[0])[1][1:]
if extension == "ds": if extension == "ds":
self._graphs, self._targets, self._label_names = self.load_from_ds(filename, filename_targets) self._graphs, self._targets, self._label_names = self.load_from_ds(filename, filename_targets)
elif extension == "cxl": elif extension == "cxl":
@@ -67,13 +71,24 @@ class DataLoader():
Note these graph formats are checked automatically by the extensions of Note these graph formats are checked automatically by the extensions of
graph files. graph files.
"""
dirname_dataset = dirname(filename)
"""
if isinstance(filename, str):
dirname_dataset = dirname(filename)
with open(filename) as f:
content = f.read().splitlines()
else: # filename is a list of files.
dirname_dataset = dirname(filename[0])
content = []
for fn in filename:
with open(fn) as f:
content += f.read().splitlines()
# to remove duplicate file names.

data = [] data = []
y = [] y = []
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
with open(filename) as fn:
content = fn.read().splitlines()
content = [line for line in content if not line.endswith('.ds')] # Alkane
content = [line for line in content if not line.startswith('#')] # Acyclic
extension = splitext(content[0].split(' ')[0])[1][1:] extension = splitext(content[0].split(' ')[0])[1][1:]
if extension == 'ct': if extension == 'ct':
load_file_fun = self.load_ct load_file_fun = self.load_ct


+ 6
- 6
gklearn/dataset/metadata.py View File

@@ -32,7 +32,7 @@ GREYC_META = {
'domain': 'small molecules', 'domain': 'small molecules',
'train_valid_test': [], 'train_valid_test': [],
'stereoisomerism': True, 'stereoisomerism': True,
'load_files': [],
'load_files': ['data.ds'],
}, },
'Acyclic': { 'Acyclic': {
'database': 'greyc', 'database': 'greyc',
@@ -165,7 +165,7 @@ GREYC_META = {
'domain': 'small molecules', 'domain': 'small molecules',
'train_valid_test': ['trainset_0.ds', None, 'testset_0.ds'], 'train_valid_test': ['trainset_0.ds', None, 'testset_0.ds'],
'stereoisomerism': False, 'stereoisomerism': False,
'load_files': [],
'load_files': [['trainset_0.ds', 'testset_0.ds']],
}, },
'PTC': { 'PTC': {
'database': 'greyc', 'database': 'greyc',
@@ -654,7 +654,7 @@ TUDataset_META = {
'node_attr_dim': 0, 'node_attr_dim': 0,
'geometry': None, 'geometry': None,
'edge_attr_dim': 0, 'edge_attr_dim': 0,
'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23.zip-H23',
'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23.zip',
'domain': 'small molecules', 'domain': 'small molecules',
}, },
'NCI-H23H': { 'NCI-H23H': {
@@ -670,7 +670,7 @@ TUDataset_META = {
'node_attr_dim': 0, 'node_attr_dim': 0,
'geometry': None, 'geometry': None,
'edge_attr_dim': 0, 'edge_attr_dim': 0,
'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23H.zip-H23H',
'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23H.zip',
'domain': 'small molecules', 'domain': 'small molecules',
}, },
'OVCAR-8': { 'OVCAR-8': {
@@ -686,7 +686,7 @@ TUDataset_META = {
'node_attr_dim': 0, 'node_attr_dim': 0,
'geometry': None, 'geometry': None,
'edge_attr_dim': 0, 'edge_attr_dim': 0,
'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8.zip-8',
'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8.zip',
'domain': 'small molecules', 'domain': 'small molecules',
}, },
'OVCAR-8H': { 'OVCAR-8H': {
@@ -702,7 +702,7 @@ TUDataset_META = {
'node_attr_dim': 0, 'node_attr_dim': 0,
'geometry': None, 'geometry': None,
'edge_attr_dim': 0, 'edge_attr_dim': 0,
'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8H.zip-8H',
'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8H.zip',
'domain': 'small molecules', 'domain': 'small molecules',
}, },
'P388': { 'P388': {


+ 54
- 53
gklearn/kernels/graph_kernel.py View File

@@ -9,10 +9,11 @@ import numpy as np
import networkx as nx import networkx as nx
import multiprocessing import multiprocessing
import time import time
from gklearn.utils import normalize_gram_matrix




class GraphKernel(object): class GraphKernel(object):
def __init__(self): def __init__(self):
self._graphs = None self._graphs = None
self._parallel = '' self._parallel = ''
@@ -22,14 +23,14 @@ class GraphKernel(object):
self._run_time = 0 self._run_time = 0
self._gram_matrix = None self._gram_matrix = None
self._gram_matrix_unnorm = None self._gram_matrix_unnorm = None


def compute(self, *graphs, **kwargs): def compute(self, *graphs, **kwargs):
self._parallel = kwargs.get('parallel', 'imap_unordered') self._parallel = kwargs.get('parallel', 'imap_unordered')
self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
self._normalize = kwargs.get('normalize', True) self._normalize = kwargs.get('normalize', True)
self._verbose = kwargs.get('verbose', 2) self._verbose = kwargs.get('verbose', 2)
if len(graphs) == 1: if len(graphs) == 1:
if not isinstance(graphs[0], list): if not isinstance(graphs[0], list):
raise Exception('Cannot detect graphs.') raise Exception('Cannot detect graphs.')
@@ -40,9 +41,9 @@ class GraphKernel(object):
self._gram_matrix = self._compute_gram_matrix() self._gram_matrix = self._compute_gram_matrix()
self._gram_matrix_unnorm = np.copy(self._gram_matrix) self._gram_matrix_unnorm = np.copy(self._gram_matrix)
if self._normalize: if self._normalize:
self._gram_matrix = self.normalize_gm(self._gram_matrix)
self._gram_matrix = normalize_gram_matrix(self._gram_matrix)
return self._gram_matrix, self._run_time return self._gram_matrix, self._run_time
elif len(graphs) == 2: elif len(graphs) == 2:
if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): if self.is_graph(graphs[0]) and self.is_graph(graphs[1]):
kernel = self._compute_single_kernel(graphs[0].copy(), graphs[1].copy()) kernel = self._compute_single_kernel(graphs[0].copy(), graphs[1].copy())
@@ -59,14 +60,14 @@ class GraphKernel(object):
return kernel_list, self._run_time return kernel_list, self._run_time
else: else:
raise Exception('Cannot detect graphs.') raise Exception('Cannot detect graphs.')
elif len(graphs) == 0 and self._graphs is None: elif len(graphs) == 0 and self._graphs is None:
raise Exception('Please add graphs before computing.') raise Exception('Please add graphs before computing.')
else: else:
raise Exception('Cannot detect graphs.') raise Exception('Cannot detect graphs.')
def normalize_gm(self, gram_matrix): def normalize_gm(self, gram_matrix):
import warnings import warnings
warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning) warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning)
@@ -77,8 +78,8 @@ class GraphKernel(object):
gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j])
gram_matrix[j][i] = gram_matrix[i][j] gram_matrix[j][i] = gram_matrix[i][j]
return gram_matrix return gram_matrix
def compute_distance_matrix(self): def compute_distance_matrix(self):
if self._gram_matrix is None: if self._gram_matrix is None:
raise Exception('Please compute the Gram matrix before computing distance matrix.') raise Exception('Please compute the Gram matrix before computing distance matrix.')
@@ -97,98 +98,98 @@ class GraphKernel(object):
dis_min = np.min(np.min(dis_mat[dis_mat != 0])) dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
dis_mean = np.mean(np.mean(dis_mat)) dis_mean = np.mean(np.mean(dis_mat))
return dis_mat, dis_max, dis_min, dis_mean return dis_mat, dis_max, dis_min, dis_mean
def _compute_gram_matrix(self): def _compute_gram_matrix(self):
start_time = time.time() start_time = time.time()
if self._parallel == 'imap_unordered': if self._parallel == 'imap_unordered':
gram_matrix = self._compute_gm_imap_unordered() gram_matrix = self._compute_gm_imap_unordered()
elif self._parallel is None: elif self._parallel is None:
gram_matrix = self._compute_gm_series() gram_matrix = self._compute_gm_series()
else: else:
raise Exception('Parallel mode is not set correctly.') raise Exception('Parallel mode is not set correctly.')
self._run_time = time.time() - start_time self._run_time = time.time() - start_time
if self._verbose: if self._verbose:
print('Gram matrix of size %d built in %s seconds.' print('Gram matrix of size %d built in %s seconds.'
% (len(self._graphs), self._run_time)) % (len(self._graphs), self._run_time))
return gram_matrix return gram_matrix
def _compute_gm_series(self): def _compute_gm_series(self):
pass pass




def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
pass pass
def _compute_kernel_list(self, g1, g_list): def _compute_kernel_list(self, g1, g_list):
start_time = time.time() start_time = time.time()
if self._parallel == 'imap_unordered': if self._parallel == 'imap_unordered':
kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list)
elif self._parallel is None: elif self._parallel is None:
kernel_list = self._compute_kernel_list_series(g1, g_list) kernel_list = self._compute_kernel_list_series(g1, g_list)
else: else:
raise Exception('Parallel mode is not set correctly.') raise Exception('Parallel mode is not set correctly.')
self._run_time = time.time() - start_time self._run_time = time.time() - start_time
if self._verbose: if self._verbose:
print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.'
% (len(g_list), self._run_time)) % (len(g_list), self._run_time))
return kernel_list return kernel_list


def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
pass pass


def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
pass pass
def _compute_single_kernel(self, g1, g2): def _compute_single_kernel(self, g1, g2):
start_time = time.time() start_time = time.time()
kernel = self._compute_single_kernel_series(g1, g2) kernel = self._compute_single_kernel_series(g1, g2)
self._run_time = time.time() - start_time self._run_time = time.time() - start_time
if self._verbose: if self._verbose:
print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time))
return kernel return kernel
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
pass pass
def is_graph(self, graph): def is_graph(self, graph):
if isinstance(graph, nx.Graph): if isinstance(graph, nx.Graph):
return True return True
if isinstance(graph, nx.DiGraph): if isinstance(graph, nx.DiGraph):
return True
return True
if isinstance(graph, nx.MultiGraph): if isinstance(graph, nx.MultiGraph):
return True
return True
if isinstance(graph, nx.MultiDiGraph): if isinstance(graph, nx.MultiDiGraph):
return True
return True
return False return False
@property @property
def graphs(self): def graphs(self):
return self._graphs return self._graphs
@property @property
def parallel(self): def parallel(self):
return self._parallel return self._parallel
@property @property
def n_jobs(self): def n_jobs(self):
return self._n_jobs return self._n_jobs
@@ -197,30 +198,30 @@ class GraphKernel(object):
@property @property
def verbose(self): def verbose(self):
return self._verbose return self._verbose
@property @property
def normalize(self): def normalize(self):
return self._normalize return self._normalize
@property @property
def run_time(self): def run_time(self):
return self._run_time return self._run_time
@property @property
def gram_matrix(self): def gram_matrix(self):
return self._gram_matrix return self._gram_matrix
@gram_matrix.setter @gram_matrix.setter
def gram_matrix(self, value): def gram_matrix(self, value):
self._gram_matrix = value self._gram_matrix = value
@property @property
def gram_matrix_unnorm(self): def gram_matrix_unnorm(self):
return self._gram_matrix_unnorm
return self._gram_matrix_unnorm


@gram_matrix_unnorm.setter @gram_matrix_unnorm.setter
def gram_matrix_unnorm(self, value): def gram_matrix_unnorm(self, value):

+ 1
- 1
gklearn/kernels/metadata.py View File

@@ -12,7 +12,7 @@ GRAPH_KERNELS = {
'common walk': '', 'common walk': '',
'marginalized': '', 'marginalized': '',
'sylvester equation': '', 'sylvester equation': '',
'fixed_point': '',
'fixed point': '',
'conjugate gradient': '', 'conjugate gradient': '',
'spectral decomposition': '', 'spectral decomposition': '',
### based on paths. ### based on paths.


+ 121
- 43
gklearn/kernels/shortest_path.py View File

@@ -5,9 +5,9 @@ Created on Tue Apr 7 15:24:58 2020


@author: ljia @author: ljia


@references:
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData
@references:
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData
Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
""" """


@@ -17,33 +17,36 @@ from itertools import product
from multiprocessing import Pool from multiprocessing import Pool
from tqdm import tqdm from tqdm import tqdm
import numpy as np import numpy as np
import networkx as nx
from gklearn.utils.parallel import parallel_gm, parallel_me from gklearn.utils.parallel import parallel_gm, parallel_me
from gklearn.utils.utils import getSPGraph from gklearn.utils.utils import getSPGraph
from gklearn.kernels import GraphKernel from gklearn.kernels import GraphKernel




class ShortestPath(GraphKernel): class ShortestPath(GraphKernel):
def __init__(self, **kwargs): def __init__(self, **kwargs):
GraphKernel.__init__(self) GraphKernel.__init__(self)
self._node_labels = kwargs.get('node_labels', []) self._node_labels = kwargs.get('node_labels', [])
self._node_attrs = kwargs.get('node_attrs', []) self._node_attrs = kwargs.get('node_attrs', [])
self._edge_weight = kwargs.get('edge_weight', None) self._edge_weight = kwargs.get('edge_weight', None)
self._node_kernels = kwargs.get('node_kernels', None) self._node_kernels = kwargs.get('node_kernels', None)
self._fcsp = kwargs.get('fcsp', True)
self._ds_infos = kwargs.get('ds_infos', {}) self._ds_infos = kwargs.get('ds_infos', {})




def _compute_gm_series(self): def _compute_gm_series(self):
self._all_graphs_have_edges(self._graphs)
# get shortest path graph of each graph. # get shortest path graph of each graph.
if self._verbose >= 2: if self._verbose >= 2:
iterator = tqdm(self._graphs, desc='getting sp graphs', file=sys.stdout) iterator = tqdm(self._graphs, desc='getting sp graphs', file=sys.stdout)
else: else:
iterator = self._graphs iterator = self._graphs
self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator]
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2: if self._verbose >= 2:
@@ -54,11 +57,12 @@ class ShortestPath(GraphKernel):
kernel = self._sp_do(self._graphs[i], self._graphs[j]) kernel = self._sp_do(self._graphs[i], self._graphs[j])
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
gram_matrix[j][i] = kernel gram_matrix[j][i] = kernel
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
self._all_graphs_have_edges(self._graphs)
# get shortest path graph of each graph. # get shortest path graph of each graph.
pool = Pool(self._n_jobs) pool = Pool(self._n_jobs)
get_sp_graphs_fun = self._wrapper_get_sp_graphs get_sp_graphs_fun = self._wrapper_get_sp_graphs
@@ -76,21 +80,22 @@ class ShortestPath(GraphKernel):
self._graphs[i] = g self._graphs[i] = g
pool.close() pool.close()
pool.join() pool.join()
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
def init_worker(gs_toshare): def init_worker(gs_toshare):
global G_gs global G_gs
G_gs = gs_toshare G_gs = gs_toshare
do_fun = self._wrapper_sp_do do_fun = self._wrapper_sp_do
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose)
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
self._all_graphs_have_edges([g1] + g_list)
# get shortest path graphs of g1 and each graph in g_list. # get shortest path graphs of g1 and each graph in g_list.
g1 = getSPGraph(g1, edge_weight=self._edge_weight) g1 = getSPGraph(g1, edge_weight=self._edge_weight)
if self._verbose >= 2: if self._verbose >= 2:
@@ -98,7 +103,7 @@ class ShortestPath(GraphKernel):
else: else:
iterator = g_list iterator = g_list
g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator]
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._verbose >= 2: if self._verbose >= 2:
@@ -108,11 +113,12 @@ class ShortestPath(GraphKernel):
for i in iterator: for i in iterator:
kernel = self._sp_do(g1, g_list[i]) kernel = self._sp_do(g1, g_list[i])
kernel_list[i] = kernel kernel_list[i] = kernel
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
self._all_graphs_have_edges([g1] + g_list)
# get shortest path graphs of g1 and each graph in g_list. # get shortest path graphs of g1 and each graph in g_list.
g1 = getSPGraph(g1, edge_weight=self._edge_weight) g1 = getSPGraph(g1, edge_weight=self._edge_weight)
pool = Pool(self._n_jobs) pool = Pool(self._n_jobs)
@@ -131,49 +137,58 @@ class ShortestPath(GraphKernel):
g_list[i] = g g_list[i] = g
pool.close() pool.close()
pool.join() pool.join()
# compute Gram matrix. # compute Gram matrix.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)


def init_worker(g1_toshare, gl_toshare): def init_worker(g1_toshare, gl_toshare):
global G_g1, G_gl global G_g1, G_gl
G_g1 = g1_toshare
G_gl = gl_toshare
G_g1 = g1_toshare
G_gl = gl_toshare
do_fun = self._wrapper_kernel_list_do do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
return kernel_list return kernel_list
def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
return itr, self._sp_do(G_g1, G_gl[itr]) return itr, self._sp_do(G_g1, G_gl[itr])
def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
self._all_graphs_have_edges([g1] + [g2])
g1 = getSPGraph(g1, edge_weight=self._edge_weight) g1 = getSPGraph(g1, edge_weight=self._edge_weight)
g2 = getSPGraph(g2, edge_weight=self._edge_weight) g2 = getSPGraph(g2, edge_weight=self._edge_weight)
kernel = self._sp_do(g1, g2) kernel = self._sp_do(g1, g2)
return kernel
return kernel
def _wrapper_get_sp_graphs(self, itr_item): def _wrapper_get_sp_graphs(self, itr_item):
g = itr_item[0] g = itr_item[0]
i = itr_item[1] i = itr_item[1]
return i, getSPGraph(g, edge_weight=self._edge_weight) return i, getSPGraph(g, edge_weight=self._edge_weight)
def _sp_do(self, g1, g2): def _sp_do(self, g1, g2):

if self._fcsp: # @todo: it may be put outside the _sp_do().
return self._sp_do_fcsp(g1, g2)
else:
return self._sp_do_naive(g1, g2)


def _sp_do_fcsp(self, g1, g2):

kernel = 0 kernel = 0
# compute shortest path matrices first, method borrowed from FCSP. # compute shortest path matrices first, method borrowed from FCSP.
vk_dict = {} # shortest path matrices dict vk_dict = {} # shortest path matrices dict
if len(self._node_labels) > 0:
if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do().
# node symb and non-synb labeled # node symb and non-synb labeled
if len(self._node_attrs) > 0: if len(self._node_attrs) > 0:
kn = self._node_kernels['mix'] kn = self._node_kernels['mix']
@@ -208,7 +223,7 @@ class ShortestPath(GraphKernel):
if e1[2]['cost'] == e2[2]['cost']: if e1[2]['cost'] == e2[2]['cost']:
kernel += 1 kernel += 1
return kernel return kernel
# compute graph kernels # compute graph kernels
if self._ds_infos['directed']: if self._ds_infos['directed']:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
@@ -225,7 +240,7 @@ class ShortestPath(GraphKernel):
kn1 = nk11 * nk22 kn1 = nk11 * nk22
kn2 = nk12 * nk21 kn2 = nk12 * nk21
kernel += kn1 + kn2 kernel += kn1 + kn2
# # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
# # compute vertex kernels # # compute vertex kernels
# try: # try:
@@ -238,7 +253,7 @@ class ShortestPath(GraphKernel):
# vk_mat[i1][i2] = kn( # vk_mat[i1][i2] = kn(
# n1[1][node_label], n2[1][node_label], # n1[1][node_label], n2[1][node_label],
# [n1[1]['attributes']], [n2[1]['attributes']]) # [n1[1]['attributes']], [n2[1]['attributes']])
# range1 = range(0, len(edge_w_g[i])) # range1 = range(0, len(edge_w_g[i]))
# range2 = range(0, len(edge_w_g[j])) # range2 = range(0, len(edge_w_g[j]))
# for i1 in range1: # for i1 in range1:
@@ -254,11 +269,74 @@ class ShortestPath(GraphKernel):
# kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# kernel += kn1 + kn2 # kernel += kn1 + kn2
return kernel return kernel


def _sp_do_naive(self, g1, g2):

kernel = 0

# Define the function to compute kernels between vertices in each condition.
if len(self._node_labels) > 0:
# node symb and non-synb labeled
if len(self._node_attrs) > 0:
def compute_vk(n1, n2):
kn = self._node_kernels['mix']
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
# node symb labeled
else:
def compute_vk(n1, n2):
kn = self._node_kernels['symb']
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
return kn(n1_labels, n2_labels)
else:
# node non-synb labeled
if len(self._node_attrs) > 0:
def compute_vk(n1, n2):
kn = self._node_kernels['nsymb']
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
return kn(n1_attrs, n2_attrs)
# node unlabeled
else:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
kernel += 1
return kernel

# compute graph kernels
if self._ds_infos['directed']:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1])
kn1 = nk11 * nk22
kernel += kn1
else:
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
if e1[2]['cost'] == e2[2]['cost']:
# each edge walk is counted twice, starting from both its extreme nodes.
nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk(
e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1])
kn1 = nk11 * nk22
kn2 = nk12 * nk21
kernel += kn1 + kn2

return kernel


def _wrapper_sp_do(self, itr): def _wrapper_sp_do(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._sp_do(G_gs[i], G_gs[j])
return i, j, self._sp_do(G_gs[i], G_gs[j])


def _all_graphs_have_edges(self, graphs):
for G in graphs:
if nx.number_of_edges(G) == 0:
raise ValueError('Not all graphs have edges!!!')

+ 182
- 54
gklearn/kernels/structural_sp.py View File

@@ -5,9 +5,9 @@ Created on Mon Mar 30 11:59:57 2020


@author: ljia @author: ljia


@references:
@references:


[1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
[1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
""" """
import sys import sys
@@ -23,7 +23,7 @@ from gklearn.kernels import GraphKernel




class StructuralSP(GraphKernel): class StructuralSP(GraphKernel):
def __init__(self, **kwargs): def __init__(self, **kwargs):
GraphKernel.__init__(self) GraphKernel.__init__(self)
self._node_labels = kwargs.get('node_labels', []) self._node_labels = kwargs.get('node_labels', [])
@@ -34,6 +34,7 @@ class StructuralSP(GraphKernel):
self._node_kernels = kwargs.get('node_kernels', None) self._node_kernels = kwargs.get('node_kernels', None)
self._edge_kernels = kwargs.get('edge_kernels', None) self._edge_kernels = kwargs.get('edge_kernels', None)
self._compute_method = kwargs.get('compute_method', 'naive') self._compute_method = kwargs.get('compute_method', 'naive')
self._fcsp = kwargs.get('fcsp', True)
self._ds_infos = kwargs.get('ds_infos', {}) self._ds_infos = kwargs.get('ds_infos', {})




@@ -50,10 +51,10 @@ class StructuralSP(GraphKernel):
else: else:
for g in iterator: for g in iterator:
splist.append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) splist.append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed']))
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
from itertools import combinations_with_replacement from itertools import combinations_with_replacement
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
if self._verbose >= 2: if self._verbose >= 2:
@@ -72,10 +73,10 @@ class StructuralSP(GraphKernel):
# print("error here ") # print("error here ")
gram_matrix[i][j] = kernel gram_matrix[i][j] = kernel
gram_matrix[j][i] = kernel gram_matrix[j][i] = kernel
return gram_matrix return gram_matrix
def _compute_gm_imap_unordered(self): def _compute_gm_imap_unordered(self):
# get shortest paths of each graph in the graphs. # get shortest paths of each graph in the graphs.
splist = [None] * len(self._graphs) splist = [None] * len(self._graphs)
@@ -87,9 +88,9 @@ class StructuralSP(GraphKernel):
chunksize = 100 chunksize = 100
# get shortest path graphs of self._graphs # get shortest path graphs of self._graphs
if self._compute_method == 'trie': if self._compute_method == 'trie':
get_sps_fun = self._wrapper_get_sps_trie
get_sps_fun = self._wrapper_get_sps_trie
else: else:
get_sps_fun = self._wrapper_get_sps_naive
get_sps_fun = self._wrapper_get_sps_naive
if self.verbose >= 2: if self.verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize), iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize),
desc='getting shortest paths', file=sys.stdout) desc='getting shortest paths', file=sys.stdout)
@@ -99,24 +100,24 @@ class StructuralSP(GraphKernel):
splist[i] = sp splist[i] = sp
pool.close() pool.close()
pool.join() pool.join()
# compute Gram matrix. # compute Gram matrix.
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))


def init_worker(spl_toshare, gs_toshare): def init_worker(spl_toshare, gs_toshare):
global G_spl, G_gs global G_spl, G_gs
G_spl = spl_toshare G_spl = spl_toshare
G_gs = gs_toshare
if self._compute_method == 'trie':
G_gs = gs_toshare
if self._compute_method == 'trie':
do_fun = self._wrapper_ssp_do_trie do_fun = self._wrapper_ssp_do_trie
else:
do_fun = self._wrapper_ssp_do_naive
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
else:
do_fun = self._wrapper_ssp_do_naive
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker,
glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose) glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose)
return gram_matrix return gram_matrix
def _compute_kernel_list_series(self, g1, g_list): def _compute_kernel_list_series(self, g1, g_list):
# get shortest paths of g1 and each graph in g_list. # get shortest paths of g1 and each graph in g_list.
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed'])
@@ -131,7 +132,7 @@ class StructuralSP(GraphKernel):
else: else:
for g in iterator: for g in iterator:
splist.append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) splist.append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed']))
# compute kernel list. # compute kernel list.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)
if self._verbose >= 2: if self._verbose >= 2:
@@ -146,10 +147,10 @@ class StructuralSP(GraphKernel):
for i in iterator: for i in iterator:
kernel = self._ssp_do_naive(g1, g_list[i], sp1, splist[i]) kernel = self._ssp_do_naive(g1, g_list[i], sp1, splist[i])
kernel_list[i] = kernel kernel_list[i] = kernel
return kernel_list return kernel_list
def _compute_kernel_list_imap_unordered(self, g1, g_list): def _compute_kernel_list_imap_unordered(self, g1, g_list):
# get shortest paths of g1 and each graph in g_list. # get shortest paths of g1 and each graph in g_list.
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed'])
@@ -162,9 +163,9 @@ class StructuralSP(GraphKernel):
chunksize = 100 chunksize = 100
# get shortest path graphs of g_list # get shortest path graphs of g_list
if self._compute_method == 'trie': if self._compute_method == 'trie':
get_sps_fun = self._wrapper_get_sps_trie
get_sps_fun = self._wrapper_get_sps_trie
else: else:
get_sps_fun = self._wrapper_get_sps_naive
get_sps_fun = self._wrapper_get_sps_naive
if self.verbose >= 2: if self.verbose >= 2:
iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize), iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize),
desc='getting shortest paths', file=sys.stdout) desc='getting shortest paths', file=sys.stdout)
@@ -174,7 +175,7 @@ class StructuralSP(GraphKernel):
splist[i] = sp splist[i] = sp
pool.close() pool.close()
pool.join() pool.join()
# compute Gram matrix. # compute Gram matrix.
kernel_list = [None] * len(g_list) kernel_list = [None] * len(g_list)


@@ -182,27 +183,27 @@ class StructuralSP(GraphKernel):
global G_sp1, G_spl, G_g1, G_gl global G_sp1, G_spl, G_g1, G_gl
G_sp1 = sp1_toshare G_sp1 = sp1_toshare
G_spl = spl_toshare G_spl = spl_toshare
G_g1 = g1_toshare
G_gl = gl_toshare
if self._compute_method == 'trie':
G_g1 = g1_toshare
G_gl = gl_toshare
if self._compute_method == 'trie':
do_fun = self._wrapper_ssp_do_trie do_fun = self._wrapper_ssp_do_trie
else:
else:
do_fun = self._wrapper_kernel_list_do do_fun = self._wrapper_kernel_list_do
def func_assign(result, var_to_assign):
def func_assign(result, var_to_assign):
var_to_assign[result[0]] = result[1] var_to_assign[result[0]] = result[1]
itr = range(len(g_list)) itr = range(len(g_list))
len_itr = len(g_list) len_itr = len(g_list)
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr,
init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose)
return kernel_list return kernel_list
def _wrapper_kernel_list_do(self, itr): def _wrapper_kernel_list_do(self, itr):
return itr, self._ssp_do_naive(G_g1, G_gl[itr], G_sp1, G_spl[itr]) return itr, self._ssp_do_naive(G_g1, G_gl[itr], G_sp1, G_spl[itr])


def _compute_single_kernel_series(self, g1, g2): def _compute_single_kernel_series(self, g1, g2):
sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed'])
sp2 = get_shortest_paths(g2, self._edge_weight, self._ds_infos['directed']) sp2 = get_shortest_paths(g2, self._edge_weight, self._ds_infos['directed'])
@@ -210,26 +211,33 @@ class StructuralSP(GraphKernel):
kernel = self._ssp_do_trie(g1, g2, sp1, sp2) kernel = self._ssp_do_trie(g1, g2, sp1, sp2)
else: else:
kernel = self._ssp_do_naive(g1, g2, sp1, sp2) kernel = self._ssp_do_naive(g1, g2, sp1, sp2)
return kernel
return kernel
def _wrapper_get_sps_naive(self, itr_item): def _wrapper_get_sps_naive(self, itr_item):
g = itr_item[0] g = itr_item[0]
i = itr_item[1] i = itr_item[1]
return i, get_shortest_paths(g, self._edge_weight, self._ds_infos['directed']) return i, get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])
def _ssp_do_naive(self, g1, g2, spl1, spl2): def _ssp_do_naive(self, g1, g2, spl1, spl2):
if self._fcsp: # @todo: it may be put outside the _sp_do().
return self._sp_do_naive_fcsp(g1, g2, spl1, spl2)
else:
return self._sp_do_naive_naive(g1, g2, spl1, spl2)


def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2):

kernel = 0 kernel = 0
# First, compute shortest path matrices, method borrowed from FCSP. # First, compute shortest path matrices, method borrowed from FCSP.
vk_dict = self._get_all_node_kernels(g1, g2) vk_dict = self._get_all_node_kernels(g1, g2)
# Then, compute kernels between all pairs of edges, which is an idea of # Then, compute kernels between all pairs of edges, which is an idea of
# extension of FCSP. It suits sparse graphs, which is the most case we # extension of FCSP. It suits sparse graphs, which is the most case we
# went though. For dense graphs, this would be slow. # went though. For dense graphs, this would be slow.
ek_dict = self._get_all_edge_kernels(g1, g2) ek_dict = self._get_all_edge_kernels(g1, g2)
# compute graph kernels # compute graph kernels
if vk_dict: if vk_dict:
if ek_dict: if ek_dict:
@@ -244,6 +252,7 @@ class StructuralSP(GraphKernel):
if not kpath: if not kpath:
break break
kernel += kpath # add up kernels of all paths kernel += kpath # add up kernels of all paths
# print(kernel, ',', p1, ',', p2)
else: else:
for p1, p2 in product(spl1, spl2): for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2): if len(p1) == len(p2):
@@ -279,7 +288,7 @@ class StructuralSP(GraphKernel):
print(g1.nodes(data=True)) print(g1.nodes(data=True))
print(g1.edges(data=True)) print(g1.edges(data=True))
raise Exception raise Exception
# # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
# # compute vertex kernel matrix # # compute vertex kernel matrix
# try: # try:
@@ -292,7 +301,7 @@ class StructuralSP(GraphKernel):
# vk_mat[i1][i2] = kn( # vk_mat[i1][i2] = kn(
# n1[1][node_label], n2[1][node_label], # n1[1][node_label], n2[1][node_label],
# [n1[1]['attributes']], [n2[1]['attributes']]) # [n1[1]['attributes']], [n2[1]['attributes']])
# range1 = range(0, len(edge_w_g[i])) # range1 = range(0, len(edge_w_g[i]))
# range2 = range(0, len(edge_w_g[j])) # range2 = range(0, len(edge_w_g[j]))
# for i1 in range1: # for i1 in range1:
@@ -309,18 +318,137 @@ class StructuralSP(GraphKernel):
# kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
# Kmatrix += kn1 + kn2 # Kmatrix += kn1 + kn2
return kernel return kernel


def _sp_do_naive_naive(self, g1, g2, spl1, spl2):

kernel = 0

# Define the function to compute kernels between vertices in each condition.
if len(self._node_labels) > 0:
# node symb and non-synb labeled
if len(self._node_attrs) > 0:
def compute_vk(n1, n2):
kn = self._node_kernels['mix']
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
# node symb labeled
else:
def compute_vk(n1, n2):
kn = self._node_kernels['symb']
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
return kn(n1_labels, n2_labels)
else:
# node non-synb labeled
if len(self._node_attrs) > 0:
def compute_vk(n1, n2):
kn = self._node_kernels['nsymb']
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
return kn(n1_attrs, n2_attrs)
# # node unlabeled
# else:
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
# if e1[2]['cost'] == e2[2]['cost']:
# kernel += 1
# return kernel

# Define the function to compute kernels between edges in each condition.
if len(self._edge_labels) > 0:
# edge symb and non-synb labeled
if len(self._edge_attrs) > 0:
def compute_ek(e1, e2):
ke = self._edge_kernels['mix']
e1_labels = [g1.edges[e1][el] for el in self._edge_labels]
e2_labels = [g2.edges[e2][el] for el in self._edge_labels]
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs]
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs]
return ke(e1_labels, e2_labels, e1_attrs, e2_attrs)
# edge symb labeled
else:
def compute_ek(e1, e2):
ke = self._edge_kernels['symb']
e1_labels = [g1.edges[e1][el] for el in self._edge_labels]
e2_labels = [g2.edges[e2][el] for el in self._edge_labels]
return ke(e1_labels, e2_labels)
else:
# edge non-synb labeled
if len(self._edge_attrs) > 0:
def compute_ek(e1, e2):
ke = self._edge_kernels['nsymb']
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs]
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs]
return ke(e1_attrs, e2_attrs)


# compute graph kernels
if len(self._node_labels) > 0 or len(self._node_attrs) > 0:
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
kpath = compute_vk(p1[0], p2[0])
if kpath:
for idx in range(1, len(p1)):
kpath *= compute_vk(p1[idx], p2[idx]) * \
compute_ek((p1[idx-1], p1[idx]),
(p2[idx-1], p2[idx]))
if not kpath:
break
kernel += kpath # add up kernels of all paths
# print(kernel, ',', p1, ',', p2)
else:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
kpath = compute_vk(p1[0], p2[0])
if kpath:
for idx in range(1, len(p1)):
kpath *= compute_vk(p1[idx], p2[idx])
if not kpath:
break
kernel += kpath # add up kernels of all paths
else:
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
if len(p1) == 0:
kernel += 1
else:
kpath = 1
for idx in range(0, len(p1) - 1):
kpath *= compute_ek((p1[idx], p1[idx+1]),
(p2[idx], p2[idx+1]))
if not kpath:
break
kernel += kpath # add up kernels of all paths
else:
for p1, p2 in product(spl1, spl2):
if len(p1) == len(p2):
kernel += 1
try:
kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average
except ZeroDivisionError:
print(spl1, spl2)
print(g1.nodes(data=True))
print(g1.edges(data=True))
raise Exception

return kernel


def _wrapper_ssp_do_naive(self, itr): def _wrapper_ssp_do_naive(self, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, self._ssp_do_naive(G_gs[i], G_gs[j], G_spl[i], G_spl[j]) return i, j, self._ssp_do_naive(G_gs[i], G_gs[j], G_spl[i], G_spl[j])
def _get_all_node_kernels(self, g1, g2): def _get_all_node_kernels(self, g1, g2):
return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs) return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs)
def _get_all_edge_kernels(self, g1, g2): def _get_all_edge_kernels(self, g1, g2):
# compute kernels between all pairs of edges, which is an idea of # compute kernels between all pairs of edges, which is an idea of
# extension of FCSP. It suits sparse graphs, which is the most case we # extension of FCSP. It suits sparse graphs, which is the most case we
@@ -368,5 +496,5 @@ class StructuralSP(GraphKernel):
# edge unlabeled # edge unlabeled
else: else:
pass pass
return ek_dict
return ek_dict

+ 97
- 97
gklearn/kernels/structuralspKernel.py View File

@@ -5,9 +5,9 @@ Created on Thu Sep 27 10:56:23 2018


@author: linlin @author: linlin


@references:
@references:


[1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
[1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
""" """


@@ -43,7 +43,7 @@ def structuralspkernel(*args,
---------- ----------
Gn : List of NetworkX graph Gn : List of NetworkX graph
List of graphs between which the kernels are computed. List of graphs between which the kernels are computed.
G1, G2 : NetworkX graphs G1, G2 : NetworkX graphs
Two graphs between which the kernel is computed. Two graphs between which the kernel is computed.


@@ -51,25 +51,25 @@ def structuralspkernel(*args,
Node attribute used as label. The default node label is atom. Node attribute used as label. The default node label is atom.


edge_weight : string edge_weight : string
Edge attribute name corresponding to the edge weight. Applied for the
Edge attribute name corresponding to the edge weight. Applied for the
computation of the shortest paths. computation of the shortest paths.


edge_label : string edge_label : string
Edge attribute used as label. The default edge label is bond_type. Edge attribute used as label. The default edge label is bond_type.


node_kernels : dict node_kernels : dict
A dictionary of kernel functions for nodes, including 3 items: 'symb'
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
for both labels. The first 2 functions take two node labels as
A dictionary of kernel functions for nodes, including 3 items: 'symb'
for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
for both labels. The first 2 functions take two node labels as
parameters, and the 'mix' function takes 4 parameters, a symbolic and a parameters, and the 'mix' function takes 4 parameters, a symbolic and a
non-symbolic label for each the two nodes. Each label is in form of 2-D non-symbolic label for each the two nodes. Each label is in form of 2-D
dimension array (n_samples, n_features). Each function returns a number dimension array (n_samples, n_features). Each function returns a number
as the kernel value. Ignored when nodes are unlabeled. as the kernel value. Ignored when nodes are unlabeled.


edge_kernels : dict edge_kernels : dict
A dictionary of kernel functions for edges, including 3 items: 'symb'
for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
for both labels. The first 2 functions take two edge labels as
A dictionary of kernel functions for edges, including 3 items: 'symb'
for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
for both labels. The first 2 functions take two edge labels as
parameters, and the 'mix' function takes 4 parameters, a symbolic and a parameters, and the 'mix' function takes 4 parameters, a symbolic and a
non-symbolic label for each the two edges. Each label is in form of 2-D non-symbolic label for each the two edges. Each label is in form of 2-D
dimension array (n_samples, n_features). Each function returns a number dimension array (n_samples, n_features). Each function returns a number
@@ -89,7 +89,7 @@ def structuralspkernel(*args,
Return Return
------ ------
Kmatrix : Numpy matrix Kmatrix : Numpy matrix
Kernel matrix, each element of which is the mean average structural
Kernel matrix, each element of which is the mean average structural
shortest path kernel between 2 praphs. shortest path kernel between 2 praphs.
""" """
# pre-process # pre-process
@@ -135,9 +135,9 @@ def structuralspkernel(*args,
chunksize = 100 chunksize = 100
# get shortest path graphs of Gn # get shortest path graphs of Gn
if compute_method == 'trie': if compute_method == 'trie':
getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed'])
getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed'])
else: else:
getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed'])
getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed'])
if verbose: if verbose:
iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
desc='getting shortest paths', file=sys.stdout) desc='getting shortest paths', file=sys.stdout)
@@ -161,17 +161,17 @@ def structuralspkernel(*args,
else: else:
for g in iterator: for g in iterator:
splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
# ss = 0 # ss = 0
# ss += sys.getsizeof(splist) # ss += sys.getsizeof(splist)
# for spss in splist: # for spss in splist:
# ss += sys.getsizeof(spss) # ss += sys.getsizeof(spss)
# for spp in spss: # for spp in spss:
# ss += sys.getsizeof(spp) # ss += sys.getsizeof(spp)
# time.sleep(20) # time.sleep(20)




# # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
@@ -194,21 +194,21 @@ def structuralspkernel(*args,


Kmatrix = np.zeros((len(Gn), len(Gn))) Kmatrix = np.zeros((len(Gn), len(Gn)))


# ---- use pool.imap_unordered to parallel and track progress. ----
# ---- use pool.imap_unordered to parallel and track progress. ----
if parallel == 'imap_unordered': if parallel == 'imap_unordered':
def init_worker(spl_toshare, gs_toshare): def init_worker(spl_toshare, gs_toshare):
global G_spl, G_gs global G_spl, G_gs
G_spl = spl_toshare G_spl = spl_toshare
G_gs = gs_toshare
if compute_method == 'trie':
do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)
else:
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
G_gs = gs_toshare
if compute_method == 'trie':
do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)
else:
do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose)
# ---- direct running, normally use single CPU core. ---- # ---- direct running, normally use single CPU core. ----
elif parallel is None: elif parallel is None:
@@ -232,10 +232,10 @@ def structuralspkernel(*args,
# print("error here ") # print("error here ")
Kmatrix[i][j] = kernel Kmatrix[i][j] = kernel
Kmatrix[j][i] = kernel Kmatrix[j][i] = kernel
# # ---- use pool.map to parallel. ---- # # ---- use pool.map to parallel. ----
# pool = Pool(n_jobs) # pool = Pool(n_jobs)
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels) # node_kernels, edge_kernels)
# itr = zip(combinations_with_replacement(Gn, 2), # itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2), # combinations_with_replacement(splist, 2),
@@ -249,7 +249,7 @@ def structuralspkernel(*args,
# pool.join() # pool.join()


# # ---- use pool.imap_unordered to parallel and track progress. ---- # # ---- use pool.imap_unordered to parallel and track progress. ----
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
# node_kernels, edge_kernels) # node_kernels, edge_kernels)
# itr = zip(combinations_with_replacement(Gn, 2), # itr = zip(combinations_with_replacement(Gn, 2),
# combinations_with_replacement(splist, 2), # combinations_with_replacement(splist, 2),
@@ -282,7 +282,7 @@ def structuralspkernel(*args,


def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels): node_kernels, edge_kernels):
kernel = 0 kernel = 0


# First, compute shortest path matrices, method borrowed from FCSP. # First, compute shortest path matrices, method borrowed from FCSP.
@@ -373,25 +373,25 @@ def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
return kernel return kernel




def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
edge_kernels, itr): edge_kernels, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j],
ds_attrs, node_label, edge_label,
return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j],
ds_attrs, node_label, edge_label,
node_kernels, edge_kernels) node_kernels, edge_kernels)
def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
node_kernels, edge_kernels): node_kernels, edge_kernels):
# # traverse all paths in graph1. Deep-first search is applied. # # traverse all paths in graph1. Deep-first search is applied.
# def traverseBothTrie(root, trie2, kernel, pcurrent=[]): # def traverseBothTrie(root, trie2, kernel, pcurrent=[]):
# for key, node in root['children'].items(): # for key, node in root['children'].items():
# pcurrent.append(key) # pcurrent.append(key)
# if node['isEndOfWord']: # if node['isEndOfWord']:
# # print(node['count']) # # print(node['count'])
# traverseTrie2(trie2.root, pcurrent, kernel,
# traverseTrie2(trie2.root, pcurrent, kernel,
# pcurrent=[]) # pcurrent=[])
# if node['children'] != {}: # if node['children'] != {}:
# traverseBothTrie(node, trie2, kernel, pcurrent) # traverseBothTrie(node, trie2, kernel, pcurrent)
@@ -399,14 +399,14 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
# del pcurrent[-1] # del pcurrent[-1]
# if pcurrent != []: # if pcurrent != []:
# del pcurrent[-1] # del pcurrent[-1]
#
#
# # traverse all paths in graph2 and find out those that are not in
# # graph1. Deep-first search is applied.
#
#
# # traverse all paths in graph2 and find out those that are not in
# # graph1. Deep-first search is applied.
# def traverseTrie2(root, p1, kernel, pcurrent=[]): # def traverseTrie2(root, p1, kernel, pcurrent=[]):
# for key, node in root['children'].items(): # for key, node in root['children'].items():
# pcurrent.append(key) # pcurrent.append(key)
# if node['isEndOfWord']:
# if node['isEndOfWord']:
# # print(node['count']) # # print(node['count'])
# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
# if node['children'] != {}: # if node['children'] != {}:
@@ -415,8 +415,8 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
# del pcurrent[-1] # del pcurrent[-1]
# if pcurrent != []: # if pcurrent != []:
# del pcurrent[-1] # del pcurrent[-1]
#
#
#
#
# kernel = [0] # kernel = [0]
# #
# # First, compute shortest path matrices, method borrowed from FCSP. # # First, compute shortest path matrices, method borrowed from FCSP.
@@ -437,7 +437,7 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
# pcurrent.append(key) # pcurrent.append(key)
# if node['isEndOfWord']: # if node['isEndOfWord']:
# # print(node['count']) # # print(node['count'])
# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
# pcurrent=[]) # pcurrent=[])
# if node['children'] != {}: # if node['children'] != {}:
# traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent) # traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
@@ -445,14 +445,14 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
# del pcurrent[-1] # del pcurrent[-1]
# if pcurrent != []: # if pcurrent != []:
# del pcurrent[-1] # del pcurrent[-1]
#
#
# # traverse all paths in graph2 and find out those that are not in
# # graph1. Deep-first search is applied.
#
#
# # traverse all paths in graph2 and find out those that are not in
# # graph1. Deep-first search is applied.
# def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): # def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
# for key, node in root['children'].items(): # for key, node in root['children'].items():
# pcurrent.append(key) # pcurrent.append(key)
# if node['isEndOfWord']:
# if node['isEndOfWord']:
# # print(node['count']) # # print(node['count'])
# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
# if node['children'] != {}: # if node['children'] != {}:
@@ -461,8 +461,8 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
# del pcurrent[-1] # del pcurrent[-1]
# if pcurrent != []: # if pcurrent != []:
# del pcurrent[-1] # del pcurrent[-1]
kernel = [0] kernel = [0]


# First, compute shortest path matrices, method borrowed from FCSP. # First, compute shortest path matrices, method borrowed from FCSP.
@@ -483,20 +483,20 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
if ek_dict: if ek_dict:
traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
else: else:
traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)


kernel = kernel[0] / (trie1[1] * trie2[1]) # Compute mean average kernel = kernel[0] / (trie1[1] * trie2[1]) # Compute mean average


return kernel return kernel




def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels,
def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels,
edge_kernels, itr): edge_kernels, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs,
return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs,
node_label, edge_label, node_kernels, edge_kernels) node_label, edge_label, node_kernels, edge_kernels)


def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs): def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs):
# compute shortest path matrices, method borrowed from FCSP. # compute shortest path matrices, method borrowed from FCSP.
@@ -528,7 +528,7 @@ def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs):
# node unlabeled # node unlabeled
else: else:
pass pass
return vk_dict return vk_dict




@@ -573,17 +573,17 @@ def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs):
# edge unlabeled # edge unlabeled
else: else:
pass pass
return ek_dict
return ek_dict
# traverse all paths in graph1. Deep-first search is applied. # traverse all paths in graph1. Deep-first search is applied.
def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']: if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
pcurrent=[]) pcurrent=[])
if node['children'] != {}: if node['children'] != {}:
traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent) traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
@@ -591,14 +591,14 @@ def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']:
if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
if len(p1) == len(pcurrent): if len(p1) == len(pcurrent):
kpath = vk_dict[(p1[0], pcurrent[0])] kpath = vk_dict[(p1[0], pcurrent[0])]
@@ -616,7 +616,7 @@ def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]


# traverse all paths in graph1. Deep-first search is applied. # traverse all paths in graph1. Deep-first search is applied.
def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
@@ -624,7 +624,7 @@ def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']: if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
pcurrent=[]) pcurrent=[])
if node['children'] != {}: if node['children'] != {}:
traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent) traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
@@ -632,14 +632,14 @@ def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']:
if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
if len(p1) == len(pcurrent): if len(p1) == len(pcurrent):
kpath = vk_dict[(p1[0], pcurrent[0])] kpath = vk_dict[(p1[0], pcurrent[0])]
@@ -655,15 +655,15 @@ def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
# traverse all paths in graph1. Deep-first search is applied. # traverse all paths in graph1. Deep-first search is applied.
def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']: if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
pcurrent=[]) pcurrent=[])
if node['children'] != {}: if node['children'] != {}:
traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent) traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
@@ -671,14 +671,14 @@ def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']:
if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
if len(p1) == len(pcurrent): if len(p1) == len(pcurrent):
if len(p1) == 0: if len(p1) == 0:
@@ -697,15 +697,15 @@ def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
# traverse all paths in graph1. Deep-first search is applied. # traverse all paths in graph1. Deep-first search is applied.
def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']: if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
pcurrent=[]) pcurrent=[])
if node['children'] != {}: if node['children'] != {}:
traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent) traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
@@ -713,14 +713,14 @@ def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
# traverse all paths in graph2 and find out those that are not in
# graph1. Deep-first search is applied.
def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
for key, node in root['children'].items(): for key, node in root['children'].items():
pcurrent.append(key) pcurrent.append(key)
if node['isEndOfWord']:
if node['isEndOfWord']:
# print(node['count']) # print(node['count'])
if len(p1) == len(pcurrent): if len(p1) == len(pcurrent):
kernel[0] += 1 kernel[0] += 1
@@ -730,8 +730,8 @@ def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
del pcurrent[-1] del pcurrent[-1]
if pcurrent != []: if pcurrent != []:
del pcurrent[-1] del pcurrent[-1]
#def computePathKernel(p1, p2, vk_dict, ek_dict): #def computePathKernel(p1, p2, vk_dict, ek_dict):
# kernel = 0 # kernel = 0
# if vk_dict: # if vk_dict:
@@ -771,7 +771,7 @@ def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
# else: # else:
# if len(p1) == len(p2): # if len(p1) == len(p2):
# kernel += 1 # kernel += 1
#
#
# return kernel # return kernel




@@ -804,7 +804,7 @@ def get_shortest_paths(G, weight, directed):
# each edge walk is counted twice, starting from both its extreme nodes. # each edge walk is counted twice, starting from both its extreme nodes.
if not directed: if not directed:
sp += [sptemp[::-1] for sptemp in spltemp] sp += [sptemp[::-1] for sptemp in spltemp]
# add single nodes as length 0 paths. # add single nodes as length 0 paths.
sp += [[n] for n in G.nodes()] sp += [[n] for n in G.nodes()]
return sp return sp
@@ -849,7 +849,7 @@ def get_sps_as_trie(G, weight, directed):
# each edge walk is counted twice, starting from both its extreme nodes. # each edge walk is counted twice, starting from both its extreme nodes.
if not directed: if not directed:
sptrie.insertWord(sp[::-1]) sptrie.insertWord(sp[::-1])
# add single nodes as length 0 paths. # add single nodes as length 0 paths.
for n in G.nodes(): for n in G.nodes():
sptrie.insertWord([n]) sptrie.insertWord([n])


+ 90
- 52
gklearn/tests/test_graph_kernels.py View File

@@ -3,13 +3,14 @@


import pytest import pytest
import multiprocessing import multiprocessing
import numpy as np




def chooseDataset(ds_name): def chooseDataset(ds_name):
"""Choose dataset according to name. """Choose dataset according to name.
""" """
from gklearn.utils import Dataset from gklearn.utils import Dataset
dataset = Dataset() dataset = Dataset()


# no node labels (and no edge labels). # no node labels (and no edge labels).
@@ -18,6 +19,7 @@ def chooseDataset(ds_name):
dataset.trim_dataset(edge_required=False) dataset.trim_dataset(edge_required=False)
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}
dataset.remove_labels(**irrelevant_labels) dataset.remove_labels(**irrelevant_labels)
dataset.cut_graphs(range(1, 10))
# node symbolic labels. # node symbolic labels.
elif ds_name == 'Acyclic': elif ds_name == 'Acyclic':
dataset.load_predefined_dataset(ds_name) dataset.load_predefined_dataset(ds_name)
@@ -46,9 +48,9 @@ def chooseDataset(ds_name):
elif ds_name == 'Cuneiform': elif ds_name == 'Cuneiform':
dataset.load_predefined_dataset(ds_name) dataset.load_predefined_dataset(ds_name)
dataset.trim_dataset(edge_required=True) dataset.trim_dataset(edge_required=True)
dataset.cut_graphs(range(0, 3)) dataset.cut_graphs(range(0, 3))
return dataset return dataset




@@ -57,7 +59,7 @@ def test_list_graph_kernels():
""" """
from gklearn.kernels import GRAPH_KERNELS, list_of_graph_kernels from gklearn.kernels import GRAPH_KERNELS, list_of_graph_kernels
assert list_of_graph_kernels() == [i for i in GRAPH_KERNELS] assert list_of_graph_kernels() == [i for i in GRAPH_KERNELS]




@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS'])
@@ -68,10 +70,10 @@ def test_CommonWalk(ds_name, parallel, weight, compute_method):
""" """
from gklearn.kernels import CommonWalk from gklearn.kernels import CommonWalk
import networkx as nx import networkx as nx
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)
dataset.load_graphs([g for g in dataset.graphs if nx.number_of_nodes(g) > 1]) dataset.load_graphs([g for g in dataset.graphs if nx.number_of_nodes(g) > 1])
try: try:
graph_kernel = CommonWalk(node_labels=dataset.node_labels, graph_kernel = CommonWalk(node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels, edge_labels=dataset.edge_labels,
@@ -87,8 +89,8 @@ def test_CommonWalk(ds_name, parallel, weight, compute_method):


except Exception as exception: except Exception as exception:
assert False, exception assert False, exception
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS'])
@pytest.mark.parametrize('remove_totters', [False]) #[True, False]) @pytest.mark.parametrize('remove_totters', [False]) #[True, False])
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) @pytest.mark.parametrize('parallel', ['imap_unordered', None])
@@ -96,9 +98,9 @@ def test_Marginalized(ds_name, parallel, remove_totters):
"""Test marginalized kernel. """Test marginalized kernel.
""" """
from gklearn.kernels import Marginalized from gklearn.kernels import Marginalized
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)
try: try:
graph_kernel = Marginalized(node_labels=dataset.node_labels, graph_kernel = Marginalized(node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels, edge_labels=dataset.edge_labels,
@@ -115,15 +117,15 @@ def test_Marginalized(ds_name, parallel, remove_totters):


except Exception as exception: except Exception as exception:
assert False, exception assert False, exception
@pytest.mark.parametrize('ds_name', ['Acyclic']) @pytest.mark.parametrize('ds_name', ['Acyclic'])
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_SylvesterEquation(ds_name, parallel): def test_SylvesterEquation(ds_name, parallel):
"""Test sylvester equation kernel. """Test sylvester equation kernel.
""" """
from gklearn.kernels import SylvesterEquation from gklearn.kernels import SylvesterEquation
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)


try: try:
@@ -139,11 +141,11 @@ def test_SylvesterEquation(ds_name, parallel):
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
except Exception as exception: except Exception as exception:
assert False, exception assert False, exception
@pytest.mark.parametrize('ds_name', ['Acyclic', 'AIDS']) @pytest.mark.parametrize('ds_name', ['Acyclic', 'AIDS'])
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_ConjugateGradient(ds_name, parallel): def test_ConjugateGradient(ds_name, parallel):
@@ -152,9 +154,9 @@ def test_ConjugateGradient(ds_name, parallel):
from gklearn.kernels import ConjugateGradient from gklearn.kernels import ConjugateGradient
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
import functools import functools
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}


@@ -177,11 +179,11 @@ def test_ConjugateGradient(ds_name, parallel):
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
except Exception as exception: except Exception as exception:
assert False, exception assert False, exception
@pytest.mark.parametrize('ds_name', ['Acyclic', 'AIDS']) @pytest.mark.parametrize('ds_name', ['Acyclic', 'AIDS'])
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_FixedPoint(ds_name, parallel): def test_FixedPoint(ds_name, parallel):
@@ -190,9 +192,9 @@ def test_FixedPoint(ds_name, parallel):
from gklearn.kernels import FixedPoint from gklearn.kernels import FixedPoint
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
import functools import functools
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}


@@ -215,11 +217,11 @@ def test_FixedPoint(ds_name, parallel):
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
except Exception as exception: except Exception as exception:
assert False, exception assert False, exception
@pytest.mark.parametrize('ds_name', ['Acyclic']) @pytest.mark.parametrize('ds_name', ['Acyclic'])
@pytest.mark.parametrize('sub_kernel', ['exp', 'geo']) @pytest.mark.parametrize('sub_kernel', ['exp', 'geo'])
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) @pytest.mark.parametrize('parallel', ['imap_unordered', None])
@@ -227,7 +229,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel, parallel):
"""Test spectral decomposition kernel. """Test spectral decomposition kernel.
""" """
from gklearn.kernels import SpectralDecomposition from gklearn.kernels import SpectralDecomposition
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)


try: try:
@@ -244,11 +246,11 @@ def test_SpectralDecomposition(ds_name, sub_kernel, parallel):
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
except Exception as exception: except Exception as exception:
assert False, exception assert False, exception
# @pytest.mark.parametrize( # @pytest.mark.parametrize(
# 'compute_method,ds_name,sub_kernel', # 'compute_method,ds_name,sub_kernel',
# [ # [
@@ -268,7 +270,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel, parallel):
# from gklearn.kernels import RandomWalk # from gklearn.kernels import RandomWalk
# from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct # from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
# import functools # import functools
#
#
# dataset = chooseDataset(ds_name) # dataset = chooseDataset(ds_name)


# mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) # mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
@@ -297,7 +299,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel, parallel):
# except Exception as exception: # except Exception as exception:
# assert False, exception # assert False, exception


@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) @pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint'])
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_ShortestPath(ds_name, parallel): def test_ShortestPath(ds_name, parallel):
@@ -306,17 +308,30 @@ def test_ShortestPath(ds_name, parallel):
from gklearn.kernels import ShortestPath from gklearn.kernels import ShortestPath
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
import functools import functools
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
try: try:
graph_kernel = ShortestPath(node_labels=dataset.node_labels, graph_kernel = ShortestPath(node_labels=dataset.node_labels,
node_attrs=dataset.node_attrs, node_attrs=dataset.node_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']), ds_infos=dataset.get_dataset_infos(keys=['directed']),
fcsp=True,
node_kernels=sub_kernels) node_kernels=sub_kernels)
gram_matrix, run_time = graph_kernel.compute(dataset.graphs,
gram_matrix1, run_time = graph_kernel.compute(dataset.graphs,
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)

graph_kernel = ShortestPath(node_labels=dataset.node_labels,
node_attrs=dataset.node_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
fcsp=False,
node_kernels=sub_kernels)
gram_matrix2, run_time = graph_kernel.compute(dataset.graphs,
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
@@ -326,6 +341,8 @@ def test_ShortestPath(ds_name, parallel):
except Exception as exception: except Exception as exception:
assert False, exception assert False, exception


assert np.array_equal(gram_matrix1, gram_matrix2)



#@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) #@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint'])
@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) @pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform'])
@@ -336,29 +353,47 @@ def test_StructuralSP(ds_name, parallel):
from gklearn.kernels import StructuralSP from gklearn.kernels import StructuralSP
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
import functools import functools
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
try: try:
graph_kernel = StructuralSP(node_labels=dataset.node_labels, graph_kernel = StructuralSP(node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, node_attrs=dataset.node_attrs,
edge_attrs=dataset.edge_attrs, edge_attrs=dataset.edge_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']), ds_infos=dataset.get_dataset_infos(keys=['directed']),
fcsp=True,
node_kernels=sub_kernels, node_kernels=sub_kernels,
edge_kernels=sub_kernels) edge_kernels=sub_kernels)
gram_matrix, run_time = graph_kernel.compute(dataset.graphs,
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
gram_matrix1, run_time = graph_kernel.compute(dataset.graphs,
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True, normalize=False)
kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)

graph_kernel = StructuralSP(node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs,
edge_attrs=dataset.edge_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
fcsp=False,
node_kernels=sub_kernels,
edge_kernels=sub_kernels)
gram_matrix2, run_time = graph_kernel.compute(dataset.graphs,
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True, normalize=False)
kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1],
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)


except Exception as exception: except Exception as exception:
assert False, exception assert False, exception


assert np.array_equal(gram_matrix1, gram_matrix2)



@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS'])
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) @pytest.mark.parametrize('parallel', ['imap_unordered', None])
@@ -369,9 +404,9 @@ def test_PathUpToH(ds_name, parallel, k_func, compute_method):
"""Test path kernel up to length $h$. """Test path kernel up to length $h$.
""" """
from gklearn.kernels import PathUpToH from gklearn.kernels import PathUpToH
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)
try: try:
graph_kernel = PathUpToH(node_labels=dataset.node_labels, graph_kernel = PathUpToH(node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels, edge_labels=dataset.edge_labels,
@@ -385,8 +420,8 @@ def test_PathUpToH(ds_name, parallel, k_func, compute_method):
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
except Exception as exception: except Exception as exception:
assert False, exception assert False, exception
@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS'])
@pytest.mark.parametrize('parallel', ['imap_unordered', None]) @pytest.mark.parametrize('parallel', ['imap_unordered', None])
def test_Treelet(ds_name, parallel): def test_Treelet(ds_name, parallel):
@@ -395,10 +430,10 @@ def test_Treelet(ds_name, parallel):
from gklearn.kernels import Treelet from gklearn.kernels import Treelet
from gklearn.utils.kernels import polynomialkernel from gklearn.utils.kernels import polynomialkernel
import functools import functools
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)


pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
try: try:
graph_kernel = Treelet(node_labels=dataset.node_labels, graph_kernel = Treelet(node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels, edge_labels=dataset.edge_labels,
@@ -412,8 +447,8 @@ def test_Treelet(ds_name, parallel):
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
except Exception as exception: except Exception as exception:
assert False, exception assert False, exception
@pytest.mark.parametrize('ds_name', ['Acyclic']) @pytest.mark.parametrize('ds_name', ['Acyclic'])
#@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge']) #@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge'])
# @pytest.mark.parametrize('base_kernel', ['subtree']) # @pytest.mark.parametrize('base_kernel', ['subtree'])
@@ -422,7 +457,7 @@ def test_WLSubtree(ds_name, parallel):
"""Test Weisfeiler-Lehman subtree kernel. """Test Weisfeiler-Lehman subtree kernel.
""" """
from gklearn.kernels import WLSubtree from gklearn.kernels import WLSubtree
dataset = chooseDataset(ds_name) dataset = chooseDataset(ds_name)


try: try:
@@ -438,12 +473,15 @@ def test_WLSubtree(ds_name, parallel):
parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True)
except Exception as exception: except Exception as exception:
assert False, exception assert False, exception


if __name__ == "__main__": if __name__ == "__main__":
test_list_graph_kernels() test_list_graph_kernels()
# test_spkernel('Alkane', 'imap_unordered') # test_spkernel('Alkane', 'imap_unordered')
# test_ShortestPath('Alkane', 'imap_unordered')
# test_StructuralSP('Fingerprint_edge', 'imap_unordered') # test_StructuralSP('Fingerprint_edge', 'imap_unordered')
# test_StructuralSP('Alkane', None)
# test_StructuralSP('Cuneiform', None)
# test_WLSubtree('Acyclic', 'imap_unordered') # test_WLSubtree('Acyclic', 'imap_unordered')
# test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered')
# test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered')


Loading…
Cancel
Save