diff --git a/gklearn/kernels/common_walk.py b/gklearn/kernels/common_walk.py index ac3363e..ea0e59f 100644 --- a/gklearn/kernels/common_walk.py +++ b/gklearn/kernels/common_walk.py @@ -47,7 +47,7 @@ class CommonWalk(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, - length=len_itr, verbose=(self._verbose >= 2)) + length=len_itr, verbose=(self.verbose >= 2)) # direct product graph method - exponential if self._compute_method == 'exp': @@ -86,7 +86,7 @@ class CommonWalk(GraphKernel): do_fun = self._wrapper_kernel_do_geo parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -100,9 +100,9 @@ class CommonWalk(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) - if self._verbose >= 2: + if self.verbose >= 2: iterator = get_iters(range(len(g_list)), desc='Computing kernels', - file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) else: iterator = range(len(g_list)) @@ -148,7 +148,7 @@ class CommonWalk(GraphKernel): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/conjugate_gradient.py b/gklearn/kernels/conjugate_gradient.py index b162f20..eb5e428 100644 --- a/gklearn/kernels/conjugate_gradient.py +++ b/gklearn/kernels/conjugate_gradient.py @@ -35,7 +35,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_gm_series(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) lmda = self._weight @@ -44,7 +44,7 @@ class ConjugateGradient(RandomWalkMeta): gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) # Reindex nodes using consecutive integers for the convenience of kernel computation. - iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -52,7 +52,7 @@ class ConjugateGradient(RandomWalkMeta): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) - iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) @@ -66,7 +66,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) # Compute Gram matrix. @@ -74,7 +74,7 @@ class ConjugateGradient(RandomWalkMeta): # @todo: parallel this. # Reindex nodes using consecutive integers for the convenience of kernel computation. - iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -86,7 +86,7 @@ class ConjugateGradient(RandomWalkMeta): do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) else: # @todo pass @@ -95,7 +95,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) lmda = self._weight @@ -105,11 +105,11 @@ class ConjugateGradient(RandomWalkMeta): # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') - iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(g1, g_list[i], lmda) @@ -122,7 +122,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) # compute kernel list. @@ -131,7 +131,7 @@ class ConjugateGradient(RandomWalkMeta): # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') # @todo: parallel this. - iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -149,7 +149,7 @@ class ConjugateGradient(RandomWalkMeta): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) else: # @todo pass @@ -162,7 +162,7 @@ class ConjugateGradient(RandomWalkMeta): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2], self._verbose) + self._check_edge_weight([g1] + [g2], self.verbose) self._check_graphs([g1] + [g2]) lmda = self._weight diff --git a/gklearn/kernels/fixed_point.py b/gklearn/kernels/fixed_point.py index 12d8fe7..ced5430 100644 --- a/gklearn/kernels/fixed_point.py +++ b/gklearn/kernels/fixed_point.py @@ -35,7 +35,7 @@ class FixedPoint(RandomWalkMeta): def _compute_gm_series(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) lmda = self._weight @@ -44,7 +44,7 @@ class FixedPoint(RandomWalkMeta): gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) # Reindex nodes using consecutive integers for the convenience of kernel computation. - iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self.verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -52,7 +52,7 @@ class FixedPoint(RandomWalkMeta): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) - iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) @@ -66,7 +66,7 @@ class FixedPoint(RandomWalkMeta): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) # Compute Gram matrix. @@ -74,7 +74,7 @@ class FixedPoint(RandomWalkMeta): # @todo: parallel this. # Reindex nodes using consecutive integers for the convenience of kernel computation. - iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -86,7 +86,7 @@ class FixedPoint(RandomWalkMeta): do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) else: # @todo pass @@ -95,7 +95,7 @@ class FixedPoint(RandomWalkMeta): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) lmda = self._weight @@ -105,12 +105,12 @@ class FixedPoint(RandomWalkMeta): # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') - iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(g1, g_list[i], lmda) @@ -123,7 +123,7 @@ class FixedPoint(RandomWalkMeta): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) # compute kernel list. @@ -132,7 +132,7 @@ class FixedPoint(RandomWalkMeta): # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') # @todo: parallel this. - iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] if self._p is None and self._q is None: # p and q are uniform distributions as default. @@ -150,7 +150,7 @@ class FixedPoint(RandomWalkMeta): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) else: # @todo pass @@ -163,7 +163,7 @@ class FixedPoint(RandomWalkMeta): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2], self._verbose) + self._check_edge_weight([g1] + [g2], self.verbose) self._check_graphs([g1] + [g2]) lmda = self._weight diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index 2692713..90a0906 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -9,27 +9,372 @@ import numpy as np import networkx as nx import multiprocessing import time +# from abc import ABC, abstractmethod +from sklearn.base import BaseEstimator # , TransformerMixin +from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, +from sklearn.exceptions import NotFittedError from gklearn.utils import normalize_gram_matrix -class GraphKernel(object): +class GraphKernel(BaseEstimator): #, ABC): + """The basic graph kernel class. - def __init__(self): - self._graphs = None - self._parallel = '' - self._n_jobs = 0 - self._verbose = None - self._normalize = True - self._run_time = 0 - self._gram_matrix = None - self._gram_matrix_unnorm = None + Attributes + ---------- + _graphs : list + Stores the input graphs on fit input data. + Default format of the list objects is `NetworkX` graphs. + **We don't guarantee that the input graphs remain unchanged during the + computation.** + + References + ---------- + https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. + """ + + def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2): + """`__init__` for `GraphKernel` object.""" + # @todo: the default settings of the parameters are different from those in the self.compute method. +# self._graphs = None + self.parallel = parallel + self.n_jobs = n_jobs + self.chunksize = chunksize + self.normalize = normalize + self.verbose = verbose +# self._run_time = 0 +# self._gram_matrix = None +# self._gram_matrix_unnorm = None + + + ########################################################################## + # The following is the 1st paradigm to compute kernel matrix, which is + # compatible with `scikit-learn`. + # ------------------------------------------------------------------- + # Special thanks to the "GraKeL" library for providing an excellent template! + ########################################################################## + + + def fit(self, X, y=None): + """Fit a graph dataset for a transformer. + + Parameters + ---------- + X : iterable + DESCRIPTION. + + y : None, optional + There is no need of a target in a transformer, yet the `scikit-learn` + pipeline API requires this parameter. + + Returns + ------- + object + Returns self. + + """ +# self._is_tranformed = False + + # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; + self.clear_attributes() + +# X = check_array(X, accept_sparse=True) + + # Validate parameters for the transformer. + self.validate_parameters() + + # Validate the input. + self._graphs = self.validate_input(X) + +# self._X = X +# self._kernel = self._get_kernel_instance() + + # Return the transformer. + return self + + + def transform(self, X): + """Compute the graph kernel matrix between given and fitted data. + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Raises + ------ + ValueError + DESCRIPTION. + + Returns + ------- + None. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, '_graphs') + + # Validate the input. + Y = self.validate_input(X) + + # Transform: compute the graph kernel matrix. + kernel_matrix = self.compute_kernel_matrix(Y) + self._Y = Y + + # Self transform must appear before the diagonal call on normilization. + self._is_transformed = True + if self.normalize: + X_diag, Y_diag = self.diagonals() + kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) + + return kernel_matrix + + + + def fit_transform(self, X): + """Fit and transform: compute Gram matrix on the same data. + + Parameters + ---------- + X : list of graphs + Input graphs. + + Returns + ------- + gram_matrix : numpy array, shape = [len(X), len(X)] + The Gram matrix of X. + + """ + self.fit(X) + + # Transform: compute Gram matrix. + gram_matrix = self.compute_kernel_matrix() + + # Normalize. + self._X_diag = np.diagonal(gram_matrix).copy() + if self.normalize: + gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) + + return gram_matrix + + + def get_params(self): + pass + + + def set_params(self): + pass + + + def clear_attributes(self): + if hasattr(self, '_X_diag'): + delattr(self, '_X_diag') + if hasattr(self, '_graphs'): + delattr(self, '_graphs') + if hasattr(self, '_Y'): + delattr(self, '_Y') + if hasattr(self, '_run_time'): + delattr(self, '_run_time') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + if self.parallel is not None and self.parallel != 'imap_unordered': + raise ValueError('Parallel mode is not set correctly.') + + if self.parallel == 'imap_unordered' and self.n_jobs is None: + self.n_jobs = multiprocessing.cpu_count() + + + def validate_input(self, X): + """Validate the given input and raise errors if it is invalid. + + Parameters + ---------- + X : list + The input to check. Should be a list of graph. + + Raises + ------ + ValueError + Raise if the input is not correct. + + Returns + ------- + X : list + The input. A list of graph. + + """ + if X is None: + raise ValueError('Please add graphs before computing.') + elif not isinstance(X, list): + raise ValueError('Cannot detect graphs.') + elif len(X) == 0: + raise ValueError('The graph list given is empty. No computation will be performed.') + + return X + + + def compute_kernel_matrix(self, Y=None): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) or the Gram matrix for the fitted + graphs (X / self._graphs). + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. The default is None. If None kernel is computed + between X and itself. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + if Y is None: + # Compute Gram matrix for self._graphs (X). + kernel_matrix = self._compute_gram_matrix() +# self._gram_matrix_unnorm = np.copy(self._gram_matrix) + + else: + # Compute kernel matrix between Y and self._graphs (X). + start_time = time.time() + + if self.parallel == 'imap_unordered': + kernel_matrix = self._compute_kernel_matrix_imap_unordered(Y) + + elif self.parallel is None: + kernel_matrix = self._compute_kernel_matrix_series(Y) + + self._run_time = time.time() - start_time + if self.verbose: + print('Kernel matrix of size (%d, %d) built in %s seconds.' + % (len(Y), len(self._graphs), self._run_time)) + + return kernel_matrix + + + def _compute_kernel_matrix_series(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) without parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + kernel_matrix = np.zeros((len(Y), len(self._graphs))) + + for i_y, g_y in enumerate(Y): + for i_x, g_x in enumerate(self._graphs): + kernel_matrix[i_y, i_x] = self.pairwise_kernel(g_y, g_x) + + return kernel_matrix + + + def _compute_kernel_matrix_imap_unordered(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) using imap unordered parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + raise Exception('Parallelization for kernel matrix is not implemented.') + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + for i, x in enumerate(self._graphs): + self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + for (i, y) in enumerate(self._Y): + self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? + + return self._X_diag, self._Y_diag + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + +# @abstractmethod + def pairwise_kernel(self, x, y): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + Returns + ------- + kernel: float + The computed kernel. + +# Notes +# ----- +# This method is abstract and must be implemented by a subclass. + + """ + raise NotImplementedError('Pairwise kernel computation is not implemented!') + + + ########################################################################## + # The following is the 2nd paradigm to compute kernel matrix. It is + # simplified and not compatible with `scikit-learn`. + ########################################################################## def compute(self, *graphs, **kwargs): - self._parallel = kwargs.get('parallel', 'imap_unordered') - self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) - self._normalize = kwargs.get('normalize', True) - self._verbose = kwargs.get('verbose', 2) + self.parallel = kwargs.get('parallel', 'imap_unordered') + self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) + self.normalize = kwargs.get('normalize', True) + self.verbose = kwargs.get('verbose', 2) + self.validate_parameters() if len(graphs) == 1: if not isinstance(graphs[0], list): @@ -40,7 +385,7 @@ class GraphKernel(object): self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. self._gram_matrix = self._compute_gram_matrix() self._gram_matrix_unnorm = np.copy(self._gram_matrix) - if self._normalize: + if self.normalize: self._gram_matrix = normalize_gram_matrix(self._gram_matrix) return self._gram_matrix, self._run_time @@ -103,15 +448,15 @@ class GraphKernel(object): def _compute_gram_matrix(self): start_time = time.time() - if self._parallel == 'imap_unordered': + if self.parallel == 'imap_unordered': gram_matrix = self._compute_gm_imap_unordered() - elif self._parallel is None: + elif self.parallel is None: gram_matrix = self._compute_gm_series() else: raise Exception('Parallel mode is not set correctly.') self._run_time = time.time() - start_time - if self._verbose: + if self.verbose: print('Gram matrix of size %d built in %s seconds.' % (len(self._graphs), self._run_time)) @@ -129,15 +474,15 @@ class GraphKernel(object): def _compute_kernel_list(self, g1, g_list): start_time = time.time() - if self._parallel == 'imap_unordered': + if self.parallel == 'imap_unordered': kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) - elif self._parallel is None: + elif self.parallel is None: kernel_list = self._compute_kernel_list_series(g1, g_list) else: raise Exception('Parallel mode is not set correctly.') self._run_time = time.time() - start_time - if self._verbose: + if self.verbose: print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' % (len(g_list), self._run_time)) @@ -158,7 +503,7 @@ class GraphKernel(object): kernel = self._compute_single_kernel_series(g1, g2) self._run_time = time.time() - start_time - if self._verbose: + if self.verbose: print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) return kernel @@ -185,24 +530,24 @@ class GraphKernel(object): return self._graphs - @property - def parallel(self): - return self._parallel +# @property +# def parallel(self): +# return self.parallel - @property - def n_jobs(self): - return self._n_jobs +# @property +# def n_jobs(self): +# return self.n_jobs - @property - def verbose(self): - return self._verbose +# @property +# def verbose(self): +# return self.verbose - @property - def normalize(self): - return self._normalize +# @property +# def normalize(self): +# return self.normalize @property diff --git a/gklearn/kernels/marginalized.py b/gklearn/kernels/marginalized.py index e3d70c6..d6c203e 100644 --- a/gklearn/kernels/marginalized.py +++ b/gklearn/kernels/marginalized.py @@ -46,7 +46,7 @@ class Marginalized(GraphKernel): self._add_dummy_labels(self._graphs) if self._remove_totters: - iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) # @todo: this may not work. self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] @@ -57,7 +57,7 @@ class Marginalized(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, - length=len_itr, verbose=(self._verbose >= 2)) + length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(self._graphs[i], self._graphs[j]) gram_matrix[i][j] = kernel @@ -70,16 +70,16 @@ class Marginalized(GraphKernel): self._add_dummy_labels(self._graphs) if self._remove_totters: - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = range(0, len(self._graphs)) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 remove_fun = self._wrapper_untotter iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), desc='removing tottering', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, g in iterator: self._graphs[i] = g pool.close() @@ -93,7 +93,7 @@ class Marginalized(GraphKernel): G_gn = gn_toshare do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -103,13 +103,13 @@ class Marginalized(GraphKernel): if self._remove_totters: g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. - iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self.verbose >= 2)) # @todo: this may not work. g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] # compute kernel list. kernel_list = [None] * len(g_list) - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(g1, g_list[i]) kernel_list[i] = kernel @@ -122,16 +122,16 @@ class Marginalized(GraphKernel): if self._remove_totters: g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = range(0, len(g_list)) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 remove_fun = self._wrapper_untotter iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), desc='removing tottering', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, g in iterator: g_list[i] = g pool.close() @@ -151,7 +151,7 @@ class Marginalized(GraphKernel): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/path_up_to_h.py b/gklearn/kernels/path_up_to_h.py index afe3859..0c80931 100644 --- a/gklearn/kernels/path_up_to_h.py +++ b/gklearn/kernels/path_up_to_h.py @@ -41,10 +41,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None from itertools import combinations_with_replacement itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) - iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2)) + iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self.verbose >= 2)) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator_kernel = get_iters(itr_kernel, desc='Computing kernels', - file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) @@ -69,10 +69,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 all_paths = [[] for _ in range(len(self._graphs))] @@ -84,7 +84,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), desc='getting paths', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, ps in iterator: all_paths[i] = ps pool.close() @@ -109,7 +109,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None G_plist = plist_toshare do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this? parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(all_paths,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -117,8 +117,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None def _compute_kernel_list_series(self, g1, g_list): self._add_dummy_labels(g_list + [g1]) - iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self._verbose >= 2)) - iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self.verbose >= 2)) + iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) kernel_list = [None] * len(g_list) @@ -143,10 +143,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(g_list, range(0, len(g_list))) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 paths_g_list = [[] for _ in range(len(g_list))] @@ -161,7 +161,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), desc='getting paths', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, ps in iterator: paths_g_list[i] = ps pool.close() @@ -180,7 +180,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/shortest_path.py b/gklearn/kernels/shortest_path.py index bfea553..0c5fccc 100644 --- a/gklearn/kernels/shortest_path.py +++ b/gklearn/kernels/shortest_path.py @@ -38,7 +38,7 @@ class ShortestPath(GraphKernel): def _compute_gm_series(self): self._all_graphs_have_edges(self._graphs) # get shortest path graph of each graph. - iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] # compute Gram matrix. @@ -48,7 +48,7 @@ class ShortestPath(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', - length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) + length=len_itr, file=sys.stdout,verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._sp_do(self._graphs[i], self._graphs[j]) gram_matrix[i][j] = kernel @@ -60,16 +60,16 @@ class ShortestPath(GraphKernel): def _compute_gm_imap_unordered(self): self._all_graphs_have_edges(self._graphs) # get shortest path graph of each graph. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) get_sp_graphs_fun = self._wrapper_get_sp_graphs itr = zip(self._graphs, range(0, len(self._graphs))) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), desc='getting sp graphs', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, g in iterator: self._graphs[i] = g pool.close() @@ -83,7 +83,7 @@ class ShortestPath(GraphKernel): G_gs = gs_toshare do_fun = self._wrapper_sp_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -92,12 +92,12 @@ class ShortestPath(GraphKernel): self._all_graphs_have_edges([g1] + g_list) # get shortest path graphs of g1 and each graph in g_list. g1 = getSPGraph(g1, edge_weight=self._edge_weight) - iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] # compute kernel list. kernel_list = [None] * len(g_list) - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._sp_do(g1, g_list[i]) kernel_list[i] = kernel @@ -109,16 +109,16 @@ class ShortestPath(GraphKernel): self._all_graphs_have_edges([g1] + g_list) # get shortest path graphs of g1 and each graph in g_list. g1 = getSPGraph(g1, edge_weight=self._edge_weight) - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) get_sp_graphs_fun = self._wrapper_get_sp_graphs itr = zip(g_list, range(0, len(g_list))) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), desc='getting sp graphs', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, g in iterator: g_list[i] = g pool.close() @@ -137,7 +137,7 @@ class ShortestPath(GraphKernel): itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/spectral_decomposition.py b/gklearn/kernels/spectral_decomposition.py index 561f632..bc06e26 100644 --- a/gklearn/kernels/spectral_decomposition.py +++ b/gklearn/kernels/spectral_decomposition.py @@ -28,9 +28,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_gm_series(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') @@ -41,7 +41,7 @@ class SpectralDecomposition(RandomWalkMeta): # precompute the spectral decomposition of each graph. P_list = [] D_list = [] - iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -58,7 +58,7 @@ class SpectralDecomposition(RandomWalkMeta): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) - iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel) @@ -74,9 +74,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') @@ -87,7 +87,7 @@ class SpectralDecomposition(RandomWalkMeta): # precompute the spectral decomposition of each graph. P_list = [] D_list = [] - iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -107,7 +107,7 @@ class SpectralDecomposition(RandomWalkMeta): do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(q_T_list, P_list, D_list), n_jobs=self.n_jobs, verbose=self.verbose) else: # @todo pass @@ -118,9 +118,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') @@ -133,7 +133,7 @@ class SpectralDecomposition(RandomWalkMeta): D1, P1 = np.linalg.eig(A1) P_list = [] D_list = [] - iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self.verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -145,7 +145,7 @@ class SpectralDecomposition(RandomWalkMeta): if self._p is None: # p is uniform distribution as default. q_T1 = 1 / nx.number_of_nodes(g1) q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel) @@ -160,9 +160,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') @@ -175,8 +175,8 @@ class SpectralDecomposition(RandomWalkMeta): D1, P1 = np.linalg.eig(A1) P_list = [] D_list = [] - if self._verbose >= 2: - iterator = tqdm(g_list, desc='spectral decompose', file=sys.stdout) + if self.verbose >= 2: + iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout) else: iterator = g_list for G in iterator: @@ -207,7 +207,7 @@ class SpectralDecomposition(RandomWalkMeta): itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) else: # @todo pass @@ -222,9 +222,9 @@ class SpectralDecomposition(RandomWalkMeta): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2], self._verbose) + self._check_edge_weight([g1] + [g2], self.verbose) self._check_graphs([g1] + [g2]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') diff --git a/gklearn/kernels/structural_sp.py b/gklearn/kernels/structural_sp.py index 35ed9d1..1fd68f7 100644 --- a/gklearn/kernels/structural_sp.py +++ b/gklearn/kernels/structural_sp.py @@ -41,7 +41,7 @@ class StructuralSP(GraphKernel): def _compute_gm_series(self): # get shortest paths of each graph in the graphs. splist = [] - iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self.verbose >= 2)) if self._compute_method == 'trie': for g in iterator: splist.append(self._get_sps_as_trie(g)) @@ -56,7 +56,7 @@ class StructuralSP(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, - length=len_itr, verbose=(self._verbose >= 2)) + length=len_itr, verbose=(self.verbose >= 2)) if self._compute_method == 'trie': for i, j in iterator: kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) @@ -76,10 +76,10 @@ class StructuralSP(GraphKernel): def _compute_gm_imap_unordered(self): # get shortest paths of each graph in the graphs. splist = [None] * len(self._graphs) - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 # get shortest path graphs of self._graphs @@ -89,7 +89,7 @@ class StructuralSP(GraphKernel): get_sps_fun = self._wrapper_get_sps_naive iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), desc='getting shortest paths', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, sp in iterator: splist[i] = sp pool.close() @@ -107,7 +107,7 @@ class StructuralSP(GraphKernel): else: do_fun = self._wrapper_ssp_do_naive parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(splist, self._graphs), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -117,7 +117,7 @@ class StructuralSP(GraphKernel): sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) splist = [] iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, - verbose=(self._verbose >= 2)) + verbose=(self.verbose >= 2)) if self._compute_method == 'trie': for g in iterator: splist.append(self._get_sps_as_trie(g)) @@ -128,7 +128,7 @@ class StructuralSP(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) iterator = get_iters(range(len(g_list)), desc='Computing kernels', - file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) if self._compute_method == 'trie': for i in iterator: kernel = self._ssp_do_trie(g1, g_list[i], sp1, splist[i]) @@ -145,10 +145,10 @@ class StructuralSP(GraphKernel): # get shortest paths of g1 and each graph in g_list. sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) splist = [None] * len(g_list) - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(g_list, range(0, len(g_list))) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 # get shortest path graphs of g_list @@ -158,7 +158,7 @@ class StructuralSP(GraphKernel): get_sps_fun = self._wrapper_get_sps_naive iterator = get_iters(pool.imap_unordered(get_sps_fun, itr, chunksize), desc='getting shortest paths', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, sp in iterator: splist[i] = sp pool.close() @@ -182,7 +182,7 @@ class StructuralSP(GraphKernel): itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list diff --git a/gklearn/kernels/sylvester_equation.py b/gklearn/kernels/sylvester_equation.py index 9f8fc66..b898ae9 100644 --- a/gklearn/kernels/sylvester_equation.py +++ b/gklearn/kernels/sylvester_equation.py @@ -27,9 +27,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_gm_series(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') @@ -41,7 +41,7 @@ class SylvesterEquation(RandomWalkMeta): if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. - iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # # normalized adjacency matrices # A_wave_list = [] @@ -55,7 +55,7 @@ class SylvesterEquation(RandomWalkMeta): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) - iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda) @@ -71,9 +71,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs, self._verbose) + self._check_edge_weight(self._graphs, self.verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') @@ -83,7 +83,7 @@ class SylvesterEquation(RandomWalkMeta): if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. - iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? if self._p is None: # p is uniform distribution as default. @@ -94,7 +94,7 @@ class SylvesterEquation(RandomWalkMeta): do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(A_wave_list,), n_jobs=self.n_jobs, verbose=self.verbose) else: # @todo pass @@ -105,9 +105,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') @@ -120,11 +120,11 @@ class SylvesterEquation(RandomWalkMeta): # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() - iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] if self._p is None: # p is uniform distribution as default. - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda) @@ -139,9 +139,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1], self._verbose) + self._check_edge_weight(g_list + [g1], self.verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') @@ -152,7 +152,7 @@ class SylvesterEquation(RandomWalkMeta): # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() - iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self.verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? if self._p is None: # p is uniform distribution as default. @@ -169,7 +169,7 @@ class SylvesterEquation(RandomWalkMeta): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) else: # @todo pass @@ -184,9 +184,9 @@ class SylvesterEquation(RandomWalkMeta): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2], self._verbose) + self._check_edge_weight([g1] + [g2], self.verbose) self._check_graphs([g1] + [g2]) - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('All labels are ignored.') diff --git a/gklearn/kernels/treelet.py b/gklearn/kernels/treelet.py index 32cad43..d546e74 100644 --- a/gklearn/kernels/treelet.py +++ b/gklearn/kernels/treelet.py @@ -18,6 +18,8 @@ import numpy as np import networkx as nx from collections import Counter from itertools import chain +from sklearn.utils.validation import check_is_fitted +from sklearn.exceptions import NotFittedError from gklearn.utils import SpecialLabel from gklearn.utils.parallel import parallel_gm, parallel_me from gklearn.utils.utils import find_all_paths, get_mlti_dim_node_attrs @@ -26,14 +28,211 @@ from gklearn.kernels import GraphKernel class Treelet(GraphKernel): - def __init__(self, **kwargs): - GraphKernel.__init__(self) - self._node_labels = kwargs.get('node_labels', []) - self._edge_labels = kwargs.get('edge_labels', []) - self._sub_kernel = kwargs.get('sub_kernel', None) - self._ds_infos = kwargs.get('ds_infos', {}) - if self._sub_kernel is None: - raise Exception('Sub kernel not set.') + def __init__(self, parallel=None, n_jobs=None, chunksize=None, normalize=True, verbose=2, precompute_canonkeys=True, save_canonkeys=False, **kwargs): + """Initialise a treelet kernel. + """ + super().__init__(parallel=parallel, n_jobs=n_jobs, chunksize=chunksize, normalize=normalize, verbose=verbose) + self.node_labels = kwargs.get('node_labels', []) + self.edge_labels = kwargs.get('edge_labels', []) + self.sub_kernel = kwargs.get('sub_kernel', None) + self.ds_infos = kwargs.get('ds_infos', {}) + self.precompute_canonkeys = precompute_canonkeys + self.save_canonkeys = save_canonkeys + + + ########################################################################## + # The following is the 1st paradigm to compute kernel matrix, which is + # compatible with `scikit-learn`. + # ------------------------------------------------------------------- + # Special thanks to the "GraKeL" library for providing an excellent template! + ########################################################################## + + + def clear_attributes(self): + super().clear_attributes() + if hasattr(self, '_canonkeys'): + delattr(self, '_canonkeys') + if hasattr(self, '_Y_canonkeys'): + delattr(self, '_Y_canonkeys') + if hasattr(self, '_dummy_labels_considered'): + delattr(self, '_dummy_labels_considered') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + super().validate_parameters() + if self.sub_kernel is None: + raise ValueError('Sub-kernel not set.') + + + def _compute_kernel_matrix_series(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) without parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + + # self._add_dummy_labels will modify the input in place. + self._add_dummy_labels() # For self._graphs +# Y = [g.copy() for g in Y] # @todo: ? + self._add_dummy_labels(Y) + + # get all canonical keys of all graphs before computing kernels to save + # time, but this may cost a lot of memory for large dataset. + + # Canonical keys for self._graphs. + try: + check_is_fitted(self, ['_canonkeys']) + canonkeys_list1 = self._canonkeys + except NotFittedError: + canonkeys_list1 = [] + iterator = get_iters(self._graphs, desc='getting canonkeys for X', file=sys.stdout, verbose=(self.verbose >= 2)) + for g in iterator: + canonkeys_list1.append(self._get_canonkeys(g)) + + if self.save_canonkeys: + self._canonkeys = canonkeys_list1 + + # Canonical keys for Y. + canonkeys_list2 = [] + iterator = get_iters(Y, desc='getting canonkeys for Y', file=sys.stdout, verbose=(self.verbose >= 2)) + for g in iterator: + canonkeys_list2.append(self._get_canonkeys(g)) + + if self.save_canonkeys: + self._Y_canonkeys = canonkeys_list2 + + # compute kernel matrix. + kernel_matrix = np.zeros((len(Y), len(canonkeys_list1))) + + from itertools import product + itr = product(range(len(Y)), range(len(canonkeys_list1))) + len_itr = int(len(Y) * len(canonkeys_list1)) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, + length=len_itr, verbose=(self.verbose >= 2)) + for i_y, i_x in iterator: + kernel = self._kernel_do(canonkeys_list2[i_y], canonkeys_list1[i_x]) + kernel_matrix[i_y][i_x] = kernel + + return kernel_matrix + + + def _compute_kernel_matrix_imap_unordered(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) using imap unordered parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + raise Exception('Parallelization for kernel matrix is not implemented.') + + + def pairwise_kernel(self, x, y, are_keys=False): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + are_keys : boolean, optional + If `True`, `x` and `y` are canonical keys, otherwise are graphs. + The default is False. + + Returns + ------- + kernel: float + The computed kernel. + + """ + if are_keys: + # x, y are canonical keys. + kernel = self._kernel_do(x, y) + + else: + # x, y are graphs. + kernel = self._compute_single_kernel_series(x, y) + + return kernel + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + try: + check_is_fitted(self, ['_canonkeys']) + for i, x in enumerate(self._canonkeys): + self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=True) # @todo: parallel? + except NotFittedError: + for i, x in enumerate(self._graphs): + self._X_diag[i] = self.pairwise_kernel(x, x, are_keys=False) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + try: + check_is_fitted(self, ['_Y_canonkeys']) + for (i, y) in enumerate(self._Y_canonkeys): + self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=True) # @todo: parallel? + except NotFittedError: + for (i, y) in enumerate(self._Y): + self._Y_diag[i] = self.pairwise_kernel(y, y, are_keys=False) # @todo: parallel? + + return self._X_diag, self._Y_diag + + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + + ########################################################################## + # The following is the 2nd paradigm to compute kernel matrix. It is + # simplified and not compatible with `scikit-learn`. + ########################################################################## def _compute_gm_series(self): @@ -43,10 +242,13 @@ class Treelet(GraphKernel): # time, but this may cost a lot of memory for large dataset. canonkeys = [] iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout, - verbose=(self._verbose >= 2)) + verbose=(self.verbose >= 2)) for g in iterator: canonkeys.append(self._get_canonkeys(g)) + if self.save_canonkeys: + self._canonkeys = canonkeys + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) @@ -54,7 +256,7 @@ class Treelet(GraphKernel): itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, - length=len_itr, verbose=(self._verbose >= 2)) + length=len_itr, verbose=(self.verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(canonkeys[i], canonkeys[j]) gram_matrix[i][j] = kernel @@ -68,22 +270,25 @@ class Treelet(GraphKernel): # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) - if len(self._graphs) < 100 * self._n_jobs: - chunksize = int(len(self._graphs) / self._n_jobs) + 1 + if len(self._graphs) < 100 * self.n_jobs: + chunksize = int(len(self._graphs) / self.n_jobs) + 1 else: chunksize = 100 canonkeys = [[] for _ in range(len(self._graphs))] get_fun = self._wrapper_get_canonkeys iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), desc='getting canonkeys', file=sys.stdout, - length=len(self._graphs), verbose=(self._verbose >= 2)) + length=len(self._graphs), verbose=(self.verbose >= 2)) for i, ck in iterator: canonkeys[i] = ck pool.close() pool.join() + if self.save_canonkeys: + self._canonkeys = canonkeys + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) @@ -92,7 +297,7 @@ class Treelet(GraphKernel): G_canonkeys = canonkeys_toshare do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(canonkeys,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(canonkeys,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix @@ -104,13 +309,13 @@ class Treelet(GraphKernel): # time, but this may cost a lot of memory for large dataset. canonkeys_1 = self._get_canonkeys(g1) canonkeys_list = [] - iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self.verbose >= 2)) for g in iterator: canonkeys_list.append(self._get_canonkeys(g)) # compute kernel list. kernel_list = [None] * len(g_list) - iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self.verbose >= 2)) for i in iterator: kernel = self._kernel_do(canonkeys_1, canonkeys_list[i]) kernel_list[i] = kernel @@ -125,16 +330,16 @@ class Treelet(GraphKernel): # time, but this may cost a lot of memory for large dataset. canonkeys_1 = self._get_canonkeys(g1) canonkeys_list = [[] for _ in range(len(g_list))] - pool = Pool(self._n_jobs) + pool = Pool(self.n_jobs) itr = zip(g_list, range(0, len(g_list))) - if len(g_list) < 100 * self._n_jobs: - chunksize = int(len(g_list) / self._n_jobs) + 1 + if len(g_list) < 100 * self.n_jobs: + chunksize = int(len(g_list) / self.n_jobs) + 1 else: chunksize = 100 get_fun = self._wrapper_get_canonkeys iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), desc='getting canonkeys', file=sys.stdout, - length=len(g_list), verbose=(self._verbose >= 2)) + length=len(g_list), verbose=(self.verbose >= 2)) for i, ck in iterator: canonkeys_list[i] = ck pool.close() @@ -154,7 +359,7 @@ class Treelet(GraphKernel): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list @@ -187,7 +392,7 @@ class Treelet(GraphKernel): keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) - kernel = self._sub_kernel(vector1, vector2) + kernel = self.sub_kernel(vector1, vector2) return kernel @@ -223,7 +428,7 @@ class Treelet(GraphKernel): patterns['0'] = list(G.nodes()) canonkey['0'] = nx.number_of_nodes(G) for i in range(1, 6): # for i in range(1, 6): - patterns[str(i)] = find_all_paths(G, i, self._ds_infos['directed']) + patterns[str(i)] = find_all_paths(G, i, self.ds_infos['directed']) canonkey[str(i)] = len(patterns[str(i)]) # n-star patterns @@ -317,11 +522,11 @@ class Treelet(GraphKernel): ### pattern obtained in the structural analysis section above, which is a ### string corresponding to a unique treelet. A dictionary is built to keep ### track of the amount of every treelet. - if len(self._node_labels) > 0 or len(self._edge_labels) > 0: + if len(self.node_labels) > 0 or len(self.edge_labels) > 0: canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. # linear patterns - canonkey_t = Counter(get_mlti_dim_node_attrs(G, self._node_labels)) + canonkey_t = Counter(get_mlti_dim_node_attrs(G, self.node_labels)) for key in canonkey_t: canonkey_l[('0', key)] = canonkey_t[key] @@ -330,9 +535,9 @@ class Treelet(GraphKernel): for pattern in patterns[str(i)]: canonlist = [] for idx, node in enumerate(pattern[:-1]): - canonlist.append(tuple(G.nodes[node][nl] for nl in self._node_labels)) - canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self._edge_labels)) - canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self._node_labels)) + canonlist.append(tuple(G.nodes[node][nl] for nl in self.node_labels)) + canonlist.append(tuple(G[node][pattern[idx+1]][el] for el in self.edge_labels)) + canonlist.append(tuple(G.nodes[pattern[-1]][nl] for nl in self.node_labels)) canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1] treelet.append(tuple([str(i)] + canonkey_t)) canonkey_l.update(Counter(treelet)) @@ -343,13 +548,13 @@ class Treelet(GraphKernel): for pattern in patterns[str(i) + 'star']: canonlist = [] for leaf in pattern[1:]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonlist = list(chain.from_iterable(canonlist)) canonkey_t = tuple(['d' if i == 5 else str(i * 2)] + - [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) @@ -359,17 +564,17 @@ class Treelet(GraphKernel): for pattern in patterns['7']: canonlist = [] for leaf in pattern[1:3]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonlist = list(chain.from_iterable(canonlist)) canonkey_t = tuple(['7'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] - + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]) + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist + + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] + + [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)]) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) @@ -378,38 +583,38 @@ class Treelet(GraphKernel): for pattern in patterns['11']: canonlist = [] for leaf in pattern[1:4]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonlist = list(chain.from_iterable(canonlist)) canonkey_t = tuple(['b'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist - + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[4]][pattern[0]][el] for el in self._edge_labels)] - + [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)]) + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist + + [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[4]][pattern[0]][el] for el in self.edge_labels)] + + [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)]) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) # pattern 10 treelet = [] for pattern in patterns['10']: - canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), - tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)] + canonkey4 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), + tuple(G[pattern[5]][pattern[4]][el] for el in self.edge_labels)] canonlist = [] for leaf in pattern[1:3]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonkey0 = list(chain.from_iterable(canonlist)) canonkey_t = tuple(['a'] - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] - + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] + + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + + [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[4]][pattern[3]][el] for el in self.edge_labels)] + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] + canonkey4 + canonkey0) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) @@ -419,15 +624,15 @@ class Treelet(GraphKernel): for pattern in patterns['12']: canonlist0 = [] for leaf in pattern[1:3]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[0]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[0]][el] for el in self.edge_labels) canonlist0.append(tuple((nlabels, elabels))) canonlist0.sort() canonlist0 = list(chain.from_iterable(canonlist0)) canonlist3 = [] for leaf in pattern[4:6]: - nlabels = tuple(G.nodes[leaf][nl] for nl in self._node_labels) - elabels = tuple(G[leaf][pattern[3]][el] for el in self._edge_labels) + nlabels = tuple(G.nodes[leaf][nl] for nl in self.node_labels) + elabels = tuple(G[leaf][pattern[3]][el] for el in self.edge_labels) canonlist3.append(tuple((nlabels, elabels))) canonlist3.sort() canonlist3 = list(chain.from_iterable(canonlist3)) @@ -435,14 +640,14 @@ class Treelet(GraphKernel): # 2 possible key can be generated from 2 nodes with extended label 3, # select the one with lower lexicographic order. canonkey_t1 = tuple(['c'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0 - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonlist0 + + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] + canonlist3) canonkey_t2 = tuple(['c'] - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3 - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] + + [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels)] + canonlist3 + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + + [tuple(G[pattern[0]][pattern[3]][el] for el in self.edge_labels)] + canonlist0) treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) canonkey_l.update(Counter(treelet)) @@ -450,24 +655,24 @@ class Treelet(GraphKernel): # pattern 9 treelet = [] for pattern in patterns['9']: - canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels), - tuple(G[pattern[4]][pattern[2]][el] for el in self._edge_labels)] - canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels), - tuple(G[pattern[5]][pattern[3]][el] for el in self._edge_labels)] - prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self._node_labels), - tuple(G[pattern[2]][pattern[0]][el] for el in self._edge_labels)] - prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels), - tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] + canonkey2 = [tuple(G.nodes[pattern[4]][nl] for nl in self.node_labels), + tuple(G[pattern[4]][pattern[2]][el] for el in self.edge_labels)] + canonkey3 = [tuple(G.nodes[pattern[5]][nl] for nl in self.node_labels), + tuple(G[pattern[5]][pattern[3]][el] for el in self.edge_labels)] + prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self.node_labels), + tuple(G[pattern[2]][pattern[0]][el] for el in self.edge_labels)] + prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self.node_labels), + tuple(G[pattern[3]][pattern[0]][el] for el in self.edge_labels)] if prekey2 + canonkey2 < prekey3 + canonkey3: - canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ - + [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ + canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ + + [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ + prekey2 + prekey3 + canonkey2 + canonkey3 else: - canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ - + [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ + canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self.node_labels)] \ + + [tuple(G[pattern[1]][pattern[0]][el] for el in self.edge_labels)] \ + prekey3 + prekey2 + canonkey3 + canonkey2 treelet.append(tuple(['9'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + + [tuple(G.nodes[pattern[0]][nl] for nl in self.node_labels)] + canonkey_t)) canonkey_l.update(Counter(treelet)) @@ -482,12 +687,33 @@ class Treelet(GraphKernel): return i, self._get_canonkeys(g) - def _add_dummy_labels(self, Gn): - if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): - for i in range(len(Gn)): - nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) - self._node_labels = [SpecialLabel.DUMMY] - if len(self._edge_labels) == 0 or (len(self._edge_labels) == 1 and self._edge_labels[0] == SpecialLabel.DUMMY): - for i in range(len(Gn)): - nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) - self._edge_labels = [SpecialLabel.DUMMY] \ No newline at end of file + def _add_dummy_labels(self, Gn=None): + def _add_dummy(Gn): + if len(self.node_labels) == 0 or (len(self.node_labels) == 1 and self.node_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.node_labels = [SpecialLabel.DUMMY] + if len(self.edge_labels) == 0 or (len(self.edge_labels) == 1 and self.edge_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.edge_labels = [SpecialLabel.DUMMY] + + if Gn is None or Gn is self._graphs: + # Add dummy labels for the copy of self._graphs. + try: + check_is_fitted(self, ['_dummy_labels_considered']) + if not self._dummy_labels_considered: + Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] + _add_dummy(Gn) + self._graphs = Gn + self._dummy_labels_considered = True + except NotFittedError: + Gn = self._graphs # @todo: ?[g.copy() for g in self._graphs] + _add_dummy(Gn) + self._graphs = Gn + self._dummy_labels_considered = True + + else: + # Add dummy labels for the input. + _add_dummy(Gn) + diff --git a/gklearn/kernels/weisfeiler_lehman.py b/gklearn/kernels/weisfeiler_lehman.py index 1f52755..64069b7 100644 --- a/gklearn/kernels/weisfeiler_lehman.py +++ b/gklearn/kernels/weisfeiler_lehman.py @@ -33,7 +33,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. def _compute_gm_series(self): -# if self._verbose >= 2: +# if self.verbose >= 2: # import warnings # warnings.warn('A part of the computation is parallelized.') @@ -74,17 +74,17 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. G_gn = gn_toshare do_fun = self._wrapper_pairwise parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(self._graphs,), n_jobs=self.n_jobs, verbose=self.verbose) return gram_matrix else: - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') return self._compute_gm_series() def _compute_kernel_list_series(self, g1, g_list): # @todo: this should be better. -# if self._verbose >= 2: +# if self.verbose >= 2: # import warnings # warnings.warn('A part of the computation is parallelized.') @@ -126,10 +126,10 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + n_jobs=self.n_jobs, itr_desc='Computing kernels', verbose=self.verbose) return kernel_list else: - if self._verbose >= 2: + if self.verbose >= 2: import warnings warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') return self._compute_kernel_list_series(g1, g_list) @@ -332,15 +332,15 @@ class WeisfeilerLehman(GraphKernel): # @todo: sp, edge user kernel. def _compute_gram_itr(self, gram_matrix, all_num_of_each_label): """Compute Gram matrix using the base kernel. """ -# if self._parallel == 'imap_unordered': +# if self.parallel == 'imap_unordered': # # compute kernels. # def init_worker(alllabels_toshare): # global G_alllabels # G_alllabels = alllabels_toshare # do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix) # parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, -# glbv=(all_num_of_each_label,), n_jobs=self._n_jobs, verbose=self._verbose) -# elif self._parallel is None: +# glbv=(all_num_of_each_label,), n_jobs=self.n_jobs, verbose=self.verbose) +# elif self.parallel is None: for i in range(len(gram_matrix)): for j in range(i, len(gram_matrix)): gram_matrix[i][j] = self._compute_subtree_kernel(all_num_of_each_label[i],