|
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Thu May 5 09:42:30 2022
-
- @author: ljia
- """
- import sys
- import multiprocessing
- import time
- import numpy as np
- import networkx as nx
-
- # from abc import ABC, abstractmethod
- from sklearn.base import BaseEstimator # , TransformerMixin
- from sklearn.utils.validation import check_is_fitted # check_X_y, check_array,
- from sklearn.exceptions import NotFittedError
-
- from gklearn.ged.model.distances import euclid_d
- from gklearn.ged.util import pairwise_ged, get_nb_edit_operations
- # from gklearn.utils import normalize_gram_matrix
- from gklearn.utils import get_iters
-
-
- class GEDModel(BaseEstimator): #, ABC):
- """The graph edit distance model class compatible with `scikit-learn`.
-
- Attributes
- ----------
- _graphs : list
- Stores the input graphs on fit input data.
- Default format of the list objects is `NetworkX` graphs.
- **We don't guarantee that the input graphs remain unchanged during the
- computation.**
-
- References
- ----------
- https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel.
- """
-
- def __init__(self,
- ed_method='BIPARTITE',
- edit_cost_fun='CONSTANT',
- init_edit_cost_constants=[3, 3, 1, 3, 3, 1],
- optim_method='init',
- optim_options={'y_distance': euclid_d, 'mode': 'reg'},
- node_labels=[],
- edge_labels=[],
- parallel=None,
- n_jobs=None,
- chunksize=None,
- # normalize=True,
- copy_graphs=True, # make sure it is a full deep copy. and faster!
- verbose=2):
- """`__init__` for `GEDModel` object."""
- # @todo: the default settings of the parameters are different from those in the self.compute method.
- # self._graphs = None
- self.ed_method = ed_method
- self.edit_cost_fun = edit_cost_fun
- self.init_edit_cost_constants = init_edit_cost_constants
- self.optim_method=optim_method
- self.optim_options=optim_options
- self.node_labels=node_labels
- self.edge_labels=edge_labels
- self.parallel = parallel
- self.n_jobs = n_jobs
- self.chunksize = chunksize
- # self.normalize = normalize
- self.copy_graphs = copy_graphs
- self.verbose = verbose
- # self._run_time = 0
- # self._gram_matrix = None
- # self._gram_matrix_unnorm = None
-
-
- ##########################################################################
- # The following is the 1st paradigm to compute GED distance matrix, which is
- # compatible with `scikit-learn`.
- ##########################################################################
-
-
- def fit(self, X, y=None):
- """Fit a graph dataset for a transformer.
-
- Parameters
- ----------
- X : iterable
- DESCRIPTION.
-
- y : None, optional
- There is no need of a target in a transformer, yet the `scikit-learn`
- pipeline API requires this parameter.
-
- Returns
- -------
- object
- Returns self.
-
- """
- # self._is_tranformed = False
-
- # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used;
- self.clear_attributes()
-
- # Validate parameters for the transformer.
- self.validate_parameters()
-
- # Validate the input.
- self._graphs = self.validate_input(X)
- if y is not None:
- self._targets = y
- # self._targets = self.validate_input(y)
-
- # self._X = X
- # self._kernel = self._get_kernel_instance()
-
- # Return the transformer.
- return self
-
-
- def transform(self, X=None, return_dm_train=False):
- """Compute the graph kernel matrix between given and fitted data.
-
- Parameters
- ----------
- X : TYPE
- DESCRIPTION.
-
- Raises
- ------
- ValueError
- DESCRIPTION.
-
- Returns
- -------
- None.
-
- """
- # If `return_dm_train`, return the fitted GED distance matrix of training data.
- if return_dm_train:
- check_is_fitted(self, '_dm_train')
- self._is_transformed = True
- return self._dm_train # @todo: copy or not?
-
- # Check if method "fit" had been called.
- check_is_fitted(self, '_graphs')
-
- # Validate the input.
- Y = self.validate_input(X)
-
- # Transform: compute the graph kernel matrix.
- dis_matrix = self.compute_distance_matrix(Y)
- self._Y = Y
-
- # Self transform must appear before the diagonal call on normilization.
- self._is_transformed = True
- # if self.normalize:
- # X_diag, Y_diag = self.diagonals()
- # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
- # try:
- # kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag))
- # except:
- # raise
- # finally:
- # np.seterr(**old_settings)
-
- return dis_matrix
-
-
- def fit_transform(self, X, y=None, save_dm_train=False):
- """Fit and transform: compute GED distance matrix on the same data.
-
- Parameters
- ----------
- X : list of graphs
- Input graphs.
-
- Returns
- -------
- dis_matrix : numpy array, shape = [len(X), len(X)]
- The distance matrix of X.
-
- """
- self.fit(X, y)
-
- # Compute edit cost constants.
- self.compute_edit_costs()
-
- # Transform: compute Gram matrix.
- dis_matrix = self.compute_distance_matrix()
-
- # # Normalize.
- # if self.normalize:
- # self._X_diag = np.diagonal(gram_matrix).copy()
- # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
- # try:
- # gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag))
- # except:
- # raise
- # finally:
- # np.seterr(**old_settings)
-
- if save_dm_train:
- self._dm_train = dis_matrix
-
- return dis_matrix
-
-
- def get_params(self):
- pass
-
-
- def set_params(self):
- pass
-
-
- def clear_attributes(self): # @todo: update
- # if hasattr(self, '_X_diag'):
- # delattr(self, '_X_diag')
- if hasattr(self, '_graphs'):
- delattr(self, '_graphs')
- if hasattr(self, '_Y'):
- delattr(self, '_Y')
- if hasattr(self, '_run_time'):
- delattr(self, '_run_time')
-
-
- def validate_parameters(self):
- """Validate all parameters for the transformer.
-
- Returns
- -------
- None.
-
- """
- if self.parallel is not None and self.parallel != 'imap_unordered':
- raise ValueError('Parallel mode is not set correctly.')
-
- if self.parallel == 'imap_unordered' and self.n_jobs is None:
- self.n_jobs = multiprocessing.cpu_count()
-
-
- def validate_input(self, X):
- """Validate the given input and raise errors if it is invalid.
-
- Parameters
- ----------
- X : list
- The input to check. Should be a list of graph.
-
- Raises
- ------
- ValueError
- Raise if the input is not correct.
-
- Returns
- -------
- X : list
- The input. A list of graph.
-
- """
- if X is None:
- raise ValueError('Please add graphs before computing.')
- elif not isinstance(X, list):
- raise ValueError('Cannot detect graphs. The input must be a list.')
- elif len(X) == 0:
- raise ValueError('The graph list given is empty. No computation will be performed.')
-
- return X
-
-
- def compute_distance_matrix(self, Y=None):
- """Compute the distance matrix between a given target graphs (Y) and
- the fitted graphs (X / self._graphs) or the distance matrix for the fitted
- graphs (X / self._graphs).
-
- Parameters
- ----------
- Y : list of graphs, optional
- The target graphs. The default is None. If None kernel is computed
- between X and itself.
-
- Returns
- -------
- kernel_matrix : numpy array, shape = [n_targets, n_inputs]
- The computed kernel matrix.
-
- """
- if Y is None:
- # Compute Gram matrix for self._graphs (X).
- dis_matrix = self._compute_X_distance_matrix()
- # self._gram_matrix_unnorm = np.copy(self._gram_matrix)
-
- else:
- # Compute kernel matrix between Y and self._graphs (X).
- start_time = time.time()
-
- if self.parallel == 'imap_unordered':
- dis_matrix = self._compute_distance_matrix_imap_unordered(Y)
-
- elif self.parallel is None:
- Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y)
- graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
- dis_matrix = self._compute_distance_matrix_series(Y_copy, graphs_copy)
-
- self._run_time = time.time() - start_time
- if self.verbose:
- print('Distance matrix of size (%d, %d) built in %s seconds.'
- % (len(Y), len(self._graphs), self._run_time))
-
- return dis_matrix
-
-
- def _compute_distance_matrix_series(self, X, Y):
- """Compute the GED distance matrix between two sets of graphs (X and Y)
- without parallelization.
-
- Parameters
- ----------
- X, Y : list of graphs
- The input graphs.
-
- Returns
- -------
- dis_matrix : numpy array, shape = [n_X, n_Y]
- The computed distance matrix.
-
- """
- dis_matrix = np.zeros((len(X), len(Y)))
-
- for i_x, g_x in enumerate(X):
- for i_y, g_y in enumerate(Y):
- dis_matrix[i_x, i_y], _ = self.compute_ged(g_x, g_y)
-
- return dis_matrix
-
-
- def _compute_kernel_matrix_imap_unordered(self, Y):
- """Compute the kernel matrix between a given target graphs (Y) and
- the fitted graphs (X / self._graphs) using imap unordered parallelization.
-
- Parameters
- ----------
- Y : list of graphs, optional
- The target graphs.
-
- Returns
- -------
- kernel_matrix : numpy array, shape = [n_targets, n_inputs]
- The computed kernel matrix.
-
- """
- raise Exception('Parallelization for kernel matrix is not implemented.')
-
-
- def diagonals(self):
- """Compute the kernel matrix diagonals of the fit/transformed data.
-
- Returns
- -------
- X_diag : numpy array
- The diagonal of the kernel matrix between the fitted data.
- This consists of each element calculated with itself.
-
- Y_diag : numpy array
- The diagonal of the kernel matrix, of the transform.
- This consists of each element calculated with itself.
-
- """
- # Check if method "fit" had been called.
- check_is_fitted(self, ['_graphs'])
-
- # Check if the diagonals of X exist.
- try:
- check_is_fitted(self, ['_X_diag'])
- except NotFittedError:
- # Compute diagonals of X.
- self._X_diag = np.empty(shape=(len(self._graphs),))
- graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
- for i, x in enumerate(graphs):
- self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel?
-
- try:
- # If transform has happened, return both diagonals.
- check_is_fitted(self, ['_Y'])
- self._Y_diag = np.empty(shape=(len(self._Y),))
- Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y)
- for (i, y) in enumerate(Y):
- self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel?
-
- return self._X_diag, self._Y_diag
- except NotFittedError:
- # Else just return both X_diag
- return self._X_diag
-
-
- # @abstractmethod
- def pairwise_distance(self, x, y):
- """Compute pairwise kernel between two graphs.
-
- Parameters
- ----------
- x, y : NetworkX Graph.
- Graphs bewteen which the kernel is computed.
-
- Returns
- -------
- kernel: float
- The computed kernel.
-
- # Notes
- # -----
- # This method is abstract and must be implemented by a subclass.
-
- """
- raise NotImplementedError('Pairwise kernel computation is not implemented!')
-
-
-
- def compute_edit_costs(self, Y=None, Y_targets=None):
- """Compute edit cost constants. When optimizing method is `fiited`,
- apply Jia2021's metric learning method by using a given target graphs (Y)
- the fitted graphs (X / self._graphs).
-
- Parameters
- ----------
- Y : TYPE, optional
- DESCRIPTION. The default is None.
-
- Returns
- -------
- None.
-
- """
- # Get or compute.
- if self.optim_method == 'random':
- self._edit_cost_constants = np.random.rand(6)
-
- elif self.optim_method == 'init':
- self._edit_cost_constants = self.init_edit_cost_constants
-
-
- elif self.optim_method == 'expert':
- self._edit_cost_constants = [3, 3, 1, 3, 3, 1]
-
-
- elif self.optim_method == 'fitted': # Jia2021 method
- # Get proper inputs.
- if Y is None:
- check_is_fitted(self, ['_graphs'])
- check_is_fitted(self, ['_targets'])
- graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
- targets = self._targets
- else:
- graphs = ([g.copy() for g in Y] if self.copy_graphs else Y)
- targets = Y_targets
-
- # Get optimization options.
- node_labels = self.node_labels
- edge_labels = self.edge_labels
- unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0)
- from gklearn.ged.model.optim_costs import compute_optimal_costs
- self._edit_cost_constants = compute_optimal_costs(
- graphs, targets,
- node_labels=node_labels, edge_labels=edge_labels,
- unlabeled=unlabeled, ed_method=self.ed_method,
- verbose=(self.verbose >= 2),
- **self.optim_options)
-
-
- ##########################################################################
- # The following is the 2nd paradigm to compute kernel matrix. It is
- # simplified and not compatible with `scikit-learn`.
- ##########################################################################
-
-
- # def compute(self, *graphs, **kwargs):
- # self.parallel = kwargs.get('parallel', 'imap_unordered')
- # self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
- # self.normalize = kwargs.get('normalize', True)
- # self.verbose = kwargs.get('verbose', 2)
- # self.copy_graphs = kwargs.get('copy_graphs', True)
- # self.save_unnormed = kwargs.get('save_unnormed', True)
- # self.validate_parameters()
-
- # # If the inputs is a list of graphs.
- # if len(graphs) == 1:
- # if not isinstance(graphs[0], list):
- # raise Exception('Cannot detect graphs.')
- # elif len(graphs[0]) == 0:
- # raise Exception('The graph list given is empty. No computation was performed.')
- # else:
- # if self.copy_graphs:
- # self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow.
- # else:
- # self._graphs = graphs
- # self._gram_matrix = self._compute_gram_matrix()
-
- # if self.save_unnormed:
- # self._gram_matrix_unnorm = np.copy(self._gram_matrix)
- # if self.normalize:
- # self._gram_matrix = normalize_gram_matrix(self._gram_matrix)
- # return self._gram_matrix, self._run_time
-
- # elif len(graphs) == 2:
- # # If the inputs are two graphs.
- # if self.is_graph(graphs[0]) and self.is_graph(graphs[1]):
- # if self.copy_graphs:
- # G0, G1 = graphs[0].copy(), graphs[1].copy()
- # else:
- # G0, G1 = graphs[0], graphs[1]
- # kernel = self._compute_single_kernel(G0, G1)
- # return kernel, self._run_time
-
- # # If the inputs are a graph and a list of graphs.
- # elif self.is_graph(graphs[0]) and isinstance(graphs[1], list):
- # if self.copy_graphs:
- # g1 = graphs[0].copy()
- # g_list = [g.copy() for g in graphs[1]]
- # kernel_list = self._compute_kernel_list(g1, g_list)
- # else:
- # kernel_list = self._compute_kernel_list(graphs[0], graphs[1])
- # return kernel_list, self._run_time
-
- # elif isinstance(graphs[0], list) and self.is_graph(graphs[1]):
- # if self.copy_graphs:
- # g1 = graphs[1].copy()
- # g_list = [g.copy() for g in graphs[0]]
- # kernel_list = self._compute_kernel_list(g1, g_list)
- # else:
- # kernel_list = self._compute_kernel_list(graphs[1], graphs[0])
- # return kernel_list, self._run_time
-
- # else:
- # raise Exception('Cannot detect graphs.')
-
- # elif len(graphs) == 0 and self._graphs is None:
- # raise Exception('Please add graphs before computing.')
-
- # else:
- # raise Exception('Cannot detect graphs.')
-
-
- # def normalize_gm(self, gram_matrix):
- # import warnings
- # warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning)
-
- # diag = gram_matrix.diagonal().copy()
- # for i in range(len(gram_matrix)):
- # for j in range(i, len(gram_matrix)):
- # gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j])
- # gram_matrix[j][i] = gram_matrix[i][j]
- # return gram_matrix
-
-
- # def compute_distance_matrix(self):
- # if self._gram_matrix is None:
- # raise Exception('Please compute the Gram matrix before computing distance matrix.')
- # dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix)))
- # for i in range(len(self._gram_matrix)):
- # for j in range(i, len(self._gram_matrix)):
- # dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j]
- # if dis < 0:
- # if dis > -1e-10:
- # dis = 0
- # else:
- # raise ValueError('The distance is negative.')
- # dis_mat[i, j] = np.sqrt(dis)
- # dis_mat[j, i] = dis_mat[i, j]
- # dis_max = np.max(np.max(dis_mat))
- # dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
- # dis_mean = np.mean(np.mean(dis_mat))
- # return dis_mat, dis_max, dis_min, dis_mean
-
-
- def _compute_X_distance_matrix(self):
- start_time = time.time()
-
- if self.parallel == 'imap_unordered':
- dis_matrix = self._compute_X_dm_imap_unordered()
- elif self.parallel is None:
- graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
- dis_matrix = self._compute_X_dm_series(graphs)
- else:
- raise Exception('Parallel mode is not set correctly.')
-
- self._run_time = time.time() - start_time
- if self.verbose:
- print('Distance matrix of size %d built in %s seconds.'
- % (len(self._graphs), self._run_time))
-
- return dis_matrix
-
-
- def _compute_X_dm_series(self, graphs):
- N = len(graphs)
- dis_matrix = np.zeros((N, N))
-
- for i, G1 in get_iters(enumerate(graphs), desc='Computing distance matrix', file=sys.stdout, verbose=(self.verbose >= 2)):
- for j, G2 in enumerate(graphs[i+1:], i+1):
- dis_matrix[i, j], _ = self.compute_ged(G1, G2)
- dis_matrix[j, i] = dis_matrix[i, j]
- return dis_matrix
-
-
- def _compute_X_dm_imap_unordered(self, graphs):
- pass
-
-
- def compute_ged(self, Gi, Gj, **kwargs):
- """
- Compute GED between two graph according to edit_cost.
- """
- ged_options = {'edit_cost': self.edit_cost_fun,
- 'method': self.ed_method,
- 'edit_cost_constants': self._edit_cost_constants}
- dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10)
- n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward,
- edit_cost=self.edit_cost_fun,
- node_labels=self.node_labels,
- edge_labels=self.edge_labels)
- return dis, n_eo_tmp
-
-
- # def _compute_kernel_list(self, g1, g_list):
- # start_time = time.time()
-
- # if self.parallel == 'imap_unordered':
- # kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list)
- # elif self.parallel is None:
- # kernel_list = self._compute_kernel_list_series(g1, g_list)
- # else:
- # raise Exception('Parallel mode is not set correctly.')
-
- # self._run_time = time.time() - start_time
- # if self.verbose:
- # print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.'
- # % (len(g_list), self._run_time))
-
- # return kernel_list
-
-
- # def _compute_kernel_list_series(self, g1, g_list):
- # pass
-
-
- # def _compute_kernel_list_imap_unordered(self, g1, g_list):
- # pass
-
-
- # def _compute_single_kernel(self, g1, g2):
- # start_time = time.time()
-
- # kernel = self._compute_single_kernel_series(g1, g2)
-
- # self._run_time = time.time() - start_time
- # if self.verbose:
- # print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time))
-
- # return kernel
-
-
- # def _compute_single_kernel_series(self, g1, g2):
- # pass
-
-
- def is_graph(self, graph):
- if isinstance(graph, nx.Graph):
- return True
- if isinstance(graph, nx.DiGraph):
- return True
- if isinstance(graph, nx.MultiGraph):
- return True
- if isinstance(graph, nx.MultiDiGraph):
- return True
- return False
-
-
- @property
- def graphs(self):
- return self._graphs
-
-
- # @property
- # def parallel(self):
- # return self.parallel
-
-
- # @property
- # def n_jobs(self):
- # return self.n_jobs
-
-
- # @property
- # def verbose(self):
- # return self.verbose
-
-
- # @property
- # def normalize(self):
- # return self.normalize
-
-
- @property
- def run_time(self):
- return self._run_time
-
-
- @property
- def dis_matrix(self):
- return self._dis_matrix
-
- @dis_matrix.setter
- def dis_matrix(self, value):
- self._dis_matrix = value
-
-
- # @property
- # def gram_matrix_unnorm(self):
- # return self._gram_matrix_unnorm
-
- # @gram_matrix_unnorm.setter
- # def gram_matrix_unnorm(self, value):
- # self._gram_matrix_unnorm = value
|