Browse Source

[Major Features] Add GEDModel which is compatibale with .

v0.2.x
jajupmochi 3 years ago
parent
commit
5e37d4447f
5 changed files with 1014 additions and 0 deletions
  1. +1
    -0
      gklearn/ged/__init__.py
  2. +43
    -0
      gklearn/ged/model/distances.py
  3. +97
    -0
      gklearn/ged/model/ged_com.py
  4. +724
    -0
      gklearn/ged/model/ged_model.py
  5. +149
    -0
      gklearn/ged/model/optim_costs.py

+ 1
- 0
gklearn/ged/__init__.py View File

@@ -0,0 +1 @@
from gklearn.ged.model.ged_model import GEDModel

+ 43
- 0
gklearn/ged/model/distances.py View File

@@ -0,0 +1,43 @@
import numpy as np


def sum_squares(a, b):
"""
Return the sum of squares of the difference between a and b, aka MSE
"""
return np.sum([(a[i] - b[i])**2 for i in range(len(a))])


def euclid_d(x, y):
"""
1D euclidean distance
"""
return np.sqrt((x-y)**2)


def man_d(x, y):
"""
1D manhattan distance
"""
return np.abs((x-y))


def classif_d(x, y):
"""
Function adapted to classification problems
"""
return np.array(0 if x == y else 1)


def rmse(pred, ground_truth):
import numpy as np
return np.sqrt(sum_squares(pred, ground_truth)/len(ground_truth))


def accuracy(pred, ground_truth):
import numpy as np
return np.mean([a == b for a, b in zip(pred, ground_truth)])


def rbf_k(D, sigma=1):
return np.exp(-(D**2)/sigma)

+ 97
- 0
gklearn/ged/model/ged_com.py View File

@@ -0,0 +1,97 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 5 14:02:17 2022

@author: ljia
"""
import sys
from gklearn.ged.model.distances import euclid_d
from gklearn.ged.util import pairwise_ged, get_nb_edit_operations
from gklearn.utils import get_iters


def compute_ged(Gi, Gj, edit_cost, method='BIPARTITE', **kwargs):
"""
Compute GED between two graph according to edit_cost
"""
ged_options = {'edit_cost': 'CONSTANT',
'method': method,
'edit_cost_constants': edit_cost}
node_labels = kwargs.get('node_labels', [])
edge_labels = kwargs.get('edge_labels', [])
dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10)
n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, edit_cost='CONSTANT', node_labels=node_labels, edge_labels=edge_labels)
return dis, n_eo_tmp


def compute_ged_all_dataset(Gn, edit_cost, ed_method, **kwargs):
N = len(Gn)
G_pairs = []
for i in range(N):
for j in range(i, N):
G_pairs.append([i, j])
return compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs)


def compute_geds(G_pairs, Gn, edit_cost, ed_method, verbose=True, **kwargs):
"""
Compute GED between all indexes in G_pairs given edit_cost
:return: ged_vec : the list of computed distances, n_edit_operations : the list of edit operations
"""
ged_vec = []
n_edit_operations = []
for k in get_iters(range(len(G_pairs)), desc='Computing GED', file=sys.stdout, length=len(G_pairs), verbose=verbose):
[i, j] = G_pairs[k]
dis, n_eo_tmp = compute_ged(
Gn[i], Gn[j], edit_cost=edit_cost, method=ed_method, **kwargs)
ged_vec.append(dis)
n_edit_operations.append(n_eo_tmp)

return ged_vec, n_edit_operations


def compute_D(G_app, edit_cost, G_test=None, ed_method='BIPARTITE', **kwargs):
import numpy as np
N = len(G_app)
D_app = np.zeros((N, N))

for i, G1 in get_iters(enumerate(G_app), desc='Computing D - app', file=sys.stdout, length=N):
for j, G2 in enumerate(G_app[i+1:], i+1):
D_app[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs)
D_app[j, i] = D_app[i, j]
if (G_test is None):
return D_app, edit_cost
else:
D_test = np.zeros((len(G_test), N))
for i, G1 in get_iters(enumerate(G_test), desc='Computing D - test', file=sys.stdout, length=len(G_test)):
for j, G2 in enumerate(G_app):
D_test[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs)
return D_app, D_test, edit_cost


def compute_D_random(G_app, G_test=None, ed_method='BIPARTITE', **kwargs):
import numpy as np
edit_costs = np.random.rand(6)
return compute_D(G_app, edit_costs, G_test, ed_method=ed_method, **kwargs)


def compute_D_expert(G_app, G_test=None, ed_method='BIPARTITE', **kwargs):
edit_cost = [3, 3, 1, 3, 3, 1]
return compute_D(G_app, edit_cost, G_test, ed_method=ed_method, **kwargs)


def compute_D_fitted(G_app, y_app, G_test=None, y_distance=euclid_d,
mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs):
from gklearn.ged.models.optim_costs import compute_optimal_costs

costs_optim = compute_optimal_costs(
G_app, y_app, y_distance=y_distance,
mode=mode, unlabeled=unlabeled, ed_method=ed_method, **kwargs)
return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs)


def compute_D_GH2020(G_app, G_test=None, ed_method='BIPARTITE', **kwargs):
from gklearn.ged.optim_costs import get_optimal_costs_GH2020
costs_optim = get_optimal_costs_GH2020(**kwargs)
return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs)

+ 724
- 0
gklearn/ged/model/ged_model.py View File

@@ -0,0 +1,724 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 5 09:42:30 2022

@author: ljia
"""
import sys
import multiprocessing
import time
import numpy as np
import networkx as nx

# from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator # , TransformerMixin
from sklearn.utils.validation import check_is_fitted # check_X_y, check_array,
from sklearn.exceptions import NotFittedError

from gklearn.ged.model.distances import euclid_d
from gklearn.ged.util import pairwise_ged, get_nb_edit_operations
# from gklearn.utils import normalize_gram_matrix
from gklearn.utils import get_iters


class GEDModel(BaseEstimator): #, ABC):
"""The graph edit distance model class compatible with `scikit-learn`.

Attributes
----------
_graphs : list
Stores the input graphs on fit input data.
Default format of the list objects is `NetworkX` graphs.
**We don't guarantee that the input graphs remain unchanged during the
computation.**

References
----------
https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel.
"""

def __init__(self,
ed_method='BIPARTITE',
edit_cost_fun='CONSTANT',
init_edit_cost_constants=[3, 3, 1, 3, 3, 1],
optim_method='init',
optim_options={'y_distance': euclid_d, 'mode': 'reg'},
node_labels=[],
edge_labels=[],
parallel=None,
n_jobs=None,
chunksize=None,
# normalize=True,
copy_graphs=True, # make sure it is a full deep copy. and faster!
verbose=2):
"""`__init__` for `GEDModel` object."""
# @todo: the default settings of the parameters are different from those in the self.compute method.
# self._graphs = None
self.ed_method = ed_method
self.edit_cost_fun = edit_cost_fun
self.init_edit_cost_constants = init_edit_cost_constants
self.optim_method=optim_method
self.optim_options=optim_options
self.node_labels=node_labels
self.edge_labels=edge_labels
self.parallel = parallel
self.n_jobs = n_jobs
self.chunksize = chunksize
# self.normalize = normalize
self.copy_graphs = copy_graphs
self.verbose = verbose
# self._run_time = 0
# self._gram_matrix = None
# self._gram_matrix_unnorm = None


##########################################################################
# The following is the 1st paradigm to compute GED distance matrix, which is
# compatible with `scikit-learn`.
##########################################################################


def fit(self, X, y=None):
"""Fit a graph dataset for a transformer.

Parameters
----------
X : iterable
DESCRIPTION.

y : None, optional
There is no need of a target in a transformer, yet the `scikit-learn`
pipeline API requires this parameter.

Returns
-------
object
Returns self.

"""
# self._is_tranformed = False

# Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used;
self.clear_attributes()

# Validate parameters for the transformer.
self.validate_parameters()

# Validate the input.
self._graphs = self.validate_input(X)
if y is not None:
self._targets = y
# self._targets = self.validate_input(y)

# self._X = X
# self._kernel = self._get_kernel_instance()

# Return the transformer.
return self


def transform(self, X=None, return_dm_train=False):
"""Compute the graph kernel matrix between given and fitted data.

Parameters
----------
X : TYPE
DESCRIPTION.

Raises
------
ValueError
DESCRIPTION.

Returns
-------
None.

"""
# If `return_dm_train`, return the fitted GED distance matrix of training data.
if return_dm_train:
check_is_fitted(self, '_dm_train')
self._is_transformed = True
return self._dm_train # @todo: copy or not?

# Check if method "fit" had been called.
check_is_fitted(self, '_graphs')

# Validate the input.
Y = self.validate_input(X)

# Transform: compute the graph kernel matrix.
dis_matrix = self.compute_distance_matrix(Y)
self._Y = Y

# Self transform must appear before the diagonal call on normilization.
self._is_transformed = True
# if self.normalize:
# X_diag, Y_diag = self.diagonals()
# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
# try:
# kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag))
# except:
# raise
# finally:
# np.seterr(**old_settings)

return dis_matrix


def fit_transform(self, X, y=None, save_dm_train=False):
"""Fit and transform: compute GED distance matrix on the same data.

Parameters
----------
X : list of graphs
Input graphs.

Returns
-------
dis_matrix : numpy array, shape = [len(X), len(X)]
The distance matrix of X.

"""
self.fit(X, y)

# Compute edit cost constants.
self.compute_edit_costs()

# Transform: compute Gram matrix.
dis_matrix = self.compute_distance_matrix()

# # Normalize.
# if self.normalize:
# self._X_diag = np.diagonal(gram_matrix).copy()
# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt.
# try:
# gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag))
# except:
# raise
# finally:
# np.seterr(**old_settings)

if save_dm_train:
self._dm_train = dis_matrix

return dis_matrix


def get_params(self):
pass


def set_params(self):
pass


def clear_attributes(self): # @todo: update
# if hasattr(self, '_X_diag'):
# delattr(self, '_X_diag')
if hasattr(self, '_graphs'):
delattr(self, '_graphs')
if hasattr(self, '_Y'):
delattr(self, '_Y')
if hasattr(self, '_run_time'):
delattr(self, '_run_time')


def validate_parameters(self):
"""Validate all parameters for the transformer.

Returns
-------
None.

"""
if self.parallel is not None and self.parallel != 'imap_unordered':
raise ValueError('Parallel mode is not set correctly.')

if self.parallel == 'imap_unordered' and self.n_jobs is None:
self.n_jobs = multiprocessing.cpu_count()


def validate_input(self, X):
"""Validate the given input and raise errors if it is invalid.

Parameters
----------
X : list
The input to check. Should be a list of graph.

Raises
------
ValueError
Raise if the input is not correct.

Returns
-------
X : list
The input. A list of graph.

"""
if X is None:
raise ValueError('Please add graphs before computing.')
elif not isinstance(X, list):
raise ValueError('Cannot detect graphs. The input must be a list.')
elif len(X) == 0:
raise ValueError('The graph list given is empty. No computation will be performed.')

return X


def compute_distance_matrix(self, Y=None):
"""Compute the distance matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) or the distance matrix for the fitted
graphs (X / self._graphs).

Parameters
----------
Y : list of graphs, optional
The target graphs. The default is None. If None kernel is computed
between X and itself.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
if Y is None:
# Compute Gram matrix for self._graphs (X).
dis_matrix = self._compute_X_distance_matrix()
# self._gram_matrix_unnorm = np.copy(self._gram_matrix)

else:
# Compute kernel matrix between Y and self._graphs (X).
start_time = time.time()

if self.parallel == 'imap_unordered':
dis_matrix = self._compute_distance_matrix_imap_unordered(Y)

elif self.parallel is None:
Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y)
graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
dis_matrix = self._compute_distance_matrix_series(Y_copy, graphs_copy)

self._run_time = time.time() - start_time
if self.verbose:
print('Distance matrix of size (%d, %d) built in %s seconds.'
% (len(Y), len(self._graphs), self._run_time))

return dis_matrix


def _compute_distance_matrix_series(self, X, Y):
"""Compute the GED distance matrix between two sets of graphs (X and Y)
without parallelization.

Parameters
----------
X, Y : list of graphs
The input graphs.

Returns
-------
dis_matrix : numpy array, shape = [n_X, n_Y]
The computed distance matrix.

"""
dis_matrix = np.zeros((len(X), len(Y)))

for i_x, g_x in enumerate(X):
for i_y, g_y in enumerate(Y):
dis_matrix[i_x, i_y], _ = self.compute_ged(g_x, g_y)

return dis_matrix


def _compute_kernel_matrix_imap_unordered(self, Y):
"""Compute the kernel matrix between a given target graphs (Y) and
the fitted graphs (X / self._graphs) using imap unordered parallelization.

Parameters
----------
Y : list of graphs, optional
The target graphs.

Returns
-------
kernel_matrix : numpy array, shape = [n_targets, n_inputs]
The computed kernel matrix.

"""
raise Exception('Parallelization for kernel matrix is not implemented.')


def diagonals(self):
"""Compute the kernel matrix diagonals of the fit/transformed data.

Returns
-------
X_diag : numpy array
The diagonal of the kernel matrix between the fitted data.
This consists of each element calculated with itself.

Y_diag : numpy array
The diagonal of the kernel matrix, of the transform.
This consists of each element calculated with itself.

"""
# Check if method "fit" had been called.
check_is_fitted(self, ['_graphs'])

# Check if the diagonals of X exist.
try:
check_is_fitted(self, ['_X_diag'])
except NotFittedError:
# Compute diagonals of X.
self._X_diag = np.empty(shape=(len(self._graphs),))
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
for i, x in enumerate(graphs):
self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel?

try:
# If transform has happened, return both diagonals.
check_is_fitted(self, ['_Y'])
self._Y_diag = np.empty(shape=(len(self._Y),))
Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y)
for (i, y) in enumerate(Y):
self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel?

return self._X_diag, self._Y_diag
except NotFittedError:
# Else just return both X_diag
return self._X_diag


# @abstractmethod
def pairwise_distance(self, x, y):
"""Compute pairwise kernel between two graphs.

Parameters
----------
x, y : NetworkX Graph.
Graphs bewteen which the kernel is computed.

Returns
-------
kernel: float
The computed kernel.

# Notes
# -----
# This method is abstract and must be implemented by a subclass.

"""
raise NotImplementedError('Pairwise kernel computation is not implemented!')



def compute_edit_costs(self, Y=None, Y_targets=None):
"""Compute edit cost constants. When optimizing method is `fiited`,
apply Jia2021's metric learning method by using a given target graphs (Y)
the fitted graphs (X / self._graphs).

Parameters
----------
Y : TYPE, optional
DESCRIPTION. The default is None.

Returns
-------
None.

"""
# Get or compute.
if self.optim_method == 'random':
self._edit_cost_constants = np.random.rand(6)

elif self.optim_method == 'init':
self._edit_cost_constants = self.init_edit_cost_constants


elif self.optim_method == 'expert':
self._edit_cost_constants = [3, 3, 1, 3, 3, 1]


elif self.optim_method == 'fitted': # Jia2021 method
# Get proper inputs.
if Y is None:
check_is_fitted(self, ['_graphs'])
check_is_fitted(self, ['_targets'])
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
targets = self._targets
else:
graphs = ([g.copy() for g in Y] if self.copy_graphs else Y)
targets = Y_targets

# Get optimization options.
node_labels = self.node_labels
edge_labels = self.edge_labels
unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0)
from gklearn.ged.model.optim_costs import compute_optimal_costs
self._edit_cost_constants = compute_optimal_costs(
graphs, targets,
node_labels=node_labels, edge_labels=edge_labels,
unlabeled=unlabeled, ed_method=self.ed_method,
verbose=(self.verbose >= 2),
**self.optim_options)


##########################################################################
# The following is the 2nd paradigm to compute kernel matrix. It is
# simplified and not compatible with `scikit-learn`.
##########################################################################


# def compute(self, *graphs, **kwargs):
# self.parallel = kwargs.get('parallel', 'imap_unordered')
# self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
# self.normalize = kwargs.get('normalize', True)
# self.verbose = kwargs.get('verbose', 2)
# self.copy_graphs = kwargs.get('copy_graphs', True)
# self.save_unnormed = kwargs.get('save_unnormed', True)
# self.validate_parameters()

# # If the inputs is a list of graphs.
# if len(graphs) == 1:
# if not isinstance(graphs[0], list):
# raise Exception('Cannot detect graphs.')
# elif len(graphs[0]) == 0:
# raise Exception('The graph list given is empty. No computation was performed.')
# else:
# if self.copy_graphs:
# self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow.
# else:
# self._graphs = graphs
# self._gram_matrix = self._compute_gram_matrix()

# if self.save_unnormed:
# self._gram_matrix_unnorm = np.copy(self._gram_matrix)
# if self.normalize:
# self._gram_matrix = normalize_gram_matrix(self._gram_matrix)
# return self._gram_matrix, self._run_time

# elif len(graphs) == 2:
# # If the inputs are two graphs.
# if self.is_graph(graphs[0]) and self.is_graph(graphs[1]):
# if self.copy_graphs:
# G0, G1 = graphs[0].copy(), graphs[1].copy()
# else:
# G0, G1 = graphs[0], graphs[1]
# kernel = self._compute_single_kernel(G0, G1)
# return kernel, self._run_time

# # If the inputs are a graph and a list of graphs.
# elif self.is_graph(graphs[0]) and isinstance(graphs[1], list):
# if self.copy_graphs:
# g1 = graphs[0].copy()
# g_list = [g.copy() for g in graphs[1]]
# kernel_list = self._compute_kernel_list(g1, g_list)
# else:
# kernel_list = self._compute_kernel_list(graphs[0], graphs[1])
# return kernel_list, self._run_time

# elif isinstance(graphs[0], list) and self.is_graph(graphs[1]):
# if self.copy_graphs:
# g1 = graphs[1].copy()
# g_list = [g.copy() for g in graphs[0]]
# kernel_list = self._compute_kernel_list(g1, g_list)
# else:
# kernel_list = self._compute_kernel_list(graphs[1], graphs[0])
# return kernel_list, self._run_time

# else:
# raise Exception('Cannot detect graphs.')

# elif len(graphs) == 0 and self._graphs is None:
# raise Exception('Please add graphs before computing.')

# else:
# raise Exception('Cannot detect graphs.')


# def normalize_gm(self, gram_matrix):
# import warnings
# warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning)

# diag = gram_matrix.diagonal().copy()
# for i in range(len(gram_matrix)):
# for j in range(i, len(gram_matrix)):
# gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j])
# gram_matrix[j][i] = gram_matrix[i][j]
# return gram_matrix


# def compute_distance_matrix(self):
# if self._gram_matrix is None:
# raise Exception('Please compute the Gram matrix before computing distance matrix.')
# dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix)))
# for i in range(len(self._gram_matrix)):
# for j in range(i, len(self._gram_matrix)):
# dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j]
# if dis < 0:
# if dis > -1e-10:
# dis = 0
# else:
# raise ValueError('The distance is negative.')
# dis_mat[i, j] = np.sqrt(dis)
# dis_mat[j, i] = dis_mat[i, j]
# dis_max = np.max(np.max(dis_mat))
# dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
# dis_mean = np.mean(np.mean(dis_mat))
# return dis_mat, dis_max, dis_min, dis_mean


def _compute_X_distance_matrix(self):
start_time = time.time()

if self.parallel == 'imap_unordered':
dis_matrix = self._compute_X_dm_imap_unordered()
elif self.parallel is None:
graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs)
dis_matrix = self._compute_X_dm_series(graphs)
else:
raise Exception('Parallel mode is not set correctly.')

self._run_time = time.time() - start_time
if self.verbose:
print('Distance matrix of size %d built in %s seconds.'
% (len(self._graphs), self._run_time))

return dis_matrix


def _compute_X_dm_series(self, graphs):
N = len(graphs)
dis_matrix = np.zeros((N, N))

for i, G1 in get_iters(enumerate(graphs), desc='Computing distance matrix', file=sys.stdout, verbose=(self.verbose >= 2)):
for j, G2 in enumerate(graphs[i+1:], i+1):
dis_matrix[i, j], _ = self.compute_ged(G1, G2)
dis_matrix[j, i] = dis_matrix[i, j]
return dis_matrix


def _compute_X_dm_imap_unordered(self, graphs):
pass


def compute_ged(self, Gi, Gj, **kwargs):
"""
Compute GED between two graph according to edit_cost.
"""
ged_options = {'edit_cost': self.edit_cost_fun,
'method': self.ed_method,
'edit_cost_constants': self._edit_cost_constants}
dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10)
n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward,
edit_cost=self.edit_cost_fun,
node_labels=self.node_labels,
edge_labels=self.edge_labels)
return dis, n_eo_tmp


# def _compute_kernel_list(self, g1, g_list):
# start_time = time.time()

# if self.parallel == 'imap_unordered':
# kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list)
# elif self.parallel is None:
# kernel_list = self._compute_kernel_list_series(g1, g_list)
# else:
# raise Exception('Parallel mode is not set correctly.')

# self._run_time = time.time() - start_time
# if self.verbose:
# print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.'
# % (len(g_list), self._run_time))

# return kernel_list


# def _compute_kernel_list_series(self, g1, g_list):
# pass


# def _compute_kernel_list_imap_unordered(self, g1, g_list):
# pass


# def _compute_single_kernel(self, g1, g2):
# start_time = time.time()

# kernel = self._compute_single_kernel_series(g1, g2)

# self._run_time = time.time() - start_time
# if self.verbose:
# print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time))

# return kernel


# def _compute_single_kernel_series(self, g1, g2):
# pass


def is_graph(self, graph):
if isinstance(graph, nx.Graph):
return True
if isinstance(graph, nx.DiGraph):
return True
if isinstance(graph, nx.MultiGraph):
return True
if isinstance(graph, nx.MultiDiGraph):
return True
return False


@property
def graphs(self):
return self._graphs


# @property
# def parallel(self):
# return self.parallel


# @property
# def n_jobs(self):
# return self.n_jobs


# @property
# def verbose(self):
# return self.verbose


# @property
# def normalize(self):
# return self.normalize


@property
def run_time(self):
return self._run_time


@property
def dis_matrix(self):
return self._dis_matrix

@dis_matrix.setter
def dis_matrix(self, value):
self._dis_matrix = value


# @property
# def gram_matrix_unnorm(self):
# return self._gram_matrix_unnorm

# @gram_matrix_unnorm.setter
# def gram_matrix_unnorm(self, value):
# self._gram_matrix_unnorm = value

+ 149
- 0
gklearn/ged/model/optim_costs.py View File

@@ -0,0 +1,149 @@
import numpy as np

from gklearn.ged.model.distances import sum_squares, euclid_d
from gklearn.ged.model.ged_com import compute_geds


def optimize_costs_unlabeled(nb_cost_mat, dis_k_vec):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
:param dis_k_vec: The N distances to fit
"""
import cvxpy as cp
import numpy as np
MAX_SAMPLE = 1000
nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] for x in nb_cost_mat])
dis_k_vec = np.array(dis_k_vec)
# dis_k_vec_norm = dis_k_vec/np.max(dis_k_vec)

# import pickle
# pickle.dump([nb_cost_mat, dis_k_vec], open('debug', 'wb'))
N = nb_cost_mat_m.shape[0]
sub_sample = np.random.permutation(np.arange(N))
sub_sample = sub_sample[:MAX_SAMPLE]

x = cp.Variable(nb_cost_mat_m.shape[1])
cost = cp.sum_squares((nb_cost_mat_m[sub_sample, :] @ x) - dis_k_vec[sub_sample])
prob = cp.Problem(cp.Minimize(cost), [x >= 0])
prob.solve()
edit_costs_new = [x.value[0], x.value[1], 0, x.value[2], x.value[3], 0]
edit_costs_new = [xi if xi > 0 else 0 for xi in edit_costs_new]
residual = prob.value
return edit_costs_new, residual


def optimize_costs_classif_unlabeled(nb_cost_mat, Y):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in
nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit
operations for each pair of graph
:param dis_k_vec: {-1,1}^N vector of common classes
"""
# import cvxpy as cp
from ml import reg_log
# import pickle
# pickle.dump([nb_cost_mat, Y], open('debug', 'wb'))
nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]]
for x in nb_cost_mat])
w, J, _ = reg_log(nb_cost_mat_m, Y, pos_contraint=True)
edit_costs_new = [w[0], w[1], 0, w[2], w[3], 0]
residual = J[-1]

return edit_costs_new, residual


def optimize_costs_classif(nb_cost_mat, Y):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
:param dis_k_vec: {-1,1}^N vector of common classes
"""
#import pickle
# pickle.dump([nb_cost_mat, Y], open("test.pickle", "wb"))
from ml import reg_log
w, J, _ = reg_log(nb_cost_mat, Y, pos_contraint=True)
return w, J[-1]


def optimize_costs(nb_cost_mat, dis_k_vec):
"""
Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
! take care that nb_cost_mat do not contains 0 lines
:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
:param dis_k_vec: The N distances to fit
"""
import cvxpy as cp
x = cp.Variable(nb_cost_mat.shape[1])
cost = cp.sum_squares((nb_cost_mat @ x) - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost), constraints)
prob.solve()
edit_costs_new = x.value
residual = prob.value

return edit_costs_new, residual


def compute_optimal_costs(G, y, init_costs=[3, 3, 1, 3, 3, 1],
y_distance=euclid_d,
mode='reg', unlabeled=False,
ed_method='BIPARTITE',
verbose=True,
**kwargs):
N = len(y)

G_pairs = []
distances_vec = []

for i in range(N):
for j in range(i+1, N):
G_pairs.append([i, j])
distances_vec.append(y_distance(y[i], y[j]))
ged_vec_init, n_edit_operations = compute_geds(G_pairs, G, init_costs, ed_method,
verbose=verbose, **kwargs)

residual_list = [sum_squares(ged_vec_init, distances_vec)]

if (mode == 'reg'):
if unlabeled:
method_optim = optimize_costs_unlabeled
else:
method_optim = optimize_costs

elif (mode == 'classif'):
if unlabeled:
method_optim = optimize_costs_classif_unlabeled
else:
method_optim = optimize_costs_classif

ite_max = 5
for i in range(ite_max):
if verbose:
print('ite', i + 1, '/', ite_max, ':')
# compute GEDs and numbers of edit operations.
edit_costs_new, residual = method_optim(
np.array(n_edit_operations), distances_vec)
ged_vec, n_edit_operations = compute_geds(G_pairs, G, edit_costs_new, ed_method,
verbose=verbose, **kwargs)
residual_list.append(sum_squares(ged_vec, distances_vec))

return edit_costs_new


def get_optimal_costs_GH2020(**kwargs):
import pickle
import os
dir_root = 'cj/output/'
ds_name = kwargs.get('ds_name')
nb_trial = kwargs.get('nb_trial')
file_name = os.path.join(dir_root, 'costs.' + ds_name + '.' + str(nb_trial) + '.pkl')
with open(file_name, 'rb') as f:
edit_costs = pickle.load(f)
return edit_costs

Loading…
Cancel
Save