From 5e37d4447f96953ffd7004d0e695dd30d53f9242 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 6 May 2022 14:12:31 +0200 Subject: [PATCH] [Major Features] Add GEDModel which is compatibale with . --- gklearn/ged/__init__.py | 1 + gklearn/ged/model/distances.py | 43 +++ gklearn/ged/model/ged_com.py | 97 ++++++ gklearn/ged/model/ged_model.py | 724 +++++++++++++++++++++++++++++++++++++++ gklearn/ged/model/optim_costs.py | 149 ++++++++ 5 files changed, 1014 insertions(+) create mode 100644 gklearn/ged/model/distances.py create mode 100644 gklearn/ged/model/ged_com.py create mode 100644 gklearn/ged/model/ged_model.py create mode 100644 gklearn/ged/model/optim_costs.py diff --git a/gklearn/ged/__init__.py b/gklearn/ged/__init__.py index e69de29..8696f76 100644 --- a/gklearn/ged/__init__.py +++ b/gklearn/ged/__init__.py @@ -0,0 +1 @@ +from gklearn.ged.model.ged_model import GEDModel \ No newline at end of file diff --git a/gklearn/ged/model/distances.py b/gklearn/ged/model/distances.py new file mode 100644 index 0000000..3e27eb3 --- /dev/null +++ b/gklearn/ged/model/distances.py @@ -0,0 +1,43 @@ +import numpy as np + + +def sum_squares(a, b): + """ + Return the sum of squares of the difference between a and b, aka MSE + """ + return np.sum([(a[i] - b[i])**2 for i in range(len(a))]) + + +def euclid_d(x, y): + """ + 1D euclidean distance + """ + return np.sqrt((x-y)**2) + + +def man_d(x, y): + """ + 1D manhattan distance + """ + return np.abs((x-y)) + + +def classif_d(x, y): + """ + Function adapted to classification problems + """ + return np.array(0 if x == y else 1) + + +def rmse(pred, ground_truth): + import numpy as np + return np.sqrt(sum_squares(pred, ground_truth)/len(ground_truth)) + + +def accuracy(pred, ground_truth): + import numpy as np + return np.mean([a == b for a, b in zip(pred, ground_truth)]) + + +def rbf_k(D, sigma=1): + return np.exp(-(D**2)/sigma) diff --git a/gklearn/ged/model/ged_com.py b/gklearn/ged/model/ged_com.py new file mode 100644 index 0000000..9da5f87 --- /dev/null +++ b/gklearn/ged/model/ged_com.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu May 5 14:02:17 2022 + +@author: ljia +""" +import sys +from gklearn.ged.model.distances import euclid_d +from gklearn.ged.util import pairwise_ged, get_nb_edit_operations +from gklearn.utils import get_iters + + +def compute_ged(Gi, Gj, edit_cost, method='BIPARTITE', **kwargs): + """ + Compute GED between two graph according to edit_cost + """ + ged_options = {'edit_cost': 'CONSTANT', + 'method': method, + 'edit_cost_constants': edit_cost} + node_labels = kwargs.get('node_labels', []) + edge_labels = kwargs.get('edge_labels', []) + dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10) + n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, edit_cost='CONSTANT', node_labels=node_labels, edge_labels=edge_labels) + return dis, n_eo_tmp + + +def compute_ged_all_dataset(Gn, edit_cost, ed_method, **kwargs): + N = len(Gn) + G_pairs = [] + for i in range(N): + for j in range(i, N): + G_pairs.append([i, j]) + return compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs) + + +def compute_geds(G_pairs, Gn, edit_cost, ed_method, verbose=True, **kwargs): + """ + Compute GED between all indexes in G_pairs given edit_cost + :return: ged_vec : the list of computed distances, n_edit_operations : the list of edit operations + """ + ged_vec = [] + n_edit_operations = [] + for k in get_iters(range(len(G_pairs)), desc='Computing GED', file=sys.stdout, length=len(G_pairs), verbose=verbose): + [i, j] = G_pairs[k] + dis, n_eo_tmp = compute_ged( + Gn[i], Gn[j], edit_cost=edit_cost, method=ed_method, **kwargs) + ged_vec.append(dis) + n_edit_operations.append(n_eo_tmp) + + return ged_vec, n_edit_operations + + +def compute_D(G_app, edit_cost, G_test=None, ed_method='BIPARTITE', **kwargs): + import numpy as np + N = len(G_app) + D_app = np.zeros((N, N)) + + for i, G1 in get_iters(enumerate(G_app), desc='Computing D - app', file=sys.stdout, length=N): + for j, G2 in enumerate(G_app[i+1:], i+1): + D_app[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) + D_app[j, i] = D_app[i, j] + if (G_test is None): + return D_app, edit_cost + else: + D_test = np.zeros((len(G_test), N)) + for i, G1 in get_iters(enumerate(G_test), desc='Computing D - test', file=sys.stdout, length=len(G_test)): + for j, G2 in enumerate(G_app): + D_test[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) + return D_app, D_test, edit_cost + + +def compute_D_random(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): + import numpy as np + edit_costs = np.random.rand(6) + return compute_D(G_app, edit_costs, G_test, ed_method=ed_method, **kwargs) + + +def compute_D_expert(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): + edit_cost = [3, 3, 1, 3, 3, 1] + return compute_D(G_app, edit_cost, G_test, ed_method=ed_method, **kwargs) + + +def compute_D_fitted(G_app, y_app, G_test=None, y_distance=euclid_d, + mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs): + from gklearn.ged.models.optim_costs import compute_optimal_costs + + costs_optim = compute_optimal_costs( + G_app, y_app, y_distance=y_distance, + mode=mode, unlabeled=unlabeled, ed_method=ed_method, **kwargs) + return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs) + + +def compute_D_GH2020(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): + from gklearn.ged.optim_costs import get_optimal_costs_GH2020 + costs_optim = get_optimal_costs_GH2020(**kwargs) + return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs) diff --git a/gklearn/ged/model/ged_model.py b/gklearn/ged/model/ged_model.py new file mode 100644 index 0000000..9bdbc90 --- /dev/null +++ b/gklearn/ged/model/ged_model.py @@ -0,0 +1,724 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu May 5 09:42:30 2022 + +@author: ljia +""" +import sys +import multiprocessing +import time +import numpy as np +import networkx as nx + +# from abc import ABC, abstractmethod +from sklearn.base import BaseEstimator # , TransformerMixin +from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, +from sklearn.exceptions import NotFittedError + +from gklearn.ged.model.distances import euclid_d +from gklearn.ged.util import pairwise_ged, get_nb_edit_operations +# from gklearn.utils import normalize_gram_matrix +from gklearn.utils import get_iters + + +class GEDModel(BaseEstimator): #, ABC): + """The graph edit distance model class compatible with `scikit-learn`. + + Attributes + ---------- + _graphs : list + Stores the input graphs on fit input data. + Default format of the list objects is `NetworkX` graphs. + **We don't guarantee that the input graphs remain unchanged during the + computation.** + + References + ---------- + https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. + """ + + def __init__(self, + ed_method='BIPARTITE', + edit_cost_fun='CONSTANT', + init_edit_cost_constants=[3, 3, 1, 3, 3, 1], + optim_method='init', + optim_options={'y_distance': euclid_d, 'mode': 'reg'}, + node_labels=[], + edge_labels=[], + parallel=None, + n_jobs=None, + chunksize=None, +# normalize=True, + copy_graphs=True, # make sure it is a full deep copy. and faster! + verbose=2): + """`__init__` for `GEDModel` object.""" + # @todo: the default settings of the parameters are different from those in the self.compute method. +# self._graphs = None + self.ed_method = ed_method + self.edit_cost_fun = edit_cost_fun + self.init_edit_cost_constants = init_edit_cost_constants + self.optim_method=optim_method + self.optim_options=optim_options + self.node_labels=node_labels + self.edge_labels=edge_labels + self.parallel = parallel + self.n_jobs = n_jobs + self.chunksize = chunksize +# self.normalize = normalize + self.copy_graphs = copy_graphs + self.verbose = verbose +# self._run_time = 0 +# self._gram_matrix = None +# self._gram_matrix_unnorm = None + + + ########################################################################## + # The following is the 1st paradigm to compute GED distance matrix, which is + # compatible with `scikit-learn`. + ########################################################################## + + + def fit(self, X, y=None): + """Fit a graph dataset for a transformer. + + Parameters + ---------- + X : iterable + DESCRIPTION. + + y : None, optional + There is no need of a target in a transformer, yet the `scikit-learn` + pipeline API requires this parameter. + + Returns + ------- + object + Returns self. + + """ +# self._is_tranformed = False + + # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; + self.clear_attributes() + + # Validate parameters for the transformer. + self.validate_parameters() + + # Validate the input. + self._graphs = self.validate_input(X) + if y is not None: + self._targets = y + # self._targets = self.validate_input(y) + +# self._X = X +# self._kernel = self._get_kernel_instance() + + # Return the transformer. + return self + + + def transform(self, X=None, return_dm_train=False): + """Compute the graph kernel matrix between given and fitted data. + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Raises + ------ + ValueError + DESCRIPTION. + + Returns + ------- + None. + + """ + # If `return_dm_train`, return the fitted GED distance matrix of training data. + if return_dm_train: + check_is_fitted(self, '_dm_train') + self._is_transformed = True + return self._dm_train # @todo: copy or not? + + # Check if method "fit" had been called. + check_is_fitted(self, '_graphs') + + # Validate the input. + Y = self.validate_input(X) + + # Transform: compute the graph kernel matrix. + dis_matrix = self.compute_distance_matrix(Y) + self._Y = Y + + # Self transform must appear before the diagonal call on normilization. + self._is_transformed = True +# if self.normalize: +# X_diag, Y_diag = self.diagonals() +# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. +# try: +# kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) +# except: +# raise +# finally: +# np.seterr(**old_settings) + + return dis_matrix + + + def fit_transform(self, X, y=None, save_dm_train=False): + """Fit and transform: compute GED distance matrix on the same data. + + Parameters + ---------- + X : list of graphs + Input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [len(X), len(X)] + The distance matrix of X. + + """ + self.fit(X, y) + + # Compute edit cost constants. + self.compute_edit_costs() + + # Transform: compute Gram matrix. + dis_matrix = self.compute_distance_matrix() + +# # Normalize. +# if self.normalize: +# self._X_diag = np.diagonal(gram_matrix).copy() +# old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. +# try: +# gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) +# except: +# raise +# finally: +# np.seterr(**old_settings) + + if save_dm_train: + self._dm_train = dis_matrix + + return dis_matrix + + + def get_params(self): + pass + + + def set_params(self): + pass + + + def clear_attributes(self): # @todo: update +# if hasattr(self, '_X_diag'): +# delattr(self, '_X_diag') + if hasattr(self, '_graphs'): + delattr(self, '_graphs') + if hasattr(self, '_Y'): + delattr(self, '_Y') + if hasattr(self, '_run_time'): + delattr(self, '_run_time') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + if self.parallel is not None and self.parallel != 'imap_unordered': + raise ValueError('Parallel mode is not set correctly.') + + if self.parallel == 'imap_unordered' and self.n_jobs is None: + self.n_jobs = multiprocessing.cpu_count() + + + def validate_input(self, X): + """Validate the given input and raise errors if it is invalid. + + Parameters + ---------- + X : list + The input to check. Should be a list of graph. + + Raises + ------ + ValueError + Raise if the input is not correct. + + Returns + ------- + X : list + The input. A list of graph. + + """ + if X is None: + raise ValueError('Please add graphs before computing.') + elif not isinstance(X, list): + raise ValueError('Cannot detect graphs. The input must be a list.') + elif len(X) == 0: + raise ValueError('The graph list given is empty. No computation will be performed.') + + return X + + + def compute_distance_matrix(self, Y=None): + """Compute the distance matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) or the distance matrix for the fitted + graphs (X / self._graphs). + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. The default is None. If None kernel is computed + between X and itself. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + if Y is None: + # Compute Gram matrix for self._graphs (X). + dis_matrix = self._compute_X_distance_matrix() +# self._gram_matrix_unnorm = np.copy(self._gram_matrix) + + else: + # Compute kernel matrix between Y and self._graphs (X). + start_time = time.time() + + if self.parallel == 'imap_unordered': + dis_matrix = self._compute_distance_matrix_imap_unordered(Y) + + elif self.parallel is None: + Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y) + graphs_copy = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + dis_matrix = self._compute_distance_matrix_series(Y_copy, graphs_copy) + + self._run_time = time.time() - start_time + if self.verbose: + print('Distance matrix of size (%d, %d) built in %s seconds.' + % (len(Y), len(self._graphs), self._run_time)) + + return dis_matrix + + + def _compute_distance_matrix_series(self, X, Y): + """Compute the GED distance matrix between two sets of graphs (X and Y) + without parallelization. + + Parameters + ---------- + X, Y : list of graphs + The input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [n_X, n_Y] + The computed distance matrix. + + """ + dis_matrix = np.zeros((len(X), len(Y))) + + for i_x, g_x in enumerate(X): + for i_y, g_y in enumerate(Y): + dis_matrix[i_x, i_y], _ = self.compute_ged(g_x, g_y) + + return dis_matrix + + + def _compute_kernel_matrix_imap_unordered(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) using imap unordered parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + raise Exception('Parallelization for kernel matrix is not implemented.') + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + for i, x in enumerate(graphs): + self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) + for (i, y) in enumerate(Y): + self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? + + return self._X_diag, self._Y_diag + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + +# @abstractmethod + def pairwise_distance(self, x, y): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + Returns + ------- + kernel: float + The computed kernel. + +# Notes +# ----- +# This method is abstract and must be implemented by a subclass. + + """ + raise NotImplementedError('Pairwise kernel computation is not implemented!') + + + + def compute_edit_costs(self, Y=None, Y_targets=None): + """Compute edit cost constants. When optimizing method is `fiited`, + apply Jia2021's metric learning method by using a given target graphs (Y) + the fitted graphs (X / self._graphs). + + Parameters + ---------- + Y : TYPE, optional + DESCRIPTION. The default is None. + + Returns + ------- + None. + + """ + # Get or compute. + if self.optim_method == 'random': + self._edit_cost_constants = np.random.rand(6) + + elif self.optim_method == 'init': + self._edit_cost_constants = self.init_edit_cost_constants + + + elif self.optim_method == 'expert': + self._edit_cost_constants = [3, 3, 1, 3, 3, 1] + + + elif self.optim_method == 'fitted': # Jia2021 method + # Get proper inputs. + if Y is None: + check_is_fitted(self, ['_graphs']) + check_is_fitted(self, ['_targets']) + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + targets = self._targets + else: + graphs = ([g.copy() for g in Y] if self.copy_graphs else Y) + targets = Y_targets + + # Get optimization options. + node_labels = self.node_labels + edge_labels = self.edge_labels + unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0) + from gklearn.ged.model.optim_costs import compute_optimal_costs + self._edit_cost_constants = compute_optimal_costs( + graphs, targets, + node_labels=node_labels, edge_labels=edge_labels, + unlabeled=unlabeled, ed_method=self.ed_method, + verbose=(self.verbose >= 2), + **self.optim_options) + + + ########################################################################## + # The following is the 2nd paradigm to compute kernel matrix. It is + # simplified and not compatible with `scikit-learn`. + ########################################################################## + + +# def compute(self, *graphs, **kwargs): +# self.parallel = kwargs.get('parallel', 'imap_unordered') +# self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) +# self.normalize = kwargs.get('normalize', True) +# self.verbose = kwargs.get('verbose', 2) +# self.copy_graphs = kwargs.get('copy_graphs', True) +# self.save_unnormed = kwargs.get('save_unnormed', True) +# self.validate_parameters() + +# # If the inputs is a list of graphs. +# if len(graphs) == 1: +# if not isinstance(graphs[0], list): +# raise Exception('Cannot detect graphs.') +# elif len(graphs[0]) == 0: +# raise Exception('The graph list given is empty. No computation was performed.') +# else: +# if self.copy_graphs: +# self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. +# else: +# self._graphs = graphs +# self._gram_matrix = self._compute_gram_matrix() + +# if self.save_unnormed: +# self._gram_matrix_unnorm = np.copy(self._gram_matrix) +# if self.normalize: +# self._gram_matrix = normalize_gram_matrix(self._gram_matrix) +# return self._gram_matrix, self._run_time + +# elif len(graphs) == 2: +# # If the inputs are two graphs. +# if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): +# if self.copy_graphs: +# G0, G1 = graphs[0].copy(), graphs[1].copy() +# else: +# G0, G1 = graphs[0], graphs[1] +# kernel = self._compute_single_kernel(G0, G1) +# return kernel, self._run_time + +# # If the inputs are a graph and a list of graphs. +# elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): +# if self.copy_graphs: +# g1 = graphs[0].copy() +# g_list = [g.copy() for g in graphs[1]] +# kernel_list = self._compute_kernel_list(g1, g_list) +# else: +# kernel_list = self._compute_kernel_list(graphs[0], graphs[1]) +# return kernel_list, self._run_time + +# elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): +# if self.copy_graphs: +# g1 = graphs[1].copy() +# g_list = [g.copy() for g in graphs[0]] +# kernel_list = self._compute_kernel_list(g1, g_list) +# else: +# kernel_list = self._compute_kernel_list(graphs[1], graphs[0]) +# return kernel_list, self._run_time + +# else: +# raise Exception('Cannot detect graphs.') + +# elif len(graphs) == 0 and self._graphs is None: +# raise Exception('Please add graphs before computing.') + +# else: +# raise Exception('Cannot detect graphs.') + + +# def normalize_gm(self, gram_matrix): +# import warnings +# warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning) + +# diag = gram_matrix.diagonal().copy() +# for i in range(len(gram_matrix)): +# for j in range(i, len(gram_matrix)): +# gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) +# gram_matrix[j][i] = gram_matrix[i][j] +# return gram_matrix + + +# def compute_distance_matrix(self): +# if self._gram_matrix is None: +# raise Exception('Please compute the Gram matrix before computing distance matrix.') +# dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix))) +# for i in range(len(self._gram_matrix)): +# for j in range(i, len(self._gram_matrix)): +# dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j] +# if dis < 0: +# if dis > -1e-10: +# dis = 0 +# else: +# raise ValueError('The distance is negative.') +# dis_mat[i, j] = np.sqrt(dis) +# dis_mat[j, i] = dis_mat[i, j] +# dis_max = np.max(np.max(dis_mat)) +# dis_min = np.min(np.min(dis_mat[dis_mat != 0])) +# dis_mean = np.mean(np.mean(dis_mat)) +# return dis_mat, dis_max, dis_min, dis_mean + + + def _compute_X_distance_matrix(self): + start_time = time.time() + + if self.parallel == 'imap_unordered': + dis_matrix = self._compute_X_dm_imap_unordered() + elif self.parallel is None: + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + dis_matrix = self._compute_X_dm_series(graphs) + else: + raise Exception('Parallel mode is not set correctly.') + + self._run_time = time.time() - start_time + if self.verbose: + print('Distance matrix of size %d built in %s seconds.' + % (len(self._graphs), self._run_time)) + + return dis_matrix + + + def _compute_X_dm_series(self, graphs): + N = len(graphs) + dis_matrix = np.zeros((N, N)) + + for i, G1 in get_iters(enumerate(graphs), desc='Computing distance matrix', file=sys.stdout, verbose=(self.verbose >= 2)): + for j, G2 in enumerate(graphs[i+1:], i+1): + dis_matrix[i, j], _ = self.compute_ged(G1, G2) + dis_matrix[j, i] = dis_matrix[i, j] + return dis_matrix + + + def _compute_X_dm_imap_unordered(self, graphs): + pass + + + def compute_ged(self, Gi, Gj, **kwargs): + """ + Compute GED between two graph according to edit_cost. + """ + ged_options = {'edit_cost': self.edit_cost_fun, + 'method': self.ed_method, + 'edit_cost_constants': self._edit_cost_constants} + dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10) + n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, + edit_cost=self.edit_cost_fun, + node_labels=self.node_labels, + edge_labels=self.edge_labels) + return dis, n_eo_tmp + + +# def _compute_kernel_list(self, g1, g_list): +# start_time = time.time() + +# if self.parallel == 'imap_unordered': +# kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) +# elif self.parallel is None: +# kernel_list = self._compute_kernel_list_series(g1, g_list) +# else: +# raise Exception('Parallel mode is not set correctly.') + +# self._run_time = time.time() - start_time +# if self.verbose: +# print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' +# % (len(g_list), self._run_time)) + +# return kernel_list + + +# def _compute_kernel_list_series(self, g1, g_list): +# pass + + +# def _compute_kernel_list_imap_unordered(self, g1, g_list): +# pass + + +# def _compute_single_kernel(self, g1, g2): +# start_time = time.time() + +# kernel = self._compute_single_kernel_series(g1, g2) + +# self._run_time = time.time() - start_time +# if self.verbose: +# print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) + +# return kernel + + +# def _compute_single_kernel_series(self, g1, g2): +# pass + + + def is_graph(self, graph): + if isinstance(graph, nx.Graph): + return True + if isinstance(graph, nx.DiGraph): + return True + if isinstance(graph, nx.MultiGraph): + return True + if isinstance(graph, nx.MultiDiGraph): + return True + return False + + + @property + def graphs(self): + return self._graphs + + +# @property +# def parallel(self): +# return self.parallel + + +# @property +# def n_jobs(self): +# return self.n_jobs + + +# @property +# def verbose(self): +# return self.verbose + + +# @property +# def normalize(self): +# return self.normalize + + + @property + def run_time(self): + return self._run_time + + + @property + def dis_matrix(self): + return self._dis_matrix + + @dis_matrix.setter + def dis_matrix(self, value): + self._dis_matrix = value + + +# @property +# def gram_matrix_unnorm(self): +# return self._gram_matrix_unnorm + +# @gram_matrix_unnorm.setter +# def gram_matrix_unnorm(self, value): +# self._gram_matrix_unnorm = value \ No newline at end of file diff --git a/gklearn/ged/model/optim_costs.py b/gklearn/ged/model/optim_costs.py new file mode 100644 index 0000000..1e23732 --- /dev/null +++ b/gklearn/ged/model/optim_costs.py @@ -0,0 +1,149 @@ +import numpy as np + +from gklearn.ged.model.distances import sum_squares, euclid_d +from gklearn.ged.model.ged_com import compute_geds + + +def optimize_costs_unlabeled(nb_cost_mat, dis_k_vec): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph + :param dis_k_vec: The N distances to fit + """ + import cvxpy as cp + import numpy as np + MAX_SAMPLE = 1000 + nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] for x in nb_cost_mat]) + dis_k_vec = np.array(dis_k_vec) + # dis_k_vec_norm = dis_k_vec/np.max(dis_k_vec) + + # import pickle + # pickle.dump([nb_cost_mat, dis_k_vec], open('debug', 'wb')) + N = nb_cost_mat_m.shape[0] + sub_sample = np.random.permutation(np.arange(N)) + sub_sample = sub_sample[:MAX_SAMPLE] + + x = cp.Variable(nb_cost_mat_m.shape[1]) + cost = cp.sum_squares((nb_cost_mat_m[sub_sample, :] @ x) - dis_k_vec[sub_sample]) + prob = cp.Problem(cp.Minimize(cost), [x >= 0]) + prob.solve() + edit_costs_new = [x.value[0], x.value[1], 0, x.value[2], x.value[3], 0] + edit_costs_new = [xi if xi > 0 else 0 for xi in edit_costs_new] + residual = prob.value + return edit_costs_new, residual + + +def optimize_costs_classif_unlabeled(nb_cost_mat, Y): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in + nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit + operations for each pair of graph + :param dis_k_vec: {-1,1}^N vector of common classes + """ + # import cvxpy as cp + from ml import reg_log + # import pickle + # pickle.dump([nb_cost_mat, Y], open('debug', 'wb')) + nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] + for x in nb_cost_mat]) + w, J, _ = reg_log(nb_cost_mat_m, Y, pos_contraint=True) + edit_costs_new = [w[0], w[1], 0, w[2], w[3], 0] + residual = J[-1] + + return edit_costs_new, residual + + +def optimize_costs_classif(nb_cost_mat, Y): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph + :param dis_k_vec: {-1,1}^N vector of common classes + """ + #import pickle + # pickle.dump([nb_cost_mat, Y], open("test.pickle", "wb")) + from ml import reg_log + w, J, _ = reg_log(nb_cost_mat, Y, pos_contraint=True) + return w, J[-1] + + +def optimize_costs(nb_cost_mat, dis_k_vec): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph + :param dis_k_vec: The N distances to fit + """ + import cvxpy as cp + x = cp.Variable(nb_cost_mat.shape[1]) + cost = cp.sum_squares((nb_cost_mat @ x) - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost), constraints) + prob.solve() + edit_costs_new = x.value + residual = prob.value + + return edit_costs_new, residual + + +def compute_optimal_costs(G, y, init_costs=[3, 3, 1, 3, 3, 1], + y_distance=euclid_d, + mode='reg', unlabeled=False, + ed_method='BIPARTITE', + verbose=True, + **kwargs): + N = len(y) + + G_pairs = [] + distances_vec = [] + + for i in range(N): + for j in range(i+1, N): + G_pairs.append([i, j]) + distances_vec.append(y_distance(y[i], y[j])) + ged_vec_init, n_edit_operations = compute_geds(G_pairs, G, init_costs, ed_method, + verbose=verbose, **kwargs) + + residual_list = [sum_squares(ged_vec_init, distances_vec)] + + if (mode == 'reg'): + if unlabeled: + method_optim = optimize_costs_unlabeled + else: + method_optim = optimize_costs + + elif (mode == 'classif'): + if unlabeled: + method_optim = optimize_costs_classif_unlabeled + else: + method_optim = optimize_costs_classif + + ite_max = 5 + for i in range(ite_max): + if verbose: + print('ite', i + 1, '/', ite_max, ':') + # compute GEDs and numbers of edit operations. + edit_costs_new, residual = method_optim( + np.array(n_edit_operations), distances_vec) + ged_vec, n_edit_operations = compute_geds(G_pairs, G, edit_costs_new, ed_method, + verbose=verbose, **kwargs) + residual_list.append(sum_squares(ged_vec, distances_vec)) + + return edit_costs_new + + +def get_optimal_costs_GH2020(**kwargs): + import pickle + import os + dir_root = 'cj/output/' + ds_name = kwargs.get('ds_name') + nb_trial = kwargs.get('nb_trial') + file_name = os.path.join(dir_root, 'costs.' + ds_name + '.' + str(nb_trial) + '.pkl') + with open(file_name, 'rb') as f: + edit_costs = pickle.load(f) + return edit_costs