From 731ab1d45b7cc8c33cb6c0aff21a265b860b3f34 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 2 Jul 2020 18:02:22 +0200 Subject: [PATCH] Add ability to GEDEnv to use pre-defined costs between node labels. --- gklearn/ged/env/ged_data.py | 37 ++++++--- gklearn/ged/env/ged_env.py | 6 ++ gklearn/ged/util/util.py | 1 + gklearn/preimage/median_preimage_generator_cml.py | 95 ++++++++++++++++------- 4 files changed, 103 insertions(+), 36 deletions(-) diff --git a/gklearn/ged/env/ged_data.py b/gklearn/ged/env/ged_data.py index 9cef41a..cf932b0 100644 --- a/gklearn/ged/env/ged_data.py +++ b/gklearn/ged/env/ged_data.py @@ -23,6 +23,7 @@ class GEDData(object): self._edit_cost = None self._node_costs = None self._edge_costs = None + self._node_label_costs = None self._node_labels = [] self._edge_labels = [] self._init_type = Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES @@ -84,15 +85,21 @@ class GEDData(object): * and 0 otherwise. */ """ - if self._eager_init(): # @todo: check if correct - return self._node_costs[label1, label2] - if label1 == label2: - return 0 - if label1 == SpecialLabel.DUMMY: # @todo: check dummy - return self._edit_cost.node_ins_cost_fun(label2) # self._node_labels[label2 - 1]) # @todo: check - if label2 == SpecialLabel.DUMMY: # @todo: check dummy - return self._edit_cost.node_del_cost_fun(label1) # self._node_labels[label1 - 1]) - return self._edit_cost.node_rel_cost_fun(label1, label2) # self._node_labels[label1 - 1], self._node_labels[label2 - 1]) + if self._node_label_costs is None: + if self._eager_init(): # @todo: check if correct + return self._node_costs[label1, label2] + if label1 == label2: + return 0 + if label1 == SpecialLabel.DUMMY: # @todo: check dummy + return self._edit_cost.node_ins_cost_fun(label2) # self._node_labels[label2 - 1]) # @todo: check + if label2 == SpecialLabel.DUMMY: # @todo: check dummy + return self._edit_cost.node_del_cost_fun(label1) # self._node_labels[label1 - 1]) + return self._edit_cost.node_rel_cost_fun(label1, label2) # self._node_labels[label1 - 1], self._node_labels[label2 - 1]) + # use pre-computed node label costs. + else: + id1 = 0 if label1 == SpecialLabel.DUMMY else self._node_label_to_id(label1) # @todo: this is slow. + id2 = 0 if label2 == SpecialLabel.DUMMY else self._node_label_to_id(label2) + return self._node_label_costs[id1, id2] def edge_cost(self, label1, label2): @@ -198,6 +205,12 @@ class GEDData(object): self._delete_edit_cost = True + def id_to_node_label(self, label_id): + if label_id > len(self._node_labels) or label_id == 0: + raise Exception('Invalid node label ID', str(label_id), '.') + return self._node_labels[label_id - 1] + + def _node_label_to_id(self, node_label): n_id = 0 for n_l in self._node_labels: @@ -208,6 +221,12 @@ class GEDData(object): return n_id + 1 + def id_to_edge_label(self, label_id): + if label_id > len(self._edge_labels) or label_id == 0: + raise Exception('Invalid edge label ID', str(label_id), '.') + return self._edge_labels[label_id - 1] + + def _edge_label_to_id(self, edge_label): e_id = 0 for e_l in self._edge_labels: diff --git a/gklearn/ged/env/ged_env.py b/gklearn/ged/env/ged_env.py index e6dc2f6..56a598a 100644 --- a/gklearn/ged/env/ged_env.py +++ b/gklearn/ged/env/ged_env.py @@ -226,6 +226,12 @@ class GEDEnv(object): */ """ return self.__ged_data._init_type + + + def set_label_costs(self, label_costs): + """Set the costs between labels. + """ + self.__ged_data._node_label_costs = label_costs def set_method(self, method, options=''): diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index 91504fe..45a9fd4 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -55,6 +55,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True for g in graphs: ged_env.add_nx_graph(g, '') listID = ged_env.get_all_graph_ids() + ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None) ged_env.init(init_type=options['init_option']) if parallel: options['threads'] = 1 diff --git a/gklearn/preimage/median_preimage_generator_cml.py b/gklearn/preimage/median_preimage_generator_cml.py index a1eadc2..ac9361c 100644 --- a/gklearn/preimage/median_preimage_generator_cml.py +++ b/gklearn/preimage/median_preimage_generator_cml.py @@ -5,31 +5,26 @@ Created on Tue Jun 16 16:04:46 2020 @author: ljia """ - -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Mar 26 18:27:22 2020 - -@author: ljia -""" import numpy as np import time import random import multiprocessing import networkx as nx import cvxpy as cp +import itertools from gklearn.preimage import PreimageGenerator from gklearn.preimage.utils import compute_k_dis -from gklearn.ged.util import compute_geds_cml, ged_options_to_string +from gklearn.ged.util import compute_geds_cml from gklearn.ged.env import GEDEnv -from gklearn.ged.median import MedianGraphEstimator -from gklearn.ged.median import constant_node_costs,mge_options_to_string -from gklearn.utils import Timer +from gklearn.ged.median import MedianGraphEstimatorPy +from gklearn.ged.median import constant_node_costs, mge_options_to_string +from gklearn.utils import Timer, SpecialLabel from gklearn.utils.utils import get_graph_kernel_by_name class MedianPreimageGeneratorCML(PreimageGenerator): + """Generator median preimages by cost matrices learning using the pure Python version of GEDEnv. Works only for symbolic labeled graphs. + """ def __init__(self, dataset=None): PreimageGenerator.__init__(self, dataset=dataset) @@ -37,7 +32,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__mge = None self.__ged_options = {} self.__mge_options = {} - self.__fit_method = 'k-graphs' +# self.__fit_method = 'k-graphs' + self.__init_method = 'random' self.__init_ecc = None self.__parallel = True self.__n_jobs = multiprocessing.cpu_count() @@ -47,8 +43,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__max_itrs_without_update = 3 self.__epsilon_residual = 0.01 self.__epsilon_ec = 0.1 - self.__allow_zeros = False - self.__triangle_rule = True + self.__allow_zeros = True +# self.__triangle_rule = True # values to compute. self.__runtime_optimize_ec = None self.__runtime_generate_preimage = None @@ -64,6 +60,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__itrs = 0 self.__converged = False self.__num_updates_ecc = 0 + self.__node_label_costs = None + self.__edge_label_costs = None # values that can be set or to be computed. self.__edit_cost_constants = [] self.__gram_matrix_unnorm = None @@ -76,7 +74,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self._verbose = kwargs.get('verbose', 2) self.__ged_options = kwargs.get('ged_options', {}) self.__mge_options = kwargs.get('mge_options', {}) - self.__fit_method = kwargs.get('fit_method', 'k-graphs') +# self.__fit_method = kwargs.get('fit_method', 'k-graphs') + self.__init_method = kwargs.get('init_method', 'random') self.__init_ecc = kwargs.get('init_ecc', None) self.__edit_cost_constants = kwargs.get('edit_cost_constants', []) self.__parallel = kwargs.get('parallel', True) @@ -89,8 +88,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1) self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) - self.__allow_zeros = kwargs.get('allow_zeros', False) - self.__triangle_rule = kwargs.get('triangle_rule', True) + self.__allow_zeros = kwargs.get('allow_zeros', True) +# self.__triangle_rule = kwargs.get('triangle_rule', True) def run(self): @@ -122,10 +121,10 @@ class MedianPreimageGeneratorCML(PreimageGenerator): end_precompute_gm = time.time() start -= self.__runtime_precompute_gm - if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset': - start = time.time() - self.__runtime_precompute_gm = 0 - end_precompute_gm = start +# if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset': +# start = time.time() +# self.__runtime_precompute_gm = 0 +# end_precompute_gm = start # 2. optimize edit cost constants. self.__optimize_edit_cost_vector() @@ -197,7 +196,48 @@ class MedianPreimageGeneratorCML(PreimageGenerator): def __optimize_edit_cost_vector(self): """Learn edit cost vector. """ - if self.__fit_method == 'random': # random + if self.__init_method == 'random': # random + # Get list of node labels. + nls = self._dataset.get_all_node_labels() + # Generate random costs. + nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) + rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) + self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1)) + # Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. These is the same setting as in GEDData. + i = 0 + # Costs of insertions. + for row in range(1, len(nls) + 1): + self.__node_label_costs[row, 0] = rand_costs[i] + i += 1 + # Costs of deletions. + for col in range(1, len(nls) + 1): + self.__node_label_costs[0, col] = rand_costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, len(nls) + 1): + for col in range(row + 1, len(nls) + 1): + self.__node_label_costs[row, col] = rand_costs[i] + self.__node_label_costs[col, row] = rand_costs[i] + i += 1 + +# self.__node_label_costs = {} +# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)): +# self.__node_label_costs[(nl1, nl2)] = rand_costs[i] +# # Add costs for deletion. +# for j, nl in enumerate(nls): +# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j] +# # Add costs for insertion. +# for k, nl in enumerate(nls): +# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k] +# # Add self costs. +# for nl in nls: +# self.__node_label_costs[(nl, nl)] = 0 +# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0 + + # Optimize edit cost matrices. + self.__optimize_ecm_by_kernel_distances() + + elif self.__fit_method == 'random': # random if self.__ged_options['edit_cost'] == 'LETTER': self.__edit_cost_constants = random.sample(range(1, 1000), 3) self.__edit_cost_constants = [item * 0.001 for item in self.__edit_cost_constants] @@ -279,6 +319,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): options['edge_labels'] = self._dataset.edge_labels options['node_attrs'] = self._dataset.node_attrs options['edge_attrs'] = self._dataset.edge_attrs + options['node_label_costs'] = self.__node_label_costs ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] time_list = [time.time() - time0] @@ -881,8 +922,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): ged_env.init(init_type=self.__ged_options['init_option']) # Set up the madian graph estimator. - self.__mge = MedianGraphEstimator(ged_env, constant_node_costs(self.__ged_options['edit_cost'])) - self.__mge.set_refine_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options)) + self.__mge = MedianGraphEstimatorPy(ged_env, constant_node_costs(self.__ged_options['edit_cost'])) + self.__mge.set_refine_method(self.__ged_options['method'], self.__ged_options) options = self.__mge_options.copy() if not 'seed' in options: options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage. @@ -897,8 +938,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): ged_options = self.__ged_options.copy() if self.__parallel: ged_options['threads'] = 1 - self.__mge.set_init_method(ged_options['method'], ged_options_to_string(ged_options)) - self.__mge.set_descent_method(ged_options['method'], ged_options_to_string(ged_options)) + self.__mge.set_init_method(ged_options['method'], ged_options) + self.__mge.set_descent_method(ged_options['method'], ged_options) # Run the estimator. self.__mge.run(graph_ids, set_median_id, gen_median_id)