From 31a8a9c51d1e3679274f4801f890238ee578ed1e Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 3 Jul 2020 12:10:04 +0200 Subject: [PATCH] Add ability to GEDEnv to use pre-defined costs between edge labels. --- gklearn/ged/env/ged_data.py | 26 +++-- gklearn/ged/env/ged_env.py | 7 +- gklearn/ged/util/lsape_solver.py | 1 + gklearn/ged/util/util.py | 3 +- gklearn/preimage/median_preimage_generator_cml.py | 116 +++++++++++++++------- gklearn/utils/dataset.py | 10 ++ 6 files changed, 114 insertions(+), 49 deletions(-) diff --git a/gklearn/ged/env/ged_data.py b/gklearn/ged/env/ged_data.py index cf932b0..0e6881f 100644 --- a/gklearn/ged/env/ged_data.py +++ b/gklearn/ged/env/ged_data.py @@ -24,6 +24,7 @@ class GEDData(object): self._node_costs = None self._edge_costs = None self._node_label_costs = None + self._edge_label_costs = None self._node_labels = [] self._edge_labels = [] self._init_type = Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES @@ -114,15 +115,22 @@ class GEDData(object): * and 0 otherwise. */ """ - if self._eager_init(): # @todo: check if correct - return self._node_costs[label1, label2] - if label1 == label2: - return 0 - if label1 == SpecialLabel.DUMMY: - return self._edit_cost.edge_ins_cost_fun(label2) # self._edge_labels[label2 - 1]) - if label2 == SpecialLabel.DUMMY: - return self._edit_cost.edge_del_cost_fun(label1) # self._edge_labels[label1 - 1]) - return self._edit_cost.edge_rel_cost_fun(label1, label2) # self._edge_labels[label1 - 1], self._edge_labels[label2 - 1]) + if self._edge_label_costs is None: + if self._eager_init(): # @todo: check if correct + return self._node_costs[label1, label2] + if label1 == label2: + return 0 + if label1 == SpecialLabel.DUMMY: + return self._edit_cost.edge_ins_cost_fun(label2) # self._edge_labels[label2 - 1]) + if label2 == SpecialLabel.DUMMY: + return self._edit_cost.edge_del_cost_fun(label1) # self._edge_labels[label1 - 1]) + return self._edit_cost.edge_rel_cost_fun(label1, label2) # self._edge_labels[label1 - 1], self._edge_labels[label2 - 1]) + + # use pre-computed edge label costs. + else: + id1 = 0 if label1 == SpecialLabel.DUMMY else self._edge_label_to_id(label1) # @todo: this is slow. + id2 = 0 if label2 == SpecialLabel.DUMMY else self._edge_label_to_id(label2) + return self._edge_label_costs[id1, id2] def compute_induced_cost(self, g, h, node_map): diff --git a/gklearn/ged/env/ged_env.py b/gklearn/ged/env/ged_env.py index 56a598a..b31ecb9 100644 --- a/gklearn/ged/env/ged_env.py +++ b/gklearn/ged/env/ged_env.py @@ -228,10 +228,13 @@ class GEDEnv(object): return self.__ged_data._init_type - def set_label_costs(self, label_costs): + def set_label_costs(self, node_label_costs=None, edge_label_costs=None): """Set the costs between labels. """ - self.__ged_data._node_label_costs = label_costs + if node_label_costs is not None: + self.__ged_data._node_label_costs = node_label_costs + if edge_label_costs is not None: + self.__ged_data._edge_label_costs = edge_label_costs def set_method(self, method, options=''): diff --git a/gklearn/ged/util/lsape_solver.py b/gklearn/ged/util/lsape_solver.py index aef9c11..72c2776 100644 --- a/gklearn/ged/util/lsape_solver.py +++ b/gklearn/ged/util/lsape_solver.py @@ -8,6 +8,7 @@ Created on Mon Jun 22 15:37:36 2020 import numpy as np from scipy.optimize import linear_sum_assignment + class LSAPESolver(object): diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index 45a9fd4..cdced21 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -55,7 +55,8 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True for g in graphs: ged_env.add_nx_graph(g, '') listID = ged_env.get_all_graph_ids() - ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None) + ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None, + options['edge_label_costs'] if 'edge_label_costs' in options else None) ged_env.init(init_type=options['init_option']) if parallel: options['threads'] = 1 diff --git a/gklearn/preimage/median_preimage_generator_cml.py b/gklearn/preimage/median_preimage_generator_cml.py index ac9361c..c4a92a6 100644 --- a/gklearn/preimage/median_preimage_generator_cml.py +++ b/gklearn/preimage/median_preimage_generator_cml.py @@ -196,46 +196,16 @@ class MedianPreimageGeneratorCML(PreimageGenerator): def __optimize_edit_cost_vector(self): """Learn edit cost vector. """ - if self.__init_method == 'random': # random - # Get list of node labels. - nls = self._dataset.get_all_node_labels() - # Generate random costs. - nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) - rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) - self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1)) - # Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. These is the same setting as in GEDData. - i = 0 - # Costs of insertions. - for row in range(1, len(nls) + 1): - self.__node_label_costs[row, 0] = rand_costs[i] - i += 1 - # Costs of deletions. - for col in range(1, len(nls) + 1): - self.__node_label_costs[0, col] = rand_costs[i] - i += 1 - # Costs of substitutions. - for row in range(1, len(nls) + 1): - for col in range(row + 1, len(nls) + 1): - self.__node_label_costs[row, col] = rand_costs[i] - self.__node_label_costs[col, row] = rand_costs[i] - i += 1 - -# self.__node_label_costs = {} -# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)): -# self.__node_label_costs[(nl1, nl2)] = rand_costs[i] -# # Add costs for deletion. -# for j, nl in enumerate(nls): -# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j] -# # Add costs for insertion. -# for k, nl in enumerate(nls): -# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k] -# # Add self costs. -# for nl in nls: -# self.__node_label_costs[(nl, nl)] = 0 -# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0 + # Initialize label costs randomly. + if self.__init_method == 'random': + # Initialize label costs. + self.__initialize_label_costs() # Optimize edit cost matrices. self.__optimize_ecm_by_kernel_distances() + # Initialize all label costs with the same value. + elif self.__init_method == 'uniform': # random + pass elif self.__fit_method == 'random': # random if self.__ged_options['edit_cost'] == 'LETTER': @@ -297,6 +267,77 @@ class MedianPreimageGeneratorCML(PreimageGenerator): pass + def __initialize_label_costs(self): + self.__initialize_node_label_costs() + self.__initialize_edge_label_costs() + + + def __initialize_node_label_costs(self): + # Get list of node labels. + nls = self._dataset.get_all_node_labels() + # Generate random costs. + nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) + rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) + rand_costs /= np.max(rand_costs) # @todo: maybe not needed. + self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1)) + # Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData. + i = 0 + # Costs of insertions. + for row in range(1, len(nls) + 1): + self.__node_label_costs[row, 0] = rand_costs[i] + i += 1 + # Costs of deletions. + for col in range(1, len(nls) + 1): + self.__node_label_costs[0, col] = rand_costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, len(nls) + 1): + for col in range(row + 1, len(nls) + 1): + self.__node_label_costs[row, col] = rand_costs[i] + self.__node_label_costs[col, row] = rand_costs[i] + i += 1 + +# self.__node_label_costs = {} +# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)): +# self.__node_label_costs[(nl1, nl2)] = rand_costs[i] +# # Add costs for deletion. +# for j, nl in enumerate(nls): +# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j] +# # Add costs for insertion. +# for k, nl in enumerate(nls): +# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k] +# # Add self costs. +# for nl in nls: +# self.__node_label_costs[(nl, nl)] = 0 +# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0 + + + def __initialize_edge_label_costs(self): + # Get list of edge labels. + els = self._dataset.get_all_edge_labels() + # Generate random costs. + nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els)) + rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el) + rand_costs /= np.max(rand_costs) # @todo: maybe not needed. + self.__edge_label_costs = np.zeros((len(els) + 1, len(els) + 1)) + # Initialize edge label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData. + i = 0 + # Costs of insertions. + for row in range(1, len(els) + 1): + self.__edge_label_costs[row, 0] = rand_costs[i] + i += 1 + # Costs of deletions. + for col in range(1, len(els) + 1): + self.__edge_label_costs[0, col] = rand_costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, len(els) + 1): + for col in range(row + 1, len(els) + 1): + self.__edge_label_costs[row, col] = rand_costs[i] + self.__edge_label_costs[col, row] = rand_costs[i] + i += 1 + + def __optimize_ecm_by_kernel_distances(self): # compute distances in feature space. dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix() @@ -320,6 +361,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): options['node_attrs'] = self._dataset.node_attrs options['edge_attrs'] = self._dataset.edge_attrs options['node_label_costs'] = self.__node_label_costs + options['edge_label_costs'] = self.__edge_label_costs ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] time_list = [time.time() - time0] diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index c692c63..19c9993 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -545,6 +545,16 @@ class Dataset(object): if nl not in node_labels: node_labels.append(nl) return node_labels + + + def get_all_edge_labels(self): + edge_labels = [] + for g in self.__graphs: + for e in g.edges(): + el = tuple(g.edges[e].items()) + if el not in edge_labels: + edge_labels.append(el) + return edge_labels def __get_dataset_size(self):