Browse Source

Add ability to GEDEnv to use pre-defined costs between edge labels.

v0.2.x
jajupmochi 5 years ago
parent
commit
31a8a9c51d
6 changed files with 114 additions and 49 deletions
  1. +17
    -9
      gklearn/ged/env/ged_data.py
  2. +5
    -2
      gklearn/ged/env/ged_env.py
  3. +1
    -0
      gklearn/ged/util/lsape_solver.py
  4. +2
    -1
      gklearn/ged/util/util.py
  5. +79
    -37
      gklearn/preimage/median_preimage_generator_cml.py
  6. +10
    -0
      gklearn/utils/dataset.py

+ 17
- 9
gklearn/ged/env/ged_data.py View File

@@ -24,6 +24,7 @@ class GEDData(object):
self._node_costs = None
self._edge_costs = None
self._node_label_costs = None
self._edge_label_costs = None
self._node_labels = []
self._edge_labels = []
self._init_type = Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES
@@ -114,15 +115,22 @@ class GEDData(object):
* and 0 otherwise.
*/
"""
if self._eager_init(): # @todo: check if correct
return self._node_costs[label1, label2]
if label1 == label2:
return 0
if label1 == SpecialLabel.DUMMY:
return self._edit_cost.edge_ins_cost_fun(label2) # self._edge_labels[label2 - 1])
if label2 == SpecialLabel.DUMMY:
return self._edit_cost.edge_del_cost_fun(label1) # self._edge_labels[label1 - 1])
return self._edit_cost.edge_rel_cost_fun(label1, label2) # self._edge_labels[label1 - 1], self._edge_labels[label2 - 1])
if self._edge_label_costs is None:
if self._eager_init(): # @todo: check if correct
return self._node_costs[label1, label2]
if label1 == label2:
return 0
if label1 == SpecialLabel.DUMMY:
return self._edit_cost.edge_ins_cost_fun(label2) # self._edge_labels[label2 - 1])
if label2 == SpecialLabel.DUMMY:
return self._edit_cost.edge_del_cost_fun(label1) # self._edge_labels[label1 - 1])
return self._edit_cost.edge_rel_cost_fun(label1, label2) # self._edge_labels[label1 - 1], self._edge_labels[label2 - 1])
# use pre-computed edge label costs.
else:
id1 = 0 if label1 == SpecialLabel.DUMMY else self._edge_label_to_id(label1) # @todo: this is slow.
id2 = 0 if label2 == SpecialLabel.DUMMY else self._edge_label_to_id(label2)
return self._edge_label_costs[id1, id2]
def compute_induced_cost(self, g, h, node_map):


+ 5
- 2
gklearn/ged/env/ged_env.py View File

@@ -228,10 +228,13 @@ class GEDEnv(object):
return self.__ged_data._init_type
def set_label_costs(self, label_costs):
def set_label_costs(self, node_label_costs=None, edge_label_costs=None):
"""Set the costs between labels.
"""
self.__ged_data._node_label_costs = label_costs
if node_label_costs is not None:
self.__ged_data._node_label_costs = node_label_costs
if edge_label_costs is not None:
self.__ged_data._edge_label_costs = edge_label_costs
def set_method(self, method, options=''):


+ 1
- 0
gklearn/ged/util/lsape_solver.py View File

@@ -8,6 +8,7 @@ Created on Mon Jun 22 15:37:36 2020
import numpy as np
from scipy.optimize import linear_sum_assignment


class LSAPESolver(object):


+ 2
- 1
gklearn/ged/util/util.py View File

@@ -55,7 +55,8 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
for g in graphs:
ged_env.add_nx_graph(g, '')
listID = ged_env.get_all_graph_ids()
ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None)
ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None,
options['edge_label_costs'] if 'edge_label_costs' in options else None)
ged_env.init(init_type=options['init_option'])
if parallel:
options['threads'] = 1


+ 79
- 37
gklearn/preimage/median_preimage_generator_cml.py View File

@@ -196,46 +196,16 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
def __optimize_edit_cost_vector(self):
"""Learn edit cost vector.
"""
if self.__init_method == 'random': # random
# Get list of node labels.
nls = self._dataset.get_all_node_labels()
# Generate random costs.
nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1))
# Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. These is the same setting as in GEDData.
i = 0
# Costs of insertions.
for row in range(1, len(nls) + 1):
self.__node_label_costs[row, 0] = rand_costs[i]
i += 1
# Costs of deletions.
for col in range(1, len(nls) + 1):
self.__node_label_costs[0, col] = rand_costs[i]
i += 1
# Costs of substitutions.
for row in range(1, len(nls) + 1):
for col in range(row + 1, len(nls) + 1):
self.__node_label_costs[row, col] = rand_costs[i]
self.__node_label_costs[col, row] = rand_costs[i]
i += 1
# self.__node_label_costs = {}
# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)):
# self.__node_label_costs[(nl1, nl2)] = rand_costs[i]
# # Add costs for deletion.
# for j, nl in enumerate(nls):
# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j]
# # Add costs for insertion.
# for k, nl in enumerate(nls):
# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k]
# # Add self costs.
# for nl in nls:
# self.__node_label_costs[(nl, nl)] = 0
# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0
# Initialize label costs randomly.
if self.__init_method == 'random':
# Initialize label costs.
self.__initialize_label_costs()
# Optimize edit cost matrices.
self.__optimize_ecm_by_kernel_distances()
# Initialize all label costs with the same value.
elif self.__init_method == 'uniform': # random
pass
elif self.__fit_method == 'random': # random
if self.__ged_options['edit_cost'] == 'LETTER':
@@ -297,6 +267,77 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
pass
def __initialize_label_costs(self):
self.__initialize_node_label_costs()
self.__initialize_edge_label_costs()
def __initialize_node_label_costs(self):
# Get list of node labels.
nls = self._dataset.get_all_node_labels()
# Generate random costs.
nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1))
# Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
i = 0
# Costs of insertions.
for row in range(1, len(nls) + 1):
self.__node_label_costs[row, 0] = rand_costs[i]
i += 1
# Costs of deletions.
for col in range(1, len(nls) + 1):
self.__node_label_costs[0, col] = rand_costs[i]
i += 1
# Costs of substitutions.
for row in range(1, len(nls) + 1):
for col in range(row + 1, len(nls) + 1):
self.__node_label_costs[row, col] = rand_costs[i]
self.__node_label_costs[col, row] = rand_costs[i]
i += 1
# self.__node_label_costs = {}
# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)):
# self.__node_label_costs[(nl1, nl2)] = rand_costs[i]
# # Add costs for deletion.
# for j, nl in enumerate(nls):
# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j]
# # Add costs for insertion.
# for k, nl in enumerate(nls):
# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k]
# # Add self costs.
# for nl in nls:
# self.__node_label_costs[(nl, nl)] = 0
# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0


def __initialize_edge_label_costs(self):
# Get list of edge labels.
els = self._dataset.get_all_edge_labels()
# Generate random costs.
nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els))
rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el)
rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
self.__edge_label_costs = np.zeros((len(els) + 1, len(els) + 1))
# Initialize edge label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
i = 0
# Costs of insertions.
for row in range(1, len(els) + 1):
self.__edge_label_costs[row, 0] = rand_costs[i]
i += 1
# Costs of deletions.
for col in range(1, len(els) + 1):
self.__edge_label_costs[0, col] = rand_costs[i]
i += 1
# Costs of substitutions.
for row in range(1, len(els) + 1):
for col in range(row + 1, len(els) + 1):
self.__edge_label_costs[row, col] = rand_costs[i]
self.__edge_label_costs[col, row] = rand_costs[i]
i += 1
def __optimize_ecm_by_kernel_distances(self):
# compute distances in feature space.
dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix()
@@ -320,6 +361,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
options['node_attrs'] = self._dataset.node_attrs
options['edge_attrs'] = self._dataset.edge_attrs
options['node_label_costs'] = self.__node_label_costs
options['edge_label_costs'] = self.__edge_label_costs
ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1))
residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
time_list = [time.time() - time0]


+ 10
- 0
gklearn/utils/dataset.py View File

@@ -545,6 +545,16 @@ class Dataset(object):
if nl not in node_labels:
node_labels.append(nl)
return node_labels
def get_all_edge_labels(self):
edge_labels = []
for g in self.__graphs:
for e in g.edges():
el = tuple(g.edges[e].items())
if el not in edge_labels:
edge_labels.append(el)
return edge_labels
def __get_dataset_size(self):


Loading…
Cancel
Save