Add ability to GEDEnv to use pre-defined costs between edge labels.

5 years ago · 31a8a9c51d
--- a/gklearn/ged/env/ged_data.py
+++ b/gklearn/ged/env/ged_data.py
@@ -24,6 +24,7 @@ class GEDData(object):
 		self._node_costs = None
 		self._edge_costs = None
 		self._node_label_costs = None
 		self._edge_label_costs = None
 		self._node_labels = []
 		self._edge_labels = []
 		self._init_type = Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES
@@ -114,15 +115,22 @@ class GEDData(object):
 	 * and 0 otherwise.
 	 */
 		"""
 		if self._eager_init(): # @todo: check if correct
 			return self._node_costs[label1, label2]
 		if label1 == label2:
 			return 0
 		if label1 == SpecialLabel.DUMMY:
 			return self._edit_cost.edge_ins_cost_fun(label2) # self._edge_labels[label2 - 1])
 		if label2 == SpecialLabel.DUMMY:
 			return self._edit_cost.edge_del_cost_fun(label1) # self._edge_labels[label1 - 1])
 		return self._edit_cost.edge_rel_cost_fun(label1, label2) # self._edge_labels[label1 - 1], self._edge_labels[label2 - 1])
 		if self._edge_label_costs is None:
 			if self._eager_init(): # @todo: check if correct
 				return self._node_costs[label1, label2]
 			if label1 == label2:
 				return 0
 			if label1 == SpecialLabel.DUMMY:
 				return self._edit_cost.edge_ins_cost_fun(label2) # self._edge_labels[label2 - 1])
 			if label2 == SpecialLabel.DUMMY:
 				return self._edit_cost.edge_del_cost_fun(label1) # self._edge_labels[label1 - 1])
 			return self._edit_cost.edge_rel_cost_fun(label1, label2) # self._edge_labels[label1 - 1], self._edge_labels[label2 - 1])
 		
 		# use pre-computed edge label costs.
 		else:
 			id1 = 0 if label1 == SpecialLabel.DUMMY else self._edge_label_to_id(label1) # @todo: this is slow.
 			id2 = 0 if label2 == SpecialLabel.DUMMY else self._edge_label_to_id(label2)
 			return self._edge_label_costs[id1, id2]
 	
 	
 	def compute_induced_cost(self, g, h, node_map):
--- a/gklearn/ged/env/ged_env.py
+++ b/gklearn/ged/env/ged_env.py
@@ -228,10 +228,13 @@ class GEDEnv(object):
 		return self.__ged_data._init_type
 	
 	
 	def set_label_costs(self, label_costs):
 	def set_label_costs(self, node_label_costs=None, edge_label_costs=None):
 		"""Set the costs between labels. 
 		"""
 		self.__ged_data._node_label_costs = label_costs
 		if node_label_costs is not None:
 			self.__ged_data._node_label_costs = node_label_costs
 		if edge_label_costs is not None:
 			self.__ged_data._edge_label_costs = edge_label_costs
 		
 		
 	def set_method(self, method, options=''):
--- a/gklearn/ged/util/lsape_solver.py
+++ b/gklearn/ged/util/lsape_solver.py
@@ -8,6 +8,7 @@ Created on Mon Jun 22 15:37:36 2020
 import numpy as np
 from scipy.optimize import linear_sum_assignment


 class LSAPESolver(object):
 	
 	
--- a/gklearn/ged/util/util.py
+++ b/gklearn/ged/util/util.py
@@ -55,7 +55,8 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
 	for g in graphs:
 		ged_env.add_nx_graph(g, '')
 	listID = ged_env.get_all_graph_ids()	
 	ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None)
 	ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None,
 						 options['edge_label_costs'] if 'edge_label_costs' in options else None)
 	ged_env.init(init_type=options['init_option'])
 	if parallel:
 		options['threads'] = 1
--- a/gklearn/preimage/median_preimage_generator_cml.py
+++ b/gklearn/preimage/median_preimage_generator_cml.py
@@ -196,46 +196,16 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
 	def __optimize_edit_cost_vector(self):
 		"""Learn edit cost vector.	
 		"""
 		if self.__init_method == 'random': # random
 			# Get list of node labels.
 			nls = self._dataset.get_all_node_labels()
 			# Generate random costs.
 			nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
 			rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
 			self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1))
 			# Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. These is the same setting as in GEDData.
 			i = 0
 			# Costs of insertions.
 			for row in range(1, len(nls) + 1):
 				self.__node_label_costs[row, 0] = rand_costs[i]
 				i += 1
 			# Costs of deletions.
 			for col in range(1, len(nls) + 1):
 				self.__node_label_costs[0, col] = rand_costs[i]
 				i += 1
 			# Costs of substitutions.				
 			for row in range(1, len(nls) + 1):
 				for col in range(row + 1, len(nls) + 1):
 					self.__node_label_costs[row, col] = rand_costs[i]
 					self.__node_label_costs[col, row] = rand_costs[i]
 					i += 1
 					
 # 			self.__node_label_costs = {}
 # 			for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)):
 # 				self.__node_label_costs[(nl1, nl2)] = rand_costs[i]
 # 			# Add costs for deletion.
 # 			for j, nl in enumerate(nls):
 # 				self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j]
 # 			# Add costs for insertion.
 # 			for k, nl in enumerate(nls):
 # 				self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k]
 # 			# Add self costs.
 # 			for nl in nls:
 # 				self.__node_label_costs[(nl, nl)] = 0
 # 			self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0
 		 # Initialize label costs randomly.
 		if self.__init_method == 'random':
 			# Initialize label costs.
 			self.__initialize_label_costs()
 				
 			# Optimize edit cost matrices.
 			self.__optimize_ecm_by_kernel_distances()
 		# Initialize all label costs with the same value.
 		elif self.__init_method == 'uniform': # random
 			pass
 	
 		elif self.__fit_method == 'random': # random
 			if self.__ged_options['edit_cost'] == 'LETTER':
@@ -297,6 +267,77 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
 			pass
 		
 		
 	def __initialize_label_costs(self):
 		self.__initialize_node_label_costs()
 		self.__initialize_edge_label_costs()
 				
 				
 	def __initialize_node_label_costs(self):
 		# Get list of node labels.
 		nls = self._dataset.get_all_node_labels()
 		# Generate random costs.
 		nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
 		rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
 		rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
 		self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1))
 		# Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
 		i = 0
 		# Costs of insertions.
 		for row in range(1, len(nls) + 1):
 			self.__node_label_costs[row, 0] = rand_costs[i]
 			i += 1
 		# Costs of deletions.
 		for col in range(1, len(nls) + 1):
 			self.__node_label_costs[0, col] = rand_costs[i]
 			i += 1
 		# Costs of substitutions.				
 		for row in range(1, len(nls) + 1):
 			for col in range(row + 1, len(nls) + 1):
 				self.__node_label_costs[row, col] = rand_costs[i]
 				self.__node_label_costs[col, row] = rand_costs[i]
 				i += 1
 				
 # 			self.__node_label_costs = {}
 # 			for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)):
 # 				self.__node_label_costs[(nl1, nl2)] = rand_costs[i]
 # 			# Add costs for deletion.
 # 			for j, nl in enumerate(nls):
 # 				self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j]
 # 			# Add costs for insertion.
 # 			for k, nl in enumerate(nls):
 # 				self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k]
 # 			# Add self costs.
 # 			for nl in nls:
 # 				self.__node_label_costs[(nl, nl)] = 0
 # 			self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0


 	def __initialize_edge_label_costs(self):
 		# Get list of edge labels.
 		els = self._dataset.get_all_edge_labels()
 		# Generate random costs.
 		nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els))
 		rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el)
 		rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
 		self.__edge_label_costs = np.zeros((len(els) + 1, len(els) + 1))
 		# Initialize edge label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
 		i = 0
 		# Costs of insertions.
 		for row in range(1, len(els) + 1):
 			self.__edge_label_costs[row, 0] = rand_costs[i]
 			i += 1
 		# Costs of deletions.
 		for col in range(1, len(els) + 1):
 			self.__edge_label_costs[0, col] = rand_costs[i]
 			i += 1
 		# Costs of substitutions.				
 		for row in range(1, len(els) + 1):
 			for col in range(row + 1, len(els) + 1):
 				self.__edge_label_costs[row, col] = rand_costs[i]
 				self.__edge_label_costs[col, row] = rand_costs[i]
 				i += 1
 		
 		
 	def __optimize_ecm_by_kernel_distances(self):		
 		# compute distances in feature space.
 		dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix()
@@ -320,6 +361,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
 		options['node_attrs'] = self._dataset.node_attrs
 		options['edge_attrs'] = self._dataset.edge_attrs
 		options['node_label_costs'] = self.__node_label_costs
 		options['edge_label_costs'] = self.__edge_label_costs
 		ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1))
 		residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]	
 		time_list = [time.time() - time0]
--- a/gklearn/utils/dataset.py
+++ b/gklearn/utils/dataset.py
@@ -545,6 +545,16 @@ class Dataset(object):
 				if nl not in node_labels:
 					node_labels.append(nl)
 		return node_labels
 	
 	
 	def get_all_edge_labels(self):
 		edge_labels = []
 		for g in self.__graphs:
 			for e in g.edges():
 				el = tuple(g.edges[e].items())
 				if el not in edge_labels:
 					edge_labels.append(el)
 		return edge_labels
 		
 	
 	def __get_dataset_size(self):