Fix bugs in ged.util.util.get_nb_edit_operations_symbolic_cml() and add test for it.

5 years ago · 0db57fe3ce
--- a/gklearn/ged/util/util.py
+++ b/gklearn/ged/util/util.py
@@ -49,16 +49,18 @@ def compute_ged(g1, g2, options):


 def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True):
 	
 	node_label_costs = options['node_label_costs'] if 'node_label_costs' in options else None
 	edge_label_costs = options['edge_label_costs'] if 'edge_label_costs' in options else None

 	# initialize ged env.
 	ged_env = GEDEnv()
 	ged_env.set_edit_cost(options['edit_cost'], edit_cost_constants=options['edit_cost_constants'])
 	for g in graphs:
 		ged_env.add_nx_graph(g, '')
 	listID = ged_env.get_all_graph_ids()	
 	listID = ged_env.get_all_graph_ids()
 	
 	node_labels = ged_env.get_all_node_labels()
 	edge_labels =  ged_env.get_all_edge_labels()
 	node_label_costs = label_costs_to_matrix(options['node_label_costs'], len(node_labels)) if 'node_label_costs' in options else None
 	edge_label_costs = label_costs_to_matrix(options['edge_label_costs'], len(edge_labels)) if 'edge_label_costs' in options else None
 	ged_env.set_label_costs(node_label_costs, edge_label_costs)
 	ged_env.init(init_type=options['init_option'])
 	if parallel:
@@ -69,11 +71,9 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
 	# compute ged.
 	# options used to compute numbers of edit operations.
 	neo_options = {'edit_cost': options['edit_cost'],
 # 				'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], 
 # 				'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs'],
 				'is_cml': True,
 				'node_labels': ged_env.get_all_node_labels(),
 				'edge_labels': ged_env.get_all_edge_labels()}
 				'node_labels': node_labels,
 				'edge_labels': edge_labels}
 	ged_mat = np.zeros((len(graphs), len(graphs)))
 	if parallel:
 		len_itr = int(len(graphs) * (len(graphs) - 1) / 2)
@@ -243,11 +243,45 @@ def _compute_ged(env, gid1, gid2, g1, g2):
 	return dis, pi_forward, pi_backward


 def label_costs_to_matrix(costs, nb_labels):
 	"""Reform a label cost vector to a matrix.

 	Parameters
 	----------
 	costs : numpy.array
 		The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs.
 	nb_labels : integer
 		Number of labels.

 	Returns
 	-------
 	cost_matrix : numpy.array. 
 		The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData.
 	"""
 	# Initialize label cost matrix.
 	cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1))
 	i = 0
 	# Costs of insertions.
 	for col in range(1, nb_labels + 1):
 		cost_matrix[0, col] = costs[i]
 		i += 1
 	# Costs of deletions.
 	for row in range(1, nb_labels + 1):
 		cost_matrix[row, 0] = costs[i]
 		i += 1
 	# Costs of substitutions.				
 	for row in range(1, nb_labels + 1):
 		for col in range(row + 1, nb_labels + 1):
 			cost_matrix[row, col] = costs[i]
 			cost_matrix[col, row] = costs[i]
 			i += 1
 			
 	return cost_matrix


 def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs):
 	if is_cml:
 		if edit_cost == 'CONSTANT':
 			node_label_costs = kwargs.get('node_label_costs')
 			edge_label_costs = kwargs.get('edge_label_costs')
 			node_labels = kwargs.get('node_labels', [])
 			edge_labels = kwargs.get('edge_labels', [])
 			return get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
@@ -273,12 +307,12 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is
 		
 def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, 
 										node_labels=[], edge_labels=[]):
 	"""Compute the number of each edit operations for symbolic-labeled graphs, where the costs are different for each pair of nodes.
 	"""Compute times that edit operations are used in an edit path for symbolic-labeled graphs, where the costs are different for each pair of nodes.
 	
 	Returns
 	-------
 	list
 		A vector of costs bewteen labels, formed in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. The dummy label is the first label, and the self label costs are not included.
 		A vector of numbers of times that costs bewteen labels are used in an edit path, formed in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. The dummy label is the first label, and the self label costs are not included.
 	"""
 	# Initialize.
 	nb_ops_node = np.zeros((1 + len(node_labels), 1 + len(node_labels)))
@@ -290,7 +324,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
 		label1 = tuple(g1.nodes[nodes1[i]].items()) # @todo: order and faster
 		idx_label1 = node_labels.index(label1) # @todo: faster
 		if map_i == np.inf: # deletions.
 			nb_ops_node[0, idx_label1 + 1] += 1
 			nb_ops_node[idx_label1 + 1, 0] += 1
 		else: # substitutions.
 			label2 = tuple(g2.nodes[map_i].items())
 			if label1 != label2:
@@ -302,7 +336,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
 		if map_i == np.inf:
 			label = tuple(g2.nodes[nodes2[i]].items())
 			idx_label = node_labels.index(label) # @todo: faster
 			nb_ops_node[idx_label + 1, 0] += 1
 			nb_ops_node[0, idx_label + 1] += 1
 			
 	# For edges.
 	edges1 = [e for e in g1.edges()]
@@ -314,7 +348,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
 		idxt1 = nodes1.index(nt1) # @todo: faster
 		# At least one of the nodes is removed, thus the edge is removed.
 		if forward_map[idxf1] == np.inf or forward_map[idxt1] == np.inf:
 			nb_ops_edge[0, idx_label1 + 1] += 1
 			nb_ops_edge[idx_label1 + 1, 0] += 1
 		# corresponding edge is in g2.
 		else:
 			nf2, nt2 = forward_map[idxf1], forward_map[idxt1]
@@ -335,38 +369,38 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
 					nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1
 			# Corresponding nodes are in g2, however the edge is removed.
 			else:
 				nb_ops_edge[0, idx_label1 + 1] += 1
 				nb_ops_edge[idx_label1 + 1, 0] += 1
 	# insertions.
 	for e in g2.edges():
 		if e not in edges2_marked:
 			label = tuple(g2.edges[e].items())
 	for nt, nf in g2.edges():
 		if (nt, nf) not in edges2_marked and (nf, nt) not in edges2_marked: # @todo: for directed.
 			label = tuple(g2.edges[(nt, nf)].items())
 			idx_label = edge_labels.index(label) # @todo: faster
 			nb_ops_edge[idx_label + 1, 0] += 1
 			nb_ops_edge[0, idx_label + 1] += 1
 			
 	# Reform the costs into a vector.
 	cost_vector = []
 	# Add node insertion costs.
 	# Reform the numbers of edit oeprations into a vector.
 	nb_eo_vector = []
 	# node insertion.
 	for i in range(1, len(nb_ops_node)):
 		cost_vector.append(nb_ops_node[i, 0])
 	# Add node deletion costs.
 		nb_eo_vector.append(nb_ops_node[0, i])
 	# node deletion.
 	for i in range(1, len(nb_ops_node)):
 		cost_vector.append(nb_ops_node[0, i])
 	# Add node substitution costs.
 		nb_eo_vector.append(nb_ops_node[i, 0])
 	# node substitution.
 	for i in range(1, len(nb_ops_node)):
 		for j in range(i + 1, len(nb_ops_node)):
 			cost_vector.append(nb_ops_node[i, j])
 	# Add edge insertion costs.
 			nb_eo_vector.append(nb_ops_node[i, j])
 	# edge insertion.
 	for i in range(1, len(nb_ops_edge)):
 		cost_vector.append(nb_ops_edge[i, 0])
 	# Add edge deletion costs.
 		nb_eo_vector.append(nb_ops_edge[0, i])
 	# edge deletion.
 	for i in range(1, len(nb_ops_edge)):
 		cost_vector.append(nb_ops_edge[0, i])
 	# Add edge substitution costs.
 		nb_eo_vector.append(nb_ops_edge[i, 0])
 	# edge substitution.
 	for i in range(1, len(nb_ops_edge)):
 		for j in range(i + 1, len(nb_ops_edge)):
 			cost_vector.append(nb_ops_edge[i, j])
 			nb_eo_vector.append(nb_ops_edge[i, j])
 	
 	return cost_vector
 	return nb_eo_vector
 	

 def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,
--- a/gklearn/preimage/median_preimage_generator_cml.py
+++ b/gklearn/preimage/median_preimage_generator_cml.py
@@ -279,37 +279,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
 		nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
 		rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
 		rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
 		self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1))
 		# Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
 		i = 0
 		# Costs of insertions.
 		for row in range(1, len(nls) + 1):
 			self.__node_label_costs[row, 0] = rand_costs[i]
 			i += 1
 		# Costs of deletions.
 		for col in range(1, len(nls) + 1):
 			self.__node_label_costs[0, col] = rand_costs[i]
 			i += 1
 		# Costs of substitutions.				
 		for row in range(1, len(nls) + 1):
 			for col in range(row + 1, len(nls) + 1):
 				self.__node_label_costs[row, col] = rand_costs[i]
 				self.__node_label_costs[col, row] = rand_costs[i]
 				i += 1
 				
 # 			self.__node_label_costs = {}
 # 			for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)):
 # 				self.__node_label_costs[(nl1, nl2)] = rand_costs[i]
 # 			# Add costs for deletion.
 # 			for j, nl in enumerate(nls):
 # 				self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j]
 # 			# Add costs for insertion.
 # 			for k, nl in enumerate(nls):
 # 				self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k]
 # 			# Add self costs.
 # 			for nl in nls:
 # 				self.__node_label_costs[(nl, nl)] = 0
 # 			self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0
 		self.__node_label_costs = rand_costs


 	def __initialize_edge_label_costs(self):
@@ -319,23 +289,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
 		nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els))
 		rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el)
 		rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
 		self.__edge_label_costs = np.zeros((len(els) + 1, len(els) + 1))
 		# Initialize edge label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
 		i = 0
 		# Costs of insertions.
 		for row in range(1, len(els) + 1):
 			self.__edge_label_costs[row, 0] = rand_costs[i]
 			i += 1
 		# Costs of deletions.
 		for col in range(1, len(els) + 1):
 			self.__edge_label_costs[0, col] = rand_costs[i]
 			i += 1
 		# Costs of substitutions.				
 		for row in range(1, len(els) + 1):
 			for col in range(row + 1, len(els) + 1):
 				self.__edge_label_costs[row, col] = rand_costs[i]
 				self.__edge_label_costs[col, row] = rand_costs[i]
 				i += 1
 		self.__edge_label_costs = rand_costs
 		
 		
 	def __optimize_ecm_by_kernel_distances(self):		
--- a/gklearn/tests/ged/test_ged_env.py
+++ b/gklearn/tests/ged/test_ged_env.py
--- a/gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py
+++ b/gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py
@@ -0,0 +1,122 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Jul  6 12:08:24 2020

@author: ljia
 """
 import random
 import numpy as np

 def test_get_nb_edit_operations_symbolic_cml():
 	"""Test get_nb_edit_operations_symbolic_cml().
 	"""
 	"""**1.   Get dataset.**"""

 	from gklearn.utils import Dataset
 	
 	# Predefined dataset name, use dataset "MUTAG".
 	ds_name = 'MUTAG'
 	
 	# Initialize a Dataset.
 	dataset = Dataset()
 	# Load predefined dataset "MUTAG".
 	dataset.load_predefined_dataset(ds_name)
 	graph1 = dataset.graphs[0]
 	graph2 = dataset.graphs[1]
 	
 	"""**2.  Compute graph edit distance.**"""
 	
 # 	try:
 	# Initialize label costs randomly.
 	node_label_costs, edge_label_costs = _initialize_label_costs(dataset)
 	
 	# Compute GEDs.
 	pi_forward, pi_backward, dis, node_labels, edge_labels = _compute_ged(dataset, node_label_costs, edge_label_costs)
 	
 	
 	# Compute numbers of edit operations.
 	
 	from gklearn.ged.util.util import get_nb_edit_operations_symbolic_cml
 	
 	n_edit_operations = get_nb_edit_operations_symbolic_cml(graph1, graph2, pi_forward, pi_backward, node_labels, edge_labels)
 	
 	assert np.abs((np.dot(np.concatenate((node_label_costs, edge_label_costs)), n_edit_operations) - dis) / dis) < 10e-6
 	
 # 	except Exception as exception:
 # 		assert False, exception
 		
 		
 def _initialize_label_costs(dataset):
 	node_label_costs = _initialize_node_label_costs(dataset)
 	edge_label_costs = _initialize_edge_label_costs(dataset)
 	return node_label_costs, edge_label_costs
 	
 	
 def _initialize_node_label_costs(dataset):
 	# Get list of node labels.
 	nls = dataset.get_all_node_labels()
 	# Generate random costs.
 	nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
 	rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
 	rand_costs /= np.max(rand_costs)
 			
 	return rand_costs


 def _initialize_edge_label_costs(dataset):
 	# Get list of edge labels.
 	els = dataset.get_all_edge_labels()
 	# Generate random costs.
 	nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els))
 	rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el)
 	rand_costs /= np.max(rand_costs)
 			
 	return rand_costs


 def _compute_ged(dataset, node_label_costs, edge_label_costs):
 	from gklearn.ged.env import GEDEnv
 	from gklearn.ged.util.util import label_costs_to_matrix
 	import networkx as nx
 			
 	ged_env = GEDEnv() # initailize GED environment.
 	ged_env.set_edit_cost('CONSTANT', # GED cost type.
 	                      edit_cost_constants=[3, 3, 1, 3, 3, 1] # edit costs.
 						  )
 	for g in dataset.graphs:
 		ged_env.add_nx_graph(g, '') # add graphs

 	node_labels = ged_env.get_all_node_labels()
 	edge_labels = ged_env.get_all_edge_labels()
 	listID = ged_env.get_all_graph_ids() # get list IDs of graphs
 	ged_env.set_label_costs(label_costs_to_matrix(node_label_costs, len(node_labels)), 
 					  label_costs_to_matrix(edge_label_costs, len(edge_labels)))
 	ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment.
 	options = {'initialization_method': 'RANDOM', # or 'NODE', etc.
 	           'threads': 1 # parallel threads.
 			   }
 	ged_env.set_method('BIPARTITE', # GED method.
 	                   options # options for GED method.
 					   )
 	ged_env.init_method() # initialize GED method.
 	
 	ged_env.run_method(listID[0], listID[1]) # run.
 	
 	pi_forward = ged_env.get_forward_map(listID[0], listID[1]) # forward map.
 	pi_backward = ged_env.get_backward_map(listID[0], listID[1]) # backward map.
 	dis = ged_env.get_upper_bound(listID[0], listID[1])	# GED bewteen two graphs.
 	
 	# make the map label correct (label remove map as np.inf)
 	nodes1 = [n for n in dataset.graphs[0].nodes()]
 	nodes2 = [n for n in dataset.graphs[1].nodes()]
 	nb1 = nx.number_of_nodes(dataset.graphs[0])
 	nb2 = nx.number_of_nodes(dataset.graphs[1])
 	pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
 	pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
 	
 	return pi_forward, pi_backward, dis, node_labels, edge_labels
 		

 if __name__ == "__main__":
 	test_get_nb_edit_operations_symbolic_cml()