From 0db57fe3cedad9c1146aac01daddb7a059c5aa6e Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 6 Jul 2020 18:21:31 +0200 Subject: [PATCH] Fix bugs in ged.util.util.get_nb_edit_operations_symbolic_cml() and add test for it. --- gklearn/ged/util/util.py | 104 ++++++++++++------ gklearn/preimage/median_preimage_generator_cml.py | 50 +-------- gklearn/tests/{ => ged}/test_ged_env.py | 0 .../test_get_nb_edit_operations_symbolic_cml.py | 122 +++++++++++++++++++++ 4 files changed, 193 insertions(+), 83 deletions(-) rename gklearn/tests/{ => ged}/test_ged_env.py (100%) create mode 100644 gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index b06fadc..0cffeba 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -49,16 +49,18 @@ def compute_ged(g1, g2, options): def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True): - - node_label_costs = options['node_label_costs'] if 'node_label_costs' in options else None - edge_label_costs = options['edge_label_costs'] if 'edge_label_costs' in options else None # initialize ged env. ged_env = GEDEnv() ged_env.set_edit_cost(options['edit_cost'], edit_cost_constants=options['edit_cost_constants']) for g in graphs: ged_env.add_nx_graph(g, '') - listID = ged_env.get_all_graph_ids() + listID = ged_env.get_all_graph_ids() + + node_labels = ged_env.get_all_node_labels() + edge_labels = ged_env.get_all_edge_labels() + node_label_costs = label_costs_to_matrix(options['node_label_costs'], len(node_labels)) if 'node_label_costs' in options else None + edge_label_costs = label_costs_to_matrix(options['edge_label_costs'], len(edge_labels)) if 'edge_label_costs' in options else None ged_env.set_label_costs(node_label_costs, edge_label_costs) ged_env.init(init_type=options['init_option']) if parallel: @@ -69,11 +71,9 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True # compute ged. # options used to compute numbers of edit operations. neo_options = {'edit_cost': options['edit_cost'], -# 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], -# 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs'], 'is_cml': True, - 'node_labels': ged_env.get_all_node_labels(), - 'edge_labels': ged_env.get_all_edge_labels()} + 'node_labels': node_labels, + 'edge_labels': edge_labels} ged_mat = np.zeros((len(graphs), len(graphs))) if parallel: len_itr = int(len(graphs) * (len(graphs) - 1) / 2) @@ -243,11 +243,45 @@ def _compute_ged(env, gid1, gid2, g1, g2): return dis, pi_forward, pi_backward +def label_costs_to_matrix(costs, nb_labels): + """Reform a label cost vector to a matrix. + + Parameters + ---------- + costs : numpy.array + The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. + nb_labels : integer + Number of labels. + + Returns + ------- + cost_matrix : numpy.array. + The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. + """ + # Initialize label cost matrix. + cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) + i = 0 + # Costs of insertions. + for col in range(1, nb_labels + 1): + cost_matrix[0, col] = costs[i] + i += 1 + # Costs of deletions. + for row in range(1, nb_labels + 1): + cost_matrix[row, 0] = costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, nb_labels + 1): + for col in range(row + 1, nb_labels + 1): + cost_matrix[row, col] = costs[i] + cost_matrix[col, row] = costs[i] + i += 1 + + return cost_matrix + + def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): if is_cml: if edit_cost == 'CONSTANT': - node_label_costs = kwargs.get('node_label_costs') - edge_label_costs = kwargs.get('edge_label_costs') node_labels = kwargs.get('node_labels', []) edge_labels = kwargs.get('edge_labels', []) return get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, @@ -273,12 +307,12 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, node_labels=[], edge_labels=[]): - """Compute the number of each edit operations for symbolic-labeled graphs, where the costs are different for each pair of nodes. + """Compute times that edit operations are used in an edit path for symbolic-labeled graphs, where the costs are different for each pair of nodes. Returns ------- list - A vector of costs bewteen labels, formed in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. The dummy label is the first label, and the self label costs are not included. + A vector of numbers of times that costs bewteen labels are used in an edit path, formed in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. The dummy label is the first label, and the self label costs are not included. """ # Initialize. nb_ops_node = np.zeros((1 + len(node_labels), 1 + len(node_labels))) @@ -290,7 +324,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, label1 = tuple(g1.nodes[nodes1[i]].items()) # @todo: order and faster idx_label1 = node_labels.index(label1) # @todo: faster if map_i == np.inf: # deletions. - nb_ops_node[0, idx_label1 + 1] += 1 + nb_ops_node[idx_label1 + 1, 0] += 1 else: # substitutions. label2 = tuple(g2.nodes[map_i].items()) if label1 != label2: @@ -302,7 +336,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, if map_i == np.inf: label = tuple(g2.nodes[nodes2[i]].items()) idx_label = node_labels.index(label) # @todo: faster - nb_ops_node[idx_label + 1, 0] += 1 + nb_ops_node[0, idx_label + 1] += 1 # For edges. edges1 = [e for e in g1.edges()] @@ -314,7 +348,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, idxt1 = nodes1.index(nt1) # @todo: faster # At least one of the nodes is removed, thus the edge is removed. if forward_map[idxf1] == np.inf or forward_map[idxt1] == np.inf: - nb_ops_edge[0, idx_label1 + 1] += 1 + nb_ops_edge[idx_label1 + 1, 0] += 1 # corresponding edge is in g2. else: nf2, nt2 = forward_map[idxf1], forward_map[idxt1] @@ -335,38 +369,38 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1 # Corresponding nodes are in g2, however the edge is removed. else: - nb_ops_edge[0, idx_label1 + 1] += 1 + nb_ops_edge[idx_label1 + 1, 0] += 1 # insertions. - for e in g2.edges(): - if e not in edges2_marked: - label = tuple(g2.edges[e].items()) + for nt, nf in g2.edges(): + if (nt, nf) not in edges2_marked and (nf, nt) not in edges2_marked: # @todo: for directed. + label = tuple(g2.edges[(nt, nf)].items()) idx_label = edge_labels.index(label) # @todo: faster - nb_ops_edge[idx_label + 1, 0] += 1 + nb_ops_edge[0, idx_label + 1] += 1 - # Reform the costs into a vector. - cost_vector = [] - # Add node insertion costs. + # Reform the numbers of edit oeprations into a vector. + nb_eo_vector = [] + # node insertion. for i in range(1, len(nb_ops_node)): - cost_vector.append(nb_ops_node[i, 0]) - # Add node deletion costs. + nb_eo_vector.append(nb_ops_node[0, i]) + # node deletion. for i in range(1, len(nb_ops_node)): - cost_vector.append(nb_ops_node[0, i]) - # Add node substitution costs. + nb_eo_vector.append(nb_ops_node[i, 0]) + # node substitution. for i in range(1, len(nb_ops_node)): for j in range(i + 1, len(nb_ops_node)): - cost_vector.append(nb_ops_node[i, j]) - # Add edge insertion costs. + nb_eo_vector.append(nb_ops_node[i, j]) + # edge insertion. for i in range(1, len(nb_ops_edge)): - cost_vector.append(nb_ops_edge[i, 0]) - # Add edge deletion costs. + nb_eo_vector.append(nb_ops_edge[0, i]) + # edge deletion. for i in range(1, len(nb_ops_edge)): - cost_vector.append(nb_ops_edge[0, i]) - # Add edge substitution costs. + nb_eo_vector.append(nb_ops_edge[i, 0]) + # edge substitution. for i in range(1, len(nb_ops_edge)): for j in range(i + 1, len(nb_ops_edge)): - cost_vector.append(nb_ops_edge[i, j]) + nb_eo_vector.append(nb_ops_edge[i, j]) - return cost_vector + return nb_eo_vector def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, diff --git a/gklearn/preimage/median_preimage_generator_cml.py b/gklearn/preimage/median_preimage_generator_cml.py index c4a92a6..161475a 100644 --- a/gklearn/preimage/median_preimage_generator_cml.py +++ b/gklearn/preimage/median_preimage_generator_cml.py @@ -279,37 +279,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) rand_costs /= np.max(rand_costs) # @todo: maybe not needed. - self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1)) - # Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData. - i = 0 - # Costs of insertions. - for row in range(1, len(nls) + 1): - self.__node_label_costs[row, 0] = rand_costs[i] - i += 1 - # Costs of deletions. - for col in range(1, len(nls) + 1): - self.__node_label_costs[0, col] = rand_costs[i] - i += 1 - # Costs of substitutions. - for row in range(1, len(nls) + 1): - for col in range(row + 1, len(nls) + 1): - self.__node_label_costs[row, col] = rand_costs[i] - self.__node_label_costs[col, row] = rand_costs[i] - i += 1 - -# self.__node_label_costs = {} -# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)): -# self.__node_label_costs[(nl1, nl2)] = rand_costs[i] -# # Add costs for deletion. -# for j, nl in enumerate(nls): -# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j] -# # Add costs for insertion. -# for k, nl in enumerate(nls): -# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k] -# # Add self costs. -# for nl in nls: -# self.__node_label_costs[(nl, nl)] = 0 -# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0 + self.__node_label_costs = rand_costs def __initialize_edge_label_costs(self): @@ -319,23 +289,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els)) rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el) rand_costs /= np.max(rand_costs) # @todo: maybe not needed. - self.__edge_label_costs = np.zeros((len(els) + 1, len(els) + 1)) - # Initialize edge label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData. - i = 0 - # Costs of insertions. - for row in range(1, len(els) + 1): - self.__edge_label_costs[row, 0] = rand_costs[i] - i += 1 - # Costs of deletions. - for col in range(1, len(els) + 1): - self.__edge_label_costs[0, col] = rand_costs[i] - i += 1 - # Costs of substitutions. - for row in range(1, len(els) + 1): - for col in range(row + 1, len(els) + 1): - self.__edge_label_costs[row, col] = rand_costs[i] - self.__edge_label_costs[col, row] = rand_costs[i] - i += 1 + self.__edge_label_costs = rand_costs def __optimize_ecm_by_kernel_distances(self): diff --git a/gklearn/tests/test_ged_env.py b/gklearn/tests/ged/test_ged_env.py similarity index 100% rename from gklearn/tests/test_ged_env.py rename to gklearn/tests/ged/test_ged_env.py diff --git a/gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py b/gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py new file mode 100644 index 0000000..aa40cca --- /dev/null +++ b/gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 6 12:08:24 2020 + +@author: ljia +""" +import random +import numpy as np + +def test_get_nb_edit_operations_symbolic_cml(): + """Test get_nb_edit_operations_symbolic_cml(). + """ + """**1. Get dataset.**""" + + from gklearn.utils import Dataset + + # Predefined dataset name, use dataset "MUTAG". + ds_name = 'MUTAG' + + # Initialize a Dataset. + dataset = Dataset() + # Load predefined dataset "MUTAG". + dataset.load_predefined_dataset(ds_name) + graph1 = dataset.graphs[0] + graph2 = dataset.graphs[1] + + """**2. Compute graph edit distance.**""" + +# try: + # Initialize label costs randomly. + node_label_costs, edge_label_costs = _initialize_label_costs(dataset) + + # Compute GEDs. + pi_forward, pi_backward, dis, node_labels, edge_labels = _compute_ged(dataset, node_label_costs, edge_label_costs) + + + # Compute numbers of edit operations. + + from gklearn.ged.util.util import get_nb_edit_operations_symbolic_cml + + n_edit_operations = get_nb_edit_operations_symbolic_cml(graph1, graph2, pi_forward, pi_backward, node_labels, edge_labels) + + assert np.abs((np.dot(np.concatenate((node_label_costs, edge_label_costs)), n_edit_operations) - dis) / dis) < 10e-6 + +# except Exception as exception: +# assert False, exception + + +def _initialize_label_costs(dataset): + node_label_costs = _initialize_node_label_costs(dataset) + edge_label_costs = _initialize_edge_label_costs(dataset) + return node_label_costs, edge_label_costs + + +def _initialize_node_label_costs(dataset): + # Get list of node labels. + nls = dataset.get_all_node_labels() + # Generate random costs. + nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) + rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) + rand_costs /= np.max(rand_costs) + + return rand_costs + + +def _initialize_edge_label_costs(dataset): + # Get list of edge labels. + els = dataset.get_all_edge_labels() + # Generate random costs. + nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els)) + rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el) + rand_costs /= np.max(rand_costs) + + return rand_costs + + +def _compute_ged(dataset, node_label_costs, edge_label_costs): + from gklearn.ged.env import GEDEnv + from gklearn.ged.util.util import label_costs_to_matrix + import networkx as nx + + ged_env = GEDEnv() # initailize GED environment. + ged_env.set_edit_cost('CONSTANT', # GED cost type. + edit_cost_constants=[3, 3, 1, 3, 3, 1] # edit costs. + ) + for g in dataset.graphs: + ged_env.add_nx_graph(g, '') # add graphs + + node_labels = ged_env.get_all_node_labels() + edge_labels = ged_env.get_all_edge_labels() + listID = ged_env.get_all_graph_ids() # get list IDs of graphs + ged_env.set_label_costs(label_costs_to_matrix(node_label_costs, len(node_labels)), + label_costs_to_matrix(edge_label_costs, len(edge_labels))) + ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment. + options = {'initialization_method': 'RANDOM', # or 'NODE', etc. + 'threads': 1 # parallel threads. + } + ged_env.set_method('BIPARTITE', # GED method. + options # options for GED method. + ) + ged_env.init_method() # initialize GED method. + + ged_env.run_method(listID[0], listID[1]) # run. + + pi_forward = ged_env.get_forward_map(listID[0], listID[1]) # forward map. + pi_backward = ged_env.get_backward_map(listID[0], listID[1]) # backward map. + dis = ged_env.get_upper_bound(listID[0], listID[1]) # GED bewteen two graphs. + + # make the map label correct (label remove map as np.inf) + nodes1 = [n for n in dataset.graphs[0].nodes()] + nodes2 = [n for n in dataset.graphs[1].nodes()] + nb1 = nx.number_of_nodes(dataset.graphs[0]) + nb2 = nx.number_of_nodes(dataset.graphs[1]) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] + + return pi_forward, pi_backward, dis, node_labels, edge_labels + + +if __name__ == "__main__": + test_get_nb_edit_operations_symbolic_cml() \ No newline at end of file