Browse Source

Fix bugs in ged.util.util.get_nb_edit_operations_symbolic_cml() and add test for it.

v0.2.x
jajupmochi 5 years ago
parent
commit
0db57fe3ce
4 changed files with 193 additions and 83 deletions
  1. +69
    -35
      gklearn/ged/util/util.py
  2. +2
    -48
      gklearn/preimage/median_preimage_generator_cml.py
  3. +0
    -0
      gklearn/tests/ged/test_ged_env.py
  4. +122
    -0
      gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py

+ 69
- 35
gklearn/ged/util/util.py View File

@@ -49,16 +49,18 @@ def compute_ged(g1, g2, options):


def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True):
node_label_costs = options['node_label_costs'] if 'node_label_costs' in options else None
edge_label_costs = options['edge_label_costs'] if 'edge_label_costs' in options else None

# initialize ged env.
ged_env = GEDEnv()
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constants=options['edit_cost_constants'])
for g in graphs:
ged_env.add_nx_graph(g, '')
listID = ged_env.get_all_graph_ids()
listID = ged_env.get_all_graph_ids()
node_labels = ged_env.get_all_node_labels()
edge_labels = ged_env.get_all_edge_labels()
node_label_costs = label_costs_to_matrix(options['node_label_costs'], len(node_labels)) if 'node_label_costs' in options else None
edge_label_costs = label_costs_to_matrix(options['edge_label_costs'], len(edge_labels)) if 'edge_label_costs' in options else None
ged_env.set_label_costs(node_label_costs, edge_label_costs)
ged_env.init(init_type=options['init_option'])
if parallel:
@@ -69,11 +71,9 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
# compute ged.
# options used to compute numbers of edit operations.
neo_options = {'edit_cost': options['edit_cost'],
# 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'],
# 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs'],
'is_cml': True,
'node_labels': ged_env.get_all_node_labels(),
'edge_labels': ged_env.get_all_edge_labels()}
'node_labels': node_labels,
'edge_labels': edge_labels}
ged_mat = np.zeros((len(graphs), len(graphs)))
if parallel:
len_itr = int(len(graphs) * (len(graphs) - 1) / 2)
@@ -243,11 +243,45 @@ def _compute_ged(env, gid1, gid2, g1, g2):
return dis, pi_forward, pi_backward


def label_costs_to_matrix(costs, nb_labels):
"""Reform a label cost vector to a matrix.

Parameters
----------
costs : numpy.array
The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs.
nb_labels : integer
Number of labels.

Returns
-------
cost_matrix : numpy.array.
The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData.
"""
# Initialize label cost matrix.
cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1))
i = 0
# Costs of insertions.
for col in range(1, nb_labels + 1):
cost_matrix[0, col] = costs[i]
i += 1
# Costs of deletions.
for row in range(1, nb_labels + 1):
cost_matrix[row, 0] = costs[i]
i += 1
# Costs of substitutions.
for row in range(1, nb_labels + 1):
for col in range(row + 1, nb_labels + 1):
cost_matrix[row, col] = costs[i]
cost_matrix[col, row] = costs[i]
i += 1
return cost_matrix


def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs):
if is_cml:
if edit_cost == 'CONSTANT':
node_label_costs = kwargs.get('node_label_costs')
edge_label_costs = kwargs.get('edge_label_costs')
node_labels = kwargs.get('node_labels', [])
edge_labels = kwargs.get('edge_labels', [])
return get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
@@ -273,12 +307,12 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is
def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
node_labels=[], edge_labels=[]):
"""Compute the number of each edit operations for symbolic-labeled graphs, where the costs are different for each pair of nodes.
"""Compute times that edit operations are used in an edit path for symbolic-labeled graphs, where the costs are different for each pair of nodes.
Returns
-------
list
A vector of costs bewteen labels, formed in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. The dummy label is the first label, and the self label costs are not included.
A vector of numbers of times that costs bewteen labels are used in an edit path, formed in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. The dummy label is the first label, and the self label costs are not included.
"""
# Initialize.
nb_ops_node = np.zeros((1 + len(node_labels), 1 + len(node_labels)))
@@ -290,7 +324,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
label1 = tuple(g1.nodes[nodes1[i]].items()) # @todo: order and faster
idx_label1 = node_labels.index(label1) # @todo: faster
if map_i == np.inf: # deletions.
nb_ops_node[0, idx_label1 + 1] += 1
nb_ops_node[idx_label1 + 1, 0] += 1
else: # substitutions.
label2 = tuple(g2.nodes[map_i].items())
if label1 != label2:
@@ -302,7 +336,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
if map_i == np.inf:
label = tuple(g2.nodes[nodes2[i]].items())
idx_label = node_labels.index(label) # @todo: faster
nb_ops_node[idx_label + 1, 0] += 1
nb_ops_node[0, idx_label + 1] += 1
# For edges.
edges1 = [e for e in g1.edges()]
@@ -314,7 +348,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
idxt1 = nodes1.index(nt1) # @todo: faster
# At least one of the nodes is removed, thus the edge is removed.
if forward_map[idxf1] == np.inf or forward_map[idxt1] == np.inf:
nb_ops_edge[0, idx_label1 + 1] += 1
nb_ops_edge[idx_label1 + 1, 0] += 1
# corresponding edge is in g2.
else:
nf2, nt2 = forward_map[idxf1], forward_map[idxt1]
@@ -335,38 +369,38 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map,
nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1
# Corresponding nodes are in g2, however the edge is removed.
else:
nb_ops_edge[0, idx_label1 + 1] += 1
nb_ops_edge[idx_label1 + 1, 0] += 1
# insertions.
for e in g2.edges():
if e not in edges2_marked:
label = tuple(g2.edges[e].items())
for nt, nf in g2.edges():
if (nt, nf) not in edges2_marked and (nf, nt) not in edges2_marked: # @todo: for directed.
label = tuple(g2.edges[(nt, nf)].items())
idx_label = edge_labels.index(label) # @todo: faster
nb_ops_edge[idx_label + 1, 0] += 1
nb_ops_edge[0, idx_label + 1] += 1
# Reform the costs into a vector.
cost_vector = []
# Add node insertion costs.
# Reform the numbers of edit oeprations into a vector.
nb_eo_vector = []
# node insertion.
for i in range(1, len(nb_ops_node)):
cost_vector.append(nb_ops_node[i, 0])
# Add node deletion costs.
nb_eo_vector.append(nb_ops_node[0, i])
# node deletion.
for i in range(1, len(nb_ops_node)):
cost_vector.append(nb_ops_node[0, i])
# Add node substitution costs.
nb_eo_vector.append(nb_ops_node[i, 0])
# node substitution.
for i in range(1, len(nb_ops_node)):
for j in range(i + 1, len(nb_ops_node)):
cost_vector.append(nb_ops_node[i, j])
# Add edge insertion costs.
nb_eo_vector.append(nb_ops_node[i, j])
# edge insertion.
for i in range(1, len(nb_ops_edge)):
cost_vector.append(nb_ops_edge[i, 0])
# Add edge deletion costs.
nb_eo_vector.append(nb_ops_edge[0, i])
# edge deletion.
for i in range(1, len(nb_ops_edge)):
cost_vector.append(nb_ops_edge[0, i])
# Add edge substitution costs.
nb_eo_vector.append(nb_ops_edge[i, 0])
# edge substitution.
for i in range(1, len(nb_ops_edge)):
for j in range(i + 1, len(nb_ops_edge)):
cost_vector.append(nb_ops_edge[i, j])
nb_eo_vector.append(nb_ops_edge[i, j])
return cost_vector
return nb_eo_vector

def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,


+ 2
- 48
gklearn/preimage/median_preimage_generator_cml.py View File

@@ -279,37 +279,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1))
# Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
i = 0
# Costs of insertions.
for row in range(1, len(nls) + 1):
self.__node_label_costs[row, 0] = rand_costs[i]
i += 1
# Costs of deletions.
for col in range(1, len(nls) + 1):
self.__node_label_costs[0, col] = rand_costs[i]
i += 1
# Costs of substitutions.
for row in range(1, len(nls) + 1):
for col in range(row + 1, len(nls) + 1):
self.__node_label_costs[row, col] = rand_costs[i]
self.__node_label_costs[col, row] = rand_costs[i]
i += 1
# self.__node_label_costs = {}
# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)):
# self.__node_label_costs[(nl1, nl2)] = rand_costs[i]
# # Add costs for deletion.
# for j, nl in enumerate(nls):
# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j]
# # Add costs for insertion.
# for k, nl in enumerate(nls):
# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k]
# # Add self costs.
# for nl in nls:
# self.__node_label_costs[(nl, nl)] = 0
# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0
self.__node_label_costs = rand_costs


def __initialize_edge_label_costs(self):
@@ -319,23 +289,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els))
rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el)
rand_costs /= np.max(rand_costs) # @todo: maybe not needed.
self.__edge_label_costs = np.zeros((len(els) + 1, len(els) + 1))
# Initialize edge label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData.
i = 0
# Costs of insertions.
for row in range(1, len(els) + 1):
self.__edge_label_costs[row, 0] = rand_costs[i]
i += 1
# Costs of deletions.
for col in range(1, len(els) + 1):
self.__edge_label_costs[0, col] = rand_costs[i]
i += 1
# Costs of substitutions.
for row in range(1, len(els) + 1):
for col in range(row + 1, len(els) + 1):
self.__edge_label_costs[row, col] = rand_costs[i]
self.__edge_label_costs[col, row] = rand_costs[i]
i += 1
self.__edge_label_costs = rand_costs
def __optimize_ecm_by_kernel_distances(self):


gklearn/tests/test_ged_env.py → gklearn/tests/ged/test_ged_env.py View File


+ 122
- 0
gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py View File

@@ -0,0 +1,122 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 6 12:08:24 2020

@author: ljia
"""
import random
import numpy as np

def test_get_nb_edit_operations_symbolic_cml():
"""Test get_nb_edit_operations_symbolic_cml().
"""
"""**1. Get dataset.**"""

from gklearn.utils import Dataset
# Predefined dataset name, use dataset "MUTAG".
ds_name = 'MUTAG'
# Initialize a Dataset.
dataset = Dataset()
# Load predefined dataset "MUTAG".
dataset.load_predefined_dataset(ds_name)
graph1 = dataset.graphs[0]
graph2 = dataset.graphs[1]
"""**2. Compute graph edit distance.**"""
# try:
# Initialize label costs randomly.
node_label_costs, edge_label_costs = _initialize_label_costs(dataset)
# Compute GEDs.
pi_forward, pi_backward, dis, node_labels, edge_labels = _compute_ged(dataset, node_label_costs, edge_label_costs)
# Compute numbers of edit operations.
from gklearn.ged.util.util import get_nb_edit_operations_symbolic_cml
n_edit_operations = get_nb_edit_operations_symbolic_cml(graph1, graph2, pi_forward, pi_backward, node_labels, edge_labels)
assert np.abs((np.dot(np.concatenate((node_label_costs, edge_label_costs)), n_edit_operations) - dis) / dis) < 10e-6
# except Exception as exception:
# assert False, exception
def _initialize_label_costs(dataset):
node_label_costs = _initialize_node_label_costs(dataset)
edge_label_costs = _initialize_edge_label_costs(dataset)
return node_label_costs, edge_label_costs
def _initialize_node_label_costs(dataset):
# Get list of node labels.
nls = dataset.get_all_node_labels()
# Generate random costs.
nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
rand_costs /= np.max(rand_costs)
return rand_costs


def _initialize_edge_label_costs(dataset):
# Get list of edge labels.
els = dataset.get_all_edge_labels()
# Generate random costs.
nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els))
rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el)
rand_costs /= np.max(rand_costs)
return rand_costs


def _compute_ged(dataset, node_label_costs, edge_label_costs):
from gklearn.ged.env import GEDEnv
from gklearn.ged.util.util import label_costs_to_matrix
import networkx as nx
ged_env = GEDEnv() # initailize GED environment.
ged_env.set_edit_cost('CONSTANT', # GED cost type.
edit_cost_constants=[3, 3, 1, 3, 3, 1] # edit costs.
)
for g in dataset.graphs:
ged_env.add_nx_graph(g, '') # add graphs

node_labels = ged_env.get_all_node_labels()
edge_labels = ged_env.get_all_edge_labels()
listID = ged_env.get_all_graph_ids() # get list IDs of graphs
ged_env.set_label_costs(label_costs_to_matrix(node_label_costs, len(node_labels)),
label_costs_to_matrix(edge_label_costs, len(edge_labels)))
ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment.
options = {'initialization_method': 'RANDOM', # or 'NODE', etc.
'threads': 1 # parallel threads.
}
ged_env.set_method('BIPARTITE', # GED method.
options # options for GED method.
)
ged_env.init_method() # initialize GED method.
ged_env.run_method(listID[0], listID[1]) # run.
pi_forward = ged_env.get_forward_map(listID[0], listID[1]) # forward map.
pi_backward = ged_env.get_backward_map(listID[0], listID[1]) # backward map.
dis = ged_env.get_upper_bound(listID[0], listID[1]) # GED bewteen two graphs.
# make the map label correct (label remove map as np.inf)
nodes1 = [n for n in dataset.graphs[0].nodes()]
nodes2 = [n for n in dataset.graphs[1].nodes()]
nb1 = nx.number_of_nodes(dataset.graphs[0])
nb2 = nx.number_of_nodes(dataset.graphs[1])
pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
return pi_forward, pi_backward, dis, node_labels, edge_labels

if __name__ == "__main__":
test_get_nb_edit_operations_symbolic_cml()

Loading…
Cancel
Save