Browse Source

Add ability to GEDEnv to use pre-defined costs between node labels.

v0.2.x
jajupmochi 5 years ago
parent
commit
731ab1d45b
4 changed files with 103 additions and 36 deletions
  1. +28
    -9
      gklearn/ged/env/ged_data.py
  2. +6
    -0
      gklearn/ged/env/ged_env.py
  3. +1
    -0
      gklearn/ged/util/util.py
  4. +68
    -27
      gklearn/preimage/median_preimage_generator_cml.py

+ 28
- 9
gklearn/ged/env/ged_data.py View File

@@ -23,6 +23,7 @@ class GEDData(object):
self._edit_cost = None
self._node_costs = None
self._edge_costs = None
self._node_label_costs = None
self._node_labels = []
self._edge_labels = []
self._init_type = Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES
@@ -84,15 +85,21 @@ class GEDData(object):
* and 0 otherwise.
*/
"""
if self._eager_init(): # @todo: check if correct
return self._node_costs[label1, label2]
if label1 == label2:
return 0
if label1 == SpecialLabel.DUMMY: # @todo: check dummy
return self._edit_cost.node_ins_cost_fun(label2) # self._node_labels[label2 - 1]) # @todo: check
if label2 == SpecialLabel.DUMMY: # @todo: check dummy
return self._edit_cost.node_del_cost_fun(label1) # self._node_labels[label1 - 1])
return self._edit_cost.node_rel_cost_fun(label1, label2) # self._node_labels[label1 - 1], self._node_labels[label2 - 1])
if self._node_label_costs is None:
if self._eager_init(): # @todo: check if correct
return self._node_costs[label1, label2]
if label1 == label2:
return 0
if label1 == SpecialLabel.DUMMY: # @todo: check dummy
return self._edit_cost.node_ins_cost_fun(label2) # self._node_labels[label2 - 1]) # @todo: check
if label2 == SpecialLabel.DUMMY: # @todo: check dummy
return self._edit_cost.node_del_cost_fun(label1) # self._node_labels[label1 - 1])
return self._edit_cost.node_rel_cost_fun(label1, label2) # self._node_labels[label1 - 1], self._node_labels[label2 - 1])
# use pre-computed node label costs.
else:
id1 = 0 if label1 == SpecialLabel.DUMMY else self._node_label_to_id(label1) # @todo: this is slow.
id2 = 0 if label2 == SpecialLabel.DUMMY else self._node_label_to_id(label2)
return self._node_label_costs[id1, id2]
def edge_cost(self, label1, label2):
@@ -198,6 +205,12 @@ class GEDData(object):
self._delete_edit_cost = True
def id_to_node_label(self, label_id):
if label_id > len(self._node_labels) or label_id == 0:
raise Exception('Invalid node label ID', str(label_id), '.')
return self._node_labels[label_id - 1]
def _node_label_to_id(self, node_label):
n_id = 0
for n_l in self._node_labels:
@@ -208,6 +221,12 @@ class GEDData(object):
return n_id + 1


def id_to_edge_label(self, label_id):
if label_id > len(self._edge_labels) or label_id == 0:
raise Exception('Invalid edge label ID', str(label_id), '.')
return self._edge_labels[label_id - 1]


def _edge_label_to_id(self, edge_label):
e_id = 0
for e_l in self._edge_labels:


+ 6
- 0
gklearn/ged/env/ged_env.py View File

@@ -226,6 +226,12 @@ class GEDEnv(object):
*/
"""
return self.__ged_data._init_type
def set_label_costs(self, label_costs):
"""Set the costs between labels.
"""
self.__ged_data._node_label_costs = label_costs
def set_method(self, method, options=''):


+ 1
- 0
gklearn/ged/util/util.py View File

@@ -55,6 +55,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True
for g in graphs:
ged_env.add_nx_graph(g, '')
listID = ged_env.get_all_graph_ids()
ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None)
ged_env.init(init_type=options['init_option'])
if parallel:
options['threads'] = 1


+ 68
- 27
gklearn/preimage/median_preimage_generator_cml.py View File

@@ -5,31 +5,26 @@ Created on Tue Jun 16 16:04:46 2020

@author: ljia
"""

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 26 18:27:22 2020

@author: ljia
"""
import numpy as np
import time
import random
import multiprocessing
import networkx as nx
import cvxpy as cp
import itertools
from gklearn.preimage import PreimageGenerator
from gklearn.preimage.utils import compute_k_dis
from gklearn.ged.util import compute_geds_cml, ged_options_to_string
from gklearn.ged.util import compute_geds_cml
from gklearn.ged.env import GEDEnv
from gklearn.ged.median import MedianGraphEstimator
from gklearn.ged.median import constant_node_costs,mge_options_to_string
from gklearn.utils import Timer
from gklearn.ged.median import MedianGraphEstimatorPy
from gklearn.ged.median import constant_node_costs, mge_options_to_string
from gklearn.utils import Timer, SpecialLabel
from gklearn.utils.utils import get_graph_kernel_by_name


class MedianPreimageGeneratorCML(PreimageGenerator):
"""Generator median preimages by cost matrices learning using the pure Python version of GEDEnv. Works only for symbolic labeled graphs.
"""
def __init__(self, dataset=None):
PreimageGenerator.__init__(self, dataset=dataset)
@@ -37,7 +32,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
self.__mge = None
self.__ged_options = {}
self.__mge_options = {}
self.__fit_method = 'k-graphs'
# self.__fit_method = 'k-graphs'
self.__init_method = 'random'
self.__init_ecc = None
self.__parallel = True
self.__n_jobs = multiprocessing.cpu_count()
@@ -47,8 +43,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
self.__max_itrs_without_update = 3
self.__epsilon_residual = 0.01
self.__epsilon_ec = 0.1
self.__allow_zeros = False
self.__triangle_rule = True
self.__allow_zeros = True
# self.__triangle_rule = True
# values to compute.
self.__runtime_optimize_ec = None
self.__runtime_generate_preimage = None
@@ -64,6 +60,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
self.__itrs = 0
self.__converged = False
self.__num_updates_ecc = 0
self.__node_label_costs = None
self.__edge_label_costs = None
# values that can be set or to be computed.
self.__edit_cost_constants = []
self.__gram_matrix_unnorm = None
@@ -76,7 +74,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
self._verbose = kwargs.get('verbose', 2)
self.__ged_options = kwargs.get('ged_options', {})
self.__mge_options = kwargs.get('mge_options', {})
self.__fit_method = kwargs.get('fit_method', 'k-graphs')
# self.__fit_method = kwargs.get('fit_method', 'k-graphs')
self.__init_method = kwargs.get('init_method', 'random')
self.__init_ecc = kwargs.get('init_ecc', None)
self.__edit_cost_constants = kwargs.get('edit_cost_constants', [])
self.__parallel = kwargs.get('parallel', True)
@@ -89,8 +88,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1)
self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None)
self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None)
self.__allow_zeros = kwargs.get('allow_zeros', False)
self.__triangle_rule = kwargs.get('triangle_rule', True)
self.__allow_zeros = kwargs.get('allow_zeros', True)
# self.__triangle_rule = kwargs.get('triangle_rule', True)
def run(self):
@@ -122,10 +121,10 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
end_precompute_gm = time.time()
start -= self.__runtime_precompute_gm
if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset':
start = time.time()
self.__runtime_precompute_gm = 0
end_precompute_gm = start
# if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset':
# start = time.time()
# self.__runtime_precompute_gm = 0
# end_precompute_gm = start
# 2. optimize edit cost constants.
self.__optimize_edit_cost_vector()
@@ -197,7 +196,48 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
def __optimize_edit_cost_vector(self):
"""Learn edit cost vector.
"""
if self.__fit_method == 'random': # random
if self.__init_method == 'random': # random
# Get list of node labels.
nls = self._dataset.get_all_node_labels()
# Generate random costs.
nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls))
rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl)
self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1))
# Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. These is the same setting as in GEDData.
i = 0
# Costs of insertions.
for row in range(1, len(nls) + 1):
self.__node_label_costs[row, 0] = rand_costs[i]
i += 1
# Costs of deletions.
for col in range(1, len(nls) + 1):
self.__node_label_costs[0, col] = rand_costs[i]
i += 1
# Costs of substitutions.
for row in range(1, len(nls) + 1):
for col in range(row + 1, len(nls) + 1):
self.__node_label_costs[row, col] = rand_costs[i]
self.__node_label_costs[col, row] = rand_costs[i]
i += 1
# self.__node_label_costs = {}
# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)):
# self.__node_label_costs[(nl1, nl2)] = rand_costs[i]
# # Add costs for deletion.
# for j, nl in enumerate(nls):
# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j]
# # Add costs for insertion.
# for k, nl in enumerate(nls):
# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k]
# # Add self costs.
# for nl in nls:
# self.__node_label_costs[(nl, nl)] = 0
# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0
# Optimize edit cost matrices.
self.__optimize_ecm_by_kernel_distances()
elif self.__fit_method == 'random': # random
if self.__ged_options['edit_cost'] == 'LETTER':
self.__edit_cost_constants = random.sample(range(1, 1000), 3)
self.__edit_cost_constants = [item * 0.001 for item in self.__edit_cost_constants]
@@ -279,6 +319,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
options['edge_labels'] = self._dataset.edge_labels
options['node_attrs'] = self._dataset.node_attrs
options['edge_attrs'] = self._dataset.edge_attrs
options['node_label_costs'] = self.__node_label_costs
ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1))
residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
time_list = [time.time() - time0]
@@ -881,8 +922,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
ged_env.init(init_type=self.__ged_options['init_option'])
# Set up the madian graph estimator.
self.__mge = MedianGraphEstimator(ged_env, constant_node_costs(self.__ged_options['edit_cost']))
self.__mge.set_refine_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
self.__mge = MedianGraphEstimatorPy(ged_env, constant_node_costs(self.__ged_options['edit_cost']))
self.__mge.set_refine_method(self.__ged_options['method'], self.__ged_options)
options = self.__mge_options.copy()
if not 'seed' in options:
options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage.
@@ -897,8 +938,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator):
ged_options = self.__ged_options.copy()
if self.__parallel:
ged_options['threads'] = 1
self.__mge.set_init_method(ged_options['method'], ged_options_to_string(ged_options))
self.__mge.set_descent_method(ged_options['method'], ged_options_to_string(ged_options))
self.__mge.set_init_method(ged_options['method'], ged_options)
self.__mge.set_descent_method(ged_options['method'], ged_options)
# Run the estimator.
self.__mge.run(graph_ids, set_median_id, gen_median_id)


Loading…
Cancel
Save