Browse Source

update preimage.

v0.1
jajupmochi 5 years ago
parent
commit
29903787ed
12 changed files with 1572 additions and 75 deletions
  1. +17
    -0
      gklearn/preimage/common_types.py
  2. +134
    -0
      gklearn/preimage/cpp2python.py
  3. +21
    -2
      gklearn/preimage/fitDistance.py
  4. +32
    -31
      gklearn/preimage/ged.py
  5. +826
    -0
      gklearn/preimage/median_graph_estimator.py
  6. +15
    -0
      gklearn/preimage/median_preimage_generator.py
  7. +108
    -0
      gklearn/preimage/misc.py
  8. +12
    -0
      gklearn/preimage/preimage_generator.py
  9. +122
    -0
      gklearn/preimage/python_code.py
  10. +91
    -0
      gklearn/preimage/test_median_graph_estimator.py
  11. +40
    -0
      gklearn/preimage/timer.py
  12. +154
    -42
      gklearn/preimage/xp_fit_method.py

+ 17
- 0
gklearn/preimage/common_types.py View File

@@ -0,0 +1,17 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 19 18:17:38 2020

@author: ljia
"""

from enum import Enum, auto

class AlgorithmState(Enum):
"""can be used to specify the state of an algorithm.
"""
CALLED = auto # The algorithm has been called.
INITIALIZED = auto # The algorithm has been initialized.
CONVERGED = auto # The algorithm has converged.
TERMINATED = auto # The algorithm has terminated.

+ 134
- 0
gklearn/preimage/cpp2python.py View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 20 11:09:04 2020

@author: ljia
"""
import re

def convert_function(cpp_code):
# f_cpp = open('cpp_code.cpp', 'r')
# # f_cpp = open('cpp_ext/src/median_graph_estimator.ipp', 'r')
# cpp_code = f_cpp.read()
python_code = cpp_code.replace('else if (', 'elif ')
python_code = python_code.replace('if (', 'if ')
python_code = python_code.replace('else {', 'else:')
python_code = python_code.replace(') {', ':')
python_code = python_code.replace(';\n', '\n')
python_code = re.sub('\n(.*)}\n', '\n\n', python_code)
# python_code = python_code.replace('}\n', '')
python_code = python_code.replace('throw', 'raise')
python_code = python_code.replace('error', 'Exception')
python_code = python_code.replace('"', '\'')
python_code = python_code.replace('\\\'', '"')
python_code = python_code.replace('try {', 'try:')
python_code = python_code.replace('true', 'True')
python_code = python_code.replace('false', 'False')
python_code = python_code.replace('catch (...', 'except')
# python_code = re.sub('std::string\(\'(.*)\'\)', '$1', python_code)
return python_code



# # python_code = python_code.replace('}\n', '')




# python_code = python_code.replace('option.first', 'opt_name')
# python_code = python_code.replace('option.second', 'opt_val')
# python_code = python_code.replace('ged::Error', 'Exception')
# python_code = python_code.replace('std::string(\'Invalid argument "\')', '\'Invalid argument "\'')


# f_cpp.close()
# f_python = open('python_code.py', 'w')
# f_python.write(python_code)
# f_python.close()


def convert_function_comment(cpp_fun_cmt, param_types):
cpp_fun_cmt = cpp_fun_cmt.replace('\t', '')
cpp_fun_cmt = cpp_fun_cmt.replace('\n * ', ' ')
# split the input comment according to key words.
param_split = None
note = None
cmt_split = cpp_fun_cmt.split('@brief')[1]
brief = cmt_split
if '@param' in cmt_split:
cmt_split = cmt_split.split('@param')
brief = cmt_split[0]
param_split = cmt_split[1:]
if '@note' in cmt_split[-1]:
note_split = cmt_split[-1].split('@note')
if param_split is not None:
param_split.pop()
param_split.append(note_split[0])
else:
brief = note_split[0]
note = note_split[1]
# get parameters.
if param_split is not None:
for idx, param in enumerate(param_split):
_, param_name, param_desc = param.split(' ', 2)
param_name = function_comment_strip(param_name, ' *\n\t/')
param_desc = function_comment_strip(param_desc, ' *\n\t/')
param_split[idx] = (param_name, param_desc)
# strip comments.
brief = function_comment_strip(brief, ' *\n\t/')
if note is not None:
note = function_comment_strip(note, ' *\n\t/')
# construct the Python function comment.
python_fun_cmt = '"""'
python_fun_cmt += brief + '\n'
if param_split is not None and len(param_split) > 0:
python_fun_cmt += '\nParameters\n----------'
for idx, param in enumerate(param_split):
python_fun_cmt += '\n' + param[0] + ' : ' + param_types[idx]
python_fun_cmt += '\n\t' + param[1] + '\n'
if note is not None:
python_fun_cmt += '\nNote\n----\n' + note + '\n'
python_fun_cmt += '"""'
return python_fun_cmt
def function_comment_strip(comment, bad_chars):
head_removed, tail_removed = False, False
while not head_removed or not tail_removed:
if comment[0] in bad_chars:
comment = comment[1:]
head_removed = False
else:
head_removed = True
if comment[-1] in bad_chars:
comment = comment[:-1]
tail_removed = False
else:
tail_removed = True
return comment

if __name__ == '__main__':
# python_code = convert_function("""
# if (print_to_stdout_ == 2) {
# std::cout << "\n===========================================================\n";
# std::cout << "Block gradient descent for initial median " << median_pos + 1 << " of " << medians.size() << ".\n";
# std::cout << "-----------------------------------------------------------\n";
# }
# """)
python_fun_cmt = convert_function_comment("""
/*!
* @brief Returns the sum of distances.
* @param[in] state The state of the estimator.
* @return The sum of distances of the median when the estimator was in the state @p state during the last call to run().
*/
""", ['string', 'string'])

+ 21
- 2
gklearn/preimage/fitDistance.py View File

@@ -260,10 +260,29 @@ def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides',
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
prob.solve()
try:
prob.solve(verbose=True)
except MemoryError as error0:
print('\nUsing solver "OSQP" caused a memory error.')
print('the original error message is\n', error0)
print('solver status: ', prob.status)
print('trying solver "CVXOPT" instead...\n')
try:
prob.solve(solver=cp.CVXOPT, verbose=True)
except Exception as error1:
print('\nAn error occured when using solver "CVXOPT".')
print('the original error message is\n', error1)
print('solver status: ', prob.status)
print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n')
prob.solve(solver=cp.MOSEK, verbose=True)
else:
print('solver status: ', prob.status)
else:
print('solver status: ', prob.status)
print()
edit_costs_new = x.value edit_costs_new = x.value
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
elif rw_constraints == '2constraints': elif rw_constraints == '2constraints':


+ 32
- 31
gklearn/preimage/ged.py View File

@@ -14,42 +14,13 @@ from multiprocessing import Pool
from functools import partial from functools import partial


#from gedlibpy_linlin import librariesImport, gedlibpy #from gedlibpy_linlin import librariesImport, gedlibpy
from libs import *
from gklearn.gedlib import librariesImport, gedlibpy


def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP', def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP',
edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50):
""" """
Compute GED for 2 graphs. Compute GED for 2 graphs.
"""
def convertGraph(G, cost):
"""Convert a graph to the proper NetworkX format that can be
recognized by library gedlibpy.
"""
G_new = nx.Graph()
if cost == 'LETTER' or cost == 'LETTER2':
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
y=str(attrs['attributes'][1]))
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2))
elif cost == 'NON_SYMBOLIC':
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd))
for a_name in G.graph['node_attrs']:
G_new.nodes[str(nd)][a_name] = str(attrs[a_name])
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2))
for a_name in G.graph['edge_attrs']:
G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name])
else:
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), chem=attrs['atom'])
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
# G_new.add_edge(str(nd1), str(nd2))
return G_new
"""
# dataset = dataset.lower() # dataset = dataset.lower()
@@ -178,6 +149,36 @@ def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method
return dis, pi_forward, pi_backward return dis, pi_forward, pi_backward




def convertGraph(G, cost):
"""Convert a graph to the proper NetworkX format that can be
recognized by library gedlibpy.
"""
G_new = nx.Graph()
if cost == 'LETTER' or cost == 'LETTER2':
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), x=str(attrs['attributes'][0]),
y=str(attrs['attributes'][1]))
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2))
elif cost == 'NON_SYMBOLIC':
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd))
for a_name in G.graph['node_attrs']:
G_new.nodes[str(nd)][a_name] = str(attrs[a_name])
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2))
for a_name in G.graph['edge_attrs']:
G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name])
else:
for nd, attrs in G.nodes(data=True):
G_new.add_node(str(nd), chem=attrs['atom'])
for nd1, nd2, attrs in G.edges(data=True):
G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
# G_new.add_edge(str(nd1), str(nd2))
return G_new


def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',
edit_cost_constant=[], stabilizer='min', repeat=50): edit_cost_constant=[], stabilizer='min', repeat=50):
""" """


+ 826
- 0
gklearn/preimage/median_graph_estimator.py View File

@@ -0,0 +1,826 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 16 18:04:55 2020

@author: ljia
"""
import numpy as np
from gklearn.preimage.common_types import AlgorithmState
from gklearn.preimage import misc
from gklearn.preimage.timer import Timer
from gklearn.utils.utils import graph_isIdentical
import time
from tqdm import tqdm
import sys
import networkx as nx


class MedianGraphEstimator(object):
def __init__(self, ged_env, constant_node_costs):
"""Constructor.
Parameters
----------
ged_env : gklearn.gedlib.gedlibpy.GEDEnv
Initialized GED environment. The edit costs must be set by the user.
constant_node_costs : Boolean
Set to True if the node relabeling costs are constant.
"""
self.__ged_env = ged_env
self.__init_method = 'BRANCH_FAST'
self.__init_options = ''
self.__descent_method = 'BRANCH_FAST'
self.__descent_options = ''
self.__refine_method = 'IPFP'
self.__refine_options = ''
self.__constant_node_costs = constant_node_costs
self.__labeled_nodes = (ged_env.get_num_node_labels() > 1)
self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1))
self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1))
self.__labeled_edges = (ged_env.get_num_edge_labels() > 1)
self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1))
self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1))
self.__init_type = 'RANDOM'
self.__num_random_inits = 10
self.__desired_num_random_inits = 10
self.__use_real_randomness = True
self.__seed = 0
self.__refine = True
self.__time_limit_in_sec = 0
self.__epsilon = 0.0001
self.__max_itrs = 100
self.__max_itrs_without_update = 3
self.__num_inits_increase_order = 10
self.__init_type_increase_order = 'K-MEANS++'
self.__max_itrs_increase_order = 10
self.__print_to_stdout = 2
self.__median_id = np.inf # @todo: check
self.__median_node_id_prefix = '' # @todo: check
self.__node_maps_from_median = {}
self.__sum_of_distances = 0
self.__best_init_sum_of_distances = np.inf
self.__converged_sum_of_distances = np.inf
self.__runtime = None
self.__runtime_initialized = None
self.__runtime_converged = None
self.__itrs = [] # @todo: check: {} ?
self.__num_decrease_order = 0
self.__num_increase_order = 0
self.__num_converged_descents = 0
self.__state = AlgorithmState.TERMINATED
if ged_env is None:
raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.')
elif not ged_env.is_initialized():
raise Exception('The GED environment is uninitialized. Call gedlibpy.GEDEnv.init() before passing it to the constructor of MedianGraphEstimator.')
def set_options(self, options):
"""Sets the options of the estimator.

Parameters
----------
options : string
String that specifies with which options to run the estimator.
"""
self.__set_default_options()
options_map = misc.options_string_to_options_map(options)
for opt_name, opt_val in options_map.items():
if opt_name == 'init-type':
self.__init_type = opt_val
if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN':
raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"')
elif opt_name == 'random-inits':
try:
self.__num_random_inits = int(opt_val)
self.__desired_num_random_inits = self.__num_random_inits
except:
raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"')

if self.__num_random_inits <= 0:
raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"')
elif opt_name == 'randomness':
if opt_val == 'PSEUDO':
self.__use_real_randomness = False
elif opt_val == 'REAL':
self.__use_real_randomness = True
else:
raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"')
elif opt_name == 'stdout':
if opt_val == '0':
self.__print_to_stdout = 0
elif opt_val == '1':
self.__print_to_stdout = 1
elif opt_val == '2':
self.__print_to_stdout = 2
else:
raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"')
elif opt_name == 'refine':
if opt_val == 'TRUE':
self.__refine = True
elif opt_val == 'FALSE':
self.__refine = False
else:
raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"')
elif opt_name == 'time-limit':
try:
self.__time_limit_in_sec = float(opt_val)
except:
raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit <convertible to double>] [...]')
elif opt_name == 'max-itrs':
try:
self.__max_itrs = int(opt_val)
except:
raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs <convertible to int>] [...]')
elif opt_name == 'max-itrs-without-update':
try:
self.__max_itrs_without_update = int(opt_val)
except:
raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update <convertible to int>] [...]')
elif opt_name == 'seed':
try:
self.__seed = int(opt_val)
except:
raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed <convertible to int greater equal 0>] [...]')
elif opt_name == 'epsilon':
try:
self.__epsilon = float(opt_val)
except:
raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]')
if self.__epsilon <= 0:
raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]')
elif opt_name == 'inits-increase-order':
try:
self.__num_inits_increase_order = int(opt_val)
except:
raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"')
if self.__num_inits_increase_order <= 0:
raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"')

elif opt_name == 'init-type-increase-order':
self.__init_type_increase_order = opt_val
if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++':
raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"')
elif opt_name == 'max-itrs-increase-order':
try:
self.__max_itrs_increase_order = int(opt_val)
except:
raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order <convertible to int>] [...]')

else:
valid_options = '[--init-type <arg>] [--random-inits <arg>] [--randomness <arg>] [--seed <arg>] [--stdout <arg>] '
valid_options += '[--time-limit <arg>] [--max-itrs <arg>] [--epsilon <arg>] '
valid_options += '[--inits-increase-order <arg>] [--init-type-increase-order <arg>] [--max-itrs-increase-order <arg>]'
raise Exception('Invalid option "' + opt_name + '". Usage: options = "' + valid_options + '"')
def set_init_method(self, init_method, init_options=''):
"""Selects method to be used for computing the initial medoid graph.
Parameters
----------
init_method : string
The selected method. Default: ged::Options::GEDMethod::BRANCH_UNIFORM.
init_options : string
The options for the selected method. Default: "".
Notes
-----
Has no effect unless "--init-type MEDOID" is passed to set_options().
"""
self.__init_method = init_method;
self.__init_options = init_options;
def set_descent_method(self, descent_method, descent_options=''):
"""Selects method to be used for block gradient descent..
Parameters
----------
descent_method : string
The selected method. Default: ged::Options::GEDMethod::BRANCH_FAST.
descent_options : string
The options for the selected method. Default: "".
Notes
-----
Has no effect unless "--init-type MEDOID" is passed to set_options().
"""
self.__descent_method = descent_method;
self.__descent_options = descent_options;

def set_refine_method(self, refine_method, refine_options):
"""Selects method to be used for improving the sum of distances and the node maps for the converged median.
Parameters
----------
refine_method : string
The selected method. Default: "IPFP".
refine_options : string
The options for the selected method. Default: "".
Notes
-----
Has no effect if "--refine FALSE" is passed to set_options().
"""
self.__refine_method = refine_method
self.__refine_options = refine_options

def run(self, graph_ids, set_median_id, gen_median_id):
"""Computes a generalized median graph.
Parameters
----------
graph_ids : list[integer]
The IDs of the graphs for which the median should be computed. Must have been added to the environment passed to the constructor.
set_median_id : integer
The ID of the computed set-median. A dummy graph with this ID must have been added to the environment passed to the constructor. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph().


gen_median_id : integer
The ID of the computed generalized median. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph().
"""
# Sanity checks.
if len(graph_ids) == 0:
raise Exception('Empty vector of graph IDs, unable to compute median.')
all_graphs_empty = True
for graph_id in graph_ids:
if self.__ged_env.get_graph_num_nodes(graph_id) > 0:
self.__median_node_id_prefix = self.__ged_env.get_original_node_ids(graph_id)[0]
all_graphs_empty = False
break
if all_graphs_empty:
raise Exception('All graphs in the collection are empty.')
# Start timer and record start time.
start = time.time()
timer = Timer(self.__time_limit_in_sec)
self.__median_id = gen_median_id
self.__state = AlgorithmState.TERMINATED
# Get ExchangeGraph representations of the input graphs.
graphs = {}
for graph_id in graph_ids:
# @todo: get_nx_graph() function may need to be modified according to the coming code.
graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id, True, True, False)
# print(self.__ged_env.get_graph_internal_id(0))
# print(graphs[0].graph)
# print(graphs[0].nodes(data=True))
# print(graphs[0].edges(data=True))
# print(nx.adjacency_matrix(graphs[0]))

# Construct initial medians.
medians = []
self.__construct_initial_medians(graph_ids, timer, medians)
end_init = time.time()
self.__runtime_initialized = end_init - start
# print(medians[0].graph)
# print(medians[0].nodes(data=True))
# print(medians[0].edges(data=True))
# print(nx.adjacency_matrix(medians[0]))
# Reset information about iterations and number of times the median decreases and increases.
self.__itrs = [0] * len(medians)
self.__num_decrease_order = 0
self.__num_increase_order = 0
self.__num_converged_descents = 0
# Initialize the best median.
best_sum_of_distances = np.inf
self.__best_init_sum_of_distances = np.inf
node_maps_from_best_median = {}
# Run block gradient descent from all initial medians.
self.__ged_env.set_method(self.__descent_method, self.__descent_options)
for median_pos in range(0, len(medians)):
# Terminate if the timer has expired and at least one SOD has been computed.
if timer.expired() and median_pos > 0:
break
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n===========================================================')
print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.')
print('-----------------------------------------------------------')
# Get reference to the median.
median = medians[median_pos]
# Load initial median into the environment.
self.__ged_env.load_nx_graph(median, gen_median_id)
self.__ged_env.init(self.__ged_env.get_init_type())
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout)
# Compute node maps and sum of distances for initial median.
self.__sum_of_distances = 0
self.__node_maps_from_median.clear() # @todo
for graph_id in graph_ids:
self.__ged_env.run_method(gen_median_id, graph_id)
self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id)
# print(self.__node_maps_from_median[graph_id])
self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: the C++ implementation for this function in GedLibBind.ipp re-call get_node_map() once more, this is not neccessary.
# print(self.__sum_of_distances)
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress.update(1)
self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances)
self.__ged_env.load_nx_graph(median, set_median_id)
# print(self.__best_init_sum_of_distances)
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n')
# Run block gradient descent from initial median.
converged = False
itrs_without_update = 0
while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update):
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n===========================================================')
print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.')
print('-----------------------------------------------------------')
# Initialize flags that tell us what happened in the iteration.
median_modified = False
node_maps_modified = False
decreased_order = False
increased_order = False
# Update the median. # @todo!!!!!!!!!!!!!!!!!!!!!!
median_modified = self.__update_median(graphs, median)
if not median_modified or self.__itrs[median_pos] == 0:
decreased_order = False
if not decreased_order or self.__itrs[median_pos] == 0:
increased_order = False
# Update the number of iterations without update of the median.
if median_modified or decreased_order or increased_order:
itrs_without_update = 0
else:
itrs_without_update += 1
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('Loading median to environment: ... ', end='')
# Load the median into the environment.
# @todo: should this function use the original node label?
self.__ged_env.load_nx_graph(median, gen_median_id)
self.__ged_env.init(self.__ged_env.get_init_type())
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('done.')
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('Updating induced costs: ... ', end='')

# Compute induced costs of the old node maps w.r.t. the updated median.
for graph_id in graph_ids:
# print(self.__ged_env.get_induced_cost(gen_median_id, graph_id))
# @todo: watch out if compute_induced_cost is correct, this may influence: increase/decrease order, induced_cost() in the following code.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
self.__ged_env.compute_induced_cost(gen_median_id, graph_id)
# print('---------------------------------------')
# print(self.__ged_env.get_induced_cost(gen_median_id, graph_id))
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('done.')
# Update the node maps.
node_maps_modified = self.__update_node_maps() # @todo

# Update the order of the median if no improvement can be found with the current order.
# Update the sum of distances.
old_sum_of_distances = self.__sum_of_distances
self.__sum_of_distances = 0
for graph_id in self.__node_maps_from_median:
self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: see above.
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('Old local SOD: ', old_sum_of_distances)
print('New local SOD: ', self.__sum_of_distances)
print('Best converged SOD: ', best_sum_of_distances)
print('Modified median: ', median_modified)
print('Modified node maps: ', node_maps_modified)
print('Decreased order: ', decreased_order)
print('Increased order: ', increased_order)
print('===========================================================\n')
converged = not (median_modified or node_maps_modified or decreased_order or increased_order)
self.__itrs[median_pos] += 1
# Update the best median.
if self.__sum_of_distances < self.__best_init_sum_of_distances:
best_sum_of_distances = self.__sum_of_distances
node_maps_from_best_median = self.__node_maps_from_median
best_median = median
# Update the number of converged descents.
if converged:
self.__num_converged_descents += 1
# Store the best encountered median.
self.__sum_of_distances = best_sum_of_distances
self.__node_maps_from_median = node_maps_from_best_median
self.__ged_env.load_nx_graph(best_median, gen_median_id)
self.__ged_env.init(self.__ged_env.get_init_type())
end_descent = time.time()
self.__runtime_converged = end_descent - start
# Refine the sum of distances and the node maps for the converged median.
self.__converged_sum_of_distances = self.__sum_of_distances
if self.__refine:
self.__improve_sum_of_distances(timer) # @todo
# Record end time, set runtime and reset the number of initial medians.
end = time.time()
self.__runtime = end - start
self.__num_random_inits = self.__desired_num_random_inits
# Print global information.
if self.__print_to_stdout != 0:
print('\n===========================================================')
print('Finished computation of generalized median graph.')
print('-----------------------------------------------------------')
print('Best SOD after initialization: ', self.__best_init_sum_of_distances)
print('Converged SOD: ', self.__converged_sum_of_distances)
if self.__refine:
print('Refined SOD: ', self.__sum_of_distances)
print('Overall runtime: ', self.__runtime)
print('Runtime of initialization: ', self.__runtime_initialized)
print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized)
if self.__refine:
print('Runtime of refinement: ', self.__runtime - self.__runtime_converged)
print('Number of initial medians: ', len(medians))
total_itr = 0
num_started_descents = 0
for itr in self.__itrs:
total_itr += itr
if itr > 0:
num_started_descents += 1
print('Size of graph collection: ', len(graph_ids))
print('Number of started descents: ', num_started_descents)
print('Number of converged descents: ', self.__num_converged_descents)
print('Overall number of iterations: ', total_itr)
print('Overall number of times the order decreased: ', self.__num_decrease_order)
print('Overall number of times the order increased: ', self.__num_increase_order)
print('===========================================================\n')
def get_sum_of_distances(self, state=''):
"""Returns the sum of distances.
Parameters
----------
state : string
The state of the estimator. Can be 'initialized' or 'converged'. Default: ""
Returns
-------
float
The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned.
"""
if not self.__median_available():
raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().')
if state == 'initialized':
return self.__best_init_sum_of_distances
if state == 'converged':
return self.__converged_sum_of_distances
return self.__sum_of_distances
def __set_default_options(self):
self.__init_type = 'RANDOM'
self.__num_random_inits = 10
self.__desired_num_random_inits = 10
self.__use_real_randomness = True
self.__seed = 0
self.__refine = True
self.__time_limit_in_sec = 0
self.__epsilon = 0.0001
self.__max_itrs = 100
self.__max_itrs_without_update = 3
self.__num_inits_increase_order = 10
self.__init_type_increase_order = 'K-MEANS++'
self.__max_itrs_increase_order = 10
self.__print_to_stdout = 2
def __construct_initial_medians(self, graph_ids, timer, initial_medians):
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n===========================================================')
print('Constructing initial median(s).')
print('-----------------------------------------------------------')
# Compute or sample the initial median(s).
initial_medians.clear()
if self.__init_type == 'MEDOID':
self.__compute_medoid(graph_ids, timer, initial_medians)
elif self.__init_type == 'MAX':
pass # @todo
# compute_max_order_graph_(graph_ids, initial_medians)
elif self.__init_type == 'MIN':
pass # @todo
# compute_min_order_graph_(graph_ids, initial_medians)
elif self.__init_type == 'MEAN':
pass # @todo
# compute_mean_order_graph_(graph_ids, initial_medians)
else:
pass # @todo
# sample_initial_medians_(graph_ids, initial_medians)

# Print information about current iteration.
if self.__print_to_stdout == 2:
print('===========================================================')
def __compute_medoid(self, graph_ids, timer, initial_medians):
# Use method selected for initialization phase.
self.__ged_env.set_method(self.__init_method, self.__init_options)
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout)
# Compute the medoid.
medoid_id = graph_ids[0]
best_sum_of_distances = np.inf
for g_id in graph_ids:
if timer.expired():
self.__state = AlgorithmState.CALLED
break
sum_of_distances = 0
for h_id in graph_ids:
self.__ged_env.run_method(g_id, h_id)
sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id)
if sum_of_distances < best_sum_of_distances:
best_sum_of_distances = sum_of_distances
medoid_id = g_id
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress.update(1)
initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n')
def __termination_criterion_met(self, converged, timer, itr, itrs_without_update):
if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False):
if self.__state == AlgorithmState.TERMINATED:
self.__state = AlgorithmState.INITIALIZED
return True
return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False)
def __update_median(self, graphs, median):
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('Updating median: ', end='')
# Store copy of the old median.
old_median = median.copy() # @todo: this is just a shallow copy.
# Update the node labels.
if self.__labeled_nodes:
self.__update_node_labels(graphs, median)
# Update the edges and their labels.
self.__update_edges(graphs, median)
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('done.')
return not self.__are_graphs_equal(median, old_median)
def __update_node_labels(self, graphs, median):
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('nodes ... ', end='')
# Iterate through all nodes of the median.
for i in range(0, nx.number_of_nodes(median)):
# print('i: ', i)
# Collect the labels of the substituted nodes.
node_labels = []
for graph_id, graph in graphs.items():
# print('graph_id: ', graph_id)
# print(self.__node_maps_from_median[graph_id])
k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i)
# print('k: ', k)
if k != np.inf:
node_labels.append(graph.nodes[k])
# Compute the median label and update the median.
if len(node_labels) > 0:
median_label = self.__ged_env.get_median_node_label(node_labels)
if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon:
nx.set_node_attributes(median, {i: median_label})
def __update_edges(self, graphs, median):
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('edges ... ', end='')
# Clear the adjacency lists of the median and reset number of edges to 0.
median_edges = list(median.edges)
for (head, tail) in median_edges:
median.remove_edge(head, tail)
# @todo: what if edge is not labeled?
# Iterate through all possible edges (i,j) of the median.
for i in range(0, nx.number_of_nodes(median)):
for j in range(i + 1, nx.number_of_nodes(median)):
# Collect the labels of the edges to which (i,j) is mapped by the node maps.
edge_labels = []
for graph_id, graph in graphs.items():
k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i)
l = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], j)
if k != np.inf and l != np.inf:
if graph.has_edge(k, l):
edge_labels.append(graph.edges[(k, l)])
# Compute the median edge label and the overall edge relabeling cost.
rel_cost = 0
median_label = self.__ged_env.get_edge_label(1)
if median.has_edge(i, j):
median_label = median.edges[(i, j)]
if self.__labeled_edges and len(edge_labels) > 0:
new_median_label = self.__ged_env.median_edge_label(edge_labels)
if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon:
median_label = new_median_label
for edge_label in edge_labels:
rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label)
# Update the median.
if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs):
median.add_edge(i, j, **median_label)
else:
if median.has_edge(i, j):
median.remove_edge(i, j)


def __update_node_maps(self):
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout)
# Update the node maps.
node_maps_were_modified = False
for graph_id in self.__node_maps_from_median:
self.__ged_env.run_method(self.__median_id, graph_id)
if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < self.__ged_env.get_induced_cost(self.__median_id, graph_id) - self.__epsilon: # @todo: see above.
self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) # @todo: node_map may not assigned.
node_maps_were_modified = True
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress.update(1)
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n')
# Return true if the node maps were modified.
return node_maps_were_modified
def __improve_sum_of_distances(self, timer):
pass
def __median_available(self):
return self.__median_id != np.inf
def __get_node_image_from_map(self, node_map, node):
"""
Return ID of the node mapping of `node` in `node_map`.

Parameters
----------
node_map : list[tuple(int, int)]
List of node maps where the mapping node is found.
node : int
The mapping node of this node is returned

Raises
------
Exception
If the node with ID `node` is not contained in the source nodes of the node map.

Returns
-------
int
ID of the mapping of `node`.
Notes
-----
This function is not implemented in the `ged::MedianGraphEstimator` class of the `GEDLIB` library. Instead it is a Python implementation of the `ged::NodeMap::image` function.
"""
if node < len(node_map):
return node_map[node][1] if node_map[node][1] < len(node_map) else np.inf
else:
raise Exception('The node with ID ', str(node), ' is not contained in the source nodes of the node map.')
return np.inf
def __are_graphs_equal(self, g1, g2):
"""
Check if the two graphs are equal.

Parameters
----------
g1 : NetworkX graph object
Graph 1 to be compared.
g2 : NetworkX graph object
Graph 2 to be compared.

Returns
-------
bool
True if the two graph are equal.
Notes
-----
This is not an identical check. Here the two graphs are equal if and only if their original_node_ids, nodes, all node labels, edges and all edge labels are equal. This function is specifically designed for class `MedianGraphEstimator` and should not be used elsewhere.
"""
# check original node ids.
if not g1.graph['original_node_ids'] == g2.graph['original_node_ids']:
return False
# check nodes.
nlist1 = [n for n in g1.nodes(data=True)]
nlist2 = [n for n in g2.nodes(data=True)]
if not nlist1 == nlist2:
return False
# check edges.
elist1 = [n for n in g1.edges(data=True)]
elist2 = [n for n in g2.edges(data=True)]
if not elist1 == elist2:
return False

return True
def compute_my_cost(g, h, node_map):
cost = 0.0
for node in g.nodes:
cost += 0

+ 15
- 0
gklearn/preimage/median_preimage_generator.py View File

@@ -0,0 +1,15 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 26 18:27:22 2020

@author: ljia
"""
from gklearn.preimage.preimage_generator import PreimageGenerator
# from gklearn.utils.dataset import Dataset

class MedianPreimageGenerator(PreimageGenerator):
def __init__(self, mge, dataset):
self.__mge = mge
self.__dataset = dataset

+ 108
- 0
gklearn/preimage/misc.py View File

@@ -0,0 +1,108 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 19 18:13:56 2020

@author: ljia
"""

def options_string_to_options_map(options_string):
"""Transforms an options string into an options map.
Parameters
----------
options_string : string
Options string of the form "[--<option> <arg>] [...]".
Return
------
options_map : dict{string : string}
Map with one key-value pair (<option>, <arg>) for each option contained in the string.
"""
if options_string == '':
return
options_map = {}
words = []
tokenize(options_string, ' ', words)
expect_option_name = True
for word in words:
if expect_option_name:
is_opt_name, word = is_option_name(word)
if is_opt_name:
option_name = word
if option_name in options_map:
raise Exception('Multiple specification of option "' + option_name + '".')
options_map[option_name] = ''
else:
raise Exception('Invalid options "' + options_string + '". Usage: options = "[--<option> <arg>] [...]"')
else:
is_opt_name, word = is_option_name(word)
if is_opt_name:
raise Exception('Invalid options "' + options_string + '". Usage: options = "[--<option> <arg>] [...]"')
else:
options_map[option_name] = word
expect_option_name = not expect_option_name
return options_map

def tokenize(sentence, sep, words):
"""Separates a sentence into words separated by sep (unless contained in single quotes).
Parameters
----------
sentence : string
The sentence that should be tokenized.
sep : string
The separator. Must be different from "'".
words : list[string]
The obtained words.
"""
outside_quotes = True
word_length = 0
pos_word_start = 0
for pos in range(0, len(sentence)):
if sentence[pos] == '\'':
if not outside_quotes and pos < len(sentence) - 1:
if sentence[pos + 1] != sep:
raise Exception('Sentence contains closing single quote which is followed by a char different from ' + sep + '.')
word_length += 1
outside_quotes = not outside_quotes
elif outside_quotes and sentence[pos] == sep:
if word_length > 0:
words.append(sentence[pos_word_start:pos_word_start + word_length])
pos_word_start = pos + 1
word_length = 0
else:
word_length += 1
if not outside_quotes:
raise Exception('Sentence contains unbalanced single quotes.')
if word_length > 0:
words.append(sentence[pos_word_start:pos_word_start + word_length])


def is_option_name(word):
"""Checks whether a word is an option name and, if so, removes the leading dashes.
Parameters
----------
word : string
Word.
return
------
True if word is of the form "--<option>".
word : string
The word without the leading dashes.
"""
if word[0] == '\'':
word = word[1:len(word) - 2]
return False, word
if len(word) < 3:
return False, word
if word[0] == '-' and word[1] == '-' and word[2] != '-':
word = word[2:]
return True, word
return False, word

+ 12
- 0
gklearn/preimage/preimage_generator.py View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 26 18:26:36 2020

@author: ljia
"""

class PreimageGenerator(object):
def __init__(self):
pass

+ 122
- 0
gklearn/preimage/python_code.py View File

@@ -0,0 +1,122 @@
elif opt_name == 'random-inits':
try:
num_random_inits_ = std::stoul(opt_val)
desired_num_random_inits_ = num_random_inits_

except:
raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"')

if num_random_inits_ <= 0:
raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"')

}
elif opt_name == 'randomness':
if opt_val == 'PSEUDO':
use_real_randomness_ = False

elif opt_val == 'REAL':
use_real_randomness_ = True

else:
raise Error('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"')

}
elif opt_name == 'stdout':
if opt_val == '0':
print_to_stdout_ = 0

elif opt_val == '1':
print_to_stdout_ = 1

elif opt_val == '2':
print_to_stdout_ = 2

else:
raise Error('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"')

}
elif opt_name == 'refine':
if opt_val == 'TRUE':
refine_ = True

elif opt_val == 'FALSE':
refine_ = False

else:
raise Error('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"')

}
elif opt_name == 'time-limit':
try:
time_limit_in_sec_ = std::stod(opt_val)

except:
raise Error('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit <convertible to double>] [...]')

}
elif opt_name == 'max-itrs':
try:
max_itrs_ = std::stoi(opt_val)

except:
raise Error('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs <convertible to int>] [...]')

}
elif opt_name == 'max-itrs-without-update':
try:
max_itrs_without_update_ = std::stoi(opt_val)

except:
raise Error('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update <convertible to int>] [...]')

}
elif opt_name == 'seed':
try:
seed_ = std::stoul(opt_val)

except:
raise Error('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed <convertible to int greater equal 0>] [...]')

}
elif opt_name == 'epsilon':
try:
epsilon_ = std::stod(opt_val)

except:
raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]')

if epsilon_ <= 0:
raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]')

}
elif opt_name == 'inits-increase-order':
try:
num_inits_increase_order_ = std::stoul(opt_val)

except:
raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"')

if num_inits_increase_order_ <= 0:
raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"')

}
elif opt_name == 'init-type-increase-order':
init_type_increase_order_ = opt_val
if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++':
raise Exception(std::string('Invalid argument ') + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"')

}
elif opt_name == 'max-itrs-increase-order':
try:
max_itrs_increase_order_ = std::stoi(opt_val)

except:
raise Error('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order <convertible to int>] [...]')

}
else:
std::string valid_options('[--init-type <arg>] [--random-inits <arg>] [--randomness <arg>] [--seed <arg>] [--stdout <arg>] ')
valid_options += '[--time-limit <arg>] [--max-itrs <arg>] [--epsilon <arg>] '
valid_options += '[--inits-increase-order <arg>] [--init-type-increase-order <arg>] [--max-itrs-increase-order <arg>]'
raise Error(std::string('Invalid option "') + opt_name + '". Usage: options = "' + valid_options + '"')


+ 91
- 0
gklearn/preimage/test_median_graph_estimator.py View File

@@ -0,0 +1,91 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 16 17:26:40 2020

@author: ljia
"""
def test_median_graph_estimator():
from gklearn.utils.graphfiles import loadDataset
from gklearn.preimage.median_graph_estimator import MedianGraphEstimator
from gklearn.gedlib import librariesImport, gedlibpy
from gklearn.preimage.utils import get_same_item_indices
from gklearn.preimage.ged import convertGraph
import multiprocessing

# estimator parameters.
init_type = 'MEDOID'
num_inits = 1
threads = multiprocessing.cpu_count()
time_limit = 60000
# algorithm parameters.
algo = 'IPFP'
initial_solutions = 40
algo_options_suffix = ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1'

edit_cost_name = 'LETTER2'
edit_cost_constants = [0.02987291, 0.0178211, 0.01431966, 0.001, 0.001]
ds_name = 'COIL-DEL'
# Load dataset.
# dataset = '../../datasets/COIL-DEL/COIL-DEL_A.txt'
dataset = '../../datasets/Letter-high/Letter-high_A.txt'
Gn, y_all = loadDataset(dataset)
y_idx = get_same_item_indices(y_all)
for i, (y, values) in enumerate(y_idx.items()):
Gn_i = [Gn[val] for val in values]
break
# Set up the environment.
ged_env = gedlibpy.GEDEnv()
# gedlibpy.restart_env()
ged_env.set_edit_cost(edit_cost_name, edit_cost_constant=edit_cost_constants)
for G in Gn_i:
ged_env.add_nx_graph(convertGraph(G, edit_cost_name), '')
graph_ids = ged_env.get_all_graph_ids()
set_median_id = ged_env.add_graph('set_median')
gen_median_id = ged_env.add_graph('gen_median')
ged_env.init(init_option='EAGER_WITHOUT_SHUFFLED_COPIES')
# Set up the estimator.
mge = MedianGraphEstimator(ged_env, constant_node_costs(edit_cost_name))
mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1')
mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type
mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --refine FALSE'# @todo: std::to_string(rng())
# Select the GED algorithm.
algo_options = '--threads ' + str(threads) + algo_options_suffix
mge.set_options(mge_options)
mge.set_init_method(algo, algo_options)
mge.set_descent_method(algo, algo_options)
# Run the estimator.
mge.run(graph_ids, set_median_id, gen_median_id)
# Get SODs.
sod_sm = mge.get_sum_of_distances('initialized')
sod_gm = mge.get_sum_of_distances('converged')
print('sod_sm, sod_gm: ', sod_sm, sod_gm)
# Get median graphs.
set_median = ged_env.get_nx_graph(set_median_id)
gen_median = ged_env.get_nx_graph(gen_median_id)
return set_median, gen_median


def constant_node_costs(edit_cost_name):
if edit_cost_name == 'NON_SYMBOLIC' or edit_cost_name == 'LETTER2' or edit_cost_name == 'LETTER':
return False
# elif edit_cost_name != '':
# # throw ged::Error("Invalid dataset " + dataset + ". Usage: ./median_tests <AIDS|Mutagenicity|Letter-high|Letter-med|Letter-low|monoterpenoides|SYNTHETICnew|Fingerprint|COIL-DEL>");
# return False
# return True


if __name__ == '__main__':
set_median, gen_median = test_median_graph_estimator()

+ 40
- 0
gklearn/preimage/timer.py View File

@@ -0,0 +1,40 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 23 09:52:50 2020

@author: ljia
"""
import time

class Timer(object):
"""A timer class that can be used by methods that support time limits.
Note
----
This is the Python implementation of `the C++ code in GEDLIB <https://github.com/dbblumenthal/gedlib/blob/master/src/env/timer.hpp>`__.
"""
def __init__(self, time_limit_in_sec):
"""Constructs a timer for a given time limit.
Parameters
----------
time_limit_in_sec : string
The time limit in seconds.
"""
self.__time_limit_in_sec = time_limit_in_sec
self.__start_time = time.time()
def expired(self):
"""Checks if the time limit has expired.
Return
------
Boolean true if the time limit has expired and false otherwise.
"""
if self.__time_limit_in_sec > 0:
runtime = time.time() - self.__start_time
return runtime >= self.__time_limit_in_sec
return False

+ 154
- 42
gklearn/preimage/xp_fit_method.py View File

@@ -12,6 +12,7 @@ from shutil import copyfile
import networkx as nx import networkx as nx
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import os import os
import time


from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL
from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes
@@ -69,6 +70,10 @@ def get_dataset(ds_name):
Gn, y_all = loadDataset(dataset) Gn, y_all = loadDataset(dataset)
elif ds_name == 'Synthie': elif ds_name == 'Synthie':
pass pass
elif ds_name == 'COIL-DEL':
dataset = '../../datasets/COIL-DEL/COIL-DEL_A.txt'
graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/COIL-DEL/'
Gn, y_all = loadDataset(dataset)
elif ds_name == 'COIL-RAG': elif ds_name == 'COIL-RAG':
pass pass
elif ds_name == 'COLORS-3': elif ds_name == 'COLORS-3':
@@ -109,7 +114,8 @@ def init_output_file(ds_name, gkernel, fit_method, dir_output):




def xp_fit_method_for_non_symbolic(parameters, save_results=True, initial_solutions=1, def xp_fit_method_for_non_symbolic(parameters, save_results=True, initial_solutions=1,
Gn_data=None, k_dis_data=None, Kmatrix=None):
Gn_data=None, k_dis_data=None, Kmatrix=None,
is_separate=False):
# 1. set parameters. # 1. set parameters.
print('1. setting parameters...') print('1. setting parameters...')
@@ -142,11 +148,12 @@ def xp_fit_method_for_non_symbolic(parameters, save_results=True, initial_soluti
dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None,
None, Kmatrix=Kmatrix, gkernel=gkernel) None, Kmatrix=Kmatrix, gkernel=gkernel)
else: else:
dis_mat = k_dis_data[0]
dis_max = k_dis_data[1]
dis_min = k_dis_data[2]
dis_mean = k_dis_data[3]
print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min, dis_mean)
# dis_mat = k_dis_data[0]
# dis_max = k_dis_data[1]
# dis_min = k_dis_data[2]
# dis_mean = k_dis_data[3]
# print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min, dis_mean)
pass




if save_results: if save_results:
@@ -213,8 +220,11 @@ def xp_fit_method_for_non_symbolic(parameters, save_results=True, initial_soluti
# get Gram matrix for this part of data. # get Gram matrix for this part of data.
if Kmatrix is not None: if Kmatrix is not None:
Kmatrix_sub = Kmatrix[values,:]
Kmatrix_sub = Kmatrix_sub[:,values]
if is_separate:
Kmatrix_sub = Kmatrix[i].copy()
else:
Kmatrix_sub = Kmatrix[values,:]
Kmatrix_sub = Kmatrix_sub[:,values]
else: else:
Kmatrix_sub = None Kmatrix_sub = None
@@ -395,7 +405,48 @@ def draw_Letter_graph(graph, file_prefix):
plt.savefig(file_prefix + '.eps', format='eps', dpi=300) plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
# plt.show() # plt.show()
plt.clf() plt.clf()
def compute_gm_for_each_class(Gn, y_all, gkernel, parallel='imap_unordered', is_separate=True):
if is_separate:
print('the Gram matrix is computed for each class.')
y_idx = get_same_item_indices(y_all)
Kmatrix = []
run_time = []
k_dis_data = []
for i, (y, values) in enumerate(y_idx.items()):
print('The ', str(i), ' class:')
Gn_i = [Gn[val] for val in values]
time0 = time.time()
Kmatrix.append(compute_kernel(Gn_i, gkernel, None, None, True, parallel=parallel))
run_time.append(time.time() - time0)
k_dis_data.append(kernel_distance_matrix(Gn_i, None, None,
Kmatrix=Kmatrix[i], gkernel=gkernel, verbose=True))
np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate)
dis_max = np.max([item[1] for item in k_dis_data])
dis_min = np.min([item[2] for item in k_dis_data])
dis_mean = np.mean([item[3] for item in k_dis_data])
print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min,
dis_mean)

else:
time0 = time.time()
Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel=parallel)
run_time = time.time() - time0
np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate)
k_dis_data = kernel_distance_matrix(Gn, None, None,
Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
print('the Gram matrix is computed for the whole dataset.')
print('pair distances - dis_max, dis_min, dis_mean:', k_dis_data[1],
k_dis_data[2], k_dis_data[3])
print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean]
return Kmatrix, run_time, k_dis_data


if __name__ == "__main__": if __name__ == "__main__":
# #### xp 1: Letter-high, spkernel. # #### xp 1: Letter-high, spkernel.
@@ -573,7 +624,7 @@ if __name__ == "__main__":
# Kmatrix=Kmatrix) # Kmatrix=Kmatrix)
# #### xp 5: Fingerprint, sspkernel, using LETTER2.
# #### xp 5: Fingerprint, sspkernel, using LETTER2, only node attrs.
# # load dataset. # # load dataset.
# print('getting dataset and computing kernel distance matrix first...') # print('getting dataset and computing kernel distance matrix first...')
# ds_name = 'Fingerprint' # ds_name = 'Fingerprint'
@@ -593,17 +644,17 @@ if __name__ == "__main__":
# del G.edges[edge]['attributes'] # del G.edges[edge]['attributes']
# del G.edges[edge]['orient'] # del G.edges[edge]['orient']
# del G.edges[edge]['angle'] # del G.edges[edge]['angle']
# Gn = Gn[805:815]
# y_all = y_all[805:815]
## Gn = Gn[805:815]
## y_all = y_all[805:815]
# for G in Gn: # for G in Gn:
# G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl' # G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
# #
# # compute/read Gram matrix and pair distances. # # compute/read Gram matrix and pair distances.
# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
# Kmatrix=Kmatrix)
## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
## Kmatrix = gmfile['Kmatrix']
## Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
## np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
## Kmatrix=Kmatrix)
# gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
# Kmatrix = gmfile['Kmatrix']
## run_time = gmfile['run_time'] ## run_time = gmfile['run_time']
## Kmatrix = Kmatrix[[0,1,2,3,4],:] ## Kmatrix = Kmatrix[[0,1,2,3,4],:]
## Kmatrix = Kmatrix[:,[0,1,2,3,4]] ## Kmatrix = Kmatrix[:,[0,1,2,3,4]]
@@ -612,11 +663,7 @@ if __name__ == "__main__":
# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True) # Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
## Kmatrix = np.zeros((len(Gn), len(Gn))) ## Kmatrix = np.zeros((len(Gn), len(Gn)))
## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 ## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
#
# # compute pair distances.
## dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
## Kmatrix=None, gkernel=gkernel, verbose=True)
## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
#
# # fitting and computing. # # fitting and computing.
# fit_methods = ['k-graphs', 'random', 'random', 'random'] # fit_methods = ['k-graphs', 'random', 'random', 'random']
# for fit_method in fit_methods: # for fit_method in fit_methods:
@@ -627,7 +674,8 @@ if __name__ == "__main__":
# 'edit_cost_name': 'LETTER2', # 'edit_cost_name': 'LETTER2',
# 'ged_method': 'mIPFP', # 'ged_method': 'mIPFP',
# 'attr_distance': 'euclidean', # 'attr_distance': 'euclidean',
# 'fit_method': fit_method}
# 'fit_method': fit_method,
# 'init_ecc': [1,1,1,1,1]} # [0.525, 0.525, 0.001, 0.125, 0.125]}
# xp_fit_method_for_non_symbolic(parameters, save_results=True, # xp_fit_method_for_non_symbolic(parameters, save_results=True,
# initial_solutions=40, # initial_solutions=40,
# Gn_data = [Gn, y_all, graph_dir], # Gn_data = [Gn, y_all, graph_dir],
@@ -773,38 +821,102 @@ if __name__ == "__main__":
# Kmatrix=Kmatrix) # Kmatrix=Kmatrix)


#### xp 9: Letter-low, spkernel.
# #### xp 9: Letter-low, spkernel.
# # load dataset.
# print('getting dataset and computing kernel distance matrix first...')
# ds_name = 'Letter-low'
# gkernel = 'spkernel'
# Gn, y_all, graph_dir = get_dataset(ds_name)
# # remove graphs without nodes and edges.
# Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0
# and nx.number_of_edges(G) != 0)]
# idx = [G[0] for G in Gn]
# Gn = [G[1] for G in Gn]
# y_all = [y_all[i] for i in idx]
## Gn = Gn[0:50]
## y_all = y_all[0:50]
#
# # compute/read Gram matrix and pair distances.
# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
# Kmatrix=Kmatrix)
## gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
## Kmatrix = gmfile['Kmatrix']
## run_time = gmfile['run_time']
## Kmatrix = Kmatrix[[0,1,2,3,4],:]
## Kmatrix = Kmatrix[:,[0,1,2,3,4]]
## print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
## Kmatrix = np.zeros((len(Gn), len(Gn)))
## dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
#
# # fitting and computing.
# fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
# for fit_method in fit_methods:
# print('\n-------------------------------------')
# print('fit method:', fit_method)
# parameters = {'ds_name': ds_name,
# 'gkernel': gkernel,
# 'edit_cost_name': 'LETTER2',
# 'ged_method': 'mIPFP',
# 'attr_distance': 'euclidean',
# 'fit_method': fit_method,
# 'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]}
# print('parameters: ', parameters)
# xp_fit_method_for_non_symbolic(parameters, save_results=True,
# initial_solutions=40,
# Gn_data = [Gn, y_all, graph_dir],
# k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
# Kmatrix=Kmatrix)
#### xp 5: COIL-DEL, sspkernel, using LETTER2, only node attrs.
# load dataset. # load dataset.
print('getting dataset and computing kernel distance matrix first...') print('getting dataset and computing kernel distance matrix first...')
ds_name = 'Letter-low'
gkernel = 'spkernel'
ds_name = 'COIL-DEL'
gkernel = 'structuralspkernel'
Gn, y_all, graph_dir = get_dataset(ds_name) Gn, y_all, graph_dir = get_dataset(ds_name)
# remove graphs without nodes and edges. # remove graphs without nodes and edges.
Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0
and nx.number_of_edges(G) != 0)]
Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 0]
# and nx.number_of_edges(G) != 0)]
idx = [G[0] for G in Gn] idx = [G[0] for G in Gn]
Gn = [G[1] for G in Gn] Gn = [G[1] for G in Gn]
y_all = [y_all[i] for i in idx] y_all = [y_all[i] for i in idx]
# Gn = Gn[0:50]
# y_all = y_all[0:50]
# remove unused labels.
for G in Gn:
G.graph['edge_labels'] = []
for edge in G.edges:
del G.edges[edge]['bond_type']
del G.edges[edge]['valence']
# Gn = Gn[805:815]
# y_all = y_all[805:815]
for G in Gn:
G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
# compute/read Gram matrix and pair distances. # compute/read Gram matrix and pair distances.
Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
Kmatrix=Kmatrix)
is_separate = True
Kmatrix, run_time, k_dis_data = compute_gm_for_each_class(Gn,
y_all,
gkernel,
parallel='imap_unordered',
is_separate=is_separate)
# Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
# np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm',
# Kmatrix=Kmatrix)
# gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz') # gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
# Kmatrix = gmfile['Kmatrix'] # Kmatrix = gmfile['Kmatrix']
# run_time = gmfile['run_time'] # run_time = gmfile['run_time']
# Kmatrix = Kmatrix[[0,1,2,3,4],:] # Kmatrix = Kmatrix[[0,1,2,3,4],:]
# Kmatrix = Kmatrix[:,[0,1,2,3,4]] # Kmatrix = Kmatrix[:,[0,1,2,3,4]]
# print('\nTime to compute Gram matrix for the whole dataset: ', run_time) # print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
# dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None,
# Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
# Kmatrix = np.zeros((len(Gn), len(Gn))) # Kmatrix = np.zeros((len(Gn), len(Gn)))
# dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0 # dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
# fitting and computing. # fitting and computing.
fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
fit_methods = ['k-graphs', 'random', 'random', 'random']
for fit_method in fit_methods: for fit_method in fit_methods:
print('\n-------------------------------------') print('\n-------------------------------------')
print('fit method:', fit_method) print('fit method:', fit_method)
@@ -814,10 +926,10 @@ if __name__ == "__main__":
'ged_method': 'mIPFP', 'ged_method': 'mIPFP',
'attr_distance': 'euclidean', 'attr_distance': 'euclidean',
'fit_method': fit_method, 'fit_method': fit_method,
'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]}
print('parameters: ', parameters)
'init_ecc': [3,3,1,3,3]} # [0.525, 0.525, 0.001, 0.125, 0.125]}
xp_fit_method_for_non_symbolic(parameters, save_results=True, xp_fit_method_for_non_symbolic(parameters, save_results=True,
initial_solutions=40, initial_solutions=40,
Gn_data = [Gn, y_all, graph_dir],
k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
Kmatrix=Kmatrix)
Gn_data=[Gn, y_all, graph_dir],
k_dis_data=k_dis_data,
Kmatrix=Kmatrix,
is_separate=is_separate)

Loading…
Cancel
Save