From d97bfe954c0d4461c7ab973fc9909a393c2d4fc2 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 26 Jun 2020 12:44:48 +0200 Subject: [PATCH 1/7] Add class MedianGraphEstimatorPy and MedianPreimageGeneratorPy which use GEDEnv implemented in pure Python. --- gklearn/ged/env/ged_data.py | 41 + gklearn/ged/env/ged_env.py | 329 ++++- gklearn/ged/median/__init__.py | 1 + gklearn/ged/median/median_graph_estimator_py.py | 1711 ++++++++++++++++++++++ gklearn/preimage/__init__.py | 3 +- gklearn/preimage/median_preimage_generator_py.py | 1035 +++++++++++++ 6 files changed, 3117 insertions(+), 3 deletions(-) create mode 100644 gklearn/ged/median/median_graph_estimator_py.py create mode 100644 gklearn/preimage/median_preimage_generator_py.py diff --git a/gklearn/ged/env/ged_data.py b/gklearn/ged/env/ged_data.py index b09805c..9cef41a 100644 --- a/gklearn/ged/env/ged_data.py +++ b/gklearn/ged/env/ged_data.py @@ -41,6 +41,17 @@ class GEDData(object): return len(self._graphs) + def graph(self, graph_id): + """ + /*! + * @brief Provides access to a graph. + * @param[in] graph_id The ID of the graph. + * @return Constant reference to the graph with ID @p graph_id. + */ + """ + return self._graphs[graph_id] + + def shuffled_graph_copies_available(self): """ /*! @@ -51,6 +62,16 @@ class GEDData(object): return (self._init_type == Options.InitType.EAGER_WITH_SHUFFLED_COPIES or self._init_type == Options.InitType.LAZY_WITH_SHUFFLED_COPIES) + def num_graphs_without_shuffled_copies(self): + """ + /*! + * @brief Returns the number of graphs in the instance without the shuffled copies. + * @return Number of graphs without shuffled copies contained in the instance. + */ + """ + return self._num_graphs_without_shuffled_copies + + def node_cost(self, label1, label2): """ /*! @@ -177,5 +198,25 @@ class GEDData(object): self._delete_edit_cost = True + def _node_label_to_id(self, node_label): + n_id = 0 + for n_l in self._node_labels: + if n_l == node_label: + return n_id + 1 + n_id += 1 + self._node_labels.append(node_label) + return n_id + 1 + + + def _edge_label_to_id(self, edge_label): + e_id = 0 + for e_l in self._edge_labels: + if e_l == edge_label: + return e_id + 1 + e_id += 1 + self._edge_labels.append(edge_label) + return e_id + 1 + + def _eager_init(self): return (self._init_type == Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES or self._init_type == Options.InitType.EAGER_WITH_SHUFFLED_COPIES) \ No newline at end of file diff --git a/gklearn/ged/env/ged_env.py b/gklearn/ged/env/ged_env.py index 9fbdd4a..e6dc2f6 100644 --- a/gklearn/ged/env/ged_env.py +++ b/gklearn/ged/env/ged_env.py @@ -63,6 +63,23 @@ class GEDEnv(object): return graph_id + def clear_graph(self, graph_id): + """ + /*! + * @brief Clears and de-initializes a graph that has previously been added to the environment. Call init() after calling this method. + * @param[in] graph_id ID of graph that has to be cleared. + */ + """ + if graph_id > self.__ged_data.num_graphs_without_shuffled_copies(): + raise Exception('The graph', self.get_graph_name(graph_id), 'has not been added to the environment.') + self.__ged_data._graphs[graph_id].clear() + self.__original_to_internal_node_ids[graph_id].clear() + self.__internal_to_original_node_ids[graph_id].clear() + self.__ged_data._strings_to_internal_node_ids[graph_id].clear() + self.__ged_data._internal_node_ids_to_strings[graph_id].clear() + self.__initialized = False + + def add_node(self, graph_id, node_id, node_label): """ /*! @@ -80,7 +97,9 @@ class GEDEnv(object): self.__internal_to_original_node_ids[graph_id][internal_node_id] = node_id self.__ged_data._strings_to_internal_node_ids[graph_id][str(node_id)] = internal_node_id self.__ged_data._internal_node_ids_to_strings[graph_id][internal_node_id] = str(node_id) - # @todo: node_label_to_id_ + self.__ged_data._node_label_to_id(node_label) + label_id = self.__ged_data._node_label_to_id(node_label) + # @todo: ged_data_.graphs_[graph_id].set_label def add_edge(self, graph_id, nd_from, nd_to, edge_label, ignore_duplicates=True): @@ -98,7 +117,8 @@ class GEDEnv(object): self.__initialized = False # @todo: check ignore_duplicates. self.__ged_data._graphs[graph_id].add_edge(self.__original_to_internal_node_ids[graph_id][nd_from], self.__original_to_internal_node_ids[graph_id][nd_to], label=edge_label) - # @todo: edge_id and label_id, edge_label_to_id_. + label_id = self.__ged_data._edge_label_to_id(edge_label) + # @todo: ged_data_.graphs_[graph_id].set_label def add_nx_graph(self, g, classe, ignore_duplicates=True) : @@ -123,6 +143,40 @@ class GEDEnv(object): return graph_id + def load_nx_graph(self, nx_graph, graph_id, graph_name='', graph_class=''): + """ + Loads NetworkX Graph into the GED environment. + + Parameters + ---------- + nx_graph : NetworkX Graph object + The graph that should be loaded. + + graph_id : int or None + The ID of a graph contained the environment (overwrite existing graph) or add new graph if `None`. + + graph_name : string, optional + The name of newly added graph. The default is ''. Has no effect unless `graph_id` equals `None`. + + graph_class : string, optional + The class of newly added graph. The default is ''. Has no effect unless `graph_id` equals `None`. + + Returns + ------- + int + The ID of the newly loaded graph. + """ + if graph_id is None: # @todo: undefined. + graph_id = self.add_graph(graph_name, graph_class) + else: + self.clear_graph(graph_id) + for node in nx_graph.nodes: + self.add_node(graph_id, node, tuple(sorted(nx_graph.nodes[node].items(), key=lambda kv: kv[0]))) + for edge in nx_graph.edges: + self.add_edge(graph_id, edge[0], edge[1], tuple(sorted(nx_graph.edges[(edge[0], edge[1])].items(), key=lambda kv: kv[0]))) + return graph_id + + def init(self, init_type=Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES, print_to_stdout=False): if isinstance(init_type, str): init_type = OptionsStringMap.InitType[init_type] @@ -154,6 +208,26 @@ class GEDEnv(object): self.__new_graph_ids.clear() + def is_initialized(self): + """ + /*! + * @brief Check if the environment is initialized. + * @return True if the environment is initialized. + */ + """ + return self.__initialized + + + def get_init_type(self): + """ + /*! + * @brief Returns the initialization type of the last initialization. + * @return Initialization type. + */ + """ + return self.__ged_data._init_type + + def set_method(self, method, options=''): """ /*! @@ -263,6 +337,58 @@ class GEDEnv(object): self.__ged_method.init() + def get_num_node_labels(self): + """ + /*! + * @brief Returns the number of node labels. + * @return Number of pairwise different node labels contained in the environment. + * @note If @p 1 is returned, the nodes are unlabeled. + */ + """ + return len(self.__ged_data._node_labels) + + + def get_node_label(self, label_id, to_dict=True): + """ + /*! + * @brief Returns node label. + * @param[in] label_id ID of node label that should be returned. Must be between 1 and num_node_labels(). + * @return Node label for selected label ID. + */ + """ + if label_id < 1 or label_id > self.get_num_node_labels(): + raise Exception('The environment does not contain a node label with ID', str(label_id), '.') + if to_dict: + return dict(self.__ged_data._node_labels[label_id - 1]) + return self.__ged_data._node_labels[label_id - 1] + + + def get_num_edge_labels(self): + """ + /*! + * @brief Returns the number of edge labels. + * @return Number of pairwise different edge labels contained in the environment. + * @note If @p 1 is returned, the edges are unlabeled. + */ + """ + return len(self.__ged_data._edge_labels) + + + def get_edge_label(self, label_id, to_dict=True): + """ + /*! + * @brief Returns edge label. + * @param[in] label_id ID of edge label that should be returned. Must be between 1 and num_node_labels(). + * @return Edge label for selected label ID. + */ + """ + if label_id < 1 or label_id > self.get_num_edge_labels(): + raise Exception('The environment does not contain an edge label with ID', str(label_id), '.') + if to_dict: + return dict(self.__ged_data._edge_labels[label_id - 1]) + return self.__ged_data._edge_labels[label_id - 1] + + def get_upper_bound(self, g_id, h_id): """ /*! @@ -363,6 +489,205 @@ class GEDEnv(object): .. note:: I don't know how to connect the two map to reconstruct the adjacence matrix. Please come back when I know how it's work ! """ return self.get_node_map(g_id, h_id).backward_map + + + def compute_induced_cost(self, g_id, h_id, node_map): + """ + /*! + * @brief Computes the edit cost between two graphs induced by a node map. + * @param[in] g_id ID of input graph. + * @param[in] h_id ID of input graph. + * @param[in,out] node_map Node map whose induced edit cost is to be computed. + */ + """ + self.__ged_data.compute_induced_cost(self.__ged_data._graphs[g_id], self.__ged_data._graphs[h_id], node_map) + + + def get_nx_graph(self, graph_id): + """ + * @brief Returns NetworkX.Graph() representation. + * @param[in] graph_id ID of the selected graph. + """ + graph = nx.Graph() # @todo: add graph attributes. + graph.graph['id'] = graph_id + + nb_nodes = self.get_graph_num_nodes(graph_id) + original_node_ids = self.get_original_node_ids(graph_id) + node_labels = self.get_graph_node_labels(graph_id, to_dict=True) + graph.graph['original_node_ids'] = original_node_ids + + for node_id in range(0, nb_nodes): + graph.add_node(node_id, **node_labels[node_id]) + + edges = self.get_graph_edges(graph_id, to_dict=True) + for (head, tail), labels in edges.items(): + graph.add_edge(head, tail, **labels) + + return graph + + + def get_graph_node_labels(self, graph_id, to_dict=True): + """ + Searchs and returns all the labels of nodes on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The list of nodes' labels on the selected graph + :rtype: list[dict{string : string}] + + .. seealso:: get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_original_node_ids(), get_graph_edges(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + graph = self.__ged_data.graph(graph_id) + node_labels = [] + for n in graph.nodes(): + node_labels.append(graph.nodes[n]['label']) + if to_dict: + return [dict(i) for i in node_labels] + return node_labels + + + def get_graph_edges(self, graph_id, to_dict=True): + """ + Searchs and returns all the edges on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The list of edges on the selected graph + :rtype: dict{tuple(size_t, size_t) : dict{string : string}} + + .. seealso::get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_original_node_ids(), get_graph_node_labels(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + graph = self.__ged_data.graph(graph_id) + if to_dict: + edges = {} + for n1, n2, attr in graph.edges(data=True): + edges[(n1, n2)] = dict(attr['label']) + return edges + return {(n1, n2): attr['label'] for n1, n2, attr in graph.edges(data=True)} + + + + def get_graph_name(self, graph_id): + """ + /*! + * @brief Returns the graph name. + * @param[in] graph_id ID of an input graph that has been added to the environment. + * @return Name of the input graph. + */ + """ + return self.__ged_data._graph_names[graph_id] + + + def get_graph_num_nodes(self, graph_id): + """ + /*! + * @brief Returns the number of nodes. + * @param[in] graph_id ID of an input graph that has been added to the environment. + * @return Number of nodes in the graph. + */ + """ + return nx.number_of_nodes(self.__ged_data.graph(graph_id)) + + + def get_original_node_ids(self, graph_id): + """ + Searchs and returns all th Ids of nodes on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The list of IDs's nodes on the selected graph + :rtype: list[string] + + .. seealso::get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_graph_node_labels(), get_graph_edges(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + return [i for i in self.__internal_to_original_node_ids[graph_id].values()] + + + def get_node_rel_cost(self, node_label_1, node_label_2): + """ + /*! + * @brief Returns node relabeling cost. + * @param[in] node_label_1 First node label. + * @param[in] node_label_2 Second node label. + * @return Node relabeling cost for the given node labels. + */ + """ + if isinstance(node_label_1, dict): + node_label_1 = tuple(sorted(node_label_1.items(), key=lambda kv: kv[0])) + if isinstance(node_label_2, dict): + node_label_2 = tuple(sorted(node_label_2.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.node_rel_cost_fun(node_label_1, node_label_2) + + + def get_node_del_cost(self, node_label): + """ + /*! + * @brief Returns node deletion cost. + * @param[in] node_label Node label. + * @return Cost of deleting node with given label. + */ + """ + if isinstance(node_label, dict): + node_label = tuple(sorted(node_label.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.node_del_cost_fun(node_label) + + + def get_node_ins_cost(self, node_label): + """ + /*! + * @brief Returns node insertion cost. + * @param[in] node_label Node label. + * @return Cost of inserting node with given label. + */ + """ + if isinstance(node_label, dict): + node_label = tuple(sorted(node_label.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.node_ins_cost_fun(node_label) + + + def get_edge_rel_cost(self, edge_label_1, edge_label_2): + """ + /*! + * @brief Returns edge relabeling cost. + * @param[in] edge_label_1 First edge label. + * @param[in] edge_label_2 Second edge label. + * @return Edge relabeling cost for the given edge labels. + */ + """ + if isinstance(edge_label_1, dict): + edge_label_1 = tuple(sorted(edge_label_1.items(), key=lambda kv: kv[0])) + if isinstance(edge_label_2, dict): + edge_label_2 = tuple(sorted(edge_label_2.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.edge_rel_cost_fun(edge_label_1, edge_label_2) + + + def get_edge_del_cost(self, edge_label): + """ + /*! + * @brief Returns edge deletion cost. + * @param[in] edge_label Edge label. + * @return Cost of deleting edge with given label. + */ + """ + if isinstance(edge_label, dict): + edge_label = tuple(sorted(edge_label.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.edge_del_cost_fun(edge_label) + + + def get_edge_ins_cost(self, edge_label): + """ + /*! + * @brief Returns edge insertion cost. + * @param[in] edge_label Edge label. + * @return Cost of inserting edge with given label. + */ + """ + if isinstance(edge_label, dict): + edge_label = tuple(sorted(edge_label.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.edge_ins_cost_fun(edge_label) def get_all_graph_ids(self): diff --git a/gklearn/ged/median/__init__.py b/gklearn/ged/median/__init__.py index 9a291ae..0a96c31 100644 --- a/gklearn/ged/median/__init__.py +++ b/gklearn/ged/median/__init__.py @@ -1,2 +1,3 @@ from gklearn.ged.median.median_graph_estimator import MedianGraphEstimator +from gklearn.ged.median.median_graph_estimator_py import MedianGraphEstimatorPy from gklearn.ged.median.utils import constant_node_costs, mge_options_to_string diff --git a/gklearn/ged/median/median_graph_estimator_py.py b/gklearn/ged/median/median_graph_estimator_py.py new file mode 100644 index 0000000..41dc3c9 --- /dev/null +++ b/gklearn/ged/median/median_graph_estimator_py.py @@ -0,0 +1,1711 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 16 18:04:55 2020 + +@author: ljia +""" +import numpy as np +from gklearn.ged.env import AlgorithmState, NodeMap +from gklearn.ged.util import misc +from gklearn.utils import Timer +import time +from tqdm import tqdm +import sys +import networkx as nx +import multiprocessing +from multiprocessing import Pool +from functools import partial + + +class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined node? + """Estimate median graphs using the pure Python version of GEDEnv. + """ + + def __init__(self, ged_env, constant_node_costs): + """Constructor. + + Parameters + ---------- + ged_env : gklearn.gedlib.gedlibpy.GEDEnv + Initialized GED environment. The edit costs must be set by the user. + + constant_node_costs : Boolean + Set to True if the node relabeling costs are constant. + """ + self.__ged_env = ged_env + self.__init_method = 'BRANCH_FAST' + self.__init_options = '' + self.__descent_method = 'BRANCH_FAST' + self.__descent_options = '' + self.__refine_method = 'IPFP' + self.__refine_options = '' + self.__constant_node_costs = constant_node_costs + self.__labeled_nodes = (ged_env.get_num_node_labels() > 1) + self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1, to_dict=False)) + self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1, to_dict=False)) + self.__labeled_edges = (ged_env.get_num_edge_labels() > 1) + self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1, to_dict=False)) + self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1, to_dict=False)) + self.__init_type = 'RANDOM' + self.__num_random_inits = 10 + self.__desired_num_random_inits = 10 + self.__use_real_randomness = True + self.__seed = 0 + self.__parallel = True + self.__update_order = True + self.__sort_graphs = True # sort graphs by size when computing GEDs. + self.__refine = True + self.__time_limit_in_sec = 0 + self.__epsilon = 0.0001 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__num_inits_increase_order = 10 + self.__init_type_increase_order = 'K-MEANS++' + self.__max_itrs_increase_order = 10 + self.__print_to_stdout = 2 + self.__median_id = np.inf # @todo: check + self.__node_maps_from_median = {} + self.__sum_of_distances = 0 + self.__best_init_sum_of_distances = np.inf + self.__converged_sum_of_distances = np.inf + self.__runtime = None + self.__runtime_initialized = None + self.__runtime_converged = None + self.__itrs = [] # @todo: check: {} ? + self.__num_decrease_order = 0 + self.__num_increase_order = 0 + self.__num_converged_descents = 0 + self.__state = AlgorithmState.TERMINATED + self.__label_names = {} + + if ged_env is None: + raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') + elif not ged_env.is_initialized(): + raise Exception('The GED environment is uninitialized. Call gedlibpy.GEDEnv.init() before passing it to the constructor of MedianGraphEstimator.') + + + def set_options(self, options): + """Sets the options of the estimator. + + Parameters + ---------- + options : string + String that specifies with which options to run the estimator. + """ + self.__set_default_options() + options_map = misc.options_string_to_options_map(options) + for opt_name, opt_val in options_map.items(): + if opt_name == 'init-type': + self.__init_type = opt_val + if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN': + raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"') + elif opt_name == 'random-inits': + try: + self.__num_random_inits = int(opt_val) + self.__desired_num_random_inits = self.__num_random_inits + except: + raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') + + if self.__num_random_inits <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') + + elif opt_name == 'randomness': + if opt_val == 'PSEUDO': + self.__use_real_randomness = False + + elif opt_val == 'REAL': + self.__use_real_randomness = True + + else: + raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') + + elif opt_name == 'stdout': + if opt_val == '0': + self.__print_to_stdout = 0 + + elif opt_val == '1': + self.__print_to_stdout = 1 + + elif opt_val == '2': + self.__print_to_stdout = 2 + + else: + raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') + + elif opt_name == 'parallel': + if opt_val == 'TRUE': + self.__parallel = True + + elif opt_val == 'FALSE': + self.__parallel = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option parallel. Usage: options = "[--parallel TRUE|FALSE] [...]"') + + elif opt_name == 'update-order': + if opt_val == 'TRUE': + self.__update_order = True + + elif opt_val == 'FALSE': + self.__update_order = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option update-order. Usage: options = "[--update-order TRUE|FALSE] [...]"') + + elif opt_name == 'sort-graphs': + if opt_val == 'TRUE': + self.__sort_graphs = True + + elif opt_val == 'FALSE': + self.__sort_graphs = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option sort-graphs. Usage: options = "[--sort-graphs TRUE|FALSE] [...]"') + + elif opt_name == 'refine': + if opt_val == 'TRUE': + self.__refine = True + + elif opt_val == 'FALSE': + self.__refine = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') + + elif opt_name == 'time-limit': + try: + self.__time_limit_in_sec = float(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]') + + elif opt_name == 'max-itrs': + try: + self.__max_itrs = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]') + + elif opt_name == 'max-itrs-without-update': + try: + self.__max_itrs_without_update = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]') + + elif opt_name == 'seed': + try: + self.__seed = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]') + + elif opt_name == 'epsilon': + try: + self.__epsilon = float(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') + + if self.__epsilon <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') + + elif opt_name == 'inits-increase-order': + try: + self.__num_inits_increase_order = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') + + if self.__num_inits_increase_order <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') + + elif opt_name == 'init-type-increase-order': + self.__init_type_increase_order = opt_val + if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': + raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') + + elif opt_name == 'max-itrs-increase-order': + try: + self.__max_itrs_increase_order = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]') + + else: + valid_options = '[--init-type ] [--random-inits ] [--randomness ] [--seed ] [--stdout ] ' + valid_options += '[--time-limit ] [--max-itrs ] [--epsilon ] ' + valid_options += '[--inits-increase-order ] [--init-type-increase-order ] [--max-itrs-increase-order ]' + raise Exception('Invalid option "' + opt_name + '". Usage: options = "' + valid_options + '"') + + + def set_init_method(self, init_method, init_options={}): + """Selects method to be used for computing the initial medoid graph. + + Parameters + ---------- + init_method : string + The selected method. Default: ged::Options::GEDMethod::BRANCH_UNIFORM. + + init_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect unless "--init-type MEDOID" is passed to set_options(). + """ + self.__init_method = init_method; + self.__init_options = init_options; + + + def set_descent_method(self, descent_method, descent_options=''): + """Selects method to be used for block gradient descent.. + + Parameters + ---------- + descent_method : string + The selected method. Default: ged::Options::GEDMethod::BRANCH_FAST. + + descent_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect unless "--init-type MEDOID" is passed to set_options(). + """ + self.__descent_method = descent_method; + self.__descent_options = descent_options; + + + def set_refine_method(self, refine_method, refine_options): + """Selects method to be used for improving the sum of distances and the node maps for the converged median. + + Parameters + ---------- + refine_method : string + The selected method. Default: "IPFP". + + refine_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect if "--refine FALSE" is passed to set_options(). + """ + self.__refine_method = refine_method + self.__refine_options = refine_options + + + def run(self, graph_ids, set_median_id, gen_median_id): + """Computes a generalized median graph. + + Parameters + ---------- + graph_ids : list[integer] + The IDs of the graphs for which the median should be computed. Must have been added to the environment passed to the constructor. + + set_median_id : integer + The ID of the computed set-median. A dummy graph with this ID must have been added to the environment passed to the constructor. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). + + + gen_median_id : integer + The ID of the computed generalized median. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). + """ + # Sanity checks. + if len(graph_ids) == 0: + raise Exception('Empty vector of graph IDs, unable to compute median.') + all_graphs_empty = True + for graph_id in graph_ids: + if self.__ged_env.get_graph_num_nodes(graph_id) > 0: + all_graphs_empty = False + break + if all_graphs_empty: + raise Exception('All graphs in the collection are empty.') + + # Start timer and record start time. + start = time.time() + timer = Timer(self.__time_limit_in_sec) + self.__median_id = gen_median_id + self.__state = AlgorithmState.TERMINATED + + # Get NetworkX graph representations of the input graphs. + graphs = {} + for graph_id in graph_ids: + # @todo: get_nx_graph() function may need to be modified according to the coming code. + graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id) +# print(self.__ged_env.get_graph_internal_id(0)) +# print(graphs[0].graph) +# print(graphs[0].nodes(data=True)) +# print(graphs[0].edges(data=True)) +# print(nx.adjacency_matrix(graphs[0])) + + # Construct initial medians. + medians = [] + self.__construct_initial_medians(graph_ids, timer, medians) + end_init = time.time() + self.__runtime_initialized = end_init - start +# print(medians[0].graph) +# print(medians[0].nodes(data=True)) +# print(medians[0].edges(data=True)) +# print(nx.adjacency_matrix(medians[0])) + + # Reset information about iterations and number of times the median decreases and increases. + self.__itrs = [0] * len(medians) + self.__num_decrease_order = 0 + self.__num_increase_order = 0 + self.__num_converged_descents = 0 + + # Initialize the best median. + best_sum_of_distances = np.inf + self.__best_init_sum_of_distances = np.inf + node_maps_from_best_median = {} + + # Run block gradient descent from all initial medians. + self.__ged_env.set_method(self.__descent_method, self.__descent_options) + for median_pos in range(0, len(medians)): + + # Terminate if the timer has expired and at least one SOD has been computed. + if timer.expired() and median_pos > 0: + break + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('-----------------------------------------------------------') + + # Get reference to the median. + median = medians[median_pos] + + # Load initial median into the environment. + self.__ged_env.load_nx_graph(median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + + # Compute node maps and sum of distances for initial median. +# xxx = self.__node_maps_from_median + self.__compute_init_node_maps(graph_ids, gen_median_id) +# yyy = self.__node_maps_from_median + + self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) + self.__ged_env.load_nx_graph(median, set_median_id) +# print(self.__best_init_sum_of_distances) + + # Run block gradient descent from initial median. + converged = False + itrs_without_update = 0 + while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update): + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('-----------------------------------------------------------') + + # Initialize flags that tell us what happened in the iteration. + median_modified = False + node_maps_modified = False + decreased_order = False + increased_order = False + + # Update the median. + median_modified = self.__update_median(graphs, median) + if self.__update_order: + if not median_modified or self.__itrs[median_pos] == 0: + decreased_order = self.__decrease_order(graphs, median) + if not decreased_order or self.__itrs[median_pos] == 0: + increased_order = self.__increase_order(graphs, median) + + # Update the number of iterations without update of the median. + if median_modified or decreased_order or increased_order: + itrs_without_update = 0 + else: + itrs_without_update += 1 + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Loading median to environment: ... ', end='') + + # Load the median into the environment. + # @todo: should this function use the original node label? + self.__ged_env.load_nx_graph(median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Updating induced costs: ... ', end='') + + # Compute induced costs of the old node maps w.r.t. the updated median. + for graph_id in graph_ids: +# print(self.__node_maps_from_median[graph_id].induced_cost()) +# xxx = self.__node_maps_from_median[graph_id] + self.__ged_env.compute_induced_cost(gen_median_id, graph_id, self.__node_maps_from_median[graph_id]) +# print('---------------------------------------') +# print(self.__node_maps_from_median[graph_id].induced_cost()) + # @todo:!!!!!!!!!!!!!!!!!!!!!!!!!!!!This value is a slight different from the c++ program, which might be a bug! Use it very carefully! + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Update the node maps. + node_maps_modified = self.__update_node_maps() + + # Update the order of the median if no improvement can be found with the current order. + + # Update the sum of distances. + old_sum_of_distances = self.__sum_of_distances + self.__sum_of_distances = 0 + for graph_id, node_map in self.__node_maps_from_median.items(): + self.__sum_of_distances += node_map.induced_cost() +# print(self.__sum_of_distances) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Old local SOD: ', old_sum_of_distances) + print('New local SOD: ', self.__sum_of_distances) + print('Best converged SOD: ', best_sum_of_distances) + print('Modified median: ', median_modified) + print('Modified node maps: ', node_maps_modified) + print('Decreased order: ', decreased_order) + print('Increased order: ', increased_order) + print('===========================================================\n') + + converged = not (median_modified or node_maps_modified or decreased_order or increased_order) + + self.__itrs[median_pos] += 1 + + # Update the best median. + if self.__sum_of_distances < best_sum_of_distances: + best_sum_of_distances = self.__sum_of_distances + node_maps_from_best_median = self.__node_maps_from_median.copy() # @todo: this is a shallow copy, not sure if it is enough. + best_median = median + + # Update the number of converged descents. + if converged: + self.__num_converged_descents += 1 + + # Store the best encountered median. + self.__sum_of_distances = best_sum_of_distances + self.__node_maps_from_median = node_maps_from_best_median + self.__ged_env.load_nx_graph(best_median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + end_descent = time.time() + self.__runtime_converged = end_descent - start + + # Refine the sum of distances and the node maps for the converged median. + self.__converged_sum_of_distances = self.__sum_of_distances + if self.__refine: + self.__improve_sum_of_distances(timer) + + # Record end time, set runtime and reset the number of initial medians. + end = time.time() + self.__runtime = end - start + self.__num_random_inits = self.__desired_num_random_inits + + # Print global information. + if self.__print_to_stdout != 0: + print('\n===========================================================') + print('Finished computation of generalized median graph.') + print('-----------------------------------------------------------') + print('Best SOD after initialization: ', self.__best_init_sum_of_distances) + print('Converged SOD: ', self.__converged_sum_of_distances) + if self.__refine: + print('Refined SOD: ', self.__sum_of_distances) + print('Overall runtime: ', self.__runtime) + print('Runtime of initialization: ', self.__runtime_initialized) + print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized) + if self.__refine: + print('Runtime of refinement: ', self.__runtime - self.__runtime_converged) + print('Number of initial medians: ', len(medians)) + total_itr = 0 + num_started_descents = 0 + for itr in self.__itrs: + total_itr += itr + if itr > 0: + num_started_descents += 1 + print('Size of graph collection: ', len(graph_ids)) + print('Number of started descents: ', num_started_descents) + print('Number of converged descents: ', self.__num_converged_descents) + print('Overall number of iterations: ', total_itr) + print('Overall number of times the order decreased: ', self.__num_decrease_order) + print('Overall number of times the order increased: ', self.__num_increase_order) + print('===========================================================\n') + + + def __improve_sum_of_distances(self, timer): # @todo: go through and test + # Use method selected for refinement phase. + self.__ged_env.set_method(self.__refine_method, self.__refine_options) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Improving node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + print('\n===========================================================') + print('Improving node maps and SOD for converged median.') + print('-----------------------------------------------------------') + progress.update(1) + + # Improving the node maps. + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__gen_median_id) + for graph_id, node_map in self.__node_maps_from_median.items(): + if time.expired(): + if self.__state == AlgorithmState.TERMINATED: + self.__state = AlgorithmState.CONVERGED + break + + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(self.__gen_median_id, graph_id) + if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost(): + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id) + else: + self.__ged_env.run_method(graph_id, self.__gen_median_id) + if self.__ged_env.get_upper_bound(graph_id, self.__gen_median_id) < node_map.induced_cost(): + node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__gen_median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + + self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + + # Print information. + if self.__print_to_stdout == 2: + progress.update(1) + + self.__sum_of_distances = 0.0 + for key, val in self.__node_maps_from_median.items(): + self.__sum_of_distances += val.induced_cost() + + # Print information. + if self.__print_to_stdout == 2: + print('===========================================================\n') + + + def __median_available(self): + return self.__median_id != np.inf + + + def get_state(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_state().') + return self.__state + + + def get_sum_of_distances(self, state=''): + """Returns the sum of distances. + + Parameters + ---------- + state : string + The state of the estimator. Can be 'initialized' or 'converged'. Default: "" + + Returns + ------- + float + The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned. + """ + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().') + if state == 'initialized': + return self.__best_init_sum_of_distances + if state == 'converged': + return self.__converged_sum_of_distances + return self.__sum_of_distances + + + def get_runtime(self, state): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_runtime().') + if state == AlgorithmState.INITIALIZED: + return self.__runtime_initialized + if state == AlgorithmState.CONVERGED: + return self.__runtime_converged + return self.__runtime + + + def get_num_itrs(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_itrs().') + return self.__itrs + + + def get_num_times_order_decreased(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_times_order_decreased().') + return self.__num_decrease_order + + + def get_num_times_order_increased(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_times_order_increased().') + return self.__num_increase_order + + + def get_num_converged_descents(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_converged_descents().') + return self.__num_converged_descents + + + def get_ged_env(self): + return self.__ged_env + + + def __set_default_options(self): + self.__init_type = 'RANDOM' + self.__num_random_inits = 10 + self.__desired_num_random_inits = 10 + self.__use_real_randomness = True + self.__seed = 0 + self.__parallel = True + self.__update_order = True + self.__sort_graphs = True + self.__refine = True + self.__time_limit_in_sec = 0 + self.__epsilon = 0.0001 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__num_inits_increase_order = 10 + self.__init_type_increase_order = 'K-MEANS++' + self.__max_itrs_increase_order = 10 + self.__print_to_stdout = 2 + self.__label_names = {} + + + def __construct_initial_medians(self, graph_ids, timer, initial_medians): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Constructing initial median(s).') + print('-----------------------------------------------------------') + + # Compute or sample the initial median(s). + initial_medians.clear() + if self.__init_type == 'MEDOID': + self.__compute_medoid(graph_ids, timer, initial_medians) + elif self.__init_type == 'MAX': + pass # @todo +# compute_max_order_graph_(graph_ids, initial_medians) + elif self.__init_type == 'MIN': + pass # @todo +# compute_min_order_graph_(graph_ids, initial_medians) + elif self.__init_type == 'MEAN': + pass # @todo +# compute_mean_order_graph_(graph_ids, initial_medians) + else: + pass # @todo +# sample_initial_medians_(graph_ids, initial_medians) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('===========================================================') + + + def __compute_medoid(self, graph_ids, timer, initial_medians): + # Use method selected for initialization phase. + self.__ged_env.set_method(self.__init_method, self.__init_options) + + # Compute the medoid. + if self.__parallel: + # @todo: notice when parallel self.__ged_env is not modified. + sum_of_distances_list = [np.inf] * len(graph_ids) + len_itr = len(graph_ids) + itr = zip(graph_ids, range(0, len(graph_ids))) + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(ged_env_toshare): + global G_ged_env + G_ged_env = ged_env_toshare + do_fun = partial(_compute_medoid_parallel, graph_ids, self.__sort_graphs) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) + if self.__print_to_stdout == 2: + iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), + desc='Computing medoid', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_fun, itr, chunksize) + for i, dis in iterator: + sum_of_distances_list[i] = dis + pool.close() + pool.join() + + medoid_id = np.argmin(sum_of_distances_list) + best_sum_of_distances = sum_of_distances_list[medoid_id] + + initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + + else: + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout) + + medoid_id = graph_ids[0] + best_sum_of_distances = np.inf + for g_id in graph_ids: + if timer.expired(): + self.__state = AlgorithmState.CALLED + break + nb_nodes_g = self.__ged_env.get_graph_num_nodes(g_id) + sum_of_distances = 0 + for h_id in graph_ids: # @todo: this can be faster, only a half is needed. + nb_nodes_h = self.__ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not self.__sort_graphs: + self.__ged_env.run_method(g_id, h_id) # @todo + sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) + else: + self.__ged_env.run_method(h_id, g_id) + sum_of_distances += self.__ged_env.get_upper_bound(h_id, g_id) + if sum_of_distances < best_sum_of_distances: + best_sum_of_distances = sum_of_distances + medoid_id = g_id + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + + def __compute_init_node_maps(self, graph_ids, gen_median_id): + # Compute node maps and sum of distances for initial median. + if self.__parallel: + # @todo: notice when parallel self.__ged_env is not modified. + self.__sum_of_distances = 0 + self.__node_maps_from_median.clear() + sum_of_distances_list = [0] * len(graph_ids) + + len_itr = len(graph_ids) + itr = graph_ids + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(ged_env_toshare): + global G_ged_env + G_ged_env = ged_env_toshare + nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self.__sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) + if self.__print_to_stdout == 2: + iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), + desc='Computing initial node maps', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_fun, itr, chunksize) + for g_id, sod, node_maps in iterator: + sum_of_distances_list[g_id] = sod + self.__node_maps_from_median[g_id] = node_maps + pool.close() + pool.join() + + self.__sum_of_distances = np.sum(sum_of_distances_list) +# xxx = self.__node_maps_from_median + + else: + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout) + + self.__sum_of_distances = 0 + self.__node_maps_from_median.clear() + nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + for graph_id in graph_ids: + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(gen_median_id, graph_id) + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) + else: + self.__ged_env.run_method(graph_id, gen_median_id) + node_map_tmp = self.__ged_env.get_node_map(graph_id, gen_median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + # print(self.__node_maps_from_median[graph_id]) + self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + # print(self.__sum_of_distances) + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + + def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): + if self.__state == AlgorithmState.TERMINATED: + self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + + + def __update_median(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Updating median: ', end='') + + # Store copy of the old median. + old_median = median.copy() # @todo: this is just a shallow copy. + + # Update the node labels. + if self.__labeled_nodes: + self.__update_node_labels(graphs, median) + + # Update the edges and their labels. + self.__update_edges(graphs, median) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + return not self.__are_graphs_equal(median, old_median) + + + def __update_node_labels(self, graphs, median): +# print('----------------------------') + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('nodes ... ', end='') + + # Iterate through all nodes of the median. + for i in range(0, nx.number_of_nodes(median)): +# print('i: ', i) + # Collect the labels of the substituted nodes. + node_labels = [] + for graph_id, graph in graphs.items(): +# print('graph_id: ', graph_id) +# print(self.__node_maps_from_median[graph_id]) +# print(self.__node_maps_from_median[graph_id].forward_map, self.__node_maps_from_median[graph_id].backward_map) + k = self.__node_maps_from_median[graph_id].image(i) +# print('k: ', k) + if k != np.inf: + node_labels.append(graph.nodes[k]) + + # Compute the median label and update the median. + if len(node_labels) > 0: +# median_label = self.__ged_env.get_median_node_label(node_labels) + median_label = self.__get_median_node_label(node_labels) + if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: + nx.set_node_attributes(median, {i: median_label}) + + + def __update_edges(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('edges ... ', end='') + +# # Clear the adjacency lists of the median and reset number of edges to 0. +# median_edges = list(median.edges) +# for (head, tail) in median_edges: +# median.remove_edge(head, tail) + + # @todo: what if edge is not labeled? + # Iterate through all possible edges (i,j) of the median. + for i in range(0, nx.number_of_nodes(median)): + for j in range(i + 1, nx.number_of_nodes(median)): + + # Collect the labels of the edges to which (i,j) is mapped by the node maps. + edge_labels = [] + for graph_id, graph in graphs.items(): + k = self.__node_maps_from_median[graph_id].image(i) + l = self.__node_maps_from_median[graph_id].image(j) + if k != np.inf and l != np.inf: + if graph.has_edge(k, l): + edge_labels.append(graph.edges[(k, l)]) + + # Compute the median edge label and the overall edge relabeling cost. + rel_cost = 0 + median_label = self.__ged_env.get_edge_label(1, to_dict=True) + if median.has_edge(i, j): + median_label = median.edges[(i, j)] + if self.__labeled_edges and len(edge_labels) > 0: + new_median_label = self.__get_median_edge_label(edge_labels) + if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: + median_label = new_median_label + for edge_label in edge_labels: + rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label) + + # Update the median. + if median.has_edge(i, j): + median.remove_edge(i, j) + if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs): + median.add_edge(i, j, **median_label) +# else: +# if median.has_edge(i, j): +# median.remove_edge(i, j) + + + def __update_node_maps(self): + # Update the node maps. + if self.__parallel: + # @todo: notice when parallel self.__ged_env is not modified. + node_maps_were_modified = False +# xxx = self.__node_maps_from_median.copy() + + len_itr = len(self.__node_maps_from_median) + itr = [item for item in self.__node_maps_from_median.items()] + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(ged_env_toshare): + global G_ged_env + G_ged_env = ged_env_toshare + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) + do_fun = partial(_update_node_maps_parallel, self.__median_id, self.__epsilon, self.__sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) + if self.__print_to_stdout == 2: + iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), + desc='Updating node maps', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_fun, itr, chunksize) + for g_id, node_map, nm_modified in iterator: + self.__node_maps_from_median[g_id] = node_map + if nm_modified: + node_maps_were_modified = True + pool.close() + pool.join() +# yyy = self.__node_maps_from_median.copy() + + else: + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + + node_maps_were_modified = False + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) + for graph_id, node_map in self.__node_maps_from_median.items(): + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(self.__median_id, graph_id) + if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon: + # xxx = self.__node_maps_from_median[graph_id] + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) + node_maps_were_modified = True + + else: + self.__ged_env.run_method(graph_id, self.__median_id) + if self.__ged_env.get_upper_bound(graph_id, self.__median_id) < node_map.induced_cost() - self.__epsilon: + node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + node_maps_were_modified = True + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + # Return true if the node maps were modified. + return node_maps_were_modified + + + def __decrease_order(self, graphs, median): + # Print information about current iteration + if self.__print_to_stdout == 2: + print('Trying to decrease order: ... ', end='') + + if nx.number_of_nodes(median) <= 1: + if self.__print_to_stdout == 2: + print('median graph has only 1 node, skip decrease.') + return False + + # Initialize ID of the node that is to be deleted. + id_deleted_node = [None] # @todo: or np.inf + decreased_order = False + + # Decrease the order as long as the best deletion delta is negative. + while self.__compute_best_deletion_delta(graphs, median, id_deleted_node) < -self.__epsilon: + decreased_order = True + self.__delete_node_from_median(id_deleted_node[0], median) + if nx.number_of_nodes(median) <= 1: + if self.__print_to_stdout == 2: + print('decrease stopped because median graph remains only 1 node. ', end='') + break + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Return true iff the order was decreased. + return decreased_order + + + def __compute_best_deletion_delta(self, graphs, median, id_deleted_node): + best_delta = 0.0 + + # Determine node that should be deleted (if any). + for i in range(0, nx.number_of_nodes(median)): + # Compute cost delta. + delta = 0.0 + for graph_id, graph in graphs.items(): + k = self.__node_maps_from_median[graph_id].image(i) + if k == np.inf: + delta -= self.__node_del_cost + else: + delta += self.__node_ins_cost - self.__ged_env.get_node_rel_cost(median.nodes[i], graph.nodes[k]) + for j, j_label in median[i].items(): + l = self.__node_maps_from_median[graph_id].image(j) + if k == np.inf or l == np.inf: + delta -= self.__edge_del_cost + elif not graph.has_edge(k, l): + delta -= self.__edge_del_cost + else: + delta += self.__edge_ins_cost - self.__ged_env.get_edge_rel_cost(j_label, graph.edges[(k, l)]) + + # Update best deletion delta. + if delta < best_delta - self.__epsilon: + best_delta = delta + id_deleted_node[0] = i +# id_deleted_node[0] = 3 # @todo: + + return best_delta + + + def __delete_node_from_median(self, id_deleted_node, median): + # Update the median. + mapping = {} + for i in range(0, nx.number_of_nodes(median)): + if i != id_deleted_node: + new_i = (i if i < id_deleted_node else (i - 1)) + mapping[i] = new_i + median.remove_node(id_deleted_node) + nx.relabel_nodes(median, mapping, copy=False) + + # Update the node maps. +# xxx = self.__node_maps_from_median + for key, node_map in self.__node_maps_from_median.items(): + new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) + is_unassigned_target_node = [True] * node_map.num_target_nodes() + for i in range(0, nx.number_of_nodes(median) + 1): + if i != id_deleted_node: + new_i = (i if i < id_deleted_node else (i - 1)) + k = node_map.image(i) + new_node_map.add_assignment(new_i, k) + if k != np.inf: + is_unassigned_target_node[k] = False + for k in range(0, node_map.num_target_nodes()): + if is_unassigned_target_node[k]: + new_node_map.add_assignment(np.inf, k) +# print(self.__node_maps_from_median[key].forward_map, self.__node_maps_from_median[key].backward_map) +# print(new_node_map.forward_map, new_node_map.backward_map + self.__node_maps_from_median[key] = new_node_map + + # Increase overall number of decreases. + self.__num_decrease_order += 1 + + + def __increase_order(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Trying to increase order: ... ', end='') + + # Initialize the best configuration and the best label of the node that is to be inserted. + best_config = {} + best_label = self.__ged_env.get_node_label(1, to_dict=True) + increased_order = False + + # Increase the order as long as the best insertion delta is negative. + while self.__compute_best_insertion_delta(graphs, best_config, best_label) < - self.__epsilon: + increased_order = True + self.__add_node_to_median(best_config, best_label, median) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Return true iff the order was increased. + return increased_order + + + def __compute_best_insertion_delta(self, graphs, best_config, best_label): + # Construct sets of inserted nodes. + no_inserted_node = True + inserted_nodes = {} + for graph_id, graph in graphs.items(): + inserted_nodes[graph_id] = [] + best_config[graph_id] = np.inf + for k in range(nx.number_of_nodes(graph)): + if self.__node_maps_from_median[graph_id].pre_image(k) == np.inf: + no_inserted_node = False + inserted_nodes[graph_id].append((k, tuple(item for item in graph.nodes[k].items()))) # @todo: can order of label names be garantteed? + + # Return 0.0 if no node is inserted in any of the graphs. + if no_inserted_node: + return 0.0 + + # Compute insertion configuration, label, and delta. + best_delta = 0.0 # @todo + if len(self.__label_names['node_labels']) == 0 and len(self.__label_names['node_attrs']) == 0: # @todo + best_delta = self.__compute_insertion_delta_unlabeled(inserted_nodes, best_config, best_label) + elif len(self.__label_names['node_labels']) > 0: # self.__constant_node_costs: + best_delta = self.__compute_insertion_delta_constant(inserted_nodes, best_config, best_label) + else: + best_delta = self.__compute_insertion_delta_generic(inserted_nodes, best_config, best_label) + + # Return the best delta. + return best_delta + + + def __compute_insertion_delta_unlabeled(self, inserted_nodes, best_config, best_label): # @todo: go through and test. + # Construct the nest configuration and compute its insertion delta. + best_delta = 0.0 + best_config.clear() + for graph_id, node_set in inserted_nodes.items(): + if len(node_set) == 0: + best_config[graph_id] = np.inf + best_delta += self.__node_del_cost + else: + best_config[graph_id] = node_set[0][0] + best_delta -= self.__node_ins_cost + + # Return the best insertion delta. + return best_delta + + + def __compute_insertion_delta_constant(self, inserted_nodes, best_config, best_label): + # Construct histogram and inverse label maps. + hist = {} + inverse_label_maps = {} + for graph_id, node_set in inserted_nodes.items(): + inverse_label_maps[graph_id] = {} + for node in node_set: + k = node[0] + label = node[1] + if label not in inverse_label_maps[graph_id]: + inverse_label_maps[graph_id][label] = k + if label not in hist: + hist[label] = 1 + else: + hist[label] += 1 + + # Determine the best label. + best_count = 0 + for key, val in hist.items(): + if val > best_count: + best_count = val + best_label_tuple = key + + # get best label. + best_label.clear() + for key, val in best_label_tuple: + best_label[key] = val + + # Construct the best configuration and compute its insertion delta. + best_config.clear() + best_delta = 0.0 + node_rel_cost = self.__ged_env.get_node_rel_cost(self.__ged_env.get_node_label(1, to_dict=False), self.__ged_env.get_node_label(2, to_dict=False)) + triangle_ineq_holds = (node_rel_cost <= self.__node_del_cost + self.__node_ins_cost) + for graph_id, _ in inserted_nodes.items(): + if best_label_tuple in inverse_label_maps[graph_id]: + best_config[graph_id] = inverse_label_maps[graph_id][best_label_tuple] + best_delta -= self.__node_ins_cost + elif triangle_ineq_holds and not len(inserted_nodes[graph_id]) == 0: + best_config[graph_id] = inserted_nodes[graph_id][0][0] + best_delta += node_rel_cost - self.__node_ins_cost + else: + best_config[graph_id] = np.inf + best_delta += self.__node_del_cost + + # Return the best insertion delta. + return best_delta + + + def __compute_insertion_delta_generic(self, inserted_nodes, best_config, best_label): + # Collect all node labels of inserted nodes. + node_labels = [] + for _, node_set in inserted_nodes.items(): + for node in node_set: + node_labels.append(node[1]) + + # Compute node label medians that serve as initial solutions for block gradient descent. + initial_node_labels = [] + self.__compute_initial_node_labels(node_labels, initial_node_labels) + + # Determine best insertion configuration, label, and delta via parallel block gradient descent from all initial node labels. + best_delta = 0.0 + for node_label in initial_node_labels: + # Construct local configuration. + config = {} + for graph_id, _ in inserted_nodes.items(): + config[graph_id] = tuple((np.inf, self.__ged_env.get_node_label(1, to_dict=False))) + + # Run block gradient descent. + converged = False + itr = 0 + while not self.__insertion_termination_criterion_met(converged, itr): + converged = not self.__update_config(node_label, inserted_nodes, config, node_labels) + node_label_dict = dict(node_label) + converged = converged and (not self.__update_node_label([dict(item) for item in node_labels], node_label_dict)) # @todo: the dict is tupled again in the function, can be better. + node_label = tuple(item for item in node_label_dict.items()) # @todo: watch out: initial_node_labels[i] is not modified here. + + itr += 1 + + # Compute insertion delta of converged solution. + delta = 0.0 + for _, node in config.items(): + if node[0] == np.inf: + delta += self.__node_del_cost + else: + delta += self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost + + # Update best delta and global configuration if improvement has been found. + if delta < best_delta - self.__epsilon: + best_delta = delta + best_label.clear() + for key, val in node_label: + best_label[key] = val + best_config.clear() + for graph_id, val in config.items(): + best_config[graph_id] = val[0] + + # Return the best delta. + return best_delta + + + def __compute_initial_node_labels(self, node_labels, median_labels): + median_labels.clear() + if self.__use_real_randomness: # @todo: may not work if parallelized. + rng = np.random.randint(0, high=2**32 - 1, size=1) + urng = np.random.RandomState(seed=rng[0]) + else: + urng = np.random.RandomState(seed=self.__seed) + + # Generate the initial node label medians. + if self.__init_type_increase_order == 'K-MEANS++': + # Use k-means++ heuristic to generate the initial node label medians. + already_selected = [False] * len(node_labels) + selected_label_id = urng.randint(low=0, high=len(node_labels), size=1)[0] # c++ test: 23 + median_labels.append(node_labels[selected_label_id]) + already_selected[selected_label_id] = True +# xxx = [41, 0, 18, 9, 6, 14, 21, 25, 33] for c++ test +# iii = 0 for c++ test + while len(median_labels) < self.__num_inits_increase_order: + weights = [np.inf] * len(node_labels) + for label_id in range(0, len(node_labels)): + if already_selected[label_id]: + weights[label_id] = 0 + continue + for label in median_labels: + weights[label_id] = min(weights[label_id], self.__ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) + + # get non-zero weights. + weights_p, idx_p = [], [] + for i, w in enumerate(weights): + if w != 0: + weights_p.append(w) + idx_p.append(i) + if len(weights_p) > 0: + p = np.array(weights_p) / np.sum(weights_p) + selected_label_id = urng.choice(range(0, len(weights_p)), size=1, p=p)[0] # for c++ test: xxx[iii] + selected_label_id = idx_p[selected_label_id] +# iii += 1 for c++ test + median_labels.append(node_labels[selected_label_id]) + already_selected[selected_label_id] = True + else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self.__num_inits_increase_order. + break + else: + # Compute the initial node medians as the medians of randomly generated clusters of (roughly) equal size. + # @todo: go through and test. + shuffled_node_labels = [np.inf] * len(node_labels) #@todo: random? + # @todo: std::shuffle(shuffled_node_labels.begin(), shuffled_node_labels.end(), urng);? + cluster_size = len(node_labels) / self.__num_inits_increase_order + pos = 0.0 + cluster = [] + while len(median_labels) < self.__num_inits_increase_order - 1: + while pos < (len(median_labels) + 1) * cluster_size: + cluster.append(shuffled_node_labels[pos]) + pos += 1 + median_labels.append(self.__get_median_node_label(cluster)) + cluster.clear() + while pos < len(shuffled_node_labels): + pos += 1 + cluster.append(shuffled_node_labels[pos]) + median_labels.append(self.__get_median_node_label(cluster)) + cluster.clear() + + # Run Lloyd's Algorithm. + converged = False + closest_median_ids = [np.inf] * len(node_labels) + clusters = [[] for _ in range(len(median_labels))] + itr = 1 + while not self.__insertion_termination_criterion_met(converged, itr): + converged = not self.__update_clusters(node_labels, median_labels, closest_median_ids) + if not converged: + for cluster in clusters: + cluster.clear() + for label_id in range(0, len(node_labels)): + clusters[closest_median_ids[label_id]].append(node_labels[label_id]) + for cluster_id in range(0, len(clusters)): + node_label = dict(median_labels[cluster_id]) + self.__update_node_label([dict(item) for item in clusters[cluster_id]], node_label) # @todo: the dict is tupled again in the function, can be better. + median_labels[cluster_id] = tuple(item for item in node_label.items()) + itr += 1 + + + def __insertion_termination_criterion_met(self, converged, itr): + return converged or (itr >= self.__max_itrs_increase_order if self.__max_itrs_increase_order > 0 else False) + + + def __update_config(self, node_label, inserted_nodes, config, node_labels): + # Determine the best configuration. + config_modified = False + for graph_id, node_set in inserted_nodes.items(): + best_assignment = config[graph_id] + best_cost = 0.0 + if best_assignment[0] == np.inf: + best_cost = self.__node_del_cost + else: + best_cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(best_assignment[1])) - self.__node_ins_cost + for node in node_set: + cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost + if cost < best_cost - self.__epsilon: + best_cost = cost + best_assignment = node + config_modified = True + if self.__node_del_cost < best_cost - self.__epsilon: + best_cost = self.__node_del_cost + best_assignment = tuple((np.inf, best_assignment[1])) + config_modified = True + config[graph_id] = best_assignment + + # Collect the node labels contained in the best configuration. + node_labels.clear() + for key, val in config.items(): + if val[0] != np.inf: + node_labels.append(val[1]) + + # Return true if the configuration was modified. + return config_modified + + + def __update_node_label(self, node_labels, node_label): + if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling __update_config(). + return False + new_node_label = self.__get_median_node_label(node_labels) + if self.__ged_env.get_node_rel_cost(new_node_label, node_label) > self.__epsilon: + node_label.clear() + for key, val in new_node_label.items(): + node_label[key] = val + return True + return False + + + def __update_clusters(self, node_labels, median_labels, closest_median_ids): + # Determine the closest median for each node label. + clusters_modified = False + for label_id in range(0, len(node_labels)): + closest_median_id = np.inf + dist_to_closest_median = np.inf + for median_id in range(0, len(median_labels)): + dist_to_median = self.__ged_env.get_node_rel_cost(dict(median_labels[median_id]), dict(node_labels[label_id])) + if dist_to_median < dist_to_closest_median - self.__epsilon: + dist_to_closest_median = dist_to_median + closest_median_id = median_id + if closest_median_id != closest_median_ids[label_id]: + closest_median_ids[label_id] = closest_median_id + clusters_modified = True + + # Return true if the clusters were modified. + return clusters_modified + + + def __add_node_to_median(self, best_config, best_label, median): + # Update the median. + nb_nodes_median = nx.number_of_nodes(median) + median.add_node(nb_nodes_median, **best_label) + + # Update the node maps. + for graph_id, node_map in self.__node_maps_from_median.items(): + node_map_as_rel = [] + node_map.as_relation(node_map_as_rel) + new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) + for assignment in node_map_as_rel: + new_node_map.add_assignment(assignment[0], assignment[1]) + new_node_map.add_assignment(nx.number_of_nodes(median) - 1, best_config[graph_id]) + self.__node_maps_from_median[graph_id] = new_node_map + + # Increase overall number of increases. + self.__num_increase_order += 1 + + + def __are_graphs_equal(self, g1, g2): + """ + Check if the two graphs are equal. + + Parameters + ---------- + g1 : NetworkX graph object + Graph 1 to be compared. + + g2 : NetworkX graph object + Graph 2 to be compared. + + Returns + ------- + bool + True if the two graph are equal. + + Notes + ----- + This is not an identical check. Here the two graphs are equal if and only if their original_node_ids, nodes, all node labels, edges and all edge labels are equal. This function is specifically designed for class `MedianGraphEstimator` and should not be used elsewhere. + """ + # check original node ids. + if not g1.graph['original_node_ids'] == g2.graph['original_node_ids']: + return False # @todo: why check this? + # check nodes. + nlist1 = [n for n in g1.nodes(data=True)] # @todo: shallow? + nlist2 = [n for n in g2.nodes(data=True)] + if not nlist1 == nlist2: + return False + # check edges. + elist1 = [n for n in g1.edges(data=True)] + elist2 = [n for n in g2.edges(data=True)] + if not elist1 == elist2: + return False + + return True + + + def compute_my_cost(g, h, node_map): + cost = 0.0 + for node in g.nodes: + cost += 0 + + + def set_label_names(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + self.__label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, + 'node_attrs': node_attrs, 'edge_attrs': edge_attrs} + + + def __get_median_node_label(self, node_labels): + if len(self.__label_names['node_labels']) > 0: + return self.__get_median_label_symbolic(node_labels) + elif len(self.__label_names['node_attrs']) > 0: + return self.__get_median_label_nonsymbolic(node_labels) + else: + raise Exception('Node label names are not given.') + + + def __get_median_edge_label(self, edge_labels): + if len(self.__label_names['edge_labels']) > 0: + return self.__get_median_label_symbolic(edge_labels) + elif len(self.__label_names['edge_attrs']) > 0: + return self.__get_median_label_nonsymbolic(edge_labels) + else: + raise Exception('Edge label names are not given.') + + + def __get_median_label_symbolic(self, labels): + # Construct histogram. + hist = {} + for label in labels: + label = tuple([kv for kv in label.items()]) # @todo: this may be slow. + if label not in hist: + hist[label] = 1 + else: + hist[label] += 1 + + # Return the label that appears most frequently. + best_count = 0 + median_label = {} + for label, count in hist.items(): + if count > best_count: + best_count = count + median_label = {kv[0]: kv[1] for kv in label} + + return median_label + + + def __get_median_label_nonsymbolic(self, labels): + if len(labels) == 0: + return {} # @todo + else: + # Transform the labels into coordinates and compute mean label as initial solution. + labels_as_coords = [] + sums = {} + for key, val in labels[0].items(): + sums[key] = 0 + for label in labels: + coords = {} + for key, val in label.items(): + label_f = float(val) + sums[key] += label_f + coords[key] = label_f + labels_as_coords.append(coords) + median = {} + for key, val in sums.items(): + median[key] = val / len(labels) + + # Run main loop of Weiszfeld's Algorithm. + epsilon = 0.0001 + delta = 1.0 + num_itrs = 0 + all_equal = False + while ((delta > epsilon) and (num_itrs < 100) and (not all_equal)): + numerator = {} + for key, val in sums.items(): + numerator[key] = 0 + denominator = 0 + for label_as_coord in labels_as_coords: + norm = 0 + for key, val in label_as_coord.items(): + norm += (val - median[key]) ** 2 + norm = np.sqrt(norm) + if norm > 0: + for key, val in label_as_coord.items(): + numerator[key] += val / norm + denominator += 1.0 / norm + if denominator == 0: + all_equal = True + else: + new_median = {} + delta = 0.0 + for key, val in numerator.items(): + this_median = val / denominator + new_median[key] = this_median + delta += np.abs(median[key] - this_median) + median = new_median + + num_itrs += 1 + + # Transform the solution to strings and return it. + median_label = {} + for key, val in median.items(): + median_label[key] = str(val) + return median_label + + +# def __get_median_edge_label_symbolic(self, edge_labels): +# pass + + +# def __get_median_edge_label_nonsymbolic(self, edge_labels): +# if len(edge_labels) == 0: +# return {} +# else: +# # Transform the labels into coordinates and compute mean label as initial solution. +# edge_labels_as_coords = [] +# sums = {} +# for key, val in edge_labels[0].items(): +# sums[key] = 0 +# for edge_label in edge_labels: +# coords = {} +# for key, val in edge_label.items(): +# label = float(val) +# sums[key] += label +# coords[key] = label +# edge_labels_as_coords.append(coords) +# median = {} +# for key, val in sums.items(): +# median[key] = val / len(edge_labels) +# +# # Run main loop of Weiszfeld's Algorithm. +# epsilon = 0.0001 +# delta = 1.0 +# num_itrs = 0 +# all_equal = False +# while ((delta > epsilon) and (num_itrs < 100) and (not all_equal)): +# numerator = {} +# for key, val in sums.items(): +# numerator[key] = 0 +# denominator = 0 +# for edge_label_as_coord in edge_labels_as_coords: +# norm = 0 +# for key, val in edge_label_as_coord.items(): +# norm += (val - median[key]) ** 2 +# norm += np.sqrt(norm) +# if norm > 0: +# for key, val in edge_label_as_coord.items(): +# numerator[key] += val / norm +# denominator += 1.0 / norm +# if denominator == 0: +# all_equal = True +# else: +# new_median = {} +# delta = 0.0 +# for key, val in numerator.items(): +# this_median = val / denominator +# new_median[key] = this_median +# delta += np.abs(median[key] - this_median) +# median = new_median +# +# num_itrs += 1 +# +# # Transform the solution to ged::GXLLabel and return it. +# median_label = {} +# for key, val in median.items(): +# median_label[key] = str(val) +# return median_label + + +def _compute_medoid_parallel(graph_ids, sort, itr): + g_id = itr[0] + i = itr[1] + # @todo: timer not considered here. +# if timer.expired(): +# self.__state = AlgorithmState.CALLED +# break + nb_nodes_g = G_ged_env.get_graph_num_nodes(g_id) + sum_of_distances = 0 + for h_id in graph_ids: + nb_nodes_h = G_ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not sort: + G_ged_env.run_method(g_id, h_id) + sum_of_distances += G_ged_env.get_upper_bound(g_id, h_id) + else: + G_ged_env.run_method(h_id, g_id) + sum_of_distances += G_ged_env.get_upper_bound(h_id, g_id) + return i, sum_of_distances + + +def _compute_init_node_maps_parallel(gen_median_id, sort, nb_nodes_median, itr): + graph_id = itr + nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not sort: + G_ged_env.run_method(gen_median_id, graph_id) + node_map = G_ged_env.get_node_map(gen_median_id, graph_id) +# print(self.__node_maps_from_median[graph_id]) + else: + G_ged_env.run_method(graph_id, gen_median_id) + node_map = G_ged_env.get_node_map(graph_id, gen_median_id) + node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map + sum_of_distance = node_map.induced_cost() +# print(self.__sum_of_distances) + return graph_id, sum_of_distance, node_map + + +def _update_node_maps_parallel(median_id, epsilon, sort, nb_nodes_median, itr): + graph_id = itr[0] + node_map = itr[1] + + node_maps_were_modified = False + nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not sort: + G_ged_env.run_method(median_id, graph_id) + if G_ged_env.get_upper_bound(median_id, graph_id) < node_map.induced_cost() - epsilon: + node_map = G_ged_env.get_node_map(median_id, graph_id) + node_maps_were_modified = True + else: + G_ged_env.run_method(graph_id, median_id) + if G_ged_env.get_upper_bound(graph_id, median_id) < node_map.induced_cost() - epsilon: + node_map = G_ged_env.get_node_map(graph_id, median_id) + node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map + node_maps_were_modified = True + + return graph_id, node_map, node_maps_were_modified \ No newline at end of file diff --git a/gklearn/preimage/__init__.py b/gklearn/preimage/__init__.py index 9713a65..a9284b2 100644 --- a/gklearn/preimage/__init__.py +++ b/gklearn/preimage/__init__.py @@ -11,8 +11,9 @@ __author__ = "Linlin Jia" __date__ = "March 2020" from gklearn.preimage.preimage_generator import PreimageGenerator -from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator from gklearn.preimage.random_preimage_generator import RandomPreimageGenerator +from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator +from gklearn.preimage.median_preimage_generator_py import MedianPreimageGeneratorPy from gklearn.preimage.median_preimage_generator_cml import MedianPreimageGeneratorCML from gklearn.preimage.kernel_knn_cv import kernel_knn_cv from gklearn.preimage.generate_random_preimages_by_class import generate_random_preimages_by_class diff --git a/gklearn/preimage/median_preimage_generator_py.py b/gklearn/preimage/median_preimage_generator_py.py new file mode 100644 index 0000000..cdc7a3c --- /dev/null +++ b/gklearn/preimage/median_preimage_generator_py.py @@ -0,0 +1,1035 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 26 18:27:22 2020 + +@author: ljia +""" +import numpy as np +import time +import random +import multiprocessing +import networkx as nx +import cvxpy as cp +from gklearn.preimage import PreimageGenerator +from gklearn.preimage.utils import compute_k_dis +from gklearn.ged.util import compute_geds_cml +from gklearn.ged.median import MedianGraphEstimatorPy +from gklearn.ged.median import constant_node_costs, mge_options_to_string +from gklearn.ged.env import GEDEnv +from gklearn.utils import Timer +from gklearn.utils.utils import get_graph_kernel_by_name + + +class MedianPreimageGeneratorPy(PreimageGenerator): + """Generator median preimages using the pure Python version of GEDEnv. + """ + + def __init__(self, dataset=None): + PreimageGenerator.__init__(self, dataset=dataset) + # arguments to set. + self.__mge = None + self.__ged_options = {} + self.__mge_options = {} + self.__fit_method = 'k-graphs' + self.__init_ecc = None + self.__parallel = True + self.__n_jobs = multiprocessing.cpu_count() + self.__ds_name = None + self.__time_limit_in_sec = 0 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__epsilon_residual = 0.01 + self.__epsilon_ec = 0.1 + self.__allow_zeros = False + self.__triangle_rule = True + # values to compute. + self.__runtime_optimize_ec = None + self.__runtime_generate_preimage = None + self.__runtime_total = None + self.__set_median = None + self.__gen_median = None + self.__best_from_dataset = None + self.__sod_set_median = None + self.__sod_gen_median = None + self.__k_dis_set_median = None + self.__k_dis_gen_median = None + self.__k_dis_dataset = None + self.__itrs = 0 + self.__converged = False + self.__num_updates_ecc = 0 + # values that can be set or to be computed. + self.__edit_cost_constants = [] + self.__gram_matrix_unnorm = None + self.__runtime_precompute_gm = None + + + def set_options(self, **kwargs): + self._kernel_options = kwargs.get('kernel_options', {}) + self._graph_kernel = kwargs.get('graph_kernel', None) + self._verbose = kwargs.get('verbose', 2) + self.__ged_options = kwargs.get('ged_options', {}) + self.__mge_options = kwargs.get('mge_options', {}) + self.__fit_method = kwargs.get('fit_method', 'k-graphs') + self.__init_ecc = kwargs.get('init_ecc', None) + self.__edit_cost_constants = kwargs.get('edit_cost_constants', []) + self.__parallel = kwargs.get('parallel', True) + self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) + self.__ds_name = kwargs.get('ds_name', None) + self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) + self.__max_itrs = kwargs.get('max_itrs', 100) + self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3) + self.__epsilon_residual = kwargs.get('epsilon_residual', 0.01) + self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1) + self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) + self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) + self.__allow_zeros = kwargs.get('allow_zeros', False) + self.__triangle_rule = kwargs.get('triangle_rule', True) + + + def run(self): + self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'], + node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + node_attrs=self._dataset.node_attrs, + edge_attrs=self._dataset.edge_attrs, + ds_infos=self._dataset.get_dataset_infos(keys=['directed']), + kernel_options=self._kernel_options) + + # record start time. + start = time.time() + + # 1. precompute gram matrix. + if self.__gram_matrix_unnorm is None: + gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options) + self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm + end_precompute_gm = time.time() + self.__runtime_precompute_gm = end_precompute_gm - start + else: + if self.__runtime_precompute_gm is None: + raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') + self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm + if self._kernel_options['normalize']: + self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) + else: + self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm) + end_precompute_gm = time.time() + start -= self.__runtime_precompute_gm + + if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset': + start = time.time() + self.__runtime_precompute_gm = 0 + end_precompute_gm = start + + # 2. optimize edit cost constants. + self.__optimize_edit_cost_constants() + end_optimize_ec = time.time() + self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm + + # 3. compute set median and gen median using optimized edit costs. + if self._verbose >= 2: + print('\nstart computing set median and gen median using optimized edit costs...\n') + self.__gmg_bcu() + end_generate_preimage = time.time() + self.__runtime_generate_preimage = end_generate_preimage - end_optimize_ec + self.__runtime_total = end_generate_preimage - start + if self._verbose >= 2: + print('medians computed.') + print('SOD of the set median: ', self.__sod_set_median) + print('SOD of the generalized median: ', self.__sod_gen_median) + + # 4. compute kernel distances to the true median. + if self._verbose >= 2: + print('\nstart computing distances to true median....\n') + self.__compute_distances_to_true_median() + + # 5. print out results. + if self._verbose: + print() + print('================================================================================') + print('Finished generation of preimages.') + print('--------------------------------------------------------------------------------') + print('The optimized edit cost constants:', self.__edit_cost_constants) + print('SOD of the set median:', self.__sod_set_median) + print('SOD of the generalized median:', self.__sod_gen_median) + print('Distance in kernel space for set median:', self.__k_dis_set_median) + print('Distance in kernel space for generalized median:', self.__k_dis_gen_median) + print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) + print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) + print('Time to optimize edit costs:', self.__runtime_optimize_ec) + print('Time to generate pre-images:', self.__runtime_generate_preimage) + print('Total time:', self.__runtime_total) + print('Total number of iterations for optimizing:', self.__itrs) + print('Total number of updating edit costs:', self.__num_updates_ecc) + print('Is optimization of edit costs converged:', self.__converged) + print('================================================================================') + print() + + + def get_results(self): + results = {} + results['edit_cost_constants'] = self.__edit_cost_constants + results['runtime_precompute_gm'] = self.__runtime_precompute_gm + results['runtime_optimize_ec'] = self.__runtime_optimize_ec + results['runtime_generate_preimage'] = self.__runtime_generate_preimage + results['runtime_total'] = self.__runtime_total + results['sod_set_median'] = self.__sod_set_median + results['sod_gen_median'] = self.__sod_gen_median + results['k_dis_set_median'] = self.__k_dis_set_median + results['k_dis_gen_median'] = self.__k_dis_gen_median + results['k_dis_dataset'] = self.__k_dis_dataset + results['itrs'] = self.__itrs + results['converged'] = self.__converged + results['num_updates_ecc'] = self.__num_updates_ecc + results['mge'] = {} + results['mge']['num_decrease_order'] = self.__mge.get_num_times_order_decreased() + results['mge']['num_increase_order'] = self.__mge.get_num_times_order_increased() + results['mge']['num_converged_descents'] = self.__mge.get_num_converged_descents() + return results + + + def __optimize_edit_cost_constants(self): + """fit edit cost constants. + """ + if self.__fit_method == 'random': # random + if self.__ged_options['edit_cost'] == 'LETTER': + self.__edit_cost_constants = random.sample(range(1, 1000), 3) + self.__edit_cost_constants = [item * 0.001 for item in self.__edit_cost_constants] + elif self.__ged_options['edit_cost'] == 'LETTER2': + random.seed(time.time()) + self.__edit_cost_constants = random.sample(range(1, 1000), 5) + self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] + elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': + self.__edit_cost_constants = random.sample(range(1, 1000), 6) + self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] + if self._dataset.node_attrs == []: + self.__edit_cost_constants[2] = 0 + if self._dataset.edge_attrs == []: + self.__edit_cost_constants[5] = 0 + else: + self.__edit_cost_constants = random.sample(range(1, 1000), 6) + self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] + if self._verbose >= 2: + print('edit cost constants used:', self.__edit_cost_constants) + elif self.__fit_method == 'expert': # expert + if self.__init_ecc is None: + if self.__ged_options['edit_cost'] == 'LETTER': + self.__edit_cost_constants = [0.9, 1.7, 0.75] + elif self.__ged_options['edit_cost'] == 'LETTER2': + self.__edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425] + else: + self.__edit_cost_constants = [3, 3, 1, 3, 3, 1] + else: + self.__edit_cost_constants = self.__init_ecc + elif self.__fit_method == 'k-graphs': + if self.__init_ecc is None: + if self.__ged_options['edit_cost'] == 'LETTER': + self.__init_ecc = [0.9, 1.7, 0.75] + elif self.__ged_options['edit_cost'] == 'LETTER2': + self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425] + elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': + self.__init_ecc = [0, 0, 1, 1, 1, 0] + if self._dataset.node_attrs == []: + self.__init_ecc[2] = 0 + if self._dataset.edge_attrs == []: + self.__init_ecc[5] = 0 + else: + self.__init_ecc = [3, 3, 1, 3, 3, 1] + # optimize on the k-graph subset. + self.__optimize_ecc_by_kernel_distances() + elif self.__fit_method == 'whole-dataset': + if self.__init_ecc is None: + if self.__ged_options['edit_cost'] == 'LETTER': + self.__init_ecc = [0.9, 1.7, 0.75] + elif self.__ged_options['edit_cost'] == 'LETTER2': + self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425] + else: + self.__init_ecc = [3, 3, 1, 3, 3, 1] + # optimizeon the whole set. + self.__optimize_ecc_by_kernel_distances() + elif self.__fit_method == 'precomputed': + pass + + + def __optimize_ecc_by_kernel_distances(self): + # compute distances in feature space. + dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix() + dis_k_vec = [] + for i in range(len(dis_k_mat)): + # for j in range(i, len(dis_k_mat)): + for j in range(i + 1, len(dis_k_mat)): + dis_k_vec.append(dis_k_mat[i, j]) + dis_k_vec = np.array(dis_k_vec) + + # init ged. + if self._verbose >= 2: + print('\ninitial:') + time0 = time.time() + graphs = [self.__clean_graph(g) for g in self._dataset.graphs] + self.__edit_cost_constants = self.__init_ecc + options = self.__ged_options.copy() + options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_labels'] = self._dataset.node_labels + options['edge_labels'] = self._dataset.edge_labels + options['node_attrs'] = self._dataset.node_attrs + options['edge_attrs'] = self._dataset.edge_attrs + ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) + residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] + time_list = [time.time() - time0] + edit_cost_list = [self.__init_ecc] + nb_cost_mat = np.array(n_edit_operations) + nb_cost_mat_list = [nb_cost_mat] + if self._verbose >= 2: + print('Current edit cost constants:', self.__edit_cost_constants) + print('Residual list:', residual_list) + + # run iteration from initial edit costs. + self.__converged = False + itrs_without_update = 0 + self.__itrs = 0 + self.__num_updates_ecc = 0 + timer = Timer(self.__time_limit_in_sec) + while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update): + if self._verbose >= 2: + print('\niteration', self.__itrs + 1) + time0 = time.time() + # "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method. +# np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm', +# nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, +# n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, +# ged_mat=ged_mat) + self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec) + for i in range(len(self.__edit_cost_constants)): + if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9: + self.__edit_cost_constants[i] = 0 + if self.__edit_cost_constants[i] < 0: + raise ValueError('The edit cost is negative.') + # for i in range(len(self.__edit_cost_constants)): + # if self.__edit_cost_constants[i] < 0: + # self.__edit_cost_constants[i] = 0 + + # compute new GEDs and numbers of edit operations. + options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75]) + options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_labels'] = self._dataset.node_labels + options['edge_labels'] = self._dataset.edge_labels + options['node_attrs'] = self._dataset.node_attrs + options['edge_attrs'] = self._dataset.edge_attrs + ged_vec, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) + residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) + time_list.append(time.time() - time0) + edit_cost_list.append(self.__edit_cost_constants) + nb_cost_mat = np.array(n_edit_operations) + nb_cost_mat_list.append(nb_cost_mat) + + # check convergency. + ec_changed = False + for i, cost in enumerate(self.__edit_cost_constants): + if cost == 0: + if edit_cost_list[-2][i] > self.__epsilon_ec: + ec_changed = True + break + elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ec: + ec_changed = True + break +# if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec: +# ec_changed = True +# break + residual_changed = False + if residual_list[-1] == 0: + if residual_list[-2] > self.__epsilon_residual: + residual_changed = True + elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_residual: + residual_changed = True + self.__converged = not (ec_changed or residual_changed) + if self.__converged: + itrs_without_update += 1 + else: + itrs_without_update = 0 + self.__num_updates_ecc += 1 + + # print current states. + if self._verbose >= 2: + print() + print('-------------------------------------------------------------------------') + print('States of iteration', self.__itrs + 1) + print('-------------------------------------------------------------------------') +# print('Time spend:', self.__runtime_optimize_ec) + print('Total number of iterations for optimizing:', self.__itrs + 1) + print('Total number of updating edit costs:', self.__num_updates_ecc) + print('Was optimization of edit costs converged:', self.__converged) + print('Did edit costs change:', ec_changed) + print('Did residual change:', residual_changed) + print('Iterations without update:', itrs_without_update) + print('Current edit cost constants:', self.__edit_cost_constants) + print('Residual list:', residual_list) + print('-------------------------------------------------------------------------') + + self.__itrs += 1 + + + def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): +# if self.__state == AlgorithmState.TERMINATED: +# self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + + + def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'): + # if self.__ds_name == 'Letter-high': + if self.__ged_options['edit_cost'] == 'LETTER': + raise Exception('Cannot compute for cost "LETTER".') + pass + # # method 1: set alpha automatically, just tune c_vir and c_eir by + # # LMS using cvxpy. + # alpha = 0.5 + # coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec) + ## if np.count_nonzero(nb_cost_mat[:,4]) == 0: + ## alpha = 0.75 + ## else: + ## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0]) + ## alpha = alpha * 0.99 + # param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1]) + # param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5]) + # nb_cost_mat_new = np.column_stack((param_vir, param_eir)) + # dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3] + # + # x = cp.Variable(nb_cost_mat_new.shape[1]) + # cost = cp.sum_squares(nb_cost_mat_new * x - dis_new) + # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] + # prob = cp.Problem(cp.Minimize(cost), constraints) + # prob.solve() + # edit_costs_new = x.value + # edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha]) + # residual = np.sqrt(prob.value) + + # # method 2: tune c_vir, c_eir and alpha by nonlinear programming by + # # scipy.optimize.minimize. + # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] + # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] + # w2 = nb_cost_mat[:,3] + # w3 = dis_k_vec + # func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ + # + w2 * x[2] - w3 * x[3]) ** 2) + # bounds = ((0, None), (0., None), (0.5, 0.5), (0, None)) + # res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds) + # edit_costs_new = res.x[0:3] + # residual = res.fun + + # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy. + + + # # method 4: tune c_vir, c_eir and alpha by QP function + # # scipy.optimize.least_squares. An initial guess is required. + # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] + # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] + # w2 = nb_cost_mat[:,3] + # w3 = dis_k_vec + # func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ + # + w2 * x[2] - w3 * x[3]) ** 2 + # res = optimize.root(func, [0.9, 1.7, 0.75, 100]) + # edit_costs_new = res.x + # residual = None + elif self.__ged_options['edit_cost'] == 'LETTER2': + # # 1. if c_vi != c_vr, c_ei != c_er. + # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + # x = cp.Variable(nb_cost_mat_new.shape[1]) + # cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + ## # 1.1 no constraints. + ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] + # # 1.2 c_vs <= c_vi + c_vr. + # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + ## # 2. if c_vi == c_vr, c_ei == c_er. + ## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]] + ## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] + ## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] + ## x = cp.Variable(nb_cost_mat_new.shape[1]) + ## cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + ## # 2.1 no constraints. + ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] + ### # 2.2 c_vs <= c_vi + c_vr. + ### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + ### np.array([2.0, -1.0, 0.0]).T@x >= 0.0] + # + # prob = cp.Problem(cp.Minimize(cost_fun), constraints) + # prob.solve() + # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] + # edit_costs_new = np.array(edit_costs_new) + # residual = np.sqrt(prob.value) + if not self.__triangle_rule and self.__allow_zeros: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif self.__triangle_rule and self.__allow_zeros: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01, + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif not self.__triangle_rule and not self.__allow_zeros: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = x.value + residual = np.sqrt(prob.value) + # elif method == 'inequality_modified': + # # c_vs <= c_vi + c_vr. + # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + # x = cp.Variable(nb_cost_mat_new.shape[1]) + # cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + # prob = cp.Problem(cp.Minimize(cost_fun), constraints) + # prob.solve() + # # use same costs for insertion and removal rather than the fitted costs. + # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] + # edit_costs_new = np.array(edit_costs_new) + # residual = np.sqrt(prob.value) + elif self.__triangle_rule and not self.__allow_zeros: + # c_vs <= c_vi + c_vr. + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif rw_constraints == '2constraints': # @todo: rearrange it later. + # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er. + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, + np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0, + np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = x.value + residual = np.sqrt(prob.value) + + elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': + is_n_attr = np.count_nonzero(nb_cost_mat[:,2]) + is_e_attr = np.count_nonzero(nb_cost_mat[:,5]) + + if self.__ds_name == 'SYNTHETICnew': # @todo: rearrenge this later. + # nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] + # constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]] + constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])], + np.array([0.0, 1.0, -1.0]).T@x == 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + # print(x.value) + edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value, + np.array([0.0]))) + residual = np.sqrt(prob.value) + + elif not self.__triangle_rule and self.__allow_zeros: + if is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif is_n_attr and not is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value, np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) + residual = np.sqrt(prob.value) + else: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), + x.value[2:], np.array([0.0]))) + residual = np.sqrt(prob.value) + elif self.__triangle_rule and self.__allow_zeros: + if is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif is_n_attr and not is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01, + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value, np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) + residual = np.sqrt(prob.value) + else: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), + x.value[2:], np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not self.__triangle_rule and not self.__allow_zeros: + if is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif is_n_attr and not is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value, np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) + residual = np.sqrt(prob.value) + else: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), + x.value[2:], np.array([0.0]))) + residual = np.sqrt(prob.value) + elif self.__triangle_rule and not self.__allow_zeros: + # c_vs <= c_vi + c_vr. + if is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif is_n_attr and not is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value, np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) + residual = np.sqrt(prob.value) + else: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), + x.value[2:], np.array([0.0]))) + residual = np.sqrt(prob.value) + + elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled. + if not self.__triangle_rule and self.__allow_zeros: + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif self.__triangle_rule and self.__allow_zeros: + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif not self.__triangle_rule and not self.__allow_zeros: + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif self.__triangle_rule and not self.__allow_zeros: + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + else: + raise Exception('The edit cost "', self.__ged_options['edit_cost'], '" is not supported for update progress.') + # # method 1: simple least square method. + # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, + # rcond=None) + + # # method 2: least square method with x_i >= 0. + # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) + + # method 3: solve as a quadratic program with constraints. + # P = np.dot(nb_cost_mat.T, nb_cost_mat) + # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) + # G = -1 * np.identity(nb_cost_mat.shape[1]) + # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) + # A = np.array([1 for i in range(nb_cost_mat.shape[1])]) + # b = 1 + # x = cp.Variable(nb_cost_mat.shape[1]) + # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x), + # [G@x <= h]) + # prob.solve() + # edit_costs_new = x.value + # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec) + + # G = -1 * np.identity(nb_cost_mat.shape[1]) + # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], + # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + + # method 4: + + return edit_costs_new, residual + + + def __execute_cvx(self, prob): + try: + prob.solve(verbose=(self._verbose>=2)) + except MemoryError as error0: + if self._verbose >= 2: + print('\nUsing solver "OSQP" caused a memory error.') + print('the original error message is\n', error0) + print('solver status: ', prob.status) + print('trying solver "CVXOPT" instead...\n') + try: + prob.solve(solver=cp.CVXOPT, verbose=(self._verbose>=2)) + except Exception as error1: + if self._verbose >= 2: + print('\nAn error occured when using solver "CVXOPT".') + print('the original error message is\n', error1) + print('solver status: ', prob.status) + print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') + prob.solve(solver=cp.MOSEK, verbose=(self._verbose>=2)) + else: + if self._verbose >= 2: + print('solver status: ', prob.status) + else: + if self._verbose >= 2: + print('solver status: ', prob.status) + if self._verbose >= 2: + print() + + + def __gmg_bcu(self): + """ + The local search algorithm based on block coordinate update (BCU) for estimating a generalized median graph (GMG). + + Returns + ------- + None. + + """ + # Set up the ged environment. + ged_env = GEDEnv() # @todo: maybe create a ged_env as a private varible. + # gedlibpy.restart_env() + ged_env.set_edit_cost(self.__ged_options['edit_cost'], edit_cost_constants=self.__edit_cost_constants) + graphs = [self.__clean_graph(g) for g in self._dataset.graphs] + for g in graphs: + ged_env.add_nx_graph(g, '') + graph_ids = ged_env.get_all_graph_ids() + set_median_id = ged_env.add_graph('set_median') + gen_median_id = ged_env.add_graph('gen_median') + ged_env.init(init_type=self.__ged_options['init_option']) + + # Set up the madian graph estimator. + self.__mge = MedianGraphEstimatorPy(ged_env, constant_node_costs(self.__ged_options['edit_cost'])) + self.__mge.set_refine_method(self.__ged_options['method'], self.__ged_options) + options = self.__mge_options.copy() + if not 'seed' in options: + options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage. + options['parallel'] = self.__parallel + + # Select the GED algorithm. + self.__mge.set_options(mge_options_to_string(options)) + self.__mge.set_label_names(node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + node_attrs=self._dataset.node_attrs, + edge_attrs=self._dataset.edge_attrs) + ged_options = self.__ged_options.copy() + if self.__parallel: + ged_options['threads'] = 1 + self.__mge.set_init_method(ged_options['method'], ged_options) + self.__mge.set_descent_method(ged_options['method'], ged_options) + + # Run the estimator. + self.__mge.run(graph_ids, set_median_id, gen_median_id) + + # Get SODs. + self.__sod_set_median = self.__mge.get_sum_of_distances('initialized') + self.__sod_gen_median = self.__mge.get_sum_of_distances('converged') + + # Get median graphs. + self.__set_median = ged_env.get_nx_graph(set_median_id) + self.__gen_median = ged_env.get_nx_graph(gen_median_id) + + + def __compute_distances_to_true_median(self): + # compute distance in kernel space for set median. + kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) + kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) + if self._kernel_options['normalize']: + kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize + kernel_sm = 1 + # @todo: not correct kernel value + gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) + gram_with_sm = np.concatenate((np.array([[kernel_sm] + kernels_to_sm]).T, gram_with_sm), axis=1) + self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), + [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), + gram_with_sm, withterm3=False) + + # compute distance in kernel space for generalized median. + kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) + kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) + if self._kernel_options['normalize']: + kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize + kernel_gm = 1 + gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) + gram_with_gm = np.concatenate((np.array([[kernel_gm] + kernels_to_gm]).T, gram_with_gm), axis=1) + self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), + [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), + gram_with_gm, withterm3=False) + + # compute distance in kernel space for each graph in median set. + k_dis_median_set = [] + for idx in range(len(self._dataset.graphs)): + k_dis_median_set.append(compute_k_dis(idx+1, range(1, 1+len(self._dataset.graphs)), + [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), + gram_with_gm, withterm3=False)) + idx_k_dis_median_set_min = np.argmin(k_dis_median_set) + self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min] + self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy() + + if self._verbose >= 2: + print() + print('distance in kernel space for set median:', self.__k_dis_set_median) + print('distance in kernel space for generalized median:', self.__k_dis_gen_median) + print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) + print('distance in kernel space for each graph in median set:', k_dis_median_set) + + +# def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + def __clean_graph(self, G): # @todo: this may not be needed when datafile is updated. + """ + Cleans node and edge labels and attributes of the given graph. + """ + G_new = nx.Graph(**G.graph) + for nd, attrs in G.nodes(data=True): + G_new.add_node(str(nd)) # @todo: should we keep this as str()? + for l_name in self._dataset.node_labels: + G_new.nodes[str(nd)][l_name] = str(attrs[l_name]) + for a_name in self._dataset.node_attrs: + G_new.nodes[str(nd)][a_name] = str(attrs[a_name]) + for nd1, nd2, attrs in G.edges(data=True): + G_new.add_edge(str(nd1), str(nd2)) + for l_name in self._dataset.edge_labels: + G_new.edges[str(nd1), str(nd2)][l_name] = str(attrs[l_name]) + for a_name in self._dataset.edge_attrs: + G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name]) + return G_new + + + @property + def mge(self): + return self.__mge + + @property + def ged_options(self): + return self.__ged_options + + @ged_options.setter + def ged_options(self, value): + self.__ged_options = value + + + @property + def mge_options(self): + return self.__mge_options + + @mge_options.setter + def mge_options(self, value): + self.__mge_options = value + + + @property + def fit_method(self): + return self.__fit_method + + @fit_method.setter + def fit_method(self, value): + self.__fit_method = value + + + @property + def init_ecc(self): + return self.__init_ecc + + @init_ecc.setter + def init_ecc(self, value): + self.__init_ecc = value + + + @property + def set_median(self): + return self.__set_median + + + @property + def gen_median(self): + return self.__gen_median + + + @property + def best_from_dataset(self): + return self.__best_from_dataset + + + @property + def gram_matrix_unnorm(self): + return self.__gram_matrix_unnorm + + @gram_matrix_unnorm.setter + def gram_matrix_unnorm(self, value): + self.__gram_matrix_unnorm = value \ No newline at end of file From 74a6e5fad3df694e585f110f761a7d6b2d543553 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Wed, 1 Jul 2020 14:44:02 +0200 Subject: [PATCH 2/7] Add exp: compare GEDEnv with the GEDLIB version. --- .../experiments/ged/check_results_of_ged_env.py | 126 +++++++++++++++++++++ gklearn/ged/util/lsape_solver.py | 6 +- 2 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 gklearn/experiments/ged/check_results_of_ged_env.py diff --git a/gklearn/experiments/ged/check_results_of_ged_env.py b/gklearn/experiments/ged/check_results_of_ged_env.py new file mode 100644 index 0000000..7c81c5d --- /dev/null +++ b/gklearn/experiments/ged/check_results_of_ged_env.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Jun 25 11:31:46 2020 + +@author: ljia +""" + +def xp_check_results_of_GEDEnv(): + """Compare results of GEDEnv to GEDLIB. + """ + """**1. Get dataset.**""" + + from gklearn.utils import Dataset + + # Predefined dataset name, use dataset "MUTAG". + ds_name = 'MUTAG' + + # Initialize a Dataset. + dataset = Dataset() + # Load predefined dataset "MUTAG". + dataset.load_predefined_dataset(ds_name) + + results1 = compute_geds_by_GEDEnv(dataset) + results2 = compute_geds_by_GEDLIB(dataset) + + # Show results. + import pprint + pp = pprint.PrettyPrinter(indent=4) # pretty print + print('Restuls using GEDEnv:') + pp.pprint(results1) + print() + print('Restuls using GEDLIB:') + pp.pprint(results2) + + return results1, results2 + + +def compute_geds_by_GEDEnv(dataset): + from gklearn.ged.env import GEDEnv + import numpy as np + + graph1 = dataset.graphs[0] + graph2 = dataset.graphs[1] + + ged_env = GEDEnv() # initailize GED environment. + ged_env.set_edit_cost('CONSTANT', # GED cost type. + edit_cost_constants=[3, 3, 1, 3, 3, 1] # edit costs. + ) + for g in dataset.graphs[0:10]: + ged_env.add_nx_graph(g, '') +# ged_env.add_nx_graph(graph1, '') # add graph1 +# ged_env.add_nx_graph(graph2, '') # add graph2 + listID = ged_env.get_all_graph_ids() # get list IDs of graphs + ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment. + options = {'threads': 1 # parallel threads. + } + ged_env.set_method('BIPARTITE', # GED method. + options # options for GED method. + ) + ged_env.init_method() # initialize GED method. + + ged_mat = np.empty((10, 10)) + for i in range(0, 10): + for j in range(i, 10): + ged_env.run_method(i, j) # run. + ged_mat[i, j] = ged_env.get_upper_bound(i, j) + ged_mat[j, i] = ged_mat[i, j] + + results = {} + results['pi_forward'] = ged_env.get_forward_map(listID[0], listID[1]) # forward map. + results['pi_backward'] = ged_env.get_backward_map(listID[0], listID[1]) # backward map. + results['upper_bound'] = ged_env.get_upper_bound(listID[0], listID[1]) # GED bewteen two graphs. + results['runtime'] = ged_env.get_runtime(listID[0], listID[1]) + results['init_time'] = ged_env.get_init_time() + results['ged_mat'] = ged_mat + + return results + + +def compute_geds_by_GEDLIB(dataset): + from gklearn.gedlib import librariesImport, gedlibpy + from gklearn.ged.util import ged_options_to_string + import numpy as np + + graph1 = dataset.graphs[5] + graph2 = dataset.graphs[6] + + ged_env = gedlibpy.GEDEnv() # initailize GED environment. + ged_env.set_edit_cost('CONSTANT', # GED cost type. + edit_cost_constant=[3, 3, 1, 3, 3, 1] # edit costs. + ) +# ged_env.add_nx_graph(graph1, '') # add graph1 +# ged_env.add_nx_graph(graph2, '') # add graph2 + for g in dataset.graphs[0:10]: + ged_env.add_nx_graph(g, '') + listID = ged_env.get_all_graph_ids() # get list IDs of graphs + ged_env.init(init_option='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment. + options = {'initialization-method': 'RANDOM', # or 'NODE', etc. + 'threads': 1 # parallel threads. + } + ged_env.set_method('BIPARTITE', # GED method. + ged_options_to_string(options) # options for GED method. + ) + ged_env.init_method() # initialize GED method. + + ged_mat = np.empty((10, 10)) + for i in range(0, 10): + for j in range(i, 10): + ged_env.run_method(i, j) # run. + ged_mat[i, j] = ged_env.get_upper_bound(i, j) + ged_mat[j, i] = ged_mat[i, j] + + results = {} + results['pi_forward'] = ged_env.get_forward_map(listID[0], listID[1]) # forward map. + results['pi_backward'] = ged_env.get_backward_map(listID[0], listID[1]) # backward map. + results['upper_bound'] = ged_env.get_upper_bound(listID[0], listID[1]) # GED bewteen two graphs. + results['runtime'] = ged_env.get_runtime(listID[0], listID[1]) + results['init_time'] = ged_env.get_init_time() + results['ged_mat'] = ged_mat + + return results + + +if __name__ == '__main__': + results1, results2 = xp_check_results_of_GEDEnv() \ No newline at end of file diff --git a/gklearn/ged/util/lsape_solver.py b/gklearn/ged/util/lsape_solver.py index 955f543..aef9c11 100644 --- a/gklearn/ged/util/lsape_solver.py +++ b/gklearn/ged/util/lsape_solver.py @@ -61,9 +61,9 @@ class LSAPESolver(object): """ self.clear_solution() if self.__solve_optimally: - row_id, col_id = linear_sum_assignment(self.__cost_matrix) # @todo: only hungarianLSAPE ('ECBP') can be used. - self.__row_to_col_assignments[0] = col_id - self.__col_to_row_assignments[0] = np.argsort(col_id) # @todo: might be slow, can use row_id + row_ind, col_ind = linear_sum_assignment(self.__cost_matrix) # @todo: only hungarianLSAPE ('ECBP') can be used. + self.__row_to_col_assignments[0] = col_ind + self.__col_to_row_assignments[0] = np.argsort(col_ind) # @todo: might be slow, can use row_ind self.__compute_cost_from_assignments() if num_solutions > 1: pass # @todo: From 938c3d5ca430e5e35849d23c43ef7fc715cb3180 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 2 Jul 2020 18:00:53 +0200 Subject: [PATCH 3/7] Add ability to GEDEnv to use pre-defined costs between node labels. --- gklearn/utils/dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 90bb886..c692c63 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -535,6 +535,16 @@ class Dataset(object): dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) # @todo: clean_labels and add other class members? return dataset + + + def get_all_node_labels(self): + node_labels = [] + for g in self.__graphs: + for n in g.nodes(): + nl = tuple(g.nodes[n].items()) + if nl not in node_labels: + node_labels.append(nl) + return node_labels def __get_dataset_size(self): From 731ab1d45b7cc8c33cb6c0aff21a265b860b3f34 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 2 Jul 2020 18:02:22 +0200 Subject: [PATCH 4/7] Add ability to GEDEnv to use pre-defined costs between node labels. --- gklearn/ged/env/ged_data.py | 37 ++++++--- gklearn/ged/env/ged_env.py | 6 ++ gklearn/ged/util/util.py | 1 + gklearn/preimage/median_preimage_generator_cml.py | 95 ++++++++++++++++------- 4 files changed, 103 insertions(+), 36 deletions(-) diff --git a/gklearn/ged/env/ged_data.py b/gklearn/ged/env/ged_data.py index 9cef41a..cf932b0 100644 --- a/gklearn/ged/env/ged_data.py +++ b/gklearn/ged/env/ged_data.py @@ -23,6 +23,7 @@ class GEDData(object): self._edit_cost = None self._node_costs = None self._edge_costs = None + self._node_label_costs = None self._node_labels = [] self._edge_labels = [] self._init_type = Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES @@ -84,15 +85,21 @@ class GEDData(object): * and 0 otherwise. */ """ - if self._eager_init(): # @todo: check if correct - return self._node_costs[label1, label2] - if label1 == label2: - return 0 - if label1 == SpecialLabel.DUMMY: # @todo: check dummy - return self._edit_cost.node_ins_cost_fun(label2) # self._node_labels[label2 - 1]) # @todo: check - if label2 == SpecialLabel.DUMMY: # @todo: check dummy - return self._edit_cost.node_del_cost_fun(label1) # self._node_labels[label1 - 1]) - return self._edit_cost.node_rel_cost_fun(label1, label2) # self._node_labels[label1 - 1], self._node_labels[label2 - 1]) + if self._node_label_costs is None: + if self._eager_init(): # @todo: check if correct + return self._node_costs[label1, label2] + if label1 == label2: + return 0 + if label1 == SpecialLabel.DUMMY: # @todo: check dummy + return self._edit_cost.node_ins_cost_fun(label2) # self._node_labels[label2 - 1]) # @todo: check + if label2 == SpecialLabel.DUMMY: # @todo: check dummy + return self._edit_cost.node_del_cost_fun(label1) # self._node_labels[label1 - 1]) + return self._edit_cost.node_rel_cost_fun(label1, label2) # self._node_labels[label1 - 1], self._node_labels[label2 - 1]) + # use pre-computed node label costs. + else: + id1 = 0 if label1 == SpecialLabel.DUMMY else self._node_label_to_id(label1) # @todo: this is slow. + id2 = 0 if label2 == SpecialLabel.DUMMY else self._node_label_to_id(label2) + return self._node_label_costs[id1, id2] def edge_cost(self, label1, label2): @@ -198,6 +205,12 @@ class GEDData(object): self._delete_edit_cost = True + def id_to_node_label(self, label_id): + if label_id > len(self._node_labels) or label_id == 0: + raise Exception('Invalid node label ID', str(label_id), '.') + return self._node_labels[label_id - 1] + + def _node_label_to_id(self, node_label): n_id = 0 for n_l in self._node_labels: @@ -208,6 +221,12 @@ class GEDData(object): return n_id + 1 + def id_to_edge_label(self, label_id): + if label_id > len(self._edge_labels) or label_id == 0: + raise Exception('Invalid edge label ID', str(label_id), '.') + return self._edge_labels[label_id - 1] + + def _edge_label_to_id(self, edge_label): e_id = 0 for e_l in self._edge_labels: diff --git a/gklearn/ged/env/ged_env.py b/gklearn/ged/env/ged_env.py index e6dc2f6..56a598a 100644 --- a/gklearn/ged/env/ged_env.py +++ b/gklearn/ged/env/ged_env.py @@ -226,6 +226,12 @@ class GEDEnv(object): */ """ return self.__ged_data._init_type + + + def set_label_costs(self, label_costs): + """Set the costs between labels. + """ + self.__ged_data._node_label_costs = label_costs def set_method(self, method, options=''): diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index 91504fe..45a9fd4 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -55,6 +55,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True for g in graphs: ged_env.add_nx_graph(g, '') listID = ged_env.get_all_graph_ids() + ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None) ged_env.init(init_type=options['init_option']) if parallel: options['threads'] = 1 diff --git a/gklearn/preimage/median_preimage_generator_cml.py b/gklearn/preimage/median_preimage_generator_cml.py index a1eadc2..ac9361c 100644 --- a/gklearn/preimage/median_preimage_generator_cml.py +++ b/gklearn/preimage/median_preimage_generator_cml.py @@ -5,31 +5,26 @@ Created on Tue Jun 16 16:04:46 2020 @author: ljia """ - -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Mar 26 18:27:22 2020 - -@author: ljia -""" import numpy as np import time import random import multiprocessing import networkx as nx import cvxpy as cp +import itertools from gklearn.preimage import PreimageGenerator from gklearn.preimage.utils import compute_k_dis -from gklearn.ged.util import compute_geds_cml, ged_options_to_string +from gklearn.ged.util import compute_geds_cml from gklearn.ged.env import GEDEnv -from gklearn.ged.median import MedianGraphEstimator -from gklearn.ged.median import constant_node_costs,mge_options_to_string -from gklearn.utils import Timer +from gklearn.ged.median import MedianGraphEstimatorPy +from gklearn.ged.median import constant_node_costs, mge_options_to_string +from gklearn.utils import Timer, SpecialLabel from gklearn.utils.utils import get_graph_kernel_by_name class MedianPreimageGeneratorCML(PreimageGenerator): + """Generator median preimages by cost matrices learning using the pure Python version of GEDEnv. Works only for symbolic labeled graphs. + """ def __init__(self, dataset=None): PreimageGenerator.__init__(self, dataset=dataset) @@ -37,7 +32,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__mge = None self.__ged_options = {} self.__mge_options = {} - self.__fit_method = 'k-graphs' +# self.__fit_method = 'k-graphs' + self.__init_method = 'random' self.__init_ecc = None self.__parallel = True self.__n_jobs = multiprocessing.cpu_count() @@ -47,8 +43,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__max_itrs_without_update = 3 self.__epsilon_residual = 0.01 self.__epsilon_ec = 0.1 - self.__allow_zeros = False - self.__triangle_rule = True + self.__allow_zeros = True +# self.__triangle_rule = True # values to compute. self.__runtime_optimize_ec = None self.__runtime_generate_preimage = None @@ -64,6 +60,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__itrs = 0 self.__converged = False self.__num_updates_ecc = 0 + self.__node_label_costs = None + self.__edge_label_costs = None # values that can be set or to be computed. self.__edit_cost_constants = [] self.__gram_matrix_unnorm = None @@ -76,7 +74,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self._verbose = kwargs.get('verbose', 2) self.__ged_options = kwargs.get('ged_options', {}) self.__mge_options = kwargs.get('mge_options', {}) - self.__fit_method = kwargs.get('fit_method', 'k-graphs') +# self.__fit_method = kwargs.get('fit_method', 'k-graphs') + self.__init_method = kwargs.get('init_method', 'random') self.__init_ecc = kwargs.get('init_ecc', None) self.__edit_cost_constants = kwargs.get('edit_cost_constants', []) self.__parallel = kwargs.get('parallel', True) @@ -89,8 +88,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1) self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) - self.__allow_zeros = kwargs.get('allow_zeros', False) - self.__triangle_rule = kwargs.get('triangle_rule', True) + self.__allow_zeros = kwargs.get('allow_zeros', True) +# self.__triangle_rule = kwargs.get('triangle_rule', True) def run(self): @@ -122,10 +121,10 @@ class MedianPreimageGeneratorCML(PreimageGenerator): end_precompute_gm = time.time() start -= self.__runtime_precompute_gm - if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset': - start = time.time() - self.__runtime_precompute_gm = 0 - end_precompute_gm = start +# if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset': +# start = time.time() +# self.__runtime_precompute_gm = 0 +# end_precompute_gm = start # 2. optimize edit cost constants. self.__optimize_edit_cost_vector() @@ -197,7 +196,48 @@ class MedianPreimageGeneratorCML(PreimageGenerator): def __optimize_edit_cost_vector(self): """Learn edit cost vector. """ - if self.__fit_method == 'random': # random + if self.__init_method == 'random': # random + # Get list of node labels. + nls = self._dataset.get_all_node_labels() + # Generate random costs. + nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) + rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) + self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1)) + # Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. These is the same setting as in GEDData. + i = 0 + # Costs of insertions. + for row in range(1, len(nls) + 1): + self.__node_label_costs[row, 0] = rand_costs[i] + i += 1 + # Costs of deletions. + for col in range(1, len(nls) + 1): + self.__node_label_costs[0, col] = rand_costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, len(nls) + 1): + for col in range(row + 1, len(nls) + 1): + self.__node_label_costs[row, col] = rand_costs[i] + self.__node_label_costs[col, row] = rand_costs[i] + i += 1 + +# self.__node_label_costs = {} +# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)): +# self.__node_label_costs[(nl1, nl2)] = rand_costs[i] +# # Add costs for deletion. +# for j, nl in enumerate(nls): +# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j] +# # Add costs for insertion. +# for k, nl in enumerate(nls): +# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k] +# # Add self costs. +# for nl in nls: +# self.__node_label_costs[(nl, nl)] = 0 +# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0 + + # Optimize edit cost matrices. + self.__optimize_ecm_by_kernel_distances() + + elif self.__fit_method == 'random': # random if self.__ged_options['edit_cost'] == 'LETTER': self.__edit_cost_constants = random.sample(range(1, 1000), 3) self.__edit_cost_constants = [item * 0.001 for item in self.__edit_cost_constants] @@ -279,6 +319,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): options['edge_labels'] = self._dataset.edge_labels options['node_attrs'] = self._dataset.node_attrs options['edge_attrs'] = self._dataset.edge_attrs + options['node_label_costs'] = self.__node_label_costs ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] time_list = [time.time() - time0] @@ -881,8 +922,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): ged_env.init(init_type=self.__ged_options['init_option']) # Set up the madian graph estimator. - self.__mge = MedianGraphEstimator(ged_env, constant_node_costs(self.__ged_options['edit_cost'])) - self.__mge.set_refine_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options)) + self.__mge = MedianGraphEstimatorPy(ged_env, constant_node_costs(self.__ged_options['edit_cost'])) + self.__mge.set_refine_method(self.__ged_options['method'], self.__ged_options) options = self.__mge_options.copy() if not 'seed' in options: options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage. @@ -897,8 +938,8 @@ class MedianPreimageGeneratorCML(PreimageGenerator): ged_options = self.__ged_options.copy() if self.__parallel: ged_options['threads'] = 1 - self.__mge.set_init_method(ged_options['method'], ged_options_to_string(ged_options)) - self.__mge.set_descent_method(ged_options['method'], ged_options_to_string(ged_options)) + self.__mge.set_init_method(ged_options['method'], ged_options) + self.__mge.set_descent_method(ged_options['method'], ged_options) # Run the estimator. self.__mge.run(graph_ids, set_median_id, gen_median_id) From 31a8a9c51d1e3679274f4801f890238ee578ed1e Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 3 Jul 2020 12:10:04 +0200 Subject: [PATCH 5/7] Add ability to GEDEnv to use pre-defined costs between edge labels. --- gklearn/ged/env/ged_data.py | 26 +++-- gklearn/ged/env/ged_env.py | 7 +- gklearn/ged/util/lsape_solver.py | 1 + gklearn/ged/util/util.py | 3 +- gklearn/preimage/median_preimage_generator_cml.py | 116 +++++++++++++++------- gklearn/utils/dataset.py | 10 ++ 6 files changed, 114 insertions(+), 49 deletions(-) diff --git a/gklearn/ged/env/ged_data.py b/gklearn/ged/env/ged_data.py index cf932b0..0e6881f 100644 --- a/gklearn/ged/env/ged_data.py +++ b/gklearn/ged/env/ged_data.py @@ -24,6 +24,7 @@ class GEDData(object): self._node_costs = None self._edge_costs = None self._node_label_costs = None + self._edge_label_costs = None self._node_labels = [] self._edge_labels = [] self._init_type = Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES @@ -114,15 +115,22 @@ class GEDData(object): * and 0 otherwise. */ """ - if self._eager_init(): # @todo: check if correct - return self._node_costs[label1, label2] - if label1 == label2: - return 0 - if label1 == SpecialLabel.DUMMY: - return self._edit_cost.edge_ins_cost_fun(label2) # self._edge_labels[label2 - 1]) - if label2 == SpecialLabel.DUMMY: - return self._edit_cost.edge_del_cost_fun(label1) # self._edge_labels[label1 - 1]) - return self._edit_cost.edge_rel_cost_fun(label1, label2) # self._edge_labels[label1 - 1], self._edge_labels[label2 - 1]) + if self._edge_label_costs is None: + if self._eager_init(): # @todo: check if correct + return self._node_costs[label1, label2] + if label1 == label2: + return 0 + if label1 == SpecialLabel.DUMMY: + return self._edit_cost.edge_ins_cost_fun(label2) # self._edge_labels[label2 - 1]) + if label2 == SpecialLabel.DUMMY: + return self._edit_cost.edge_del_cost_fun(label1) # self._edge_labels[label1 - 1]) + return self._edit_cost.edge_rel_cost_fun(label1, label2) # self._edge_labels[label1 - 1], self._edge_labels[label2 - 1]) + + # use pre-computed edge label costs. + else: + id1 = 0 if label1 == SpecialLabel.DUMMY else self._edge_label_to_id(label1) # @todo: this is slow. + id2 = 0 if label2 == SpecialLabel.DUMMY else self._edge_label_to_id(label2) + return self._edge_label_costs[id1, id2] def compute_induced_cost(self, g, h, node_map): diff --git a/gklearn/ged/env/ged_env.py b/gklearn/ged/env/ged_env.py index 56a598a..b31ecb9 100644 --- a/gklearn/ged/env/ged_env.py +++ b/gklearn/ged/env/ged_env.py @@ -228,10 +228,13 @@ class GEDEnv(object): return self.__ged_data._init_type - def set_label_costs(self, label_costs): + def set_label_costs(self, node_label_costs=None, edge_label_costs=None): """Set the costs between labels. """ - self.__ged_data._node_label_costs = label_costs + if node_label_costs is not None: + self.__ged_data._node_label_costs = node_label_costs + if edge_label_costs is not None: + self.__ged_data._edge_label_costs = edge_label_costs def set_method(self, method, options=''): diff --git a/gklearn/ged/util/lsape_solver.py b/gklearn/ged/util/lsape_solver.py index aef9c11..72c2776 100644 --- a/gklearn/ged/util/lsape_solver.py +++ b/gklearn/ged/util/lsape_solver.py @@ -8,6 +8,7 @@ Created on Mon Jun 22 15:37:36 2020 import numpy as np from scipy.optimize import linear_sum_assignment + class LSAPESolver(object): diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index 45a9fd4..cdced21 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -55,7 +55,8 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True for g in graphs: ged_env.add_nx_graph(g, '') listID = ged_env.get_all_graph_ids() - ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None) + ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None, + options['edge_label_costs'] if 'edge_label_costs' in options else None) ged_env.init(init_type=options['init_option']) if parallel: options['threads'] = 1 diff --git a/gklearn/preimage/median_preimage_generator_cml.py b/gklearn/preimage/median_preimage_generator_cml.py index ac9361c..c4a92a6 100644 --- a/gklearn/preimage/median_preimage_generator_cml.py +++ b/gklearn/preimage/median_preimage_generator_cml.py @@ -196,46 +196,16 @@ class MedianPreimageGeneratorCML(PreimageGenerator): def __optimize_edit_cost_vector(self): """Learn edit cost vector. """ - if self.__init_method == 'random': # random - # Get list of node labels. - nls = self._dataset.get_all_node_labels() - # Generate random costs. - nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) - rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) - self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1)) - # Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. These is the same setting as in GEDData. - i = 0 - # Costs of insertions. - for row in range(1, len(nls) + 1): - self.__node_label_costs[row, 0] = rand_costs[i] - i += 1 - # Costs of deletions. - for col in range(1, len(nls) + 1): - self.__node_label_costs[0, col] = rand_costs[i] - i += 1 - # Costs of substitutions. - for row in range(1, len(nls) + 1): - for col in range(row + 1, len(nls) + 1): - self.__node_label_costs[row, col] = rand_costs[i] - self.__node_label_costs[col, row] = rand_costs[i] - i += 1 - -# self.__node_label_costs = {} -# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)): -# self.__node_label_costs[(nl1, nl2)] = rand_costs[i] -# # Add costs for deletion. -# for j, nl in enumerate(nls): -# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j] -# # Add costs for insertion. -# for k, nl in enumerate(nls): -# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k] -# # Add self costs. -# for nl in nls: -# self.__node_label_costs[(nl, nl)] = 0 -# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0 + # Initialize label costs randomly. + if self.__init_method == 'random': + # Initialize label costs. + self.__initialize_label_costs() # Optimize edit cost matrices. self.__optimize_ecm_by_kernel_distances() + # Initialize all label costs with the same value. + elif self.__init_method == 'uniform': # random + pass elif self.__fit_method == 'random': # random if self.__ged_options['edit_cost'] == 'LETTER': @@ -297,6 +267,77 @@ class MedianPreimageGeneratorCML(PreimageGenerator): pass + def __initialize_label_costs(self): + self.__initialize_node_label_costs() + self.__initialize_edge_label_costs() + + + def __initialize_node_label_costs(self): + # Get list of node labels. + nls = self._dataset.get_all_node_labels() + # Generate random costs. + nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) + rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) + rand_costs /= np.max(rand_costs) # @todo: maybe not needed. + self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1)) + # Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData. + i = 0 + # Costs of insertions. + for row in range(1, len(nls) + 1): + self.__node_label_costs[row, 0] = rand_costs[i] + i += 1 + # Costs of deletions. + for col in range(1, len(nls) + 1): + self.__node_label_costs[0, col] = rand_costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, len(nls) + 1): + for col in range(row + 1, len(nls) + 1): + self.__node_label_costs[row, col] = rand_costs[i] + self.__node_label_costs[col, row] = rand_costs[i] + i += 1 + +# self.__node_label_costs = {} +# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)): +# self.__node_label_costs[(nl1, nl2)] = rand_costs[i] +# # Add costs for deletion. +# for j, nl in enumerate(nls): +# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j] +# # Add costs for insertion. +# for k, nl in enumerate(nls): +# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k] +# # Add self costs. +# for nl in nls: +# self.__node_label_costs[(nl, nl)] = 0 +# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0 + + + def __initialize_edge_label_costs(self): + # Get list of edge labels. + els = self._dataset.get_all_edge_labels() + # Generate random costs. + nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els)) + rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el) + rand_costs /= np.max(rand_costs) # @todo: maybe not needed. + self.__edge_label_costs = np.zeros((len(els) + 1, len(els) + 1)) + # Initialize edge label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData. + i = 0 + # Costs of insertions. + for row in range(1, len(els) + 1): + self.__edge_label_costs[row, 0] = rand_costs[i] + i += 1 + # Costs of deletions. + for col in range(1, len(els) + 1): + self.__edge_label_costs[0, col] = rand_costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, len(els) + 1): + for col in range(row + 1, len(els) + 1): + self.__edge_label_costs[row, col] = rand_costs[i] + self.__edge_label_costs[col, row] = rand_costs[i] + i += 1 + + def __optimize_ecm_by_kernel_distances(self): # compute distances in feature space. dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix() @@ -320,6 +361,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): options['node_attrs'] = self._dataset.node_attrs options['edge_attrs'] = self._dataset.edge_attrs options['node_label_costs'] = self.__node_label_costs + options['edge_label_costs'] = self.__edge_label_costs ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] time_list = [time.time() - time0] diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index c692c63..19c9993 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -545,6 +545,16 @@ class Dataset(object): if nl not in node_labels: node_labels.append(nl) return node_labels + + + def get_all_edge_labels(self): + edge_labels = [] + for g in self.__graphs: + for e in g.edges(): + el = tuple(g.edges[e].items()) + if el not in edge_labels: + edge_labels.append(el) + return edge_labels def __get_dataset_size(self): From 15945eb27202999105428ee7b4c663c30d4c8e63 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 3 Jul 2020 18:23:55 +0200 Subject: [PATCH 6/7] Update the computation of numbers of edit operations for GEDs using pre-defined costs between labels. --- gklearn/ged/env/ged_env.py | 22 +++++++ gklearn/ged/util/util.py | 157 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 158 insertions(+), 21 deletions(-) diff --git a/gklearn/ged/env/ged_env.py b/gklearn/ged/env/ged_env.py index b31ecb9..572db58 100644 --- a/gklearn/ged/env/ged_env.py +++ b/gklearn/ged/env/ged_env.py @@ -357,6 +357,17 @@ class GEDEnv(object): return len(self.__ged_data._node_labels) + def get_all_node_labels(self): + """ + /*! + * @brief Returns the list of all node labels. + * @return List of pairwise different node labels contained in the environment. + * @note If @p 1 is returned, the nodes are unlabeled. + */ + """ + return self.__ged_data._node_labels + + def get_node_label(self, label_id, to_dict=True): """ /*! @@ -382,6 +393,17 @@ class GEDEnv(object): """ return len(self.__ged_data._edge_labels) + + def get_all_edge_labels(self): + """ + /*! + * @brief Returns the list of all edge labels. + * @return List of pairwise different edge labels contained in the environment. + * @note If @p 1 is returned, the edges are unlabeled. + */ + """ + return self.__ged_data._edge_labels + def get_edge_label(self, label_id, to_dict=True): """ diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index cdced21..b06fadc 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -49,14 +49,17 @@ def compute_ged(g1, g2, options): def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True): + + node_label_costs = options['node_label_costs'] if 'node_label_costs' in options else None + edge_label_costs = options['edge_label_costs'] if 'edge_label_costs' in options else None + # initialize ged env. ged_env = GEDEnv() ged_env.set_edit_cost(options['edit_cost'], edit_cost_constants=options['edit_cost_constants']) for g in graphs: ged_env.add_nx_graph(g, '') listID = ged_env.get_all_graph_ids() - ged_env.set_label_costs(options['node_label_costs'] if 'node_label_costs' in options else None, - options['edge_label_costs'] if 'edge_label_costs' in options else None) + ged_env.set_label_costs(node_label_costs, edge_label_costs) ged_env.init(init_type=options['init_option']) if parallel: options['threads'] = 1 @@ -64,9 +67,13 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True ged_env.init_method() # compute ged. + # options used to compute numbers of edit operations. neo_options = {'edit_cost': options['edit_cost'], - 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], - 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} +# 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], +# 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs'], + 'is_cml': True, + 'node_labels': ged_env.get_all_node_labels(), + 'edge_labels': ged_env.get_all_edge_labels()} ged_mat = np.zeros((len(graphs), len(graphs))) if parallel: len_itr = int(len(graphs) * (len(graphs) - 1) / 2) @@ -122,8 +129,7 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True n_eo_tmp = get_nb_edit_operations(graphs[i], graphs[j], pi_forward, pi_backward, **neo_options) n_edit_operations.append(n_eo_tmp) - return ged_vec, ged_mat, n_edit_operations - + return ged_vec, ged_mat, n_edit_operations def compute_geds(graphs, options={}, sort=True, parallel=False, verbose=True): @@ -237,21 +243,130 @@ def _compute_ged(env, gid1, gid2, g1, g2): return dis, pi_forward, pi_backward -def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, **kwargs): - if edit_cost == 'LETTER' or edit_cost == 'LETTER2': - return get_nb_edit_operations_letter(g1, g2, forward_map, backward_map) - elif edit_cost == 'NON_SYMBOLIC': - node_attrs = kwargs.get('node_attrs', []) - edge_attrs = kwargs.get('edge_attrs', []) - return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, - node_attrs=node_attrs, edge_attrs=edge_attrs) - elif edit_cost == 'CONSTANT': - node_labels = kwargs.get('node_labels', []) - edge_labels = kwargs.get('edge_labels', []) - return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, - node_labels=node_labels, edge_labels=edge_labels) - else: - return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) +def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): + if is_cml: + if edit_cost == 'CONSTANT': + node_label_costs = kwargs.get('node_label_costs') + edge_label_costs = kwargs.get('edge_label_costs') + node_labels = kwargs.get('node_labels', []) + edge_labels = kwargs.get('edge_labels', []) + return get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, + node_labels=node_labels, edge_labels=edge_labels) + else: + raise Exception('Edit cost "', edit_cost, '" is not supported.') + else: + if edit_cost == 'LETTER' or edit_cost == 'LETTER2': + return get_nb_edit_operations_letter(g1, g2, forward_map, backward_map) + elif edit_cost == 'NON_SYMBOLIC': + node_attrs = kwargs.get('node_attrs', []) + edge_attrs = kwargs.get('edge_attrs', []) + return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, + node_attrs=node_attrs, edge_attrs=edge_attrs) + elif edit_cost == 'CONSTANT': + node_labels = kwargs.get('node_labels', []) + edge_labels = kwargs.get('edge_labels', []) + return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, + node_labels=node_labels, edge_labels=edge_labels) + else: + return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) + + +def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, + node_labels=[], edge_labels=[]): + """Compute the number of each edit operations for symbolic-labeled graphs, where the costs are different for each pair of nodes. + + Returns + ------- + list + A vector of costs bewteen labels, formed in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. The dummy label is the first label, and the self label costs are not included. + """ + # Initialize. + nb_ops_node = np.zeros((1 + len(node_labels), 1 + len(node_labels))) + nb_ops_edge = np.zeros((1 + len(edge_labels), 1 + len(edge_labels))) + + # For nodes. + nodes1 = [n for n in g1.nodes()] + for i, map_i in enumerate(forward_map): + label1 = tuple(g1.nodes[nodes1[i]].items()) # @todo: order and faster + idx_label1 = node_labels.index(label1) # @todo: faster + if map_i == np.inf: # deletions. + nb_ops_node[0, idx_label1 + 1] += 1 + else: # substitutions. + label2 = tuple(g2.nodes[map_i].items()) + if label1 != label2: + idx_label2 = node_labels.index(label2) # @todo: faster + nb_ops_node[idx_label1 + 1, idx_label2 + 1] += 1 + # insertions. + nodes2 = [n for n in g2.nodes()] + for i, map_i in enumerate(backward_map): + if map_i == np.inf: + label = tuple(g2.nodes[nodes2[i]].items()) + idx_label = node_labels.index(label) # @todo: faster + nb_ops_node[idx_label + 1, 0] += 1 + + # For edges. + edges1 = [e for e in g1.edges()] + edges2_marked = [] + for nf1, nt1 in edges1: + label1 = tuple(g1.edges[(nf1, nt1)].items()) + idx_label1 = edge_labels.index(label1) # @todo: faster + idxf1 = nodes1.index(nf1) # @todo: faster + idxt1 = nodes1.index(nt1) # @todo: faster + # At least one of the nodes is removed, thus the edge is removed. + if forward_map[idxf1] == np.inf or forward_map[idxt1] == np.inf: + nb_ops_edge[0, idx_label1 + 1] += 1 + # corresponding edge is in g2. + else: + nf2, nt2 = forward_map[idxf1], forward_map[idxt1] + if (nf2, nt2) in g2.edges(): + edges2_marked.append((nf2, nt2)) + # If edge labels are different. + label2 = tuple(g2.edges[(nf2, nt2)].items()) + if label1 != label2: + idx_label2 = edge_labels.index(label2) # @todo: faster + nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1 + # Switch nf2 and nt2, for directed graphs. + elif (nt2, nf2) in g2.edges(): + edges2_marked.append((nt2, nf2)) + # If edge labels are different. + label2 = tuple(g2.edges[(nt2, nf2)].items()) + if label1 != label2: + idx_label2 = edge_labels.index(label2) # @todo: faster + nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1 + # Corresponding nodes are in g2, however the edge is removed. + else: + nb_ops_edge[0, idx_label1 + 1] += 1 + # insertions. + for e in g2.edges(): + if e not in edges2_marked: + label = tuple(g2.edges[e].items()) + idx_label = edge_labels.index(label) # @todo: faster + nb_ops_edge[idx_label + 1, 0] += 1 + + # Reform the costs into a vector. + cost_vector = [] + # Add node insertion costs. + for i in range(1, len(nb_ops_node)): + cost_vector.append(nb_ops_node[i, 0]) + # Add node deletion costs. + for i in range(1, len(nb_ops_node)): + cost_vector.append(nb_ops_node[0, i]) + # Add node substitution costs. + for i in range(1, len(nb_ops_node)): + for j in range(i + 1, len(nb_ops_node)): + cost_vector.append(nb_ops_node[i, j]) + # Add edge insertion costs. + for i in range(1, len(nb_ops_edge)): + cost_vector.append(nb_ops_edge[i, 0]) + # Add edge deletion costs. + for i in range(1, len(nb_ops_edge)): + cost_vector.append(nb_ops_edge[0, i]) + # Add edge substitution costs. + for i in range(1, len(nb_ops_edge)): + for j in range(i + 1, len(nb_ops_edge)): + cost_vector.append(nb_ops_edge[i, j]) + + return cost_vector def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, From 0db57fe3cedad9c1146aac01daddb7a059c5aa6e Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 6 Jul 2020 18:21:31 +0200 Subject: [PATCH 7/7] Fix bugs in ged.util.util.get_nb_edit_operations_symbolic_cml() and add test for it. --- gklearn/ged/util/util.py | 104 ++++++++++++------ gklearn/preimage/median_preimage_generator_cml.py | 50 +-------- gklearn/tests/{ => ged}/test_ged_env.py | 0 .../test_get_nb_edit_operations_symbolic_cml.py | 122 +++++++++++++++++++++ 4 files changed, 193 insertions(+), 83 deletions(-) rename gklearn/tests/{ => ged}/test_ged_env.py (100%) create mode 100644 gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index b06fadc..0cffeba 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -49,16 +49,18 @@ def compute_ged(g1, g2, options): def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True): - - node_label_costs = options['node_label_costs'] if 'node_label_costs' in options else None - edge_label_costs = options['edge_label_costs'] if 'edge_label_costs' in options else None # initialize ged env. ged_env = GEDEnv() ged_env.set_edit_cost(options['edit_cost'], edit_cost_constants=options['edit_cost_constants']) for g in graphs: ged_env.add_nx_graph(g, '') - listID = ged_env.get_all_graph_ids() + listID = ged_env.get_all_graph_ids() + + node_labels = ged_env.get_all_node_labels() + edge_labels = ged_env.get_all_edge_labels() + node_label_costs = label_costs_to_matrix(options['node_label_costs'], len(node_labels)) if 'node_label_costs' in options else None + edge_label_costs = label_costs_to_matrix(options['edge_label_costs'], len(edge_labels)) if 'edge_label_costs' in options else None ged_env.set_label_costs(node_label_costs, edge_label_costs) ged_env.init(init_type=options['init_option']) if parallel: @@ -69,11 +71,9 @@ def compute_geds_cml(graphs, options={}, sort=True, parallel=False, verbose=True # compute ged. # options used to compute numbers of edit operations. neo_options = {'edit_cost': options['edit_cost'], -# 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], -# 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs'], 'is_cml': True, - 'node_labels': ged_env.get_all_node_labels(), - 'edge_labels': ged_env.get_all_edge_labels()} + 'node_labels': node_labels, + 'edge_labels': edge_labels} ged_mat = np.zeros((len(graphs), len(graphs))) if parallel: len_itr = int(len(graphs) * (len(graphs) - 1) / 2) @@ -243,11 +243,45 @@ def _compute_ged(env, gid1, gid2, g1, g2): return dis, pi_forward, pi_backward +def label_costs_to_matrix(costs, nb_labels): + """Reform a label cost vector to a matrix. + + Parameters + ---------- + costs : numpy.array + The vector containing costs between labels, in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. + nb_labels : integer + Number of labels. + + Returns + ------- + cost_matrix : numpy.array. + The reformed label cost matrix of size (nb_labels, nb_labels). Each row/column of cost_matrix corresponds to a label, and the first label is the dummy label. This is the same setting as in GEDData. + """ + # Initialize label cost matrix. + cost_matrix = np.zeros((nb_labels + 1, nb_labels + 1)) + i = 0 + # Costs of insertions. + for col in range(1, nb_labels + 1): + cost_matrix[0, col] = costs[i] + i += 1 + # Costs of deletions. + for row in range(1, nb_labels + 1): + cost_matrix[row, 0] = costs[i] + i += 1 + # Costs of substitutions. + for row in range(1, nb_labels + 1): + for col in range(row + 1, nb_labels + 1): + cost_matrix[row, col] = costs[i] + cost_matrix[col, row] = costs[i] + i += 1 + + return cost_matrix + + def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is_cml=False, **kwargs): if is_cml: if edit_cost == 'CONSTANT': - node_label_costs = kwargs.get('node_label_costs') - edge_label_costs = kwargs.get('edge_label_costs') node_labels = kwargs.get('node_labels', []) edge_labels = kwargs.get('edge_labels', []) return get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, @@ -273,12 +307,12 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, is def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, node_labels=[], edge_labels=[]): - """Compute the number of each edit operations for symbolic-labeled graphs, where the costs are different for each pair of nodes. + """Compute times that edit operations are used in an edit path for symbolic-labeled graphs, where the costs are different for each pair of nodes. Returns ------- list - A vector of costs bewteen labels, formed in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. The dummy label is the first label, and the self label costs are not included. + A vector of numbers of times that costs bewteen labels are used in an edit path, formed in the order of node insertion costs, node deletion costs, node substitition costs, edge insertion costs, edge deletion costs, edge substitition costs. The dummy label is the first label, and the self label costs are not included. """ # Initialize. nb_ops_node = np.zeros((1 + len(node_labels), 1 + len(node_labels))) @@ -290,7 +324,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, label1 = tuple(g1.nodes[nodes1[i]].items()) # @todo: order and faster idx_label1 = node_labels.index(label1) # @todo: faster if map_i == np.inf: # deletions. - nb_ops_node[0, idx_label1 + 1] += 1 + nb_ops_node[idx_label1 + 1, 0] += 1 else: # substitutions. label2 = tuple(g2.nodes[map_i].items()) if label1 != label2: @@ -302,7 +336,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, if map_i == np.inf: label = tuple(g2.nodes[nodes2[i]].items()) idx_label = node_labels.index(label) # @todo: faster - nb_ops_node[idx_label + 1, 0] += 1 + nb_ops_node[0, idx_label + 1] += 1 # For edges. edges1 = [e for e in g1.edges()] @@ -314,7 +348,7 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, idxt1 = nodes1.index(nt1) # @todo: faster # At least one of the nodes is removed, thus the edge is removed. if forward_map[idxf1] == np.inf or forward_map[idxt1] == np.inf: - nb_ops_edge[0, idx_label1 + 1] += 1 + nb_ops_edge[idx_label1 + 1, 0] += 1 # corresponding edge is in g2. else: nf2, nt2 = forward_map[idxf1], forward_map[idxt1] @@ -335,38 +369,38 @@ def get_nb_edit_operations_symbolic_cml(g1, g2, forward_map, backward_map, nb_ops_edge[idx_label1 + 1, idx_label2 + 1] += 1 # Corresponding nodes are in g2, however the edge is removed. else: - nb_ops_edge[0, idx_label1 + 1] += 1 + nb_ops_edge[idx_label1 + 1, 0] += 1 # insertions. - for e in g2.edges(): - if e not in edges2_marked: - label = tuple(g2.edges[e].items()) + for nt, nf in g2.edges(): + if (nt, nf) not in edges2_marked and (nf, nt) not in edges2_marked: # @todo: for directed. + label = tuple(g2.edges[(nt, nf)].items()) idx_label = edge_labels.index(label) # @todo: faster - nb_ops_edge[idx_label + 1, 0] += 1 + nb_ops_edge[0, idx_label + 1] += 1 - # Reform the costs into a vector. - cost_vector = [] - # Add node insertion costs. + # Reform the numbers of edit oeprations into a vector. + nb_eo_vector = [] + # node insertion. for i in range(1, len(nb_ops_node)): - cost_vector.append(nb_ops_node[i, 0]) - # Add node deletion costs. + nb_eo_vector.append(nb_ops_node[0, i]) + # node deletion. for i in range(1, len(nb_ops_node)): - cost_vector.append(nb_ops_node[0, i]) - # Add node substitution costs. + nb_eo_vector.append(nb_ops_node[i, 0]) + # node substitution. for i in range(1, len(nb_ops_node)): for j in range(i + 1, len(nb_ops_node)): - cost_vector.append(nb_ops_node[i, j]) - # Add edge insertion costs. + nb_eo_vector.append(nb_ops_node[i, j]) + # edge insertion. for i in range(1, len(nb_ops_edge)): - cost_vector.append(nb_ops_edge[i, 0]) - # Add edge deletion costs. + nb_eo_vector.append(nb_ops_edge[0, i]) + # edge deletion. for i in range(1, len(nb_ops_edge)): - cost_vector.append(nb_ops_edge[0, i]) - # Add edge substitution costs. + nb_eo_vector.append(nb_ops_edge[i, 0]) + # edge substitution. for i in range(1, len(nb_ops_edge)): for j in range(i + 1, len(nb_ops_edge)): - cost_vector.append(nb_ops_edge[i, j]) + nb_eo_vector.append(nb_ops_edge[i, j]) - return cost_vector + return nb_eo_vector def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, diff --git a/gklearn/preimage/median_preimage_generator_cml.py b/gklearn/preimage/median_preimage_generator_cml.py index c4a92a6..161475a 100644 --- a/gklearn/preimage/median_preimage_generator_cml.py +++ b/gklearn/preimage/median_preimage_generator_cml.py @@ -279,37 +279,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) rand_costs /= np.max(rand_costs) # @todo: maybe not needed. - self.__node_label_costs = np.zeros((len(nls) + 1, len(nls) + 1)) - # Initialize node label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData. - i = 0 - # Costs of insertions. - for row in range(1, len(nls) + 1): - self.__node_label_costs[row, 0] = rand_costs[i] - i += 1 - # Costs of deletions. - for col in range(1, len(nls) + 1): - self.__node_label_costs[0, col] = rand_costs[i] - i += 1 - # Costs of substitutions. - for row in range(1, len(nls) + 1): - for col in range(row + 1, len(nls) + 1): - self.__node_label_costs[row, col] = rand_costs[i] - self.__node_label_costs[col, row] = rand_costs[i] - i += 1 - -# self.__node_label_costs = {} -# for i, (nl1, nl2) in enumerate(itertools.combinations(nls, 2)): -# self.__node_label_costs[(nl1, nl2)] = rand_costs[i] -# # Add costs for deletion. -# for j, nl in enumerate(nls): -# self.__node_label_costs[(nl1, SpecialLabel.DUMMY)] = rand_costs[i + j] -# # Add costs for insertion. -# for k, nl in enumerate(nls): -# self.__node_label_costs[(SpecialLabel.DUMMY, nl1)] = rand_costs[i + j + k] -# # Add self costs. -# for nl in nls: -# self.__node_label_costs[(nl, nl)] = 0 -# self.__node_label_costs[(SpecialLabel.DUMMY, SpecialLabel.DUMMY)] = 0 + self.__node_label_costs = rand_costs def __initialize_edge_label_costs(self): @@ -319,23 +289,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els)) rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el) rand_costs /= np.max(rand_costs) # @todo: maybe not needed. - self.__edge_label_costs = np.zeros((len(els) + 1, len(els) + 1)) - # Initialize edge label cost matrix, each row/column corresponds to a label, the first label is the dummy label. This is the same setting as in GEDData. - i = 0 - # Costs of insertions. - for row in range(1, len(els) + 1): - self.__edge_label_costs[row, 0] = rand_costs[i] - i += 1 - # Costs of deletions. - for col in range(1, len(els) + 1): - self.__edge_label_costs[0, col] = rand_costs[i] - i += 1 - # Costs of substitutions. - for row in range(1, len(els) + 1): - for col in range(row + 1, len(els) + 1): - self.__edge_label_costs[row, col] = rand_costs[i] - self.__edge_label_costs[col, row] = rand_costs[i] - i += 1 + self.__edge_label_costs = rand_costs def __optimize_ecm_by_kernel_distances(self): diff --git a/gklearn/tests/test_ged_env.py b/gklearn/tests/ged/test_ged_env.py similarity index 100% rename from gklearn/tests/test_ged_env.py rename to gklearn/tests/ged/test_ged_env.py diff --git a/gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py b/gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py new file mode 100644 index 0000000..aa40cca --- /dev/null +++ b/gklearn/tests/ged/test_get_nb_edit_operations_symbolic_cml.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 6 12:08:24 2020 + +@author: ljia +""" +import random +import numpy as np + +def test_get_nb_edit_operations_symbolic_cml(): + """Test get_nb_edit_operations_symbolic_cml(). + """ + """**1. Get dataset.**""" + + from gklearn.utils import Dataset + + # Predefined dataset name, use dataset "MUTAG". + ds_name = 'MUTAG' + + # Initialize a Dataset. + dataset = Dataset() + # Load predefined dataset "MUTAG". + dataset.load_predefined_dataset(ds_name) + graph1 = dataset.graphs[0] + graph2 = dataset.graphs[1] + + """**2. Compute graph edit distance.**""" + +# try: + # Initialize label costs randomly. + node_label_costs, edge_label_costs = _initialize_label_costs(dataset) + + # Compute GEDs. + pi_forward, pi_backward, dis, node_labels, edge_labels = _compute_ged(dataset, node_label_costs, edge_label_costs) + + + # Compute numbers of edit operations. + + from gklearn.ged.util.util import get_nb_edit_operations_symbolic_cml + + n_edit_operations = get_nb_edit_operations_symbolic_cml(graph1, graph2, pi_forward, pi_backward, node_labels, edge_labels) + + assert np.abs((np.dot(np.concatenate((node_label_costs, edge_label_costs)), n_edit_operations) - dis) / dis) < 10e-6 + +# except Exception as exception: +# assert False, exception + + +def _initialize_label_costs(dataset): + node_label_costs = _initialize_node_label_costs(dataset) + edge_label_costs = _initialize_edge_label_costs(dataset) + return node_label_costs, edge_label_costs + + +def _initialize_node_label_costs(dataset): + # Get list of node labels. + nls = dataset.get_all_node_labels() + # Generate random costs. + nb_nl = int((len(nls) * (len(nls) - 1)) / 2 + 2 * len(nls)) + rand_costs = random.sample(range(1, 10 * nb_nl + 1), nb_nl) + rand_costs /= np.max(rand_costs) + + return rand_costs + + +def _initialize_edge_label_costs(dataset): + # Get list of edge labels. + els = dataset.get_all_edge_labels() + # Generate random costs. + nb_el = int((len(els) * (len(els) - 1)) / 2 + 2 * len(els)) + rand_costs = random.sample(range(1, 10 * nb_el + 1), nb_el) + rand_costs /= np.max(rand_costs) + + return rand_costs + + +def _compute_ged(dataset, node_label_costs, edge_label_costs): + from gklearn.ged.env import GEDEnv + from gklearn.ged.util.util import label_costs_to_matrix + import networkx as nx + + ged_env = GEDEnv() # initailize GED environment. + ged_env.set_edit_cost('CONSTANT', # GED cost type. + edit_cost_constants=[3, 3, 1, 3, 3, 1] # edit costs. + ) + for g in dataset.graphs: + ged_env.add_nx_graph(g, '') # add graphs + + node_labels = ged_env.get_all_node_labels() + edge_labels = ged_env.get_all_edge_labels() + listID = ged_env.get_all_graph_ids() # get list IDs of graphs + ged_env.set_label_costs(label_costs_to_matrix(node_label_costs, len(node_labels)), + label_costs_to_matrix(edge_label_costs, len(edge_labels))) + ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment. + options = {'initialization_method': 'RANDOM', # or 'NODE', etc. + 'threads': 1 # parallel threads. + } + ged_env.set_method('BIPARTITE', # GED method. + options # options for GED method. + ) + ged_env.init_method() # initialize GED method. + + ged_env.run_method(listID[0], listID[1]) # run. + + pi_forward = ged_env.get_forward_map(listID[0], listID[1]) # forward map. + pi_backward = ged_env.get_backward_map(listID[0], listID[1]) # backward map. + dis = ged_env.get_upper_bound(listID[0], listID[1]) # GED bewteen two graphs. + + # make the map label correct (label remove map as np.inf) + nodes1 = [n for n in dataset.graphs[0].nodes()] + nodes2 = [n for n in dataset.graphs[1].nodes()] + nb1 = nx.number_of_nodes(dataset.graphs[0]) + nb2 = nx.number_of_nodes(dataset.graphs[1]) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] + + return pi_forward, pi_backward, dis, node_labels, edge_labels + + +if __name__ == "__main__": + test_get_nb_edit_operations_symbolic_cml() \ No newline at end of file