From d97bfe954c0d4461c7ab973fc9909a393c2d4fc2 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 26 Jun 2020 12:44:48 +0200 Subject: [PATCH] Add class MedianGraphEstimatorPy and MedianPreimageGeneratorPy which use GEDEnv implemented in pure Python. --- gklearn/ged/env/ged_data.py | 41 + gklearn/ged/env/ged_env.py | 329 ++++- gklearn/ged/median/__init__.py | 1 + gklearn/ged/median/median_graph_estimator_py.py | 1711 ++++++++++++++++++++++ gklearn/preimage/__init__.py | 3 +- gklearn/preimage/median_preimage_generator_py.py | 1035 +++++++++++++ 6 files changed, 3117 insertions(+), 3 deletions(-) create mode 100644 gklearn/ged/median/median_graph_estimator_py.py create mode 100644 gklearn/preimage/median_preimage_generator_py.py diff --git a/gklearn/ged/env/ged_data.py b/gklearn/ged/env/ged_data.py index b09805c..9cef41a 100644 --- a/gklearn/ged/env/ged_data.py +++ b/gklearn/ged/env/ged_data.py @@ -41,6 +41,17 @@ class GEDData(object): return len(self._graphs) + def graph(self, graph_id): + """ + /*! + * @brief Provides access to a graph. + * @param[in] graph_id The ID of the graph. + * @return Constant reference to the graph with ID @p graph_id. + */ + """ + return self._graphs[graph_id] + + def shuffled_graph_copies_available(self): """ /*! @@ -51,6 +62,16 @@ class GEDData(object): return (self._init_type == Options.InitType.EAGER_WITH_SHUFFLED_COPIES or self._init_type == Options.InitType.LAZY_WITH_SHUFFLED_COPIES) + def num_graphs_without_shuffled_copies(self): + """ + /*! + * @brief Returns the number of graphs in the instance without the shuffled copies. + * @return Number of graphs without shuffled copies contained in the instance. + */ + """ + return self._num_graphs_without_shuffled_copies + + def node_cost(self, label1, label2): """ /*! @@ -177,5 +198,25 @@ class GEDData(object): self._delete_edit_cost = True + def _node_label_to_id(self, node_label): + n_id = 0 + for n_l in self._node_labels: + if n_l == node_label: + return n_id + 1 + n_id += 1 + self._node_labels.append(node_label) + return n_id + 1 + + + def _edge_label_to_id(self, edge_label): + e_id = 0 + for e_l in self._edge_labels: + if e_l == edge_label: + return e_id + 1 + e_id += 1 + self._edge_labels.append(edge_label) + return e_id + 1 + + def _eager_init(self): return (self._init_type == Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES or self._init_type == Options.InitType.EAGER_WITH_SHUFFLED_COPIES) \ No newline at end of file diff --git a/gklearn/ged/env/ged_env.py b/gklearn/ged/env/ged_env.py index 9fbdd4a..e6dc2f6 100644 --- a/gklearn/ged/env/ged_env.py +++ b/gklearn/ged/env/ged_env.py @@ -63,6 +63,23 @@ class GEDEnv(object): return graph_id + def clear_graph(self, graph_id): + """ + /*! + * @brief Clears and de-initializes a graph that has previously been added to the environment. Call init() after calling this method. + * @param[in] graph_id ID of graph that has to be cleared. + */ + """ + if graph_id > self.__ged_data.num_graphs_without_shuffled_copies(): + raise Exception('The graph', self.get_graph_name(graph_id), 'has not been added to the environment.') + self.__ged_data._graphs[graph_id].clear() + self.__original_to_internal_node_ids[graph_id].clear() + self.__internal_to_original_node_ids[graph_id].clear() + self.__ged_data._strings_to_internal_node_ids[graph_id].clear() + self.__ged_data._internal_node_ids_to_strings[graph_id].clear() + self.__initialized = False + + def add_node(self, graph_id, node_id, node_label): """ /*! @@ -80,7 +97,9 @@ class GEDEnv(object): self.__internal_to_original_node_ids[graph_id][internal_node_id] = node_id self.__ged_data._strings_to_internal_node_ids[graph_id][str(node_id)] = internal_node_id self.__ged_data._internal_node_ids_to_strings[graph_id][internal_node_id] = str(node_id) - # @todo: node_label_to_id_ + self.__ged_data._node_label_to_id(node_label) + label_id = self.__ged_data._node_label_to_id(node_label) + # @todo: ged_data_.graphs_[graph_id].set_label def add_edge(self, graph_id, nd_from, nd_to, edge_label, ignore_duplicates=True): @@ -98,7 +117,8 @@ class GEDEnv(object): self.__initialized = False # @todo: check ignore_duplicates. self.__ged_data._graphs[graph_id].add_edge(self.__original_to_internal_node_ids[graph_id][nd_from], self.__original_to_internal_node_ids[graph_id][nd_to], label=edge_label) - # @todo: edge_id and label_id, edge_label_to_id_. + label_id = self.__ged_data._edge_label_to_id(edge_label) + # @todo: ged_data_.graphs_[graph_id].set_label def add_nx_graph(self, g, classe, ignore_duplicates=True) : @@ -123,6 +143,40 @@ class GEDEnv(object): return graph_id + def load_nx_graph(self, nx_graph, graph_id, graph_name='', graph_class=''): + """ + Loads NetworkX Graph into the GED environment. + + Parameters + ---------- + nx_graph : NetworkX Graph object + The graph that should be loaded. + + graph_id : int or None + The ID of a graph contained the environment (overwrite existing graph) or add new graph if `None`. + + graph_name : string, optional + The name of newly added graph. The default is ''. Has no effect unless `graph_id` equals `None`. + + graph_class : string, optional + The class of newly added graph. The default is ''. Has no effect unless `graph_id` equals `None`. + + Returns + ------- + int + The ID of the newly loaded graph. + """ + if graph_id is None: # @todo: undefined. + graph_id = self.add_graph(graph_name, graph_class) + else: + self.clear_graph(graph_id) + for node in nx_graph.nodes: + self.add_node(graph_id, node, tuple(sorted(nx_graph.nodes[node].items(), key=lambda kv: kv[0]))) + for edge in nx_graph.edges: + self.add_edge(graph_id, edge[0], edge[1], tuple(sorted(nx_graph.edges[(edge[0], edge[1])].items(), key=lambda kv: kv[0]))) + return graph_id + + def init(self, init_type=Options.InitType.EAGER_WITHOUT_SHUFFLED_COPIES, print_to_stdout=False): if isinstance(init_type, str): init_type = OptionsStringMap.InitType[init_type] @@ -154,6 +208,26 @@ class GEDEnv(object): self.__new_graph_ids.clear() + def is_initialized(self): + """ + /*! + * @brief Check if the environment is initialized. + * @return True if the environment is initialized. + */ + """ + return self.__initialized + + + def get_init_type(self): + """ + /*! + * @brief Returns the initialization type of the last initialization. + * @return Initialization type. + */ + """ + return self.__ged_data._init_type + + def set_method(self, method, options=''): """ /*! @@ -263,6 +337,58 @@ class GEDEnv(object): self.__ged_method.init() + def get_num_node_labels(self): + """ + /*! + * @brief Returns the number of node labels. + * @return Number of pairwise different node labels contained in the environment. + * @note If @p 1 is returned, the nodes are unlabeled. + */ + """ + return len(self.__ged_data._node_labels) + + + def get_node_label(self, label_id, to_dict=True): + """ + /*! + * @brief Returns node label. + * @param[in] label_id ID of node label that should be returned. Must be between 1 and num_node_labels(). + * @return Node label for selected label ID. + */ + """ + if label_id < 1 or label_id > self.get_num_node_labels(): + raise Exception('The environment does not contain a node label with ID', str(label_id), '.') + if to_dict: + return dict(self.__ged_data._node_labels[label_id - 1]) + return self.__ged_data._node_labels[label_id - 1] + + + def get_num_edge_labels(self): + """ + /*! + * @brief Returns the number of edge labels. + * @return Number of pairwise different edge labels contained in the environment. + * @note If @p 1 is returned, the edges are unlabeled. + */ + """ + return len(self.__ged_data._edge_labels) + + + def get_edge_label(self, label_id, to_dict=True): + """ + /*! + * @brief Returns edge label. + * @param[in] label_id ID of edge label that should be returned. Must be between 1 and num_node_labels(). + * @return Edge label for selected label ID. + */ + """ + if label_id < 1 or label_id > self.get_num_edge_labels(): + raise Exception('The environment does not contain an edge label with ID', str(label_id), '.') + if to_dict: + return dict(self.__ged_data._edge_labels[label_id - 1]) + return self.__ged_data._edge_labels[label_id - 1] + + def get_upper_bound(self, g_id, h_id): """ /*! @@ -363,6 +489,205 @@ class GEDEnv(object): .. note:: I don't know how to connect the two map to reconstruct the adjacence matrix. Please come back when I know how it's work ! """ return self.get_node_map(g_id, h_id).backward_map + + + def compute_induced_cost(self, g_id, h_id, node_map): + """ + /*! + * @brief Computes the edit cost between two graphs induced by a node map. + * @param[in] g_id ID of input graph. + * @param[in] h_id ID of input graph. + * @param[in,out] node_map Node map whose induced edit cost is to be computed. + */ + """ + self.__ged_data.compute_induced_cost(self.__ged_data._graphs[g_id], self.__ged_data._graphs[h_id], node_map) + + + def get_nx_graph(self, graph_id): + """ + * @brief Returns NetworkX.Graph() representation. + * @param[in] graph_id ID of the selected graph. + """ + graph = nx.Graph() # @todo: add graph attributes. + graph.graph['id'] = graph_id + + nb_nodes = self.get_graph_num_nodes(graph_id) + original_node_ids = self.get_original_node_ids(graph_id) + node_labels = self.get_graph_node_labels(graph_id, to_dict=True) + graph.graph['original_node_ids'] = original_node_ids + + for node_id in range(0, nb_nodes): + graph.add_node(node_id, **node_labels[node_id]) + + edges = self.get_graph_edges(graph_id, to_dict=True) + for (head, tail), labels in edges.items(): + graph.add_edge(head, tail, **labels) + + return graph + + + def get_graph_node_labels(self, graph_id, to_dict=True): + """ + Searchs and returns all the labels of nodes on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The list of nodes' labels on the selected graph + :rtype: list[dict{string : string}] + + .. seealso:: get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_original_node_ids(), get_graph_edges(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + graph = self.__ged_data.graph(graph_id) + node_labels = [] + for n in graph.nodes(): + node_labels.append(graph.nodes[n]['label']) + if to_dict: + return [dict(i) for i in node_labels] + return node_labels + + + def get_graph_edges(self, graph_id, to_dict=True): + """ + Searchs and returns all the edges on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The list of edges on the selected graph + :rtype: dict{tuple(size_t, size_t) : dict{string : string}} + + .. seealso::get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_original_node_ids(), get_graph_node_labels(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + graph = self.__ged_data.graph(graph_id) + if to_dict: + edges = {} + for n1, n2, attr in graph.edges(data=True): + edges[(n1, n2)] = dict(attr['label']) + return edges + return {(n1, n2): attr['label'] for n1, n2, attr in graph.edges(data=True)} + + + + def get_graph_name(self, graph_id): + """ + /*! + * @brief Returns the graph name. + * @param[in] graph_id ID of an input graph that has been added to the environment. + * @return Name of the input graph. + */ + """ + return self.__ged_data._graph_names[graph_id] + + + def get_graph_num_nodes(self, graph_id): + """ + /*! + * @brief Returns the number of nodes. + * @param[in] graph_id ID of an input graph that has been added to the environment. + * @return Number of nodes in the graph. + */ + """ + return nx.number_of_nodes(self.__ged_data.graph(graph_id)) + + + def get_original_node_ids(self, graph_id): + """ + Searchs and returns all th Ids of nodes on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The list of IDs's nodes on the selected graph + :rtype: list[string] + + .. seealso::get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_graph_node_labels(), get_graph_edges(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + return [i for i in self.__internal_to_original_node_ids[graph_id].values()] + + + def get_node_rel_cost(self, node_label_1, node_label_2): + """ + /*! + * @brief Returns node relabeling cost. + * @param[in] node_label_1 First node label. + * @param[in] node_label_2 Second node label. + * @return Node relabeling cost for the given node labels. + */ + """ + if isinstance(node_label_1, dict): + node_label_1 = tuple(sorted(node_label_1.items(), key=lambda kv: kv[0])) + if isinstance(node_label_2, dict): + node_label_2 = tuple(sorted(node_label_2.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.node_rel_cost_fun(node_label_1, node_label_2) + + + def get_node_del_cost(self, node_label): + """ + /*! + * @brief Returns node deletion cost. + * @param[in] node_label Node label. + * @return Cost of deleting node with given label. + */ + """ + if isinstance(node_label, dict): + node_label = tuple(sorted(node_label.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.node_del_cost_fun(node_label) + + + def get_node_ins_cost(self, node_label): + """ + /*! + * @brief Returns node insertion cost. + * @param[in] node_label Node label. + * @return Cost of inserting node with given label. + */ + """ + if isinstance(node_label, dict): + node_label = tuple(sorted(node_label.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.node_ins_cost_fun(node_label) + + + def get_edge_rel_cost(self, edge_label_1, edge_label_2): + """ + /*! + * @brief Returns edge relabeling cost. + * @param[in] edge_label_1 First edge label. + * @param[in] edge_label_2 Second edge label. + * @return Edge relabeling cost for the given edge labels. + */ + """ + if isinstance(edge_label_1, dict): + edge_label_1 = tuple(sorted(edge_label_1.items(), key=lambda kv: kv[0])) + if isinstance(edge_label_2, dict): + edge_label_2 = tuple(sorted(edge_label_2.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.edge_rel_cost_fun(edge_label_1, edge_label_2) + + + def get_edge_del_cost(self, edge_label): + """ + /*! + * @brief Returns edge deletion cost. + * @param[in] edge_label Edge label. + * @return Cost of deleting edge with given label. + */ + """ + if isinstance(edge_label, dict): + edge_label = tuple(sorted(edge_label.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.edge_del_cost_fun(edge_label) + + + def get_edge_ins_cost(self, edge_label): + """ + /*! + * @brief Returns edge insertion cost. + * @param[in] edge_label Edge label. + * @return Cost of inserting edge with given label. + */ + """ + if isinstance(edge_label, dict): + edge_label = tuple(sorted(edge_label.items(), key=lambda kv: kv[0])) + return self.__ged_data._edit_cost.edge_ins_cost_fun(edge_label) def get_all_graph_ids(self): diff --git a/gklearn/ged/median/__init__.py b/gklearn/ged/median/__init__.py index 9a291ae..0a96c31 100644 --- a/gklearn/ged/median/__init__.py +++ b/gklearn/ged/median/__init__.py @@ -1,2 +1,3 @@ from gklearn.ged.median.median_graph_estimator import MedianGraphEstimator +from gklearn.ged.median.median_graph_estimator_py import MedianGraphEstimatorPy from gklearn.ged.median.utils import constant_node_costs, mge_options_to_string diff --git a/gklearn/ged/median/median_graph_estimator_py.py b/gklearn/ged/median/median_graph_estimator_py.py new file mode 100644 index 0000000..41dc3c9 --- /dev/null +++ b/gklearn/ged/median/median_graph_estimator_py.py @@ -0,0 +1,1711 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 16 18:04:55 2020 + +@author: ljia +""" +import numpy as np +from gklearn.ged.env import AlgorithmState, NodeMap +from gklearn.ged.util import misc +from gklearn.utils import Timer +import time +from tqdm import tqdm +import sys +import networkx as nx +import multiprocessing +from multiprocessing import Pool +from functools import partial + + +class MedianGraphEstimatorPy(object): # @todo: differ dummy_node from undifined node? + """Estimate median graphs using the pure Python version of GEDEnv. + """ + + def __init__(self, ged_env, constant_node_costs): + """Constructor. + + Parameters + ---------- + ged_env : gklearn.gedlib.gedlibpy.GEDEnv + Initialized GED environment. The edit costs must be set by the user. + + constant_node_costs : Boolean + Set to True if the node relabeling costs are constant. + """ + self.__ged_env = ged_env + self.__init_method = 'BRANCH_FAST' + self.__init_options = '' + self.__descent_method = 'BRANCH_FAST' + self.__descent_options = '' + self.__refine_method = 'IPFP' + self.__refine_options = '' + self.__constant_node_costs = constant_node_costs + self.__labeled_nodes = (ged_env.get_num_node_labels() > 1) + self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1, to_dict=False)) + self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1, to_dict=False)) + self.__labeled_edges = (ged_env.get_num_edge_labels() > 1) + self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1, to_dict=False)) + self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1, to_dict=False)) + self.__init_type = 'RANDOM' + self.__num_random_inits = 10 + self.__desired_num_random_inits = 10 + self.__use_real_randomness = True + self.__seed = 0 + self.__parallel = True + self.__update_order = True + self.__sort_graphs = True # sort graphs by size when computing GEDs. + self.__refine = True + self.__time_limit_in_sec = 0 + self.__epsilon = 0.0001 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__num_inits_increase_order = 10 + self.__init_type_increase_order = 'K-MEANS++' + self.__max_itrs_increase_order = 10 + self.__print_to_stdout = 2 + self.__median_id = np.inf # @todo: check + self.__node_maps_from_median = {} + self.__sum_of_distances = 0 + self.__best_init_sum_of_distances = np.inf + self.__converged_sum_of_distances = np.inf + self.__runtime = None + self.__runtime_initialized = None + self.__runtime_converged = None + self.__itrs = [] # @todo: check: {} ? + self.__num_decrease_order = 0 + self.__num_increase_order = 0 + self.__num_converged_descents = 0 + self.__state = AlgorithmState.TERMINATED + self.__label_names = {} + + if ged_env is None: + raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') + elif not ged_env.is_initialized(): + raise Exception('The GED environment is uninitialized. Call gedlibpy.GEDEnv.init() before passing it to the constructor of MedianGraphEstimator.') + + + def set_options(self, options): + """Sets the options of the estimator. + + Parameters + ---------- + options : string + String that specifies with which options to run the estimator. + """ + self.__set_default_options() + options_map = misc.options_string_to_options_map(options) + for opt_name, opt_val in options_map.items(): + if opt_name == 'init-type': + self.__init_type = opt_val + if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN': + raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"') + elif opt_name == 'random-inits': + try: + self.__num_random_inits = int(opt_val) + self.__desired_num_random_inits = self.__num_random_inits + except: + raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') + + if self.__num_random_inits <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') + + elif opt_name == 'randomness': + if opt_val == 'PSEUDO': + self.__use_real_randomness = False + + elif opt_val == 'REAL': + self.__use_real_randomness = True + + else: + raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') + + elif opt_name == 'stdout': + if opt_val == '0': + self.__print_to_stdout = 0 + + elif opt_val == '1': + self.__print_to_stdout = 1 + + elif opt_val == '2': + self.__print_to_stdout = 2 + + else: + raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') + + elif opt_name == 'parallel': + if opt_val == 'TRUE': + self.__parallel = True + + elif opt_val == 'FALSE': + self.__parallel = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option parallel. Usage: options = "[--parallel TRUE|FALSE] [...]"') + + elif opt_name == 'update-order': + if opt_val == 'TRUE': + self.__update_order = True + + elif opt_val == 'FALSE': + self.__update_order = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option update-order. Usage: options = "[--update-order TRUE|FALSE] [...]"') + + elif opt_name == 'sort-graphs': + if opt_val == 'TRUE': + self.__sort_graphs = True + + elif opt_val == 'FALSE': + self.__sort_graphs = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option sort-graphs. Usage: options = "[--sort-graphs TRUE|FALSE] [...]"') + + elif opt_name == 'refine': + if opt_val == 'TRUE': + self.__refine = True + + elif opt_val == 'FALSE': + self.__refine = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') + + elif opt_name == 'time-limit': + try: + self.__time_limit_in_sec = float(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]') + + elif opt_name == 'max-itrs': + try: + self.__max_itrs = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]') + + elif opt_name == 'max-itrs-without-update': + try: + self.__max_itrs_without_update = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]') + + elif opt_name == 'seed': + try: + self.__seed = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]') + + elif opt_name == 'epsilon': + try: + self.__epsilon = float(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') + + if self.__epsilon <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') + + elif opt_name == 'inits-increase-order': + try: + self.__num_inits_increase_order = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') + + if self.__num_inits_increase_order <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') + + elif opt_name == 'init-type-increase-order': + self.__init_type_increase_order = opt_val + if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': + raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') + + elif opt_name == 'max-itrs-increase-order': + try: + self.__max_itrs_increase_order = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]') + + else: + valid_options = '[--init-type ] [--random-inits ] [--randomness ] [--seed ] [--stdout ] ' + valid_options += '[--time-limit ] [--max-itrs ] [--epsilon ] ' + valid_options += '[--inits-increase-order ] [--init-type-increase-order ] [--max-itrs-increase-order ]' + raise Exception('Invalid option "' + opt_name + '". Usage: options = "' + valid_options + '"') + + + def set_init_method(self, init_method, init_options={}): + """Selects method to be used for computing the initial medoid graph. + + Parameters + ---------- + init_method : string + The selected method. Default: ged::Options::GEDMethod::BRANCH_UNIFORM. + + init_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect unless "--init-type MEDOID" is passed to set_options(). + """ + self.__init_method = init_method; + self.__init_options = init_options; + + + def set_descent_method(self, descent_method, descent_options=''): + """Selects method to be used for block gradient descent.. + + Parameters + ---------- + descent_method : string + The selected method. Default: ged::Options::GEDMethod::BRANCH_FAST. + + descent_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect unless "--init-type MEDOID" is passed to set_options(). + """ + self.__descent_method = descent_method; + self.__descent_options = descent_options; + + + def set_refine_method(self, refine_method, refine_options): + """Selects method to be used for improving the sum of distances and the node maps for the converged median. + + Parameters + ---------- + refine_method : string + The selected method. Default: "IPFP". + + refine_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect if "--refine FALSE" is passed to set_options(). + """ + self.__refine_method = refine_method + self.__refine_options = refine_options + + + def run(self, graph_ids, set_median_id, gen_median_id): + """Computes a generalized median graph. + + Parameters + ---------- + graph_ids : list[integer] + The IDs of the graphs for which the median should be computed. Must have been added to the environment passed to the constructor. + + set_median_id : integer + The ID of the computed set-median. A dummy graph with this ID must have been added to the environment passed to the constructor. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). + + + gen_median_id : integer + The ID of the computed generalized median. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). + """ + # Sanity checks. + if len(graph_ids) == 0: + raise Exception('Empty vector of graph IDs, unable to compute median.') + all_graphs_empty = True + for graph_id in graph_ids: + if self.__ged_env.get_graph_num_nodes(graph_id) > 0: + all_graphs_empty = False + break + if all_graphs_empty: + raise Exception('All graphs in the collection are empty.') + + # Start timer and record start time. + start = time.time() + timer = Timer(self.__time_limit_in_sec) + self.__median_id = gen_median_id + self.__state = AlgorithmState.TERMINATED + + # Get NetworkX graph representations of the input graphs. + graphs = {} + for graph_id in graph_ids: + # @todo: get_nx_graph() function may need to be modified according to the coming code. + graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id) +# print(self.__ged_env.get_graph_internal_id(0)) +# print(graphs[0].graph) +# print(graphs[0].nodes(data=True)) +# print(graphs[0].edges(data=True)) +# print(nx.adjacency_matrix(graphs[0])) + + # Construct initial medians. + medians = [] + self.__construct_initial_medians(graph_ids, timer, medians) + end_init = time.time() + self.__runtime_initialized = end_init - start +# print(medians[0].graph) +# print(medians[0].nodes(data=True)) +# print(medians[0].edges(data=True)) +# print(nx.adjacency_matrix(medians[0])) + + # Reset information about iterations and number of times the median decreases and increases. + self.__itrs = [0] * len(medians) + self.__num_decrease_order = 0 + self.__num_increase_order = 0 + self.__num_converged_descents = 0 + + # Initialize the best median. + best_sum_of_distances = np.inf + self.__best_init_sum_of_distances = np.inf + node_maps_from_best_median = {} + + # Run block gradient descent from all initial medians. + self.__ged_env.set_method(self.__descent_method, self.__descent_options) + for median_pos in range(0, len(medians)): + + # Terminate if the timer has expired and at least one SOD has been computed. + if timer.expired() and median_pos > 0: + break + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('-----------------------------------------------------------') + + # Get reference to the median. + median = medians[median_pos] + + # Load initial median into the environment. + self.__ged_env.load_nx_graph(median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + + # Compute node maps and sum of distances for initial median. +# xxx = self.__node_maps_from_median + self.__compute_init_node_maps(graph_ids, gen_median_id) +# yyy = self.__node_maps_from_median + + self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) + self.__ged_env.load_nx_graph(median, set_median_id) +# print(self.__best_init_sum_of_distances) + + # Run block gradient descent from initial median. + converged = False + itrs_without_update = 0 + while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update): + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('-----------------------------------------------------------') + + # Initialize flags that tell us what happened in the iteration. + median_modified = False + node_maps_modified = False + decreased_order = False + increased_order = False + + # Update the median. + median_modified = self.__update_median(graphs, median) + if self.__update_order: + if not median_modified or self.__itrs[median_pos] == 0: + decreased_order = self.__decrease_order(graphs, median) + if not decreased_order or self.__itrs[median_pos] == 0: + increased_order = self.__increase_order(graphs, median) + + # Update the number of iterations without update of the median. + if median_modified or decreased_order or increased_order: + itrs_without_update = 0 + else: + itrs_without_update += 1 + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Loading median to environment: ... ', end='') + + # Load the median into the environment. + # @todo: should this function use the original node label? + self.__ged_env.load_nx_graph(median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Updating induced costs: ... ', end='') + + # Compute induced costs of the old node maps w.r.t. the updated median. + for graph_id in graph_ids: +# print(self.__node_maps_from_median[graph_id].induced_cost()) +# xxx = self.__node_maps_from_median[graph_id] + self.__ged_env.compute_induced_cost(gen_median_id, graph_id, self.__node_maps_from_median[graph_id]) +# print('---------------------------------------') +# print(self.__node_maps_from_median[graph_id].induced_cost()) + # @todo:!!!!!!!!!!!!!!!!!!!!!!!!!!!!This value is a slight different from the c++ program, which might be a bug! Use it very carefully! + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Update the node maps. + node_maps_modified = self.__update_node_maps() + + # Update the order of the median if no improvement can be found with the current order. + + # Update the sum of distances. + old_sum_of_distances = self.__sum_of_distances + self.__sum_of_distances = 0 + for graph_id, node_map in self.__node_maps_from_median.items(): + self.__sum_of_distances += node_map.induced_cost() +# print(self.__sum_of_distances) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Old local SOD: ', old_sum_of_distances) + print('New local SOD: ', self.__sum_of_distances) + print('Best converged SOD: ', best_sum_of_distances) + print('Modified median: ', median_modified) + print('Modified node maps: ', node_maps_modified) + print('Decreased order: ', decreased_order) + print('Increased order: ', increased_order) + print('===========================================================\n') + + converged = not (median_modified or node_maps_modified or decreased_order or increased_order) + + self.__itrs[median_pos] += 1 + + # Update the best median. + if self.__sum_of_distances < best_sum_of_distances: + best_sum_of_distances = self.__sum_of_distances + node_maps_from_best_median = self.__node_maps_from_median.copy() # @todo: this is a shallow copy, not sure if it is enough. + best_median = median + + # Update the number of converged descents. + if converged: + self.__num_converged_descents += 1 + + # Store the best encountered median. + self.__sum_of_distances = best_sum_of_distances + self.__node_maps_from_median = node_maps_from_best_median + self.__ged_env.load_nx_graph(best_median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + end_descent = time.time() + self.__runtime_converged = end_descent - start + + # Refine the sum of distances and the node maps for the converged median. + self.__converged_sum_of_distances = self.__sum_of_distances + if self.__refine: + self.__improve_sum_of_distances(timer) + + # Record end time, set runtime and reset the number of initial medians. + end = time.time() + self.__runtime = end - start + self.__num_random_inits = self.__desired_num_random_inits + + # Print global information. + if self.__print_to_stdout != 0: + print('\n===========================================================') + print('Finished computation of generalized median graph.') + print('-----------------------------------------------------------') + print('Best SOD after initialization: ', self.__best_init_sum_of_distances) + print('Converged SOD: ', self.__converged_sum_of_distances) + if self.__refine: + print('Refined SOD: ', self.__sum_of_distances) + print('Overall runtime: ', self.__runtime) + print('Runtime of initialization: ', self.__runtime_initialized) + print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized) + if self.__refine: + print('Runtime of refinement: ', self.__runtime - self.__runtime_converged) + print('Number of initial medians: ', len(medians)) + total_itr = 0 + num_started_descents = 0 + for itr in self.__itrs: + total_itr += itr + if itr > 0: + num_started_descents += 1 + print('Size of graph collection: ', len(graph_ids)) + print('Number of started descents: ', num_started_descents) + print('Number of converged descents: ', self.__num_converged_descents) + print('Overall number of iterations: ', total_itr) + print('Overall number of times the order decreased: ', self.__num_decrease_order) + print('Overall number of times the order increased: ', self.__num_increase_order) + print('===========================================================\n') + + + def __improve_sum_of_distances(self, timer): # @todo: go through and test + # Use method selected for refinement phase. + self.__ged_env.set_method(self.__refine_method, self.__refine_options) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Improving node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + print('\n===========================================================') + print('Improving node maps and SOD for converged median.') + print('-----------------------------------------------------------') + progress.update(1) + + # Improving the node maps. + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__gen_median_id) + for graph_id, node_map in self.__node_maps_from_median.items(): + if time.expired(): + if self.__state == AlgorithmState.TERMINATED: + self.__state = AlgorithmState.CONVERGED + break + + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(self.__gen_median_id, graph_id) + if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost(): + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id) + else: + self.__ged_env.run_method(graph_id, self.__gen_median_id) + if self.__ged_env.get_upper_bound(graph_id, self.__gen_median_id) < node_map.induced_cost(): + node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__gen_median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + + self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + + # Print information. + if self.__print_to_stdout == 2: + progress.update(1) + + self.__sum_of_distances = 0.0 + for key, val in self.__node_maps_from_median.items(): + self.__sum_of_distances += val.induced_cost() + + # Print information. + if self.__print_to_stdout == 2: + print('===========================================================\n') + + + def __median_available(self): + return self.__median_id != np.inf + + + def get_state(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_state().') + return self.__state + + + def get_sum_of_distances(self, state=''): + """Returns the sum of distances. + + Parameters + ---------- + state : string + The state of the estimator. Can be 'initialized' or 'converged'. Default: "" + + Returns + ------- + float + The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned. + """ + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().') + if state == 'initialized': + return self.__best_init_sum_of_distances + if state == 'converged': + return self.__converged_sum_of_distances + return self.__sum_of_distances + + + def get_runtime(self, state): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_runtime().') + if state == AlgorithmState.INITIALIZED: + return self.__runtime_initialized + if state == AlgorithmState.CONVERGED: + return self.__runtime_converged + return self.__runtime + + + def get_num_itrs(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_itrs().') + return self.__itrs + + + def get_num_times_order_decreased(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_times_order_decreased().') + return self.__num_decrease_order + + + def get_num_times_order_increased(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_times_order_increased().') + return self.__num_increase_order + + + def get_num_converged_descents(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_converged_descents().') + return self.__num_converged_descents + + + def get_ged_env(self): + return self.__ged_env + + + def __set_default_options(self): + self.__init_type = 'RANDOM' + self.__num_random_inits = 10 + self.__desired_num_random_inits = 10 + self.__use_real_randomness = True + self.__seed = 0 + self.__parallel = True + self.__update_order = True + self.__sort_graphs = True + self.__refine = True + self.__time_limit_in_sec = 0 + self.__epsilon = 0.0001 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__num_inits_increase_order = 10 + self.__init_type_increase_order = 'K-MEANS++' + self.__max_itrs_increase_order = 10 + self.__print_to_stdout = 2 + self.__label_names = {} + + + def __construct_initial_medians(self, graph_ids, timer, initial_medians): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Constructing initial median(s).') + print('-----------------------------------------------------------') + + # Compute or sample the initial median(s). + initial_medians.clear() + if self.__init_type == 'MEDOID': + self.__compute_medoid(graph_ids, timer, initial_medians) + elif self.__init_type == 'MAX': + pass # @todo +# compute_max_order_graph_(graph_ids, initial_medians) + elif self.__init_type == 'MIN': + pass # @todo +# compute_min_order_graph_(graph_ids, initial_medians) + elif self.__init_type == 'MEAN': + pass # @todo +# compute_mean_order_graph_(graph_ids, initial_medians) + else: + pass # @todo +# sample_initial_medians_(graph_ids, initial_medians) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('===========================================================') + + + def __compute_medoid(self, graph_ids, timer, initial_medians): + # Use method selected for initialization phase. + self.__ged_env.set_method(self.__init_method, self.__init_options) + + # Compute the medoid. + if self.__parallel: + # @todo: notice when parallel self.__ged_env is not modified. + sum_of_distances_list = [np.inf] * len(graph_ids) + len_itr = len(graph_ids) + itr = zip(graph_ids, range(0, len(graph_ids))) + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(ged_env_toshare): + global G_ged_env + G_ged_env = ged_env_toshare + do_fun = partial(_compute_medoid_parallel, graph_ids, self.__sort_graphs) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) + if self.__print_to_stdout == 2: + iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), + desc='Computing medoid', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_fun, itr, chunksize) + for i, dis in iterator: + sum_of_distances_list[i] = dis + pool.close() + pool.join() + + medoid_id = np.argmin(sum_of_distances_list) + best_sum_of_distances = sum_of_distances_list[medoid_id] + + initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + + else: + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout) + + medoid_id = graph_ids[0] + best_sum_of_distances = np.inf + for g_id in graph_ids: + if timer.expired(): + self.__state = AlgorithmState.CALLED + break + nb_nodes_g = self.__ged_env.get_graph_num_nodes(g_id) + sum_of_distances = 0 + for h_id in graph_ids: # @todo: this can be faster, only a half is needed. + nb_nodes_h = self.__ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not self.__sort_graphs: + self.__ged_env.run_method(g_id, h_id) # @todo + sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) + else: + self.__ged_env.run_method(h_id, g_id) + sum_of_distances += self.__ged_env.get_upper_bound(h_id, g_id) + if sum_of_distances < best_sum_of_distances: + best_sum_of_distances = sum_of_distances + medoid_id = g_id + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + + def __compute_init_node_maps(self, graph_ids, gen_median_id): + # Compute node maps and sum of distances for initial median. + if self.__parallel: + # @todo: notice when parallel self.__ged_env is not modified. + self.__sum_of_distances = 0 + self.__node_maps_from_median.clear() + sum_of_distances_list = [0] * len(graph_ids) + + len_itr = len(graph_ids) + itr = graph_ids + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(ged_env_toshare): + global G_ged_env + G_ged_env = ged_env_toshare + nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self.__sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) + if self.__print_to_stdout == 2: + iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), + desc='Computing initial node maps', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_fun, itr, chunksize) + for g_id, sod, node_maps in iterator: + sum_of_distances_list[g_id] = sod + self.__node_maps_from_median[g_id] = node_maps + pool.close() + pool.join() + + self.__sum_of_distances = np.sum(sum_of_distances_list) +# xxx = self.__node_maps_from_median + + else: + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout) + + self.__sum_of_distances = 0 + self.__node_maps_from_median.clear() + nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + for graph_id in graph_ids: + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(gen_median_id, graph_id) + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) + else: + self.__ged_env.run_method(graph_id, gen_median_id) + node_map_tmp = self.__ged_env.get_node_map(graph_id, gen_median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + # print(self.__node_maps_from_median[graph_id]) + self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + # print(self.__sum_of_distances) + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + + def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): + if self.__state == AlgorithmState.TERMINATED: + self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + + + def __update_median(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Updating median: ', end='') + + # Store copy of the old median. + old_median = median.copy() # @todo: this is just a shallow copy. + + # Update the node labels. + if self.__labeled_nodes: + self.__update_node_labels(graphs, median) + + # Update the edges and their labels. + self.__update_edges(graphs, median) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + return not self.__are_graphs_equal(median, old_median) + + + def __update_node_labels(self, graphs, median): +# print('----------------------------') + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('nodes ... ', end='') + + # Iterate through all nodes of the median. + for i in range(0, nx.number_of_nodes(median)): +# print('i: ', i) + # Collect the labels of the substituted nodes. + node_labels = [] + for graph_id, graph in graphs.items(): +# print('graph_id: ', graph_id) +# print(self.__node_maps_from_median[graph_id]) +# print(self.__node_maps_from_median[graph_id].forward_map, self.__node_maps_from_median[graph_id].backward_map) + k = self.__node_maps_from_median[graph_id].image(i) +# print('k: ', k) + if k != np.inf: + node_labels.append(graph.nodes[k]) + + # Compute the median label and update the median. + if len(node_labels) > 0: +# median_label = self.__ged_env.get_median_node_label(node_labels) + median_label = self.__get_median_node_label(node_labels) + if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: + nx.set_node_attributes(median, {i: median_label}) + + + def __update_edges(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('edges ... ', end='') + +# # Clear the adjacency lists of the median and reset number of edges to 0. +# median_edges = list(median.edges) +# for (head, tail) in median_edges: +# median.remove_edge(head, tail) + + # @todo: what if edge is not labeled? + # Iterate through all possible edges (i,j) of the median. + for i in range(0, nx.number_of_nodes(median)): + for j in range(i + 1, nx.number_of_nodes(median)): + + # Collect the labels of the edges to which (i,j) is mapped by the node maps. + edge_labels = [] + for graph_id, graph in graphs.items(): + k = self.__node_maps_from_median[graph_id].image(i) + l = self.__node_maps_from_median[graph_id].image(j) + if k != np.inf and l != np.inf: + if graph.has_edge(k, l): + edge_labels.append(graph.edges[(k, l)]) + + # Compute the median edge label and the overall edge relabeling cost. + rel_cost = 0 + median_label = self.__ged_env.get_edge_label(1, to_dict=True) + if median.has_edge(i, j): + median_label = median.edges[(i, j)] + if self.__labeled_edges and len(edge_labels) > 0: + new_median_label = self.__get_median_edge_label(edge_labels) + if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: + median_label = new_median_label + for edge_label in edge_labels: + rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label) + + # Update the median. + if median.has_edge(i, j): + median.remove_edge(i, j) + if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs): + median.add_edge(i, j, **median_label) +# else: +# if median.has_edge(i, j): +# median.remove_edge(i, j) + + + def __update_node_maps(self): + # Update the node maps. + if self.__parallel: + # @todo: notice when parallel self.__ged_env is not modified. + node_maps_were_modified = False +# xxx = self.__node_maps_from_median.copy() + + len_itr = len(self.__node_maps_from_median) + itr = [item for item in self.__node_maps_from_median.items()] + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(ged_env_toshare): + global G_ged_env + G_ged_env = ged_env_toshare + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) + do_fun = partial(_update_node_maps_parallel, self.__median_id, self.__epsilon, self.__sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) + if self.__print_to_stdout == 2: + iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), + desc='Updating node maps', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_fun, itr, chunksize) + for g_id, node_map, nm_modified in iterator: + self.__node_maps_from_median[g_id] = node_map + if nm_modified: + node_maps_were_modified = True + pool.close() + pool.join() +# yyy = self.__node_maps_from_median.copy() + + else: + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + + node_maps_were_modified = False + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) + for graph_id, node_map in self.__node_maps_from_median.items(): + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(self.__median_id, graph_id) + if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon: + # xxx = self.__node_maps_from_median[graph_id] + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) + node_maps_were_modified = True + + else: + self.__ged_env.run_method(graph_id, self.__median_id) + if self.__ged_env.get_upper_bound(graph_id, self.__median_id) < node_map.induced_cost() - self.__epsilon: + node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + node_maps_were_modified = True + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + # Return true if the node maps were modified. + return node_maps_were_modified + + + def __decrease_order(self, graphs, median): + # Print information about current iteration + if self.__print_to_stdout == 2: + print('Trying to decrease order: ... ', end='') + + if nx.number_of_nodes(median) <= 1: + if self.__print_to_stdout == 2: + print('median graph has only 1 node, skip decrease.') + return False + + # Initialize ID of the node that is to be deleted. + id_deleted_node = [None] # @todo: or np.inf + decreased_order = False + + # Decrease the order as long as the best deletion delta is negative. + while self.__compute_best_deletion_delta(graphs, median, id_deleted_node) < -self.__epsilon: + decreased_order = True + self.__delete_node_from_median(id_deleted_node[0], median) + if nx.number_of_nodes(median) <= 1: + if self.__print_to_stdout == 2: + print('decrease stopped because median graph remains only 1 node. ', end='') + break + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Return true iff the order was decreased. + return decreased_order + + + def __compute_best_deletion_delta(self, graphs, median, id_deleted_node): + best_delta = 0.0 + + # Determine node that should be deleted (if any). + for i in range(0, nx.number_of_nodes(median)): + # Compute cost delta. + delta = 0.0 + for graph_id, graph in graphs.items(): + k = self.__node_maps_from_median[graph_id].image(i) + if k == np.inf: + delta -= self.__node_del_cost + else: + delta += self.__node_ins_cost - self.__ged_env.get_node_rel_cost(median.nodes[i], graph.nodes[k]) + for j, j_label in median[i].items(): + l = self.__node_maps_from_median[graph_id].image(j) + if k == np.inf or l == np.inf: + delta -= self.__edge_del_cost + elif not graph.has_edge(k, l): + delta -= self.__edge_del_cost + else: + delta += self.__edge_ins_cost - self.__ged_env.get_edge_rel_cost(j_label, graph.edges[(k, l)]) + + # Update best deletion delta. + if delta < best_delta - self.__epsilon: + best_delta = delta + id_deleted_node[0] = i +# id_deleted_node[0] = 3 # @todo: + + return best_delta + + + def __delete_node_from_median(self, id_deleted_node, median): + # Update the median. + mapping = {} + for i in range(0, nx.number_of_nodes(median)): + if i != id_deleted_node: + new_i = (i if i < id_deleted_node else (i - 1)) + mapping[i] = new_i + median.remove_node(id_deleted_node) + nx.relabel_nodes(median, mapping, copy=False) + + # Update the node maps. +# xxx = self.__node_maps_from_median + for key, node_map in self.__node_maps_from_median.items(): + new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) + is_unassigned_target_node = [True] * node_map.num_target_nodes() + for i in range(0, nx.number_of_nodes(median) + 1): + if i != id_deleted_node: + new_i = (i if i < id_deleted_node else (i - 1)) + k = node_map.image(i) + new_node_map.add_assignment(new_i, k) + if k != np.inf: + is_unassigned_target_node[k] = False + for k in range(0, node_map.num_target_nodes()): + if is_unassigned_target_node[k]: + new_node_map.add_assignment(np.inf, k) +# print(self.__node_maps_from_median[key].forward_map, self.__node_maps_from_median[key].backward_map) +# print(new_node_map.forward_map, new_node_map.backward_map + self.__node_maps_from_median[key] = new_node_map + + # Increase overall number of decreases. + self.__num_decrease_order += 1 + + + def __increase_order(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Trying to increase order: ... ', end='') + + # Initialize the best configuration and the best label of the node that is to be inserted. + best_config = {} + best_label = self.__ged_env.get_node_label(1, to_dict=True) + increased_order = False + + # Increase the order as long as the best insertion delta is negative. + while self.__compute_best_insertion_delta(graphs, best_config, best_label) < - self.__epsilon: + increased_order = True + self.__add_node_to_median(best_config, best_label, median) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Return true iff the order was increased. + return increased_order + + + def __compute_best_insertion_delta(self, graphs, best_config, best_label): + # Construct sets of inserted nodes. + no_inserted_node = True + inserted_nodes = {} + for graph_id, graph in graphs.items(): + inserted_nodes[graph_id] = [] + best_config[graph_id] = np.inf + for k in range(nx.number_of_nodes(graph)): + if self.__node_maps_from_median[graph_id].pre_image(k) == np.inf: + no_inserted_node = False + inserted_nodes[graph_id].append((k, tuple(item for item in graph.nodes[k].items()))) # @todo: can order of label names be garantteed? + + # Return 0.0 if no node is inserted in any of the graphs. + if no_inserted_node: + return 0.0 + + # Compute insertion configuration, label, and delta. + best_delta = 0.0 # @todo + if len(self.__label_names['node_labels']) == 0 and len(self.__label_names['node_attrs']) == 0: # @todo + best_delta = self.__compute_insertion_delta_unlabeled(inserted_nodes, best_config, best_label) + elif len(self.__label_names['node_labels']) > 0: # self.__constant_node_costs: + best_delta = self.__compute_insertion_delta_constant(inserted_nodes, best_config, best_label) + else: + best_delta = self.__compute_insertion_delta_generic(inserted_nodes, best_config, best_label) + + # Return the best delta. + return best_delta + + + def __compute_insertion_delta_unlabeled(self, inserted_nodes, best_config, best_label): # @todo: go through and test. + # Construct the nest configuration and compute its insertion delta. + best_delta = 0.0 + best_config.clear() + for graph_id, node_set in inserted_nodes.items(): + if len(node_set) == 0: + best_config[graph_id] = np.inf + best_delta += self.__node_del_cost + else: + best_config[graph_id] = node_set[0][0] + best_delta -= self.__node_ins_cost + + # Return the best insertion delta. + return best_delta + + + def __compute_insertion_delta_constant(self, inserted_nodes, best_config, best_label): + # Construct histogram and inverse label maps. + hist = {} + inverse_label_maps = {} + for graph_id, node_set in inserted_nodes.items(): + inverse_label_maps[graph_id] = {} + for node in node_set: + k = node[0] + label = node[1] + if label not in inverse_label_maps[graph_id]: + inverse_label_maps[graph_id][label] = k + if label not in hist: + hist[label] = 1 + else: + hist[label] += 1 + + # Determine the best label. + best_count = 0 + for key, val in hist.items(): + if val > best_count: + best_count = val + best_label_tuple = key + + # get best label. + best_label.clear() + for key, val in best_label_tuple: + best_label[key] = val + + # Construct the best configuration and compute its insertion delta. + best_config.clear() + best_delta = 0.0 + node_rel_cost = self.__ged_env.get_node_rel_cost(self.__ged_env.get_node_label(1, to_dict=False), self.__ged_env.get_node_label(2, to_dict=False)) + triangle_ineq_holds = (node_rel_cost <= self.__node_del_cost + self.__node_ins_cost) + for graph_id, _ in inserted_nodes.items(): + if best_label_tuple in inverse_label_maps[graph_id]: + best_config[graph_id] = inverse_label_maps[graph_id][best_label_tuple] + best_delta -= self.__node_ins_cost + elif triangle_ineq_holds and not len(inserted_nodes[graph_id]) == 0: + best_config[graph_id] = inserted_nodes[graph_id][0][0] + best_delta += node_rel_cost - self.__node_ins_cost + else: + best_config[graph_id] = np.inf + best_delta += self.__node_del_cost + + # Return the best insertion delta. + return best_delta + + + def __compute_insertion_delta_generic(self, inserted_nodes, best_config, best_label): + # Collect all node labels of inserted nodes. + node_labels = [] + for _, node_set in inserted_nodes.items(): + for node in node_set: + node_labels.append(node[1]) + + # Compute node label medians that serve as initial solutions for block gradient descent. + initial_node_labels = [] + self.__compute_initial_node_labels(node_labels, initial_node_labels) + + # Determine best insertion configuration, label, and delta via parallel block gradient descent from all initial node labels. + best_delta = 0.0 + for node_label in initial_node_labels: + # Construct local configuration. + config = {} + for graph_id, _ in inserted_nodes.items(): + config[graph_id] = tuple((np.inf, self.__ged_env.get_node_label(1, to_dict=False))) + + # Run block gradient descent. + converged = False + itr = 0 + while not self.__insertion_termination_criterion_met(converged, itr): + converged = not self.__update_config(node_label, inserted_nodes, config, node_labels) + node_label_dict = dict(node_label) + converged = converged and (not self.__update_node_label([dict(item) for item in node_labels], node_label_dict)) # @todo: the dict is tupled again in the function, can be better. + node_label = tuple(item for item in node_label_dict.items()) # @todo: watch out: initial_node_labels[i] is not modified here. + + itr += 1 + + # Compute insertion delta of converged solution. + delta = 0.0 + for _, node in config.items(): + if node[0] == np.inf: + delta += self.__node_del_cost + else: + delta += self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost + + # Update best delta and global configuration if improvement has been found. + if delta < best_delta - self.__epsilon: + best_delta = delta + best_label.clear() + for key, val in node_label: + best_label[key] = val + best_config.clear() + for graph_id, val in config.items(): + best_config[graph_id] = val[0] + + # Return the best delta. + return best_delta + + + def __compute_initial_node_labels(self, node_labels, median_labels): + median_labels.clear() + if self.__use_real_randomness: # @todo: may not work if parallelized. + rng = np.random.randint(0, high=2**32 - 1, size=1) + urng = np.random.RandomState(seed=rng[0]) + else: + urng = np.random.RandomState(seed=self.__seed) + + # Generate the initial node label medians. + if self.__init_type_increase_order == 'K-MEANS++': + # Use k-means++ heuristic to generate the initial node label medians. + already_selected = [False] * len(node_labels) + selected_label_id = urng.randint(low=0, high=len(node_labels), size=1)[0] # c++ test: 23 + median_labels.append(node_labels[selected_label_id]) + already_selected[selected_label_id] = True +# xxx = [41, 0, 18, 9, 6, 14, 21, 25, 33] for c++ test +# iii = 0 for c++ test + while len(median_labels) < self.__num_inits_increase_order: + weights = [np.inf] * len(node_labels) + for label_id in range(0, len(node_labels)): + if already_selected[label_id]: + weights[label_id] = 0 + continue + for label in median_labels: + weights[label_id] = min(weights[label_id], self.__ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) + + # get non-zero weights. + weights_p, idx_p = [], [] + for i, w in enumerate(weights): + if w != 0: + weights_p.append(w) + idx_p.append(i) + if len(weights_p) > 0: + p = np.array(weights_p) / np.sum(weights_p) + selected_label_id = urng.choice(range(0, len(weights_p)), size=1, p=p)[0] # for c++ test: xxx[iii] + selected_label_id = idx_p[selected_label_id] +# iii += 1 for c++ test + median_labels.append(node_labels[selected_label_id]) + already_selected[selected_label_id] = True + else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self.__num_inits_increase_order. + break + else: + # Compute the initial node medians as the medians of randomly generated clusters of (roughly) equal size. + # @todo: go through and test. + shuffled_node_labels = [np.inf] * len(node_labels) #@todo: random? + # @todo: std::shuffle(shuffled_node_labels.begin(), shuffled_node_labels.end(), urng);? + cluster_size = len(node_labels) / self.__num_inits_increase_order + pos = 0.0 + cluster = [] + while len(median_labels) < self.__num_inits_increase_order - 1: + while pos < (len(median_labels) + 1) * cluster_size: + cluster.append(shuffled_node_labels[pos]) + pos += 1 + median_labels.append(self.__get_median_node_label(cluster)) + cluster.clear() + while pos < len(shuffled_node_labels): + pos += 1 + cluster.append(shuffled_node_labels[pos]) + median_labels.append(self.__get_median_node_label(cluster)) + cluster.clear() + + # Run Lloyd's Algorithm. + converged = False + closest_median_ids = [np.inf] * len(node_labels) + clusters = [[] for _ in range(len(median_labels))] + itr = 1 + while not self.__insertion_termination_criterion_met(converged, itr): + converged = not self.__update_clusters(node_labels, median_labels, closest_median_ids) + if not converged: + for cluster in clusters: + cluster.clear() + for label_id in range(0, len(node_labels)): + clusters[closest_median_ids[label_id]].append(node_labels[label_id]) + for cluster_id in range(0, len(clusters)): + node_label = dict(median_labels[cluster_id]) + self.__update_node_label([dict(item) for item in clusters[cluster_id]], node_label) # @todo: the dict is tupled again in the function, can be better. + median_labels[cluster_id] = tuple(item for item in node_label.items()) + itr += 1 + + + def __insertion_termination_criterion_met(self, converged, itr): + return converged or (itr >= self.__max_itrs_increase_order if self.__max_itrs_increase_order > 0 else False) + + + def __update_config(self, node_label, inserted_nodes, config, node_labels): + # Determine the best configuration. + config_modified = False + for graph_id, node_set in inserted_nodes.items(): + best_assignment = config[graph_id] + best_cost = 0.0 + if best_assignment[0] == np.inf: + best_cost = self.__node_del_cost + else: + best_cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(best_assignment[1])) - self.__node_ins_cost + for node in node_set: + cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost + if cost < best_cost - self.__epsilon: + best_cost = cost + best_assignment = node + config_modified = True + if self.__node_del_cost < best_cost - self.__epsilon: + best_cost = self.__node_del_cost + best_assignment = tuple((np.inf, best_assignment[1])) + config_modified = True + config[graph_id] = best_assignment + + # Collect the node labels contained in the best configuration. + node_labels.clear() + for key, val in config.items(): + if val[0] != np.inf: + node_labels.append(val[1]) + + # Return true if the configuration was modified. + return config_modified + + + def __update_node_label(self, node_labels, node_label): + if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling __update_config(). + return False + new_node_label = self.__get_median_node_label(node_labels) + if self.__ged_env.get_node_rel_cost(new_node_label, node_label) > self.__epsilon: + node_label.clear() + for key, val in new_node_label.items(): + node_label[key] = val + return True + return False + + + def __update_clusters(self, node_labels, median_labels, closest_median_ids): + # Determine the closest median for each node label. + clusters_modified = False + for label_id in range(0, len(node_labels)): + closest_median_id = np.inf + dist_to_closest_median = np.inf + for median_id in range(0, len(median_labels)): + dist_to_median = self.__ged_env.get_node_rel_cost(dict(median_labels[median_id]), dict(node_labels[label_id])) + if dist_to_median < dist_to_closest_median - self.__epsilon: + dist_to_closest_median = dist_to_median + closest_median_id = median_id + if closest_median_id != closest_median_ids[label_id]: + closest_median_ids[label_id] = closest_median_id + clusters_modified = True + + # Return true if the clusters were modified. + return clusters_modified + + + def __add_node_to_median(self, best_config, best_label, median): + # Update the median. + nb_nodes_median = nx.number_of_nodes(median) + median.add_node(nb_nodes_median, **best_label) + + # Update the node maps. + for graph_id, node_map in self.__node_maps_from_median.items(): + node_map_as_rel = [] + node_map.as_relation(node_map_as_rel) + new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) + for assignment in node_map_as_rel: + new_node_map.add_assignment(assignment[0], assignment[1]) + new_node_map.add_assignment(nx.number_of_nodes(median) - 1, best_config[graph_id]) + self.__node_maps_from_median[graph_id] = new_node_map + + # Increase overall number of increases. + self.__num_increase_order += 1 + + + def __are_graphs_equal(self, g1, g2): + """ + Check if the two graphs are equal. + + Parameters + ---------- + g1 : NetworkX graph object + Graph 1 to be compared. + + g2 : NetworkX graph object + Graph 2 to be compared. + + Returns + ------- + bool + True if the two graph are equal. + + Notes + ----- + This is not an identical check. Here the two graphs are equal if and only if their original_node_ids, nodes, all node labels, edges and all edge labels are equal. This function is specifically designed for class `MedianGraphEstimator` and should not be used elsewhere. + """ + # check original node ids. + if not g1.graph['original_node_ids'] == g2.graph['original_node_ids']: + return False # @todo: why check this? + # check nodes. + nlist1 = [n for n in g1.nodes(data=True)] # @todo: shallow? + nlist2 = [n for n in g2.nodes(data=True)] + if not nlist1 == nlist2: + return False + # check edges. + elist1 = [n for n in g1.edges(data=True)] + elist2 = [n for n in g2.edges(data=True)] + if not elist1 == elist2: + return False + + return True + + + def compute_my_cost(g, h, node_map): + cost = 0.0 + for node in g.nodes: + cost += 0 + + + def set_label_names(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + self.__label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, + 'node_attrs': node_attrs, 'edge_attrs': edge_attrs} + + + def __get_median_node_label(self, node_labels): + if len(self.__label_names['node_labels']) > 0: + return self.__get_median_label_symbolic(node_labels) + elif len(self.__label_names['node_attrs']) > 0: + return self.__get_median_label_nonsymbolic(node_labels) + else: + raise Exception('Node label names are not given.') + + + def __get_median_edge_label(self, edge_labels): + if len(self.__label_names['edge_labels']) > 0: + return self.__get_median_label_symbolic(edge_labels) + elif len(self.__label_names['edge_attrs']) > 0: + return self.__get_median_label_nonsymbolic(edge_labels) + else: + raise Exception('Edge label names are not given.') + + + def __get_median_label_symbolic(self, labels): + # Construct histogram. + hist = {} + for label in labels: + label = tuple([kv for kv in label.items()]) # @todo: this may be slow. + if label not in hist: + hist[label] = 1 + else: + hist[label] += 1 + + # Return the label that appears most frequently. + best_count = 0 + median_label = {} + for label, count in hist.items(): + if count > best_count: + best_count = count + median_label = {kv[0]: kv[1] for kv in label} + + return median_label + + + def __get_median_label_nonsymbolic(self, labels): + if len(labels) == 0: + return {} # @todo + else: + # Transform the labels into coordinates and compute mean label as initial solution. + labels_as_coords = [] + sums = {} + for key, val in labels[0].items(): + sums[key] = 0 + for label in labels: + coords = {} + for key, val in label.items(): + label_f = float(val) + sums[key] += label_f + coords[key] = label_f + labels_as_coords.append(coords) + median = {} + for key, val in sums.items(): + median[key] = val / len(labels) + + # Run main loop of Weiszfeld's Algorithm. + epsilon = 0.0001 + delta = 1.0 + num_itrs = 0 + all_equal = False + while ((delta > epsilon) and (num_itrs < 100) and (not all_equal)): + numerator = {} + for key, val in sums.items(): + numerator[key] = 0 + denominator = 0 + for label_as_coord in labels_as_coords: + norm = 0 + for key, val in label_as_coord.items(): + norm += (val - median[key]) ** 2 + norm = np.sqrt(norm) + if norm > 0: + for key, val in label_as_coord.items(): + numerator[key] += val / norm + denominator += 1.0 / norm + if denominator == 0: + all_equal = True + else: + new_median = {} + delta = 0.0 + for key, val in numerator.items(): + this_median = val / denominator + new_median[key] = this_median + delta += np.abs(median[key] - this_median) + median = new_median + + num_itrs += 1 + + # Transform the solution to strings and return it. + median_label = {} + for key, val in median.items(): + median_label[key] = str(val) + return median_label + + +# def __get_median_edge_label_symbolic(self, edge_labels): +# pass + + +# def __get_median_edge_label_nonsymbolic(self, edge_labels): +# if len(edge_labels) == 0: +# return {} +# else: +# # Transform the labels into coordinates and compute mean label as initial solution. +# edge_labels_as_coords = [] +# sums = {} +# for key, val in edge_labels[0].items(): +# sums[key] = 0 +# for edge_label in edge_labels: +# coords = {} +# for key, val in edge_label.items(): +# label = float(val) +# sums[key] += label +# coords[key] = label +# edge_labels_as_coords.append(coords) +# median = {} +# for key, val in sums.items(): +# median[key] = val / len(edge_labels) +# +# # Run main loop of Weiszfeld's Algorithm. +# epsilon = 0.0001 +# delta = 1.0 +# num_itrs = 0 +# all_equal = False +# while ((delta > epsilon) and (num_itrs < 100) and (not all_equal)): +# numerator = {} +# for key, val in sums.items(): +# numerator[key] = 0 +# denominator = 0 +# for edge_label_as_coord in edge_labels_as_coords: +# norm = 0 +# for key, val in edge_label_as_coord.items(): +# norm += (val - median[key]) ** 2 +# norm += np.sqrt(norm) +# if norm > 0: +# for key, val in edge_label_as_coord.items(): +# numerator[key] += val / norm +# denominator += 1.0 / norm +# if denominator == 0: +# all_equal = True +# else: +# new_median = {} +# delta = 0.0 +# for key, val in numerator.items(): +# this_median = val / denominator +# new_median[key] = this_median +# delta += np.abs(median[key] - this_median) +# median = new_median +# +# num_itrs += 1 +# +# # Transform the solution to ged::GXLLabel and return it. +# median_label = {} +# for key, val in median.items(): +# median_label[key] = str(val) +# return median_label + + +def _compute_medoid_parallel(graph_ids, sort, itr): + g_id = itr[0] + i = itr[1] + # @todo: timer not considered here. +# if timer.expired(): +# self.__state = AlgorithmState.CALLED +# break + nb_nodes_g = G_ged_env.get_graph_num_nodes(g_id) + sum_of_distances = 0 + for h_id in graph_ids: + nb_nodes_h = G_ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not sort: + G_ged_env.run_method(g_id, h_id) + sum_of_distances += G_ged_env.get_upper_bound(g_id, h_id) + else: + G_ged_env.run_method(h_id, g_id) + sum_of_distances += G_ged_env.get_upper_bound(h_id, g_id) + return i, sum_of_distances + + +def _compute_init_node_maps_parallel(gen_median_id, sort, nb_nodes_median, itr): + graph_id = itr + nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not sort: + G_ged_env.run_method(gen_median_id, graph_id) + node_map = G_ged_env.get_node_map(gen_median_id, graph_id) +# print(self.__node_maps_from_median[graph_id]) + else: + G_ged_env.run_method(graph_id, gen_median_id) + node_map = G_ged_env.get_node_map(graph_id, gen_median_id) + node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map + sum_of_distance = node_map.induced_cost() +# print(self.__sum_of_distances) + return graph_id, sum_of_distance, node_map + + +def _update_node_maps_parallel(median_id, epsilon, sort, nb_nodes_median, itr): + graph_id = itr[0] + node_map = itr[1] + + node_maps_were_modified = False + nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not sort: + G_ged_env.run_method(median_id, graph_id) + if G_ged_env.get_upper_bound(median_id, graph_id) < node_map.induced_cost() - epsilon: + node_map = G_ged_env.get_node_map(median_id, graph_id) + node_maps_were_modified = True + else: + G_ged_env.run_method(graph_id, median_id) + if G_ged_env.get_upper_bound(graph_id, median_id) < node_map.induced_cost() - epsilon: + node_map = G_ged_env.get_node_map(graph_id, median_id) + node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map + node_maps_were_modified = True + + return graph_id, node_map, node_maps_were_modified \ No newline at end of file diff --git a/gklearn/preimage/__init__.py b/gklearn/preimage/__init__.py index 9713a65..a9284b2 100644 --- a/gklearn/preimage/__init__.py +++ b/gklearn/preimage/__init__.py @@ -11,8 +11,9 @@ __author__ = "Linlin Jia" __date__ = "March 2020" from gklearn.preimage.preimage_generator import PreimageGenerator -from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator from gklearn.preimage.random_preimage_generator import RandomPreimageGenerator +from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator +from gklearn.preimage.median_preimage_generator_py import MedianPreimageGeneratorPy from gklearn.preimage.median_preimage_generator_cml import MedianPreimageGeneratorCML from gklearn.preimage.kernel_knn_cv import kernel_knn_cv from gklearn.preimage.generate_random_preimages_by_class import generate_random_preimages_by_class diff --git a/gklearn/preimage/median_preimage_generator_py.py b/gklearn/preimage/median_preimage_generator_py.py new file mode 100644 index 0000000..cdc7a3c --- /dev/null +++ b/gklearn/preimage/median_preimage_generator_py.py @@ -0,0 +1,1035 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 26 18:27:22 2020 + +@author: ljia +""" +import numpy as np +import time +import random +import multiprocessing +import networkx as nx +import cvxpy as cp +from gklearn.preimage import PreimageGenerator +from gklearn.preimage.utils import compute_k_dis +from gklearn.ged.util import compute_geds_cml +from gklearn.ged.median import MedianGraphEstimatorPy +from gklearn.ged.median import constant_node_costs, mge_options_to_string +from gklearn.ged.env import GEDEnv +from gklearn.utils import Timer +from gklearn.utils.utils import get_graph_kernel_by_name + + +class MedianPreimageGeneratorPy(PreimageGenerator): + """Generator median preimages using the pure Python version of GEDEnv. + """ + + def __init__(self, dataset=None): + PreimageGenerator.__init__(self, dataset=dataset) + # arguments to set. + self.__mge = None + self.__ged_options = {} + self.__mge_options = {} + self.__fit_method = 'k-graphs' + self.__init_ecc = None + self.__parallel = True + self.__n_jobs = multiprocessing.cpu_count() + self.__ds_name = None + self.__time_limit_in_sec = 0 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__epsilon_residual = 0.01 + self.__epsilon_ec = 0.1 + self.__allow_zeros = False + self.__triangle_rule = True + # values to compute. + self.__runtime_optimize_ec = None + self.__runtime_generate_preimage = None + self.__runtime_total = None + self.__set_median = None + self.__gen_median = None + self.__best_from_dataset = None + self.__sod_set_median = None + self.__sod_gen_median = None + self.__k_dis_set_median = None + self.__k_dis_gen_median = None + self.__k_dis_dataset = None + self.__itrs = 0 + self.__converged = False + self.__num_updates_ecc = 0 + # values that can be set or to be computed. + self.__edit_cost_constants = [] + self.__gram_matrix_unnorm = None + self.__runtime_precompute_gm = None + + + def set_options(self, **kwargs): + self._kernel_options = kwargs.get('kernel_options', {}) + self._graph_kernel = kwargs.get('graph_kernel', None) + self._verbose = kwargs.get('verbose', 2) + self.__ged_options = kwargs.get('ged_options', {}) + self.__mge_options = kwargs.get('mge_options', {}) + self.__fit_method = kwargs.get('fit_method', 'k-graphs') + self.__init_ecc = kwargs.get('init_ecc', None) + self.__edit_cost_constants = kwargs.get('edit_cost_constants', []) + self.__parallel = kwargs.get('parallel', True) + self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) + self.__ds_name = kwargs.get('ds_name', None) + self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) + self.__max_itrs = kwargs.get('max_itrs', 100) + self.__max_itrs_without_update = kwargs.get('max_itrs_without_update', 3) + self.__epsilon_residual = kwargs.get('epsilon_residual', 0.01) + self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1) + self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) + self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) + self.__allow_zeros = kwargs.get('allow_zeros', False) + self.__triangle_rule = kwargs.get('triangle_rule', True) + + + def run(self): + self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'], + node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + node_attrs=self._dataset.node_attrs, + edge_attrs=self._dataset.edge_attrs, + ds_infos=self._dataset.get_dataset_infos(keys=['directed']), + kernel_options=self._kernel_options) + + # record start time. + start = time.time() + + # 1. precompute gram matrix. + if self.__gram_matrix_unnorm is None: + gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options) + self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm + end_precompute_gm = time.time() + self.__runtime_precompute_gm = end_precompute_gm - start + else: + if self.__runtime_precompute_gm is None: + raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') + self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm + if self._kernel_options['normalize']: + self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) + else: + self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm) + end_precompute_gm = time.time() + start -= self.__runtime_precompute_gm + + if self.__fit_method != 'k-graphs' and self.__fit_method != 'whole-dataset': + start = time.time() + self.__runtime_precompute_gm = 0 + end_precompute_gm = start + + # 2. optimize edit cost constants. + self.__optimize_edit_cost_constants() + end_optimize_ec = time.time() + self.__runtime_optimize_ec = end_optimize_ec - end_precompute_gm + + # 3. compute set median and gen median using optimized edit costs. + if self._verbose >= 2: + print('\nstart computing set median and gen median using optimized edit costs...\n') + self.__gmg_bcu() + end_generate_preimage = time.time() + self.__runtime_generate_preimage = end_generate_preimage - end_optimize_ec + self.__runtime_total = end_generate_preimage - start + if self._verbose >= 2: + print('medians computed.') + print('SOD of the set median: ', self.__sod_set_median) + print('SOD of the generalized median: ', self.__sod_gen_median) + + # 4. compute kernel distances to the true median. + if self._verbose >= 2: + print('\nstart computing distances to true median....\n') + self.__compute_distances_to_true_median() + + # 5. print out results. + if self._verbose: + print() + print('================================================================================') + print('Finished generation of preimages.') + print('--------------------------------------------------------------------------------') + print('The optimized edit cost constants:', self.__edit_cost_constants) + print('SOD of the set median:', self.__sod_set_median) + print('SOD of the generalized median:', self.__sod_gen_median) + print('Distance in kernel space for set median:', self.__k_dis_set_median) + print('Distance in kernel space for generalized median:', self.__k_dis_gen_median) + print('Minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) + print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) + print('Time to optimize edit costs:', self.__runtime_optimize_ec) + print('Time to generate pre-images:', self.__runtime_generate_preimage) + print('Total time:', self.__runtime_total) + print('Total number of iterations for optimizing:', self.__itrs) + print('Total number of updating edit costs:', self.__num_updates_ecc) + print('Is optimization of edit costs converged:', self.__converged) + print('================================================================================') + print() + + + def get_results(self): + results = {} + results['edit_cost_constants'] = self.__edit_cost_constants + results['runtime_precompute_gm'] = self.__runtime_precompute_gm + results['runtime_optimize_ec'] = self.__runtime_optimize_ec + results['runtime_generate_preimage'] = self.__runtime_generate_preimage + results['runtime_total'] = self.__runtime_total + results['sod_set_median'] = self.__sod_set_median + results['sod_gen_median'] = self.__sod_gen_median + results['k_dis_set_median'] = self.__k_dis_set_median + results['k_dis_gen_median'] = self.__k_dis_gen_median + results['k_dis_dataset'] = self.__k_dis_dataset + results['itrs'] = self.__itrs + results['converged'] = self.__converged + results['num_updates_ecc'] = self.__num_updates_ecc + results['mge'] = {} + results['mge']['num_decrease_order'] = self.__mge.get_num_times_order_decreased() + results['mge']['num_increase_order'] = self.__mge.get_num_times_order_increased() + results['mge']['num_converged_descents'] = self.__mge.get_num_converged_descents() + return results + + + def __optimize_edit_cost_constants(self): + """fit edit cost constants. + """ + if self.__fit_method == 'random': # random + if self.__ged_options['edit_cost'] == 'LETTER': + self.__edit_cost_constants = random.sample(range(1, 1000), 3) + self.__edit_cost_constants = [item * 0.001 for item in self.__edit_cost_constants] + elif self.__ged_options['edit_cost'] == 'LETTER2': + random.seed(time.time()) + self.__edit_cost_constants = random.sample(range(1, 1000), 5) + self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] + elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': + self.__edit_cost_constants = random.sample(range(1, 1000), 6) + self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] + if self._dataset.node_attrs == []: + self.__edit_cost_constants[2] = 0 + if self._dataset.edge_attrs == []: + self.__edit_cost_constants[5] = 0 + else: + self.__edit_cost_constants = random.sample(range(1, 1000), 6) + self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] + if self._verbose >= 2: + print('edit cost constants used:', self.__edit_cost_constants) + elif self.__fit_method == 'expert': # expert + if self.__init_ecc is None: + if self.__ged_options['edit_cost'] == 'LETTER': + self.__edit_cost_constants = [0.9, 1.7, 0.75] + elif self.__ged_options['edit_cost'] == 'LETTER2': + self.__edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425] + else: + self.__edit_cost_constants = [3, 3, 1, 3, 3, 1] + else: + self.__edit_cost_constants = self.__init_ecc + elif self.__fit_method == 'k-graphs': + if self.__init_ecc is None: + if self.__ged_options['edit_cost'] == 'LETTER': + self.__init_ecc = [0.9, 1.7, 0.75] + elif self.__ged_options['edit_cost'] == 'LETTER2': + self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425] + elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': + self.__init_ecc = [0, 0, 1, 1, 1, 0] + if self._dataset.node_attrs == []: + self.__init_ecc[2] = 0 + if self._dataset.edge_attrs == []: + self.__init_ecc[5] = 0 + else: + self.__init_ecc = [3, 3, 1, 3, 3, 1] + # optimize on the k-graph subset. + self.__optimize_ecc_by_kernel_distances() + elif self.__fit_method == 'whole-dataset': + if self.__init_ecc is None: + if self.__ged_options['edit_cost'] == 'LETTER': + self.__init_ecc = [0.9, 1.7, 0.75] + elif self.__ged_options['edit_cost'] == 'LETTER2': + self.__init_ecc = [0.675, 0.675, 0.75, 0.425, 0.425] + else: + self.__init_ecc = [3, 3, 1, 3, 3, 1] + # optimizeon the whole set. + self.__optimize_ecc_by_kernel_distances() + elif self.__fit_method == 'precomputed': + pass + + + def __optimize_ecc_by_kernel_distances(self): + # compute distances in feature space. + dis_k_mat, _, _, _ = self._graph_kernel.compute_distance_matrix() + dis_k_vec = [] + for i in range(len(dis_k_mat)): + # for j in range(i, len(dis_k_mat)): + for j in range(i + 1, len(dis_k_mat)): + dis_k_vec.append(dis_k_mat[i, j]) + dis_k_vec = np.array(dis_k_vec) + + # init ged. + if self._verbose >= 2: + print('\ninitial:') + time0 = time.time() + graphs = [self.__clean_graph(g) for g in self._dataset.graphs] + self.__edit_cost_constants = self.__init_ecc + options = self.__ged_options.copy() + options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_labels'] = self._dataset.node_labels + options['edge_labels'] = self._dataset.edge_labels + options['node_attrs'] = self._dataset.node_attrs + options['edge_attrs'] = self._dataset.edge_attrs + ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) + residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] + time_list = [time.time() - time0] + edit_cost_list = [self.__init_ecc] + nb_cost_mat = np.array(n_edit_operations) + nb_cost_mat_list = [nb_cost_mat] + if self._verbose >= 2: + print('Current edit cost constants:', self.__edit_cost_constants) + print('Residual list:', residual_list) + + # run iteration from initial edit costs. + self.__converged = False + itrs_without_update = 0 + self.__itrs = 0 + self.__num_updates_ecc = 0 + timer = Timer(self.__time_limit_in_sec) + while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update): + if self._verbose >= 2: + print('\niteration', self.__itrs + 1) + time0 = time.time() + # "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method. +# np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm', +# nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, +# n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, +# ged_mat=ged_mat) + self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec) + for i in range(len(self.__edit_cost_constants)): + if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9: + self.__edit_cost_constants[i] = 0 + if self.__edit_cost_constants[i] < 0: + raise ValueError('The edit cost is negative.') + # for i in range(len(self.__edit_cost_constants)): + # if self.__edit_cost_constants[i] < 0: + # self.__edit_cost_constants[i] = 0 + + # compute new GEDs and numbers of edit operations. + options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75]) + options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_labels'] = self._dataset.node_labels + options['edge_labels'] = self._dataset.edge_labels + options['node_attrs'] = self._dataset.node_attrs + options['edge_attrs'] = self._dataset.edge_attrs + ged_vec, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) + residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) + time_list.append(time.time() - time0) + edit_cost_list.append(self.__edit_cost_constants) + nb_cost_mat = np.array(n_edit_operations) + nb_cost_mat_list.append(nb_cost_mat) + + # check convergency. + ec_changed = False + for i, cost in enumerate(self.__edit_cost_constants): + if cost == 0: + if edit_cost_list[-2][i] > self.__epsilon_ec: + ec_changed = True + break + elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ec: + ec_changed = True + break +# if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec: +# ec_changed = True +# break + residual_changed = False + if residual_list[-1] == 0: + if residual_list[-2] > self.__epsilon_residual: + residual_changed = True + elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_residual: + residual_changed = True + self.__converged = not (ec_changed or residual_changed) + if self.__converged: + itrs_without_update += 1 + else: + itrs_without_update = 0 + self.__num_updates_ecc += 1 + + # print current states. + if self._verbose >= 2: + print() + print('-------------------------------------------------------------------------') + print('States of iteration', self.__itrs + 1) + print('-------------------------------------------------------------------------') +# print('Time spend:', self.__runtime_optimize_ec) + print('Total number of iterations for optimizing:', self.__itrs + 1) + print('Total number of updating edit costs:', self.__num_updates_ecc) + print('Was optimization of edit costs converged:', self.__converged) + print('Did edit costs change:', ec_changed) + print('Did residual change:', residual_changed) + print('Iterations without update:', itrs_without_update) + print('Current edit cost constants:', self.__edit_cost_constants) + print('Residual list:', residual_list) + print('-------------------------------------------------------------------------') + + self.__itrs += 1 + + + def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): +# if self.__state == AlgorithmState.TERMINATED: +# self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + + + def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'): + # if self.__ds_name == 'Letter-high': + if self.__ged_options['edit_cost'] == 'LETTER': + raise Exception('Cannot compute for cost "LETTER".') + pass + # # method 1: set alpha automatically, just tune c_vir and c_eir by + # # LMS using cvxpy. + # alpha = 0.5 + # coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec) + ## if np.count_nonzero(nb_cost_mat[:,4]) == 0: + ## alpha = 0.75 + ## else: + ## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0]) + ## alpha = alpha * 0.99 + # param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1]) + # param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5]) + # nb_cost_mat_new = np.column_stack((param_vir, param_eir)) + # dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3] + # + # x = cp.Variable(nb_cost_mat_new.shape[1]) + # cost = cp.sum_squares(nb_cost_mat_new * x - dis_new) + # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] + # prob = cp.Problem(cp.Minimize(cost), constraints) + # prob.solve() + # edit_costs_new = x.value + # edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha]) + # residual = np.sqrt(prob.value) + + # # method 2: tune c_vir, c_eir and alpha by nonlinear programming by + # # scipy.optimize.minimize. + # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] + # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] + # w2 = nb_cost_mat[:,3] + # w3 = dis_k_vec + # func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ + # + w2 * x[2] - w3 * x[3]) ** 2) + # bounds = ((0, None), (0., None), (0.5, 0.5), (0, None)) + # res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds) + # edit_costs_new = res.x[0:3] + # residual = res.fun + + # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy. + + + # # method 4: tune c_vir, c_eir and alpha by QP function + # # scipy.optimize.least_squares. An initial guess is required. + # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] + # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] + # w2 = nb_cost_mat[:,3] + # w3 = dis_k_vec + # func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ + # + w2 * x[2] - w3 * x[3]) ** 2 + # res = optimize.root(func, [0.9, 1.7, 0.75, 100]) + # edit_costs_new = res.x + # residual = None + elif self.__ged_options['edit_cost'] == 'LETTER2': + # # 1. if c_vi != c_vr, c_ei != c_er. + # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + # x = cp.Variable(nb_cost_mat_new.shape[1]) + # cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + ## # 1.1 no constraints. + ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] + # # 1.2 c_vs <= c_vi + c_vr. + # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + ## # 2. if c_vi == c_vr, c_ei == c_er. + ## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]] + ## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] + ## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] + ## x = cp.Variable(nb_cost_mat_new.shape[1]) + ## cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + ## # 2.1 no constraints. + ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] + ### # 2.2 c_vs <= c_vi + c_vr. + ### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + ### np.array([2.0, -1.0, 0.0]).T@x >= 0.0] + # + # prob = cp.Problem(cp.Minimize(cost_fun), constraints) + # prob.solve() + # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] + # edit_costs_new = np.array(edit_costs_new) + # residual = np.sqrt(prob.value) + if not self.__triangle_rule and self.__allow_zeros: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif self.__triangle_rule and self.__allow_zeros: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01, + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif not self.__triangle_rule and not self.__allow_zeros: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = x.value + residual = np.sqrt(prob.value) + # elif method == 'inequality_modified': + # # c_vs <= c_vi + c_vr. + # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + # x = cp.Variable(nb_cost_mat_new.shape[1]) + # cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + # prob = cp.Problem(cp.Minimize(cost_fun), constraints) + # prob.solve() + # # use same costs for insertion and removal rather than the fitted costs. + # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] + # edit_costs_new = np.array(edit_costs_new) + # residual = np.sqrt(prob.value) + elif self.__triangle_rule and not self.__allow_zeros: + # c_vs <= c_vi + c_vr. + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif rw_constraints == '2constraints': # @todo: rearrange it later. + # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er. + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, + np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0, + np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + edit_costs_new = x.value + residual = np.sqrt(prob.value) + + elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': + is_n_attr = np.count_nonzero(nb_cost_mat[:,2]) + is_e_attr = np.count_nonzero(nb_cost_mat[:,5]) + + if self.__ds_name == 'SYNTHETICnew': # @todo: rearrenge this later. + # nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] + # constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]] + constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])], + np.array([0.0, 1.0, -1.0]).T@x == 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + prob.solve() + # print(x.value) + edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value, + np.array([0.0]))) + residual = np.sqrt(prob.value) + + elif not self.__triangle_rule and self.__allow_zeros: + if is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif is_n_attr and not is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value, np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) + residual = np.sqrt(prob.value) + else: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), + x.value[2:], np.array([0.0]))) + residual = np.sqrt(prob.value) + elif self.__triangle_rule and self.__allow_zeros: + if is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif is_n_attr and not is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01, + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value, np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) + residual = np.sqrt(prob.value) + else: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), + x.value[2:], np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not self.__triangle_rule and not self.__allow_zeros: + if is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif is_n_attr and not is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value, np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) + residual = np.sqrt(prob.value) + else: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), + x.value[2:], np.array([0.0]))) + residual = np.sqrt(prob.value) + elif self.__triangle_rule and not self.__allow_zeros: + # c_vs <= c_vi + c_vr. + if is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif is_n_attr and not is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value, np.array([0.0]))) + residual = np.sqrt(prob.value) + elif not is_n_attr and is_e_attr: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], + np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) + residual = np.sqrt(prob.value) + else: + nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] + x = cp.Variable(nb_cost_mat_new.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), + x.value[2:], np.array([0.0]))) + residual = np.sqrt(prob.value) + + elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled. + if not self.__triangle_rule and self.__allow_zeros: + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif self.__triangle_rule and self.__allow_zeros: + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif not self.__triangle_rule and not self.__allow_zeros: + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif self.__triangle_rule and not self.__allow_zeros: + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + else: + raise Exception('The edit cost "', self.__ged_options['edit_cost'], '" is not supported for update progress.') + # # method 1: simple least square method. + # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, + # rcond=None) + + # # method 2: least square method with x_i >= 0. + # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) + + # method 3: solve as a quadratic program with constraints. + # P = np.dot(nb_cost_mat.T, nb_cost_mat) + # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) + # G = -1 * np.identity(nb_cost_mat.shape[1]) + # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) + # A = np.array([1 for i in range(nb_cost_mat.shape[1])]) + # b = 1 + # x = cp.Variable(nb_cost_mat.shape[1]) + # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x), + # [G@x <= h]) + # prob.solve() + # edit_costs_new = x.value + # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec) + + # G = -1 * np.identity(nb_cost_mat.shape[1]) + # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], + # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + + # method 4: + + return edit_costs_new, residual + + + def __execute_cvx(self, prob): + try: + prob.solve(verbose=(self._verbose>=2)) + except MemoryError as error0: + if self._verbose >= 2: + print('\nUsing solver "OSQP" caused a memory error.') + print('the original error message is\n', error0) + print('solver status: ', prob.status) + print('trying solver "CVXOPT" instead...\n') + try: + prob.solve(solver=cp.CVXOPT, verbose=(self._verbose>=2)) + except Exception as error1: + if self._verbose >= 2: + print('\nAn error occured when using solver "CVXOPT".') + print('the original error message is\n', error1) + print('solver status: ', prob.status) + print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') + prob.solve(solver=cp.MOSEK, verbose=(self._verbose>=2)) + else: + if self._verbose >= 2: + print('solver status: ', prob.status) + else: + if self._verbose >= 2: + print('solver status: ', prob.status) + if self._verbose >= 2: + print() + + + def __gmg_bcu(self): + """ + The local search algorithm based on block coordinate update (BCU) for estimating a generalized median graph (GMG). + + Returns + ------- + None. + + """ + # Set up the ged environment. + ged_env = GEDEnv() # @todo: maybe create a ged_env as a private varible. + # gedlibpy.restart_env() + ged_env.set_edit_cost(self.__ged_options['edit_cost'], edit_cost_constants=self.__edit_cost_constants) + graphs = [self.__clean_graph(g) for g in self._dataset.graphs] + for g in graphs: + ged_env.add_nx_graph(g, '') + graph_ids = ged_env.get_all_graph_ids() + set_median_id = ged_env.add_graph('set_median') + gen_median_id = ged_env.add_graph('gen_median') + ged_env.init(init_type=self.__ged_options['init_option']) + + # Set up the madian graph estimator. + self.__mge = MedianGraphEstimatorPy(ged_env, constant_node_costs(self.__ged_options['edit_cost'])) + self.__mge.set_refine_method(self.__ged_options['method'], self.__ged_options) + options = self.__mge_options.copy() + if not 'seed' in options: + options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage. + options['parallel'] = self.__parallel + + # Select the GED algorithm. + self.__mge.set_options(mge_options_to_string(options)) + self.__mge.set_label_names(node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + node_attrs=self._dataset.node_attrs, + edge_attrs=self._dataset.edge_attrs) + ged_options = self.__ged_options.copy() + if self.__parallel: + ged_options['threads'] = 1 + self.__mge.set_init_method(ged_options['method'], ged_options) + self.__mge.set_descent_method(ged_options['method'], ged_options) + + # Run the estimator. + self.__mge.run(graph_ids, set_median_id, gen_median_id) + + # Get SODs. + self.__sod_set_median = self.__mge.get_sum_of_distances('initialized') + self.__sod_gen_median = self.__mge.get_sum_of_distances('converged') + + # Get median graphs. + self.__set_median = ged_env.get_nx_graph(set_median_id) + self.__gen_median = ged_env.get_nx_graph(gen_median_id) + + + def __compute_distances_to_true_median(self): + # compute distance in kernel space for set median. + kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) + kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) + if self._kernel_options['normalize']: + kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize + kernel_sm = 1 + # @todo: not correct kernel value + gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) + gram_with_sm = np.concatenate((np.array([[kernel_sm] + kernels_to_sm]).T, gram_with_sm), axis=1) + self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), + [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), + gram_with_sm, withterm3=False) + + # compute distance in kernel space for generalized median. + kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) + kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) + if self._kernel_options['normalize']: + kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize + kernel_gm = 1 + gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) + gram_with_gm = np.concatenate((np.array([[kernel_gm] + kernels_to_gm]).T, gram_with_gm), axis=1) + self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), + [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), + gram_with_gm, withterm3=False) + + # compute distance in kernel space for each graph in median set. + k_dis_median_set = [] + for idx in range(len(self._dataset.graphs)): + k_dis_median_set.append(compute_k_dis(idx+1, range(1, 1+len(self._dataset.graphs)), + [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), + gram_with_gm, withterm3=False)) + idx_k_dis_median_set_min = np.argmin(k_dis_median_set) + self.__k_dis_dataset = k_dis_median_set[idx_k_dis_median_set_min] + self.__best_from_dataset = self._dataset.graphs[idx_k_dis_median_set_min].copy() + + if self._verbose >= 2: + print() + print('distance in kernel space for set median:', self.__k_dis_set_median) + print('distance in kernel space for generalized median:', self.__k_dis_gen_median) + print('minimum distance in kernel space for each graph in median set:', self.__k_dis_dataset) + print('distance in kernel space for each graph in median set:', k_dis_median_set) + + +# def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + def __clean_graph(self, G): # @todo: this may not be needed when datafile is updated. + """ + Cleans node and edge labels and attributes of the given graph. + """ + G_new = nx.Graph(**G.graph) + for nd, attrs in G.nodes(data=True): + G_new.add_node(str(nd)) # @todo: should we keep this as str()? + for l_name in self._dataset.node_labels: + G_new.nodes[str(nd)][l_name] = str(attrs[l_name]) + for a_name in self._dataset.node_attrs: + G_new.nodes[str(nd)][a_name] = str(attrs[a_name]) + for nd1, nd2, attrs in G.edges(data=True): + G_new.add_edge(str(nd1), str(nd2)) + for l_name in self._dataset.edge_labels: + G_new.edges[str(nd1), str(nd2)][l_name] = str(attrs[l_name]) + for a_name in self._dataset.edge_attrs: + G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name]) + return G_new + + + @property + def mge(self): + return self.__mge + + @property + def ged_options(self): + return self.__ged_options + + @ged_options.setter + def ged_options(self, value): + self.__ged_options = value + + + @property + def mge_options(self): + return self.__mge_options + + @mge_options.setter + def mge_options(self, value): + self.__mge_options = value + + + @property + def fit_method(self): + return self.__fit_method + + @fit_method.setter + def fit_method(self, value): + self.__fit_method = value + + + @property + def init_ecc(self): + return self.__init_ecc + + @init_ecc.setter + def init_ecc(self, value): + self.__init_ecc = value + + + @property + def set_median(self): + return self.__set_median + + + @property + def gen_median(self): + return self.__gen_median + + + @property + def best_from_dataset(self): + return self.__best_from_dataset + + + @property + def gram_matrix_unnorm(self): + return self.__gram_matrix_unnorm + + @gram_matrix_unnorm.setter + def gram_matrix_unnorm(self, value): + self.__gram_matrix_unnorm = value \ No newline at end of file