diff --git a/.gitignore b/.gitignore index 0f4ea83..79e102c 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ gklearn/kernels/*_sym.py gklearn/preimage/* !gklearn/preimage/*.py +!gklearn/preimage/experiments/*.py __pycache__ ##*# diff --git a/gklearn/ged/median/median_graph_estimator.py b/gklearn/ged/median/median_graph_estimator.py index 0b0cfe2..7cbb6d6 100644 --- a/gklearn/ged/median/median_graph_estimator.py +++ b/gklearn/ged/median/median_graph_estimator.py @@ -70,6 +70,7 @@ class MedianGraphEstimator(object): self.__num_increase_order = 0 self.__num_converged_descents = 0 self.__state = AlgorithmState.TERMINATED + self.__label_names = {} if ged_env is None: raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') @@ -551,6 +552,7 @@ class MedianGraphEstimator(object): self.__init_type_increase_order = 'K-MEANS++' self.__max_itrs_increase_order = 10 self.__print_to_stdout = 2 + self.__label_names = {} def __construct_initial_medians(self, graph_ids, timer, initial_medians): @@ -824,19 +826,49 @@ class MedianGraphEstimator(object): for node in g.nodes: cost += 0 + + def set_label_names(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + self.__label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, + 'node_attrs': node_attrs, 'edge_attrs': edge_attrs} + def __get_median_node_label(self, node_labels): - if True: + if len(self.__label_names['node_labels']) > 0: + return self.__get_median_label_symbolic(node_labels) + elif len(self.__label_names['node_attrs']) > 0: return self.__get_median_label_nonsymbolic(node_labels) else: - return self.__get_median_node_label_symbolic(node_labels) + raise Exception('Node label names are not given.') def __get_median_edge_label(self, edge_labels): - if True: + if len(self.__label_names['edge_labels']) > 0: + return self.__get_median_label_symbolic(edge_labels) + elif len(self.__label_names['edge_attrs']) > 0: return self.__get_median_label_nonsymbolic(edge_labels) else: - return self.__get_median_edge_label_symbolic(edge_labels) + raise Exception('Edge label names are not given.') + + + def __get_median_label_symbolic(self, labels): + # Construct histogram. + hist = {} + for label in labels: + label = tuple([kv for kv in label.items()]) # @todo: this may be slow. + if label not in hist: + hist[label] = 1 + else: + hist[label] += 1 + + # Return the label that appears most frequently. + best_count = 0 + median_label = {} + for label, count in hist.items(): + if count > best_count: + best_count = count + median_label = {kv[0]: kv[1] for kv in label} + + return median_label def __get_median_label_nonsymbolic(self, labels): @@ -896,14 +928,10 @@ class MedianGraphEstimator(object): for key, val in median.items(): median_label[key] = str(val) return median_label - - - def __get_median_node_label_symbolic(self, node_labels): - pass - def __get_median_edge_label_symbolic(self, edge_labels): - pass +# def __get_median_edge_label_symbolic(self, edge_labels): +# pass # def __get_median_edge_label_nonsymbolic(self, edge_labels): diff --git a/gklearn/ged/median/utils.py b/gklearn/ged/median/utils.py index 741320c..f6d43bb 100644 --- a/gklearn/ged/median/utils.py +++ b/gklearn/ged/median/utils.py @@ -9,6 +9,10 @@ Created on Wed Apr 1 15:12:31 2020 def constant_node_costs(edit_cost_name): if edit_cost_name == 'NON_SYMBOLIC' or edit_cost_name == 'LETTER2' or edit_cost_name == 'LETTER': return False + elif edit_cost_name == 'CONSTANT': + return True + else: + raise Exception('Can not recognize the given edit cost. Possible edit costs include: "NON_SYMBOLIC", "LETTER", "LETTER2", "CONSTANT".') # elif edit_cost_name != '': # # throw ged::Error("Invalid dataset " + dataset + ". Usage: ./median_tests "); # return False diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index 22601dd..a18b0cb 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -58,7 +58,8 @@ def compute_geds(graphs, options={}, parallel=False): ged_env.init_method() # compute ged. - neo_options = {'edit_cost': options['edit_cost'], + neo_options = {'edit_cost': options['edit_cost'], + 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} ged_mat = np.zeros((len(graphs), len(graphs))) if parallel: @@ -147,12 +148,18 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, ** edge_attrs = kwargs.get('edge_attrs', []) return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, node_attrs=node_attrs, edge_attrs=edge_attrs) + elif edit_cost == 'CONSTANT': + node_labels = kwargs.get('node_labels', []) + edge_labels = kwargs.get('edge_labels', []) + return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, + node_labels=node_labels, edge_labels=edge_labels) else: return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) -def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map): - """Compute the number of each edit operations. +def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, + node_labels=[], edge_labels=[]): + """Compute the number of each edit operations for symbolic-labeled graphs. """ n_vi = 0 n_vr = 0 @@ -165,8 +172,13 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map): for i, map_i in enumerate(forward_map): if map_i == np.inf: n_vr += 1 - elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']: - n_vs += 1 + else: + for nl in node_labels: + label1 = g1.nodes[nodes1[i]][nl] + label2 = g2.nodes[map_i][nl] + if label1 != label2: + n_vs += 1 + break for map_i in backward_map: if map_i == np.inf: n_vi += 1 @@ -185,15 +197,21 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map): elif (forward_map[idx1], forward_map[idx2]) in g2.edges(): nb_edges2_cnted += 1 # edge labels are different. - if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \ - != g1.edges[(n1, n2)]['bond_type']: + for el in edge_labels: + label1 = g2.edges[((forward_map[idx1], forward_map[idx2]))][el] + label2 = g1.edges[(n1, n2)][el] + if label1 != label2: n_es += 1 + break elif (forward_map[idx2], forward_map[idx1]) in g2.edges(): nb_edges2_cnted += 1 # edge labels are different. - if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \ - != g1.edges[(n1, n2)]['bond_type']: - n_es += 1 + for el in edge_labels: + label1 = g2.edges[((forward_map[idx2], forward_map[idx1]))][el] + label2 = g1.edges[(n1, n2)][el] + if label1 != label2: + n_es += 1 + break # corresponding nodes are in g2, however the edge is removed. else: n_er += 1 diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index 916cf8a..6878701 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -262,6 +262,8 @@ class MedianPreimageGenerator(PreimageGenerator): self.__edit_cost_constants = self.__init_ecc options = self.__ged_options.copy() options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_labels'] = self._dataset.node_labels + options['edge_labels'] = self._dataset.edge_labels options['node_attrs'] = self._dataset.node_attrs options['edge_attrs'] = self._dataset.edge_attrs ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel) @@ -302,6 +304,8 @@ class MedianPreimageGenerator(PreimageGenerator): # compute new GEDs and numbers of edit operations. options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75]) options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_labels'] = self._dataset.node_labels + options['edge_labels'] = self._dataset.edge_labels options['node_attrs'] = self._dataset.node_attrs options['edge_attrs'] = self._dataset.edge_attrs ged_vec, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel) @@ -451,7 +455,7 @@ class MedianPreimageGenerator(PreimageGenerator): nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] x = cp.Variable(nb_cost_mat_new.shape[1]) cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) self.__execute_cvx(prob) @@ -524,17 +528,17 @@ class MedianPreimageGenerator(PreimageGenerator): np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() + self.__execute_cvx(prob) edit_costs_new = x.value residual = np.sqrt(prob.value) elif is_n_attr and not is_e_attr: nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] x = cp.Variable(nb_cost_mat_new.shape[1]) cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.execute_cvx(prob) + self.__execute_cvx(prob) edit_costs_new = np.concatenate((x.value, np.array([0.0]))) residual = np.sqrt(prob.value) elif not is_n_attr and is_e_attr: @@ -544,7 +548,7 @@ class MedianPreimageGenerator(PreimageGenerator): constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() + self.__execute_cvx(prob) edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) residual = np.sqrt(prob.value) else: @@ -553,10 +557,20 @@ class MedianPreimageGenerator(PreimageGenerator): cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() + self.__execute_cvx(prob) edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:], np.array([0.0]))) residual = np.sqrt(prob.value) + elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled. + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) else: # # method 1: simple least square method. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, @@ -588,7 +602,7 @@ class MedianPreimageGenerator(PreimageGenerator): np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() + self.__execute_cvx(prob) edit_costs_new = x.value residual = np.sqrt(prob.value) @@ -647,6 +661,10 @@ class MedianPreimageGenerator(PreimageGenerator): # Select the GED algorithm. mge.set_options(mge_options_to_string(options)) + mge.set_label_names(node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + node_attrs=self._dataset.node_attrs, + edge_attrs=self._dataset.edge_attrs) mge.set_init_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options)) mge.set_descent_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options)) diff --git a/gklearn/preimage/utils.py b/gklearn/preimage/utils.py index 0cd50ef..63c8b9e 100644 --- a/gklearn/preimage/utils.py +++ b/gklearn/preimage/utils.py @@ -37,7 +37,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged dataset_all.trim_dataset(edge_required=edge_required) if irrelevant_labels is not None: dataset_all.remove_labels(**irrelevant_labels) -# dataset_all.cut_graphs(range(0, 100)) +# dataset_all.cut_graphs(range(0, 10)) datasets = split_dataset_by_target(dataset_all) if save_results: diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index c90073f..0c13e0f 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -67,24 +67,7 @@ class Dataset(object): def load_predefined_dataset(self, ds_name): current_path = os.path.dirname(os.path.realpath(__file__)) + '/' - if ds_name == 'Letter-high': # node non-symb - ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'Letter-med': # node non-symb - ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'Letter-low': # node non-symb - ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'Fingerprint': - ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'SYNTHETIC': - pass - elif ds_name == 'SYNTHETICnew': - ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'Synthie': + if ds_name == 'acyclic': pass elif ds_name == 'COIL-DEL': ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' @@ -95,9 +78,31 @@ class Dataset(object): elif ds_name == 'COLORS-3': ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Fingerprint': + ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'FRANKENSTEIN': ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Letter-high': # node non-symb + ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Letter-low': # node non-symb + ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Letter-med': # node non-symb + ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'MUTAG': + ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'SYNTHETIC': + pass + elif ds_name == 'SYNTHETICnew': + ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Synthie': + pass self.__node_labels = label_names['node_labels'] self.__node_attrs = label_names['node_attrs']