From 66e18c93e1ceda6f307333ab0522a1807c15a35a Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Wed, 8 Apr 2020 18:05:35 +0200 Subject: [PATCH 1/4] 1. add function to get median node/edge label in MedianGraphEstimator. 2. update load_tud function. 3. update MedianPreimageGenerator. --- gklearn/ged/median/median_graph_estimator.py | 147 +++++++++++++++++++++++++- gklearn/preimage/median_preimage_generator.py | 63 ++++++----- gklearn/preimage/utils.py | 10 +- gklearn/utils/dataset.py | 6 +- gklearn/utils/graph_files.py | 20 ++-- 5 files changed, 201 insertions(+), 45 deletions(-) diff --git a/gklearn/ged/median/median_graph_estimator.py b/gklearn/ged/median/median_graph_estimator.py index 84cd64d..0b0cfe2 100644 --- a/gklearn/ged/median/median_graph_estimator.py +++ b/gklearn/ged/median/median_graph_estimator.py @@ -666,7 +666,8 @@ class MedianGraphEstimator(object): # Compute the median label and update the median. if len(node_labels) > 0: - median_label = self.__ged_env.get_median_node_label(node_labels) +# median_label = self.__ged_env.get_median_node_label(node_labels) + median_label = self.__get_median_node_label(node_labels) if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: nx.set_node_attributes(median, {i: median_label}) @@ -701,7 +702,7 @@ class MedianGraphEstimator(object): if median.has_edge(i, j): median_label = median.edges[(i, j)] if self.__labeled_edges and len(edge_labels) > 0: - new_median_label = self.__ged_env.median_edge_label(edge_labels) + new_median_label = self.__get_median_edge_label(edge_labels) if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: median_label = new_median_label for edge_label in edge_labels: @@ -821,4 +822,144 @@ class MedianGraphEstimator(object): def compute_my_cost(g, h, node_map): cost = 0.0 for node in g.nodes: - cost += 0 \ No newline at end of file + cost += 0 + + + def __get_median_node_label(self, node_labels): + if True: + return self.__get_median_label_nonsymbolic(node_labels) + else: + return self.__get_median_node_label_symbolic(node_labels) + + + def __get_median_edge_label(self, edge_labels): + if True: + return self.__get_median_label_nonsymbolic(edge_labels) + else: + return self.__get_median_edge_label_symbolic(edge_labels) + + + def __get_median_label_nonsymbolic(self, labels): + if len(labels) == 0: + return {} # @todo + else: + # Transform the labels into coordinates and compute mean label as initial solution. + labels_as_coords = [] + sums = {} + for key, val in labels[0].items(): + sums[key] = 0 + for label in labels: + coords = {} + for key, val in label.items(): + label = float(val) + sums[key] += label + coords[key] = label + labels_as_coords.append(coords) + median = {} + for key, val in sums.items(): + median[key] = val / len(labels) + + # Run main loop of Weiszfeld's Algorithm. + epsilon = 0.0001 + delta = 1.0 + num_itrs = 0 + all_equal = False + while ((delta > epsilon) and (num_itrs < 100) and (not all_equal)): + numerator = {} + for key, val in sums.items(): + numerator[key] = 0 + denominator = 0 + for label_as_coord in labels_as_coords: + norm = 0 + for key, val in label_as_coord.items(): + norm += (val - median[key]) ** 2 + norm += np.sqrt(norm) + if norm > 0: + for key, val in label_as_coord.items(): + numerator[key] += val / norm + denominator += 1.0 / norm + if denominator == 0: + all_equal = True + else: + new_median = {} + delta = 0.0 + for key, val in numerator.items(): + this_median = val / denominator + new_median[key] = this_median + delta += np.abs(median[key] - this_median) + median = new_median + + num_itrs += 1 + + # Transform the solution to strings and return it. + median_label = {} + for key, val in median.items(): + median_label[key] = str(val) + return median_label + + + def __get_median_node_label_symbolic(self, node_labels): + pass + + + def __get_median_edge_label_symbolic(self, edge_labels): + pass + + +# def __get_median_edge_label_nonsymbolic(self, edge_labels): +# if len(edge_labels) == 0: +# return {} +# else: +# # Transform the labels into coordinates and compute mean label as initial solution. +# edge_labels_as_coords = [] +# sums = {} +# for key, val in edge_labels[0].items(): +# sums[key] = 0 +# for edge_label in edge_labels: +# coords = {} +# for key, val in edge_label.items(): +# label = float(val) +# sums[key] += label +# coords[key] = label +# edge_labels_as_coords.append(coords) +# median = {} +# for key, val in sums.items(): +# median[key] = val / len(edge_labels) +# +# # Run main loop of Weiszfeld's Algorithm. +# epsilon = 0.0001 +# delta = 1.0 +# num_itrs = 0 +# all_equal = False +# while ((delta > epsilon) and (num_itrs < 100) and (not all_equal)): +# numerator = {} +# for key, val in sums.items(): +# numerator[key] = 0 +# denominator = 0 +# for edge_label_as_coord in edge_labels_as_coords: +# norm = 0 +# for key, val in edge_label_as_coord.items(): +# norm += (val - median[key]) ** 2 +# norm += np.sqrt(norm) +# if norm > 0: +# for key, val in edge_label_as_coord.items(): +# numerator[key] += val / norm +# denominator += 1.0 / norm +# if denominator == 0: +# all_equal = True +# else: +# new_median = {} +# delta = 0.0 +# for key, val in numerator.items(): +# this_median = val / denominator +# new_median[key] = this_median +# delta += np.abs(median[key] - this_median) +# median = new_median +# +# num_itrs += 1 +# +# # Transform the solution to ged::GXLLabel and return it. +# median_label = {} +# for key, val in median.items(): +# median_label[key] = str(val) +# return median_label \ No newline at end of file diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index ef1d57a..98eaa81 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -96,7 +96,10 @@ class MedianPreimageGenerator(PreimageGenerator): if self.__runtime_precompute_gm is None: raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm - self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) + if self._kernel_options['normalize']: + self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) + else: + self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm) end_precompute_gm = time.time() start -= self.__runtime_precompute_gm @@ -447,31 +450,7 @@ class MedianPreimageGenerator(PreimageGenerator): constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - try: - prob.solve(verbose=True) - except MemoryError as error0: - if self._verbose >= 2: - print('\nUsing solver "OSQP" caused a memory error.') - print('the original error message is\n', error0) - print('solver status: ', prob.status) - print('trying solver "CVXOPT" instead...\n') - try: - prob.solve(solver=cp.CVXOPT, verbose=True) - except Exception as error1: - if self._verbose >= 2: - print('\nAn error occured when using solver "CVXOPT".') - print('the original error message is\n', error1) - print('solver status: ', prob.status) - print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') - prob.solve(solver=cp.MOSEK, verbose=True) - else: - if self._verbose >= 2: - print('solver status: ', prob.status) - else: - if self._verbose >= 2: - print('solver status: ', prob.status) - if self._verbose >= 2: - print() + self.__execute_cvx(prob) edit_costs_new = x.value residual = np.sqrt(prob.value) elif rw_constraints == '2constraints': @@ -551,9 +530,7 @@ class MedianPreimageGenerator(PreimageGenerator): constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - if self._verbose >= 2: - print(x.value) + self.execute_cvx(prob) edit_costs_new = np.concatenate((x.value, np.array([0.0]))) residual = np.sqrt(prob.value) elif not is_n_attr and is_e_attr: @@ -616,6 +593,34 @@ class MedianPreimageGenerator(PreimageGenerator): return edit_costs_new, residual + def __execute_cvx(self, prob): + try: + prob.solve(verbose=(self._verbose>=2)) + except MemoryError as error0: + if self._verbose >= 2: + print('\nUsing solver "OSQP" caused a memory error.') + print('the original error message is\n', error0) + print('solver status: ', prob.status) + print('trying solver "CVXOPT" instead...\n') + try: + prob.solve(solver=cp.CVXOPT, verbose=(self._verbose>=2)) + except Exception as error1: + if self._verbose >= 2: + print('\nAn error occured when using solver "CVXOPT".') + print('the original error message is\n', error1) + print('solver status: ', prob.status) + print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') + prob.solve(solver=cp.MOSEK, verbose=(self._verbose>=2)) + else: + if self._verbose >= 2: + print('solver status: ', prob.status) + else: + if self._verbose >= 2: + print('solver status: ', prob.status) + if self._verbose >= 2: + print() + + def __generate_preimage_iam(self): # Set up the ged environment. ged_env = gedlibpy.GEDEnv() # @todo: maybe create a ged_env as a private varible. diff --git a/gklearn/preimage/utils.py b/gklearn/preimage/utils.py index a3a661e..0cd50ef 100644 --- a/gklearn/preimage/utils.py +++ b/gklearn/preimage/utils.py @@ -67,8 +67,8 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) if gmfile_exist: - gmfile = np.load(gm_fname) - gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list'] + gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe. + gram_matrix_unnorm_list = [item for item in gmfile['gram_matrix_unnorm_list']] time_precompute_gm_list = gmfile['run_time_list'].tolist() else: gram_matrix_unnorm_list = [] @@ -87,6 +87,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged print('start generating preimage for each class of target...') + idx_offset = 0 for idx, dataset in enumerate(datasets): target = dataset.targets[0] print('\ntarget =', target, '\n') @@ -96,14 +97,15 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged num_graphs = len(dataset.graphs) if num_graphs < 2: print('\nnumber of graphs = ', num_graphs, ', skip.\n') + idx_offset += 1 continue # 2. set parameters. print('2. initializing mpg and setting parameters...') if load_gm: if gmfile_exist: - mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx] - mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx] + mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx - idx_offset] + mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx - idx_offset] mpg = MedianPreimageGenerator() mpg.dataset = dataset mpg.set_options(**mpg_options.copy()) diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 6f5389c..6d5250d 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -92,9 +92,11 @@ class Dataset(object): elif ds_name == 'COIL-RAG': pass elif ds_name == 'COLORS-3': - pass + ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'FRANKENSTEIN': - pass + ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) self.__node_labels = label_names['node_labels'] self.__node_attrs = label_names['node_attrs'] diff --git a/gklearn/utils/graph_files.py b/gklearn/utils/graph_files.py index c00149e..a713958 100644 --- a/gklearn/utils/graph_files.py +++ b/gklearn/utils/graph_files.py @@ -541,10 +541,21 @@ def load_tud(filename): content_gi = open(fgi).read().splitlines() # graph indicator content_am = open(fam).read().splitlines() # adjacency matrix - content_gl = open(fgl).read().splitlines() # graph labels + + # load targets. + if 'fgl' in locals(): + content_targets = open(fgl).read().splitlines() # targets (classification) + targets = [float(i) for i in content_targets] + elif 'fga' in locals(): + content_targets = open(fga).read().splitlines() # targets (regression) + targets = [int(i) for i in content_targets] + if 'class_label_map' in locals(): + targets = [class_label_map[t] for t in targets] + else: + raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') # create graphs and add nodes - data = [nx.Graph(name=str(i)) for i in range(0, len(content_gl))] + data = [nx.Graph(name=str(i)) for i in range(0, len(content_targets))] if 'fnl' in locals(): content_nl = open(fnl).read().splitlines() # node labels for idx, line in enumerate(content_gi): @@ -619,11 +630,6 @@ def load_tud(filename): for i, a_name in enumerate(label_names['edge_attrs']): data[g].edges[n[0], n[1]][a_name] = attrs[i] - # load targets. - targets = [int(i) for i in content_gl] - if 'class_label_map' in locals(): - targets = [class_label_map[t] for t in targets] - return data, targets, label_names From 45052568ebe2be002f8781d4c44467ee3f1f8402 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 9 Apr 2020 12:09:49 +0200 Subject: [PATCH 2/4] 1. fix bugs for function of getting numbers of edit operations for non-symbolic labels. 2. fix bugs for load_tud. --- gklearn/ged/util/util.py | 30 ++++++++++++++++----------- gklearn/preimage/median_preimage_generator.py | 4 ++++ gklearn/utils/dataset.py | 3 ++- gklearn/utils/graph_files.py | 6 ++++-- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index 2ff0103..22601dd 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -57,7 +57,9 @@ def compute_geds(graphs, options={}, parallel=False): ged_env.set_method(options['method'], ged_options_to_string(options)) ged_env.init_method() - # compute ged. + # compute ged. + neo_options = {'edit_cost': options['edit_cost'], + 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} ged_mat = np.zeros((len(graphs), len(graphs))) if parallel: len_itr = int(len(graphs) * (len(graphs) - 1) / 2) @@ -74,7 +76,7 @@ def compute_geds(graphs, options={}, parallel=False): G_graphs = graphs_toshare G_ged_env = ged_env_toshare G_listID = listID_toshare - do_partial = partial(_wrapper_compute_ged_parallel, options) + do_partial = partial(_wrapper_compute_ged_parallel, neo_options) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), desc='computing GEDs', file=sys.stdout) @@ -100,7 +102,7 @@ def compute_geds(graphs, options={}, parallel=False): ged_vec.append(dis) ged_mat[i][j] = dis ged_mat[j][i] = dis - n_eo_tmp = get_nb_edit_operations(graphs[i], graphs[j], pi_forward, pi_backward, edit_cost=options['edit_cost']) + n_eo_tmp = get_nb_edit_operations(graphs[i], graphs[j], pi_forward, pi_backward, **neo_options) n_edit_operations.append(n_eo_tmp) return ged_vec, ged_mat, n_edit_operations @@ -115,7 +117,7 @@ def _wrapper_compute_ged_parallel(options, itr): def _compute_ged_parallel(env, gid1, gid2, g1, g2, options): dis, pi_forward, pi_backward = _compute_ged(env, gid1, gid2, g1, g2) - n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward, edit_cost=options['edit_cost']) # [0,0,0,0,0,0] + n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward, **options) # [0,0,0,0,0,0] return dis, n_eo_tmp @@ -137,11 +139,14 @@ def _compute_ged(env, gid1, gid2, g1, g2): return dis, pi_forward, pi_backward -def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None): +def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, **kwargs): if edit_cost == 'LETTER' or edit_cost == 'LETTER2': return get_nb_edit_operations_letter(g1, g2, forward_map, backward_map) elif edit_cost == 'NON_SYMBOLIC': - return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map) + node_attrs = kwargs.get('node_attrs', []) + edge_attrs = kwargs.get('edge_attrs', []) + return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, + node_attrs=node_attrs, edge_attrs=edge_attrs) else: return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) @@ -242,7 +247,8 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er -def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map): +def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, + node_attrs=[], edge_attrs=[]): """Compute the number of each edit operations. """ n_vi = 0 @@ -261,7 +267,7 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map): else: n_vs += 1 sum_squares = 0 - for a_name in g1.graph['node_attrs']: + for a_name in node_attrs: diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name]) sum_squares += np.square(diff) sod_vs += np.sqrt(sum_squares) @@ -284,15 +290,15 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map): elif (n1_g2, n2_g2) in g2.edges(): n_es += 1 sum_squares = 0 - for a_name in g1.graph['edge_attrs']: - diff = float(g1.edges[n1, n2][a_name]) - float(g2.nodes[n1_g2, n2_g2][a_name]) + for a_name in edge_attrs: + diff = float(g1.edges[n1, n2][a_name]) - float(g2.edges[n1_g2, n2_g2][a_name]) sum_squares += np.square(diff) sod_es += np.sqrt(sum_squares) elif (n2_g2, n1_g2) in g2.edges(): n_es += 1 sum_squares = 0 - for a_name in g1.graph['edge_attrs']: - diff = float(g1.edges[n2, n1][a_name]) - float(g2.nodes[n2_g2, n1_g2][a_name]) + for a_name in edge_attrs: + diff = float(g1.edges[n2, n1][a_name]) - float(g2.edges[n2_g2, n1_g2][a_name]) sum_squares += np.square(diff) sod_es += np.sqrt(sum_squares) # corresponding nodes are in g2, however the edge is removed. diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index 98eaa81..916cf8a 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -262,6 +262,8 @@ class MedianPreimageGenerator(PreimageGenerator): self.__edit_cost_constants = self.__init_ecc options = self.__ged_options.copy() options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_attrs'] = self._dataset.node_attrs + options['edge_attrs'] = self._dataset.edge_attrs ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel) residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] time_list = [time.time() - time0] @@ -300,6 +302,8 @@ class MedianPreimageGenerator(PreimageGenerator): # compute new GEDs and numbers of edit operations. options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75]) options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_attrs'] = self._dataset.node_attrs + options['edge_attrs'] = self._dataset.edge_attrs ged_vec, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel) residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) time_list.append(time.time() - time0) diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 6d5250d..c90073f 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -90,7 +90,8 @@ class Dataset(object): ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'COIL-RAG': - pass + ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'COLORS-3': ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) diff --git a/gklearn/utils/graph_files.py b/gklearn/utils/graph_files.py index a713958..7f424d6 100644 --- a/gklearn/utils/graph_files.py +++ b/gklearn/utils/graph_files.py @@ -474,6 +474,7 @@ def load_tud(filename): label_names = {'node_labels': [], 'node_attrs': [], 'edge_labels': [], 'edge_attrs': []} + class_label_map = None class_label_map_strings = [] content_rm = open(frm).read().splitlines() i = 0 @@ -538,6 +539,7 @@ def load_tud(filename): else: label_names = {'node_labels': [], 'node_attrs': [], 'edge_labels': [], 'edge_attrs': []} + class_label_map = None content_gi = open(fgi).read().splitlines() # graph indicator content_am = open(fam).read().splitlines() # adjacency matrix @@ -549,7 +551,7 @@ def load_tud(filename): elif 'fga' in locals(): content_targets = open(fga).read().splitlines() # targets (regression) targets = [int(i) for i in content_targets] - if 'class_label_map' in locals(): + if class_label_map is not None: targets = [class_label_map[t] for t in targets] else: raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') @@ -562,7 +564,7 @@ def load_tud(filename): # transfer to int first in case of unexpected blanks data[int(line) - 1].add_node(idx) labels = [l.strip() for l in content_nl[idx].split(',')] - if label_names['node_labels'] == []: + if label_names['node_labels'] == []: # @todo: need fix bug. for i, label in enumerate(labels): l_name = 'label_' + str(i) data[int(line) - 1].nodes[idx][l_name] = label From 8385bc4caea6361b48af915a111a5650a4f30f01 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 10 Apr 2020 10:35:54 +0200 Subject: [PATCH 3/4] Update MedianPreimageGenerator and MedianGraphEstimator for symbolic-labeled graphs and edit cost "CONSTANT". --- .gitignore | 1 + gklearn/ged/median/median_graph_estimator.py | 48 +++++++++++++++++++++------ gklearn/ged/median/utils.py | 4 +++ gklearn/ged/util/util.py | 38 +++++++++++++++------ gklearn/preimage/median_preimage_generator.py | 32 ++++++++++++++---- gklearn/preimage/utils.py | 2 +- gklearn/utils/dataset.py | 41 +++++++++++++---------- 7 files changed, 120 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index 0f4ea83..79e102c 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ gklearn/kernels/*_sym.py gklearn/preimage/* !gklearn/preimage/*.py +!gklearn/preimage/experiments/*.py __pycache__ ##*# diff --git a/gklearn/ged/median/median_graph_estimator.py b/gklearn/ged/median/median_graph_estimator.py index 0b0cfe2..7cbb6d6 100644 --- a/gklearn/ged/median/median_graph_estimator.py +++ b/gklearn/ged/median/median_graph_estimator.py @@ -70,6 +70,7 @@ class MedianGraphEstimator(object): self.__num_increase_order = 0 self.__num_converged_descents = 0 self.__state = AlgorithmState.TERMINATED + self.__label_names = {} if ged_env is None: raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') @@ -551,6 +552,7 @@ class MedianGraphEstimator(object): self.__init_type_increase_order = 'K-MEANS++' self.__max_itrs_increase_order = 10 self.__print_to_stdout = 2 + self.__label_names = {} def __construct_initial_medians(self, graph_ids, timer, initial_medians): @@ -824,19 +826,49 @@ class MedianGraphEstimator(object): for node in g.nodes: cost += 0 + + def set_label_names(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + self.__label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, + 'node_attrs': node_attrs, 'edge_attrs': edge_attrs} + def __get_median_node_label(self, node_labels): - if True: + if len(self.__label_names['node_labels']) > 0: + return self.__get_median_label_symbolic(node_labels) + elif len(self.__label_names['node_attrs']) > 0: return self.__get_median_label_nonsymbolic(node_labels) else: - return self.__get_median_node_label_symbolic(node_labels) + raise Exception('Node label names are not given.') def __get_median_edge_label(self, edge_labels): - if True: + if len(self.__label_names['edge_labels']) > 0: + return self.__get_median_label_symbolic(edge_labels) + elif len(self.__label_names['edge_attrs']) > 0: return self.__get_median_label_nonsymbolic(edge_labels) else: - return self.__get_median_edge_label_symbolic(edge_labels) + raise Exception('Edge label names are not given.') + + + def __get_median_label_symbolic(self, labels): + # Construct histogram. + hist = {} + for label in labels: + label = tuple([kv for kv in label.items()]) # @todo: this may be slow. + if label not in hist: + hist[label] = 1 + else: + hist[label] += 1 + + # Return the label that appears most frequently. + best_count = 0 + median_label = {} + for label, count in hist.items(): + if count > best_count: + best_count = count + median_label = {kv[0]: kv[1] for kv in label} + + return median_label def __get_median_label_nonsymbolic(self, labels): @@ -896,14 +928,10 @@ class MedianGraphEstimator(object): for key, val in median.items(): median_label[key] = str(val) return median_label - - - def __get_median_node_label_symbolic(self, node_labels): - pass - def __get_median_edge_label_symbolic(self, edge_labels): - pass +# def __get_median_edge_label_symbolic(self, edge_labels): +# pass # def __get_median_edge_label_nonsymbolic(self, edge_labels): diff --git a/gklearn/ged/median/utils.py b/gklearn/ged/median/utils.py index 741320c..f6d43bb 100644 --- a/gklearn/ged/median/utils.py +++ b/gklearn/ged/median/utils.py @@ -9,6 +9,10 @@ Created on Wed Apr 1 15:12:31 2020 def constant_node_costs(edit_cost_name): if edit_cost_name == 'NON_SYMBOLIC' or edit_cost_name == 'LETTER2' or edit_cost_name == 'LETTER': return False + elif edit_cost_name == 'CONSTANT': + return True + else: + raise Exception('Can not recognize the given edit cost. Possible edit costs include: "NON_SYMBOLIC", "LETTER", "LETTER2", "CONSTANT".') # elif edit_cost_name != '': # # throw ged::Error("Invalid dataset " + dataset + ". Usage: ./median_tests "); # return False diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index 22601dd..a18b0cb 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -58,7 +58,8 @@ def compute_geds(graphs, options={}, parallel=False): ged_env.init_method() # compute ged. - neo_options = {'edit_cost': options['edit_cost'], + neo_options = {'edit_cost': options['edit_cost'], + 'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'], 'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']} ged_mat = np.zeros((len(graphs), len(graphs))) if parallel: @@ -147,12 +148,18 @@ def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, ** edge_attrs = kwargs.get('edge_attrs', []) return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map, node_attrs=node_attrs, edge_attrs=edge_attrs) + elif edit_cost == 'CONSTANT': + node_labels = kwargs.get('node_labels', []) + edge_labels = kwargs.get('edge_labels', []) + return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, + node_labels=node_labels, edge_labels=edge_labels) else: return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) -def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map): - """Compute the number of each edit operations. +def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map, + node_labels=[], edge_labels=[]): + """Compute the number of each edit operations for symbolic-labeled graphs. """ n_vi = 0 n_vr = 0 @@ -165,8 +172,13 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map): for i, map_i in enumerate(forward_map): if map_i == np.inf: n_vr += 1 - elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']: - n_vs += 1 + else: + for nl in node_labels: + label1 = g1.nodes[nodes1[i]][nl] + label2 = g2.nodes[map_i][nl] + if label1 != label2: + n_vs += 1 + break for map_i in backward_map: if map_i == np.inf: n_vi += 1 @@ -185,15 +197,21 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map): elif (forward_map[idx1], forward_map[idx2]) in g2.edges(): nb_edges2_cnted += 1 # edge labels are different. - if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \ - != g1.edges[(n1, n2)]['bond_type']: + for el in edge_labels: + label1 = g2.edges[((forward_map[idx1], forward_map[idx2]))][el] + label2 = g1.edges[(n1, n2)][el] + if label1 != label2: n_es += 1 + break elif (forward_map[idx2], forward_map[idx1]) in g2.edges(): nb_edges2_cnted += 1 # edge labels are different. - if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \ - != g1.edges[(n1, n2)]['bond_type']: - n_es += 1 + for el in edge_labels: + label1 = g2.edges[((forward_map[idx2], forward_map[idx1]))][el] + label2 = g1.edges[(n1, n2)][el] + if label1 != label2: + n_es += 1 + break # corresponding nodes are in g2, however the edge is removed. else: n_er += 1 diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index 916cf8a..6878701 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -262,6 +262,8 @@ class MedianPreimageGenerator(PreimageGenerator): self.__edit_cost_constants = self.__init_ecc options = self.__ged_options.copy() options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_labels'] = self._dataset.node_labels + options['edge_labels'] = self._dataset.edge_labels options['node_attrs'] = self._dataset.node_attrs options['edge_attrs'] = self._dataset.edge_attrs ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel) @@ -302,6 +304,8 @@ class MedianPreimageGenerator(PreimageGenerator): # compute new GEDs and numbers of edit operations. options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75]) options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['node_labels'] = self._dataset.node_labels + options['edge_labels'] = self._dataset.edge_labels options['node_attrs'] = self._dataset.node_attrs options['edge_attrs'] = self._dataset.edge_attrs ged_vec, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel) @@ -451,7 +455,7 @@ class MedianPreimageGenerator(PreimageGenerator): nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] x = cp.Variable(nb_cost_mat_new.shape[1]) cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) self.__execute_cvx(prob) @@ -524,17 +528,17 @@ class MedianPreimageGenerator(PreimageGenerator): np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() + self.__execute_cvx(prob) edit_costs_new = x.value residual = np.sqrt(prob.value) elif is_n_attr and not is_e_attr: nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] x = cp.Variable(nb_cost_mat_new.shape[1]) cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], + constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.execute_cvx(prob) + self.__execute_cvx(prob) edit_costs_new = np.concatenate((x.value, np.array([0.0]))) residual = np.sqrt(prob.value) elif not is_n_attr and is_e_attr: @@ -544,7 +548,7 @@ class MedianPreimageGenerator(PreimageGenerator): constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() + self.__execute_cvx(prob) edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) residual = np.sqrt(prob.value) else: @@ -553,10 +557,20 @@ class MedianPreimageGenerator(PreimageGenerator): cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() + self.__execute_cvx(prob) edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:], np.array([0.0]))) residual = np.sqrt(prob.value) + elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled. + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) else: # # method 1: simple least square method. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, @@ -588,7 +602,7 @@ class MedianPreimageGenerator(PreimageGenerator): np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() + self.__execute_cvx(prob) edit_costs_new = x.value residual = np.sqrt(prob.value) @@ -647,6 +661,10 @@ class MedianPreimageGenerator(PreimageGenerator): # Select the GED algorithm. mge.set_options(mge_options_to_string(options)) + mge.set_label_names(node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + node_attrs=self._dataset.node_attrs, + edge_attrs=self._dataset.edge_attrs) mge.set_init_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options)) mge.set_descent_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options)) diff --git a/gklearn/preimage/utils.py b/gklearn/preimage/utils.py index 0cd50ef..63c8b9e 100644 --- a/gklearn/preimage/utils.py +++ b/gklearn/preimage/utils.py @@ -37,7 +37,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged dataset_all.trim_dataset(edge_required=edge_required) if irrelevant_labels is not None: dataset_all.remove_labels(**irrelevant_labels) -# dataset_all.cut_graphs(range(0, 100)) +# dataset_all.cut_graphs(range(0, 10)) datasets = split_dataset_by_target(dataset_all) if save_results: diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index c90073f..0c13e0f 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -67,24 +67,7 @@ class Dataset(object): def load_predefined_dataset(self, ds_name): current_path = os.path.dirname(os.path.realpath(__file__)) + '/' - if ds_name == 'Letter-high': # node non-symb - ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'Letter-med': # node non-symb - ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'Letter-low': # node non-symb - ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'Fingerprint': - ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'SYNTHETIC': - pass - elif ds_name == 'SYNTHETICnew': - ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' - self.__graphs, self.__targets, label_names = load_dataset(ds_file) - elif ds_name == 'Synthie': + if ds_name == 'acyclic': pass elif ds_name == 'COIL-DEL': ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' @@ -95,9 +78,31 @@ class Dataset(object): elif ds_name == 'COLORS-3': ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Fingerprint': + ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'FRANKENSTEIN': ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Letter-high': # node non-symb + ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Letter-low': # node non-symb + ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Letter-med': # node non-symb + ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'MUTAG': + ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'SYNTHETIC': + pass + elif ds_name == 'SYNTHETICnew': + ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Synthie': + pass self.__node_labels = label_names['node_labels'] self.__node_attrs = label_names['node_attrs'] From a9197b9b49b631ad8e54f1a1c7dc2b7f146db7a3 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 10 Apr 2020 10:56:47 +0200 Subject: [PATCH 4/4] Update .travis.yml. Test the library on more Python version. --- .travis.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 66d519e..4ad5f90 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,10 @@ language: python python: -- '3.6.9' +- '3.0' +- '3.1' +- '3.2' +- '3.3' +- '3.4' - '3.5' - '3.6' - '3.7'