From 06c35201b264218baac2cb05dc058e2314a45858 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 26 May 2020 15:55:58 +0200 Subject: [PATCH] Add option to sort graphs by size when computing GEDs for the sake of the stability. --- gklearn/ged/env/node_map.py | 28 ++-- gklearn/ged/median/median_graph_estimator.py | 146 ++++++++++++++++----- gklearn/ged/median/utils.py | 2 + gklearn/ged/util/util.py | 24 ++-- .../tools/analyze_results_of_random_edit_costs.py | 9 +- .../experiments/xp_1nn_init10_trianglerule.py | 24 ++-- gklearn/preimage/kernel_knn_cv.py | 2 +- 7 files changed, 166 insertions(+), 69 deletions(-) diff --git a/gklearn/ged/env/node_map.py b/gklearn/ged/env/node_map.py index 4812486..dc3e3bf 100644 --- a/gklearn/ged/env/node_map.py +++ b/gklearn/ged/env/node_map.py @@ -39,14 +39,6 @@ class NodeMap(object): return np.inf - def get_forward_map(self): - return self.__forward_map - - - def get_backward_map(self): - return self.__backward_map - - def as_relation(self, relation): relation.clear() for i in range(0, len(self.__forward_map)): @@ -77,4 +69,22 @@ class NodeMap(object): def induced_cost(self): - return self.__induced_cost \ No newline at end of file + return self.__induced_cost + + + @property + def forward_map(self): + return self.__forward_map + + @forward_map.setter + def forward_map(self, value): + self.__forward_map = value + + + @property + def backward_map(self): + return self.__backward_map + + @backward_map.setter + def backward_map(self, value): + self.__backward_map = value \ No newline at end of file diff --git a/gklearn/ged/median/median_graph_estimator.py b/gklearn/ged/median/median_graph_estimator.py index c4291ce..03c7892 100644 --- a/gklearn/ged/median/median_graph_estimator.py +++ b/gklearn/ged/median/median_graph_estimator.py @@ -52,6 +52,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no self.__seed = 0 self.__parallel = True self.__update_order = True + self.__sort_graphs = True # sort graphs by size when computing GEDs. self.__refine = True self.__time_limit_in_sec = 0 self.__epsilon = 0.0001 @@ -150,6 +151,16 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no else: raise Exception('Invalid argument "' + opt_val + '" for option update-order. Usage: options = "[--update-order TRUE|FALSE] [...]"') + elif opt_name == 'sort-graphs': + if opt_val == 'TRUE': + self.__sort_graphs = True + + elif opt_val == 'FALSE': + self.__sort_graphs = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option sort-graphs. Usage: options = "[--sort-graphs TRUE|FALSE] [...]"') + elif opt_name == 'refine': if opt_val == 'TRUE': self.__refine = True @@ -537,18 +548,31 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no progress.update(1) # Improving the node maps. + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__gen_median_id) for graph_id, node_map in self.__node_maps_from_median.items(): if time.expired(): if self.__state == AlgorithmState.TERMINATED: self.__state = AlgorithmState.CONVERGED break - self.__ged_env.run_method(self.__gen_median_id, graph_id) - if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost(): - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id) - self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(self.__gen_median_id, graph_id) + if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost(): + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id) + else: + self.__ged_env.run_method(graph_id, self.__gen_median_id) + if self.__ged_env.get_upper_bound(graph_id, self.__gen_median_id) < node_map.induced_cost(): + node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__gen_median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + + self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + # Print information. if self.__print_to_stdout == 2: progress.update(1) + self.__sum_of_distances = 0.0 for key, val in self.__node_maps_from_median.items(): self.__sum_of_distances += val.induced_cost() @@ -636,6 +660,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no self.__seed = 0 self.__parallel = True self.__update_order = True + self.__sort_graphs = True self.__refine = True self.__time_limit_in_sec = 0 self.__epsilon = 0.0001 @@ -695,7 +720,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - do_fun = partial(_compute_medoid_parallel, graph_ids) + do_fun = partial(_compute_medoid_parallel, graph_ids, self.__sort_graphs) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) if self.__print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), @@ -723,10 +748,16 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no if timer.expired(): self.__state = AlgorithmState.CALLED break + nb_nodes_g = self.__ged_env.get_graph_num_nodes(g_id) sum_of_distances = 0 - for h_id in graph_ids: - self.__ged_env.run_method(g_id, h_id) - sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) + for h_id in graph_ids: + nb_nodes_h = self.__ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not self.__sort_graphs: + self.__ged_env.run_method(g_id, h_id) + sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) + else: + self.__ged_env.run_method(h_id, g_id) + sum_of_distances += self.__ged_env.get_upper_bound(h_id, g_id) if sum_of_distances < best_sum_of_distances: best_sum_of_distances = sum_of_distances medoid_id = g_id @@ -760,7 +791,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - do_fun = partial(_compute_init_node_maps_parallel, gen_median_id) + nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self.__sort_graphs, nb_nodes_median) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) if self.__print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), @@ -783,9 +815,17 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no self.__sum_of_distances = 0 self.__node_maps_from_median.clear() + nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) for graph_id in graph_ids: - self.__ged_env.run_method(gen_median_id, graph_id) - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(gen_median_id, graph_id) + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) + else: + self.__ged_env.run_method(graph_id, gen_median_id) + node_map_tmp = self.__ged_env.get_node_map(graph_id, gen_median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp # print(self.__node_maps_from_median[graph_id]) self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() # print(self.__sum_of_distances) @@ -843,7 +883,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no for graph_id, graph in graphs.items(): # print('graph_id: ', graph_id) # print(self.__node_maps_from_median[graph_id]) -# print(self.__node_maps_from_median[graph_id].get_forward_map(), self.__node_maps_from_median[graph_id].get_backward_map()) +# print(self.__node_maps_from_median[graph_id].forward_map, self.__node_maps_from_median[graph_id].backward_map) k = self.__node_maps_from_median[graph_id].image(i) # print('k: ', k) if k != np.inf: @@ -920,7 +960,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no def init_worker(ged_env_toshare): global G_ged_env G_ged_env = ged_env_toshare - do_fun = partial(_update_node_maps_parallel, self.__median_id, self.__epsilon) + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) + do_fun = partial(_update_node_maps_parallel, self.__median_id, self.__epsilon, self.__sort_graphs, nb_nodes_median) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) if self.__print_to_stdout == 2: iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), @@ -941,13 +982,25 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) node_maps_were_modified = False + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) for graph_id, node_map in self.__node_maps_from_median.items(): - self.__ged_env.run_method(self.__median_id, graph_id) - if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon: - # xxx = self.__node_maps_from_median[graph_id] - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) - # yyy = self.__node_maps_from_median[graph_id] - node_maps_were_modified = True + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(self.__median_id, graph_id) + if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon: + # xxx = self.__node_maps_from_median[graph_id] + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) + node_maps_were_modified = True + + else: + self.__ged_env.run_method(graph_id, self.__median_id) + if self.__ged_env.get_upper_bound(graph_id, self.__median_id) < node_map.induced_cost() - self.__epsilon: + node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + node_maps_were_modified = True + # Print information about current iteration. if self.__print_to_stdout == 2: progress.update(1) @@ -1047,8 +1100,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no for k in range(0, node_map.num_target_nodes()): if is_unassigned_target_node[k]: new_node_map.add_assignment(np.inf, k) -# print(self.__node_maps_from_median[key].get_forward_map(), self.__node_maps_from_median[key].get_backward_map()) -# print(new_node_map.get_forward_map(), new_node_map.get_backward_map()) +# print(self.__node_maps_from_median[key].forward_map, self.__node_maps_from_median[key].backward_map) +# print(new_node_map.forward_map, new_node_map.backward_map self.__node_maps_from_median[key] = new_node_map # Increase overall number of decreases. @@ -1599,37 +1652,58 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no # return median_label -def _compute_medoid_parallel(graph_ids, itr): +def _compute_medoid_parallel(graph_ids, sort, itr): g_id = itr[0] i = itr[1] # @todo: timer not considered here. # if timer.expired(): # self.__state = AlgorithmState.CALLED # break + nb_nodes_g = G_ged_env.get_graph_num_nodes(g_id) sum_of_distances = 0 for h_id in graph_ids: - G_ged_env.run_method(g_id, h_id) - sum_of_distances += G_ged_env.get_upper_bound(g_id, h_id) + nb_nodes_h = G_ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not sort: + G_ged_env.run_method(g_id, h_id) + sum_of_distances += G_ged_env.get_upper_bound(g_id, h_id) + else: + G_ged_env.run_method(h_id, g_id) + sum_of_distances += G_ged_env.get_upper_bound(h_id, g_id) return i, sum_of_distances + - -def _compute_init_node_maps_parallel(gen_median_id, itr): +def _compute_init_node_maps_parallel(gen_median_id, sort, nb_nodes_median, itr): graph_id = itr - G_ged_env.run_method(gen_median_id, graph_id) - node_maps_from_median = G_ged_env.get_node_map(gen_median_id, graph_id) + nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not sort: + G_ged_env.run_method(gen_median_id, graph_id) + node_map = G_ged_env.get_node_map(gen_median_id, graph_id) # print(self.__node_maps_from_median[graph_id]) - sum_of_distance = node_maps_from_median.induced_cost() + else: + G_ged_env.run_method(graph_id, gen_median_id) + node_map = G_ged_env.get_node_map(graph_id, gen_median_id) + node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map + sum_of_distance = node_map.induced_cost() # print(self.__sum_of_distances) - return graph_id, sum_of_distance, node_maps_from_median - + return graph_id, sum_of_distance, node_map + -def _update_node_maps_parallel(median_id, epsilon, itr): +def _update_node_maps_parallel(median_id, epsilon, sort, nb_nodes_median, itr): graph_id = itr[0] node_map = itr[1] node_maps_were_modified = False - G_ged_env.run_method(median_id, graph_id) - if G_ged_env.get_upper_bound(median_id, graph_id) < node_map.induced_cost() - epsilon: - node_map = G_ged_env.get_node_map(median_id, graph_id) - node_maps_were_modified = True + nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not sort: + G_ged_env.run_method(median_id, graph_id) + if G_ged_env.get_upper_bound(median_id, graph_id) < node_map.induced_cost() - epsilon: + node_map = G_ged_env.get_node_map(median_id, graph_id) + node_maps_were_modified = True + else: + G_ged_env.run_method(graph_id, median_id) + if G_ged_env.get_upper_bound(graph_id, median_id) < node_map.induced_cost() - epsilon: + node_map = G_ged_env.get_node_map(graph_id, median_id) + node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map + node_maps_were_modified = True + return graph_id, node_map, node_maps_were_modified \ No newline at end of file diff --git a/gklearn/ged/median/utils.py b/gklearn/ged/median/utils.py index 5c4c52f..d27c86d 100644 --- a/gklearn/ged/median/utils.py +++ b/gklearn/ged/median/utils.py @@ -34,6 +34,8 @@ def mge_options_to_string(options): opt_str += '--parallel ' + ('TRUE' if val else 'FALSE') + ' ' elif key == 'update_order': opt_str += '--update-order ' + ('TRUE' if val else 'FALSE') + ' ' + elif key == 'sort_graphs': + opt_str += '--sort-graphs ' + ('TRUE' if val else 'FALSE') + ' ' elif key == 'refine': opt_str += '--refine ' + ('TRUE' if val else 'FALSE') + ' ' elif key == 'time_limit': diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index c41ca86..7032345 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -46,7 +46,7 @@ def compute_ged(g1, g2, options): return dis, pi_forward, pi_backward -def compute_geds(graphs, options={}, parallel=False, verbose=True): +def compute_geds(graphs, options={}, sort=True, parallel=False, verbose=True): # initialize ged env. ged_env = gedlibpy.GEDEnv() ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) @@ -79,7 +79,7 @@ def compute_geds(graphs, options={}, parallel=False, verbose=True): G_graphs = graphs_toshare G_ged_env = ged_env_toshare G_listID = listID_toshare - do_partial = partial(_wrapper_compute_ged_parallel, neo_options) + do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) if verbose: iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), @@ -108,25 +108,31 @@ def compute_geds(graphs, options={}, parallel=False, verbose=True): for i in iterator: # for i in range(len(graphs)): for j in range(i + 1, len(graphs)): - dis, pi_forward, pi_backward = _compute_ged(ged_env, listID[i], listID[j], graphs[i], graphs[j]) + if nx.number_of_nodes(graphs[i]) <= nx.number_of_nodes(graphs[j]) or not sort: + dis, pi_forward, pi_backward = _compute_ged(ged_env, listID[i], listID[j], graphs[i], graphs[j]) + else: + dis, pi_backward, pi_forward = _compute_ged(ged_env, listID[j], listID[i], graphs[j], graphs[i]) ged_vec.append(dis) ged_mat[i][j] = dis ged_mat[j][i] = dis - n_eo_tmp = get_nb_edit_operations(graphs[i], graphs[j], pi_forward, pi_backward, **neo_options) + n_eo_tmp = get_nb_edit_operations(graphs[i], graphs[j], pi_forward, pi_backward, **neo_options) n_edit_operations.append(n_eo_tmp) - + return ged_vec, ged_mat, n_edit_operations -def _wrapper_compute_ged_parallel(options, itr): +def _wrapper_compute_ged_parallel(options, sort, itr): i = itr[0] j = itr[1] - dis, n_eo_tmp = _compute_ged_parallel(G_ged_env, G_listID[i], G_listID[j], G_graphs[i], G_graphs[j], options) + dis, n_eo_tmp = _compute_ged_parallel(G_ged_env, G_listID[i], G_listID[j], G_graphs[i], G_graphs[j], options, sort) return i, j, dis, n_eo_tmp -def _compute_ged_parallel(env, gid1, gid2, g1, g2, options): - dis, pi_forward, pi_backward = _compute_ged(env, gid1, gid2, g1, g2) +def _compute_ged_parallel(env, gid1, gid2, g1, g2, options, sort): + if nx.number_of_nodes(g1) <= nx.number_of_nodes(g2) or not sort: + dis, pi_forward, pi_backward = _compute_ged(env, gid1, gid2, g1, g2) + else: + dis, pi_backward, pi_forward = _compute_ged(env, gid2, gid1, g2, g1) n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward, **options) # [0,0,0,0,0,0] return dis, n_eo_tmp diff --git a/gklearn/preimage/experiments/tools/analyze_results_of_random_edit_costs.py b/gklearn/preimage/experiments/tools/analyze_results_of_random_edit_costs.py index 4dc20d3..bbde39e 100644 --- a/gklearn/preimage/experiments/tools/analyze_results_of_random_edit_costs.py +++ b/gklearn/preimage/experiments/tools/analyze_results_of_random_edit_costs.py @@ -82,5 +82,10 @@ def compute_for_all_experiments(data_dir): if __name__ == '__main__': # data_dir = '../results/xp_median_preimage.update_order/' - data_dir = '../../results/CRIANN/xp_median_preimage.init10/' - compute_for_all_experiments(data_dir) \ No newline at end of file + root_dir_tnz = '../../results/CRIANN/xp_median_preimage.init10/' + root_dir_ntnz = '../../results/CRIANN/xp_median_preimage.init10.no_triangle_rule/' + root_dir_tz = '../../results/CRIANN/xp_median_preimage.init10.triangle_rule.allow_zeros/' + root_dir_ntz = '../../results/CRIANN/xp_median_preimage.init10.no_triangle_rule.allow_zeros/' + data_dirs = [root_dir_tnz, root_dir_ntnz, root_dir_tz, root_dir_ntz] + for data_dir in data_dirs: + compute_for_all_experiments(data_dir) \ No newline at end of file diff --git a/gklearn/preimage/experiments/xp_1nn_init10_trianglerule.py b/gklearn/preimage/experiments/xp_1nn_init10_trianglerule.py index 9af5828..9d7809d 100644 --- a/gklearn/preimage/experiments/xp_1nn_init10_trianglerule.py +++ b/gklearn/preimage/experiments/xp_1nn_init10_trianglerule.py @@ -2958,18 +2958,6 @@ if __name__ == "__main__": # #### xp 7_1: MUTAG, StructuralSP, using CONSTANT. xp_median_preimage_7_1() -# #### xp 8_2: Monoterpenoides, PathUpToH, using CONSTANT. - xp_median_preimage_8_2() - -# #### xp 8_3: Monoterpenoides, Treelet, using CONSTANT. - xp_median_preimage_8_3() - -# #### xp 8_4: Monoterpenoides, WeisfeilerLehman, using CONSTANT. - xp_median_preimage_8_4() - -# #### xp 8_1: Monoterpenoides, StructuralSP, using CONSTANT. - xp_median_preimage_8_1() - # #### xp 9_2: MAO, PathUpToH, using CONSTANT, symbolic only. xp_median_preimage_9_2() @@ -2999,6 +2987,18 @@ if __name__ == "__main__": # #### xp 6_1: COIL-RAG, StructuralSP, using NON_SYMBOLIC. xp_median_preimage_6_1() + +# #### xp 8_2: Monoterpenoides, PathUpToH, using CONSTANT. + xp_median_preimage_8_2() + +# #### xp 8_3: Monoterpenoides, Treelet, using CONSTANT. + xp_median_preimage_8_3() + +# #### xp 8_4: Monoterpenoides, WeisfeilerLehman, using CONSTANT. + xp_median_preimage_8_4() + +# #### xp 8_1: Monoterpenoides, StructuralSP, using CONSTANT. + xp_median_preimage_8_1() # # #### xp 2_1: COIL-DEL, StructuralSP, using LETTER2, only node attrs. xp_median_preimage_2_1() diff --git a/gklearn/preimage/kernel_knn_cv.py b/gklearn/preimage/kernel_knn_cv.py index 2faf4ba..3e5e88b 100644 --- a/gklearn/preimage/kernel_knn_cv.py +++ b/gklearn/preimage/kernel_knn_cv.py @@ -151,7 +151,7 @@ def __kernel_knn_cv_median(dataset_all, ds_name, knn_options, mpg_options, kerne # write result summary for each letter. if save_results: f_summary = open(dir_save + fn_output_summary, 'a') - for i, median_type in enumerate('set-median', 'gen median', 'gen median uo'): + for i, median_type in enumerate(['set-median', 'gen median', 'gen median uo']): csv.writer(f_summary).writerow([ds_name, kernel_options['name'], train_examples + ': ' + median_type, knn_options['n_neighbors'],