From b63a60fe46788b86634e4f6ba09b0b3b8af406ef Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 1 Jun 2020 16:43:38 +0200 Subject: [PATCH] Add RandomPreimageGenerator introduced inBakir's paper. --- gklearn/preimage/__init__.py | 1 + .../experiments/xp_random_preimage_generation.py | 124 +++++++++ gklearn/preimage/median_preimage_generator.py | 30 +-- gklearn/preimage/random_preimage_generator.py | 287 +++++++++++++++++++++ 4 files changed, 423 insertions(+), 19 deletions(-) create mode 100644 gklearn/preimage/experiments/xp_random_preimage_generation.py create mode 100644 gklearn/preimage/random_preimage_generator.py diff --git a/gklearn/preimage/__init__.py b/gklearn/preimage/__init__.py index 21e688e..385762e 100644 --- a/gklearn/preimage/__init__.py +++ b/gklearn/preimage/__init__.py @@ -12,4 +12,5 @@ __date__ = "March 2020" from gklearn.preimage.preimage_generator import PreimageGenerator from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator +from gklearn.preimage.random_preimage_generator import RandomPreimageGenerator from gklearn.preimage.kernel_knn_cv import kernel_knn_cv diff --git a/gklearn/preimage/experiments/xp_random_preimage_generation.py b/gklearn/preimage/experiments/xp_random_preimage_generation.py new file mode 100644 index 0000000..51f7a7b --- /dev/null +++ b/gklearn/preimage/experiments/xp_random_preimage_generation.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 1 11:37:57 2020 + +@author: ljia +""" +import multiprocessing +import numpy as np +import networkx as nx +import os +from gklearn.preimage import RandomPreimageGenerator +from gklearn.utils import Dataset + + +dir_root = '../results/xp_random_preimage_generation/' + + +def xp_random_preimage_generation(): + """ + Experiment similar to the one in Bakir's paper. A test to check if RandomPreimageGenerator class works correctly. + + Returns + ------- + None. + + """ + alpha1_list = np.linspace(0, 1, 11) + k_dis_datasets = [] + k_dis_preimages = [] + preimages = [] + bests_from_dataset = [] + for alpha1 in alpha1_list: + print('alpha1 =', alpha1, ':\n') + # set parameters. + ds_name = 'MUTAG' + rpg_options = {'k': 5, + 'r_max': 10, # + 'l': 500, + 'alphas': None, + 'parallel': True, + 'verbose': 2} + kernel_options = {'name': 'PathUpToH', + 'depth': 2, # + 'k_func': 'MinMax', # + 'compute_method': 'trie', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 0} + edge_required = True + irrelevant_labels = {'edge_labels': ['label_0']} + cut_range = None + + # create/get Gram matrix. + dir_save = dir_root + ds_name + '.' + kernel_options['name'] + '/' + if not os.path.exists(dir_save): + os.makedirs(dir_save) + gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' + gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) + if gmfile_exist: + gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe. + gram_matrix_unnorm = gmfile['gram_matrix_unnorm'] + time_precompute_gm = gmfile['run_time'] + + # 1. get dataset. + print('1. getting dataset...') + dataset_all = Dataset() + dataset_all.load_predefined_dataset(ds_name) + dataset_all.trim_dataset(edge_required=edge_required) + if irrelevant_labels is not None: + dataset_all.remove_labels(**irrelevant_labels) + if cut_range is not None: + dataset_all.cut_graphs(cut_range) + + # add two "random" graphs. + g1 = nx.Graph() + g1.add_nodes_from(range(0, 16), label_0='0') + g1.add_nodes_from(range(16, 25), label_0='1') + g1.add_node(25, label_0='2') + g1.add_nodes_from([26, 27], label_0='3') + g1.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (5, 0), (4, 9), (12, 3), (10, 13), (13, 14), (14, 15), (15, 8), (0, 16), (1, 17), (2, 18), (12, 19), (11, 20), (13, 21), (15, 22), (7, 23), (6, 24), (14, 25), (25, 26), (25, 27)]) + g2 = nx.Graph() + g2.add_nodes_from(range(0, 12), label_0='0') + g2.add_nodes_from(range(12, 19), label_0='1') + g2.add_nodes_from([19, 20, 21], label_0='2') + g2.add_nodes_from([22, 23], label_0='3') + g2.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 19), (19, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 20), (20, 7), (5, 0), (4, 8), (0, 12), (1, 13), (2, 14), (9, 15), (10, 16), (11, 17), (6, 18), (3, 21), (21, 22), (21, 23)]) + dataset_all.load_graphs([g1, g2] + dataset_all.graphs, targets=None) + + # 2. initialize rpg and setting parameters. + print('2. initializing rpg and setting parameters...') + nb_graphs = len(dataset_all.graphs) - 2 + rpg_options['alphas'] = [alpha1, 1 - alpha1] + [0] * nb_graphs + if gmfile_exist: + rpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm + rpg_options['runtime_precompute_gm'] = time_precompute_gm + rpg = RandomPreimageGenerator() + rpg.dataset = dataset_all + rpg.set_options(**rpg_options.copy()) + rpg.kernel_options = kernel_options.copy() + + # 3. compute preimage. + print('3. computing preimage...') + rpg.run() + results = rpg.get_results() + k_dis_datasets.append(results['k_dis_dataset']) + k_dis_preimages.append(results['k_dis_preimage']) + bests_from_dataset.append(rpg.best_from_dataset) + preimages.append(rpg.preimage) + + # 4. save results. + # write Gram matrices to file. + if not gmfile_exist: + np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm=rpg.gram_matrix_unnorm, run_time=results['runtime_precompute_gm']) + + print('\ncomplete.\n') + + return k_dis_datasets, k_dis_preimages, bests_from_dataset, preimages + + +if __name__ == '__main__': + k_dis_datasets, k_dis_preimages, bests_from_dataset, preimages = xp_random_preimage_generation() \ No newline at end of file diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index 9deabe0..6d3a45f 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -19,7 +19,7 @@ from gklearn.ged.median import constant_node_costs,mge_options_to_string from gklearn.gedlib import librariesImport, gedlibpy from gklearn.utils import Timer from gklearn.utils.utils import get_graph_kernel_by_name -# from gklearn.utils.dataset import Dataset + class MedianPreimageGenerator(PreimageGenerator): @@ -127,8 +127,7 @@ class MedianPreimageGenerator(PreimageGenerator): # 3. compute set median and gen median using optimized edit costs. if self._verbose >= 2: print('\nstart computing set median and gen median using optimized edit costs...\n') -# group_fnames = [Gn[g].graph['filename'] for g in group_min] - self.__generate_preimage_iam() + self.__gmg_bcu() end_generate_preimage = time.time() self.__runtime_generate_preimage = end_generate_preimage - end_optimize_ec self.__runtime_total = end_generate_preimage - start @@ -140,13 +139,7 @@ class MedianPreimageGenerator(PreimageGenerator): # 4. compute kernel distances to the true median. if self._verbose >= 2: print('\nstart computing distances to true median....\n') -# Gn_median = [Gn[g].copy() for g in group_min] self.__compute_distances_to_true_median() -# dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min = -# idx_dis_k_gi_min = group_min[idx_dis_k_gi_min] -# print('index min dis_k_gi:', idx_dis_k_gi_min) -# print('sod_sm:', sod_sm) -# print('sod_gm:', sod_gm) # 5. print out results. if self._verbose: @@ -169,11 +162,6 @@ class MedianPreimageGenerator(PreimageGenerator): print('Is optimization of edit costs converged:', self.__converged) print('================================================================================') print() - - # collect return values. -# return (sod_sm, sod_gm), \ -# (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ -# (time_fitting, time_generating) def get_results(self): @@ -861,7 +849,15 @@ class MedianPreimageGenerator(PreimageGenerator): print() - def __generate_preimage_iam(self): + def __gmg_bcu(self): + """ + The local search algorithm based on block coordinate update (BCU) for estimating a generalized median graph (GMG). + + Returns + ------- + None. + + """ # Set up the ged environment. ged_env = gedlibpy.GEDEnv() # @todo: maybe create a ged_env as a private varible. # gedlibpy.restart_env() @@ -917,10 +913,6 @@ class MedianPreimageGenerator(PreimageGenerator): self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), gram_with_sm, withterm3=False) - # print(gen_median.nodes(data=True)) - # print(gen_median.edges(data=True)) - # print(set_median.nodes(data=True)) - # print(set_median.edges(data=True)) # compute distance in kernel space for generalized median. kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) diff --git a/gklearn/preimage/random_preimage_generator.py b/gklearn/preimage/random_preimage_generator.py new file mode 100644 index 0000000..b2da2b2 --- /dev/null +++ b/gklearn/preimage/random_preimage_generator.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri May 29 14:29:52 2020 + +@author: ljia +""" + +import numpy as np +import time +import random +import sys +import tqdm +import multiprocessing +import networkx as nx +from gklearn.preimage import PreimageGenerator +from gklearn.preimage.utils import compute_k_dis +from gklearn.utils import Timer +from gklearn.utils.utils import get_graph_kernel_by_name +# from gklearn.utils.dataset import Dataset + +class RandomPreimageGenerator(PreimageGenerator): + + def __init__(self, dataset=None): + PreimageGenerator.__init__(self, dataset=dataset) + # arguments to set. + self.__k = 5 # number of nearest neighbors of phi in D_N. + self.__r_max = 10 # maximum number of iterations. + self.__l = 500 # numbers of graphs generated for each graph in D_k U {g_i_hat}. + self.__alphas = None # weights of linear combinations of points in kernel space. + self.__parallel = True + self.__n_jobs = multiprocessing.cpu_count() + self.__time_limit_in_sec = 0 # @todo + self.__max_itrs = 100 # @todo + # values to compute. + self.__runtime_generate_preimage = None + self.__runtime_total = None + self.__preimage = None + self.__best_from_dataset = None + self.__k_dis_preimage = None + self.__k_dis_dataset = None + self.__itrs = 0 + self.__converged = False # @todo + self.__num_updates = 0 + # values that can be set or to be computed. + self.__gram_matrix_unnorm = None + self.__runtime_precompute_gm = None + + + def set_options(self, **kwargs): + self._kernel_options = kwargs.get('kernel_options', {}) + self._graph_kernel = kwargs.get('graph_kernel', None) + self._verbose = kwargs.get('verbose', 2) + self.__k = kwargs.get('k', 5) + self.__r_max = kwargs.get('r_max', 10) + self.__l = kwargs.get('l', 500) + self.__alphas = kwargs.get('alphas', None) + self.__parallel = kwargs.get('parallel', True) + self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) + self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) + self.__max_itrs = kwargs.get('max_itrs', 100) + self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) + self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) + + + def run(self): + self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'], + node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + node_attrs=self._dataset.node_attrs, + edge_attrs=self._dataset.edge_attrs, + ds_infos=self._dataset.get_dataset_infos(keys=['directed']), + kernel_options=self._kernel_options) + + # record start time. + start = time.time() + + # 1. precompute gram matrix. + if self.__gram_matrix_unnorm is None: + gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options) + self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm + end_precompute_gm = time.time() + self.__runtime_precompute_gm = end_precompute_gm - start + else: + if self.__runtime_precompute_gm is None: + raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') + self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm + if self._kernel_options['normalize']: + self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) + else: + self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm) + end_precompute_gm = time.time() + start -= self.__runtime_precompute_gm + + # 2. compute k nearest neighbors of phi in D_N. + if self._verbose >= 2: + print('\nstart computing k nearest neighbors of phi in D_N...\n') + D_N = self._dataset.graphs + if self.__alphas is None: + self.__alphas = [1 / len(D_N)] * len(D_N) + k_dis_list = [] # distance between g_star and each graph. + term3 = 0 + for i1, a1 in enumerate(self.__alphas): + for i2, a2 in enumerate(self.__alphas): + term3 += a1 * a2 * self._graph_kernel.gram_matrix[i1, i2] + for idx in range(len(D_N)): + k_dis_list.append(compute_k_dis(idx, range(0, len(D_N)), self.__alphas, self._graph_kernel.gram_matrix, term3=term3, withterm3=True)) + + # sort. + sort_idx = np.argsort(k_dis_list) + dis_gs = [k_dis_list[idis] for idis in sort_idx[0:self.__k]] # the k shortest distances. + nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) + g0hat_list = [D_N[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in D_N + self.__best_from_dataset = g0hat_list[0] # get the first best graph if there are muitlple. + self.__k_dis_dataset = dis_gs[0] + + if self.__k_dis_dataset == 0: # get the exact pre-image. + end_generate_preimage = time.time() + self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm + self.__runtime_total = end_generate_preimage - start + self.__preimage = self.__best_from_dataset.copy() + self.__k_dis_preimage = self.__k_dis_dataset + if self._verbose: + print() + print('=============================================================================') + print('The exact pre-image is found from the input dataset.') + print('-----------------------------------------------------------------------------') + print('Distance in kernel space for the best graph from dataset and for preimage:', self.__k_dis_dataset) + print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) + print('Time to generate pre-images:', self.__runtime_generate_preimage) + print('Total time:', self.__runtime_total) + print('=============================================================================') + print() + return + + dhat = dis_gs[0] # the nearest distance + Gk = [D_N[ig].copy() for ig in sort_idx[0:self.__k]] # the k nearest neighbors + Gs_nearest = [nx.convert_node_labels_to_integers(g) for g in Gk] # [g.copy() for g in Gk] + + # 3. start iterations. + if self._verbose >= 2: + print('starting iterations...') + gihat_list = [] + dihat_list = [] + r = 0 + dis_of_each_itr = [dhat] + while r < self.__r_max: + print('\n- r =', r) + found = False + dis_bests = dis_gs + dihat_list + + # compute numbers of nodes to be inserted/deleted. + # @todo what if the log is negetive? how to choose alpha (scalar)? + fdgs_list = np.array(dis_bests) + if np.min(fdgs_list) < 1: + fdgs_list /= np.min(dis_bests) + fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))] + if np.min(fdgs_list) < 1: + fdgs_list = np.array(fdgs_list) + 1 + + for ig, gs in enumerate(Gs_nearest + gihat_list): + if self._verbose >= 2: + print('-- computing', ig + 1, 'graphs out of', len(Gs_nearest) + len(gihat_list)) + for trail in range(0, self.__l): + if self._verbose >= 2: + print('---', trail + 1, 'trail out of', self.__l) + + # add and delete edges. + gtemp = gs.copy() + np.random.seed() # @todo: may not work for possible parallel. + # which edges to change. + # @todo: should we use just half of the adjacency matrix for undirected graphs? + nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1) + # @todo: what if fdgs is bigger than nb_vpairs? + idx_change = random.sample(range(nb_vpairs), fdgs_list[ig] if + fdgs_list[ig] < nb_vpairs else nb_vpairs) + for item in idx_change: + node1 = int(item / (nx.number_of_nodes(gs) - 1)) + node2 = (item - node1 * (nx.number_of_nodes(gs) - 1)) + if node2 >= node1: # skip the self pair. + node2 += 1 + # @todo: is the randomness correct? + if not gtemp.has_edge(node1, node2): + gtemp.add_edge(node1, node2) + else: + gtemp.remove_edge(node1, node2) + + # compute new distances. + kernels_to_gtmp, _ = self._graph_kernel.compute(gtemp, D_N, **self._kernel_options) + kernel_gtmp, _ = self._graph_kernel.compute(gtemp, gtemp, **self._kernel_options) + kernels_to_gtmp = [kernels_to_gtmp[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gtmp) for i in range(len(kernels_to_gtmp))] # normalize + # @todo: not correct kernel value + gram_with_gtmp = np.concatenate((np.array([kernels_to_gtmp]), np.copy(self._graph_kernel.gram_matrix)), axis=0) + gram_with_gtmp = np.concatenate((np.array([[1] + kernels_to_gtmp]).T, gram_with_gtmp), axis=1) + dnew = compute_k_dis(0, range(1, 1 + len(D_N)), self.__alphas, gram_with_gtmp, term3=term3, withterm3=True) + + # get the better graph preimage. + if dnew <= dhat: # @todo: the new distance is smaller or also equal? + if dnew < dhat: + if self._verbose >= 2: + print('trail =', str(trail)) + print('\nI am smaller!') + print('index (as in D_k U {gihat} =', str(ig)) + print('distance:', dhat, '->', dnew) + self.__num_updates += 1 + elif dnew == dhat: + if self._verbose >= 2: + print('I am equal!') + dhat = dnew + gnew = gtemp.copy() + found = True # found better graph. + + if found: + r = 0 + gihat_list = [gnew] + dihat_list = [dhat] + else: + r += 1 + + dis_of_each_itr.append(dhat) + self.__itrs += 1 + if self._verbose >= 2: + print('Total number of iterations is', self.__itrs) + print('The preimage is updated', self.__num_updates, 'times.') + print('The shortest distances for previous iterations are', dis_of_each_itr) + + + + # get results and print. + end_generate_preimage = time.time() + self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm + self.__runtime_total = end_generate_preimage - start + self.__preimage = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0]) + self.__k_dis_preimage = dhat + if self._verbose: + print() + print('=============================================================================') + print('Finished generalization of preimages.') + print('-----------------------------------------------------------------------------') + print('Distance in kernel space for the best graph from dataset:', self.__k_dis_dataset) + print('Distance in kernel space for the preimage:', self.__k_dis_preimage) + print('Total number of iterations for optimizing:', self.__itrs) + print('Total number of updating preimage:', self.__num_updates) + print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) + print('Time to generate pre-images:', self.__runtime_generate_preimage) + print('Total time:', self.__runtime_total) + print('=============================================================================') + print() + + + def get_results(self): + results = {} + results['runtime_precompute_gm'] = self.__runtime_precompute_gm + results['runtime_generate_preimage'] = self.__runtime_generate_preimage + results['runtime_total'] = self.__runtime_total + results['k_dis_dataset'] = self.__k_dis_dataset + results['k_dis_preimage'] = self.__k_dis_preimage + results['itrs'] = self.__itrs + results['num_updates'] = self.__num_updates + return results + + + def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): +# if self.__state == AlgorithmState.TERMINATED: +# self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + + + @property + def preimage(self): + return self.__preimage + + + @property + def best_from_dataset(self): + return self.__best_from_dataset + + + @property + def gram_matrix_unnorm(self): + return self.__gram_matrix_unnorm + + @gram_matrix_unnorm.setter + def gram_matrix_unnorm(self, value): + self.__gram_matrix_unnorm = value \ No newline at end of file