diff --git a/preimage/fitDistance.py b/preimage/fitDistance.py index d0d2357..42b9889 100644 --- a/preimage/fitDistance.py +++ b/preimage/fitDistance.py @@ -7,6 +7,12 @@ Created on Wed Oct 16 14:20:06 2019 """ import numpy as np from tqdm import tqdm +from itertools import combinations_with_replacement +import multiprocessing +from multiprocessing import Pool +from functools import partial +import time +import random from scipy import optimize import cvxpy as cp @@ -19,7 +25,12 @@ from utils import kernel_distance_matrix def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. - edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] + random.seed(1) + cost_rdm = random.sample(range(1, 10), 5) + edit_costs = cost_rdm + [0] +# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] +# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] +# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] # compute distances in feature space. @@ -34,24 +45,13 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): edit_cost_list = [] for itr in range(itr_max): - print('iteration', itr) - ged_all = [] - n_edit_operations = [[] for i in range(len(idx_nonzeros))] + print('\niteration', itr) # compute GEDs and numbers of edit operations. edit_cost_constant = [i for i in edit_costs] edit_cost_list.append(edit_cost_constant) - for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): -# for i in range(len(Gn)): - for j in range(i, len(Gn)): - dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy', - cost='CONSTANT', method='IPFP', - edit_cost_constant=edit_cost_constant, stabilizer='min', - repeat=30) - ged_all.append(dis) - n_eo_tmp = get_nb_edit_operations(Gn[i], - Gn[j], pi_forward, pi_backward) - for idx, item in enumerate(idx_nonzeros): - n_edit_operations[idx].append(n_eo_tmp[item]) + + ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, + idx_nonzeros, parallel=True) residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) residual_list.append(residual) @@ -59,23 +59,105 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): # "fit" geds to distances in feature space by tuning edit costs using the # Least Squares Method. nb_cost_mat = np.array(n_edit_operations).T - edit_costs_new, residual = get_better_costs(nb_cost_mat, dis_k_vec) + edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec) - print(residual) + print('pseudo residual:', residual) for i in range(len(edit_costs_new)): if edit_costs_new[i] < 0: - if edit_costs_new[i] > -1e-6: + if edit_costs_new[i] > -1e-9: edit_costs_new[i] = 0 else: raise ValueError('The edit cost is negative.') for idx, item in enumerate(idx_nonzeros): edit_costs[item] = edit_costs_new[idx] + + print('edit_costs:', edit_costs) + print('residual_list:', residual_list) - return edit_costs, residual_list, edit_cost_list + return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat + + +def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False): + ged_mat = np.zeros((len(Gn), len(Gn))) + if parallel: +# print('parallel') + len_itr = int(len(Gn) * (len(Gn) + 1) / 2) + ged_all = [0 for i in range(len_itr)] + n_edit_operations = [[0 for i in range(len_itr)] for j in + range(len(idx_nonzeros))] + + itr = combinations_with_replacement(range(0, len(Gn)), 2) + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant, + idx_nonzeros) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) + iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), + desc='computing GEDs', file=sys.stdout) +# iterator = pool.imap_unordered(do_partial, itr, chunksize) + for i, j, dis, n_eo_tmp in iterator: + idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2) + ged_all[idx_itr] = dis + ged_mat[i][j] = dis + ged_mat[j][i] = dis + for idx, item in enumerate(idx_nonzeros): + n_edit_operations[idx][idx_itr] = n_eo_tmp[item] +# print('\n-------------------------------------------') +# print(i, j, idx_itr, dis) + pool.close() + pool.join() + + else: + ged_all = [] + n_edit_operations = [[] for i in range(len(idx_nonzeros))] + for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): +# for i in range(len(Gn)): + for j in range(i, len(Gn)): +# time0 = time.time() + dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy', + cost='CONSTANT', method='IPFP', + edit_cost_constant=edit_cost_constant, stabilizer='min', + repeat=50) +# time1 = time.time() - time0 +# time0 = time.time() + ged_all.append(dis) + ged_mat[i][j] = dis + ged_mat[j][i] = dis + n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward) + for idx, item in enumerate(idx_nonzeros): + n_edit_operations[idx].append(n_eo_tmp[item]) +# time2 = time.time() - time0 +# print(time1, time2, time1 / time2) + + return ged_all, ged_mat, n_edit_operations + + +def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr): + i = itr[0] + j = itr[1] + dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant, + idx_nonzeros) + return i, j, dis, n_eo_tmp + + +def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros): + dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy', + cost='CONSTANT', method='IPFP', + edit_cost_constant=edit_cost_constant, stabilizer='min', + repeat=50) + n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward) + + return dis, n_eo_tmp -def get_better_costs(nb_cost_mat, dis_k_vec): +def compute_better_costs(nb_cost_mat, dis_k_vec): # # method 1: simple least square method. # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, # rcond=None) @@ -92,13 +174,11 @@ def get_better_costs(nb_cost_mat, dis_k_vec): b = 1 x = cp.Variable(nb_cost_mat.shape[1]) prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x), - [G@x <= h, - A@x == b]) + [G@x <= h]) prob.solve() edit_costs_new = x.value residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec) -# p = program(minimize(norm2(nb_cost_mat*x-dis_k_vec)),[equals(sum(x),1),geq(x,0)]) return edit_costs_new, residual @@ -111,5 +191,8 @@ if __name__ == '__main__': remove_edges(Gn) gkernel = 'marginalizedkernel' itr_max = 10 - edit_costs, residual_list, edit_cost_list = \ - fit_GED_to_kernel_distance(Gn, gkernel, itr_max) \ No newline at end of file + time0 = time.time() + edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat = \ + fit_GED_to_kernel_distance(Gn, gkernel, itr_max) + total_time = time.time() - time0 + print('total time:', total_time) \ No newline at end of file diff --git a/pygraph/utils/parallel.py b/pygraph/utils/parallel.py index fdf2244..603164a 100644 --- a/pygraph/utils/parallel.py +++ b/pygraph/utils/parallel.py @@ -20,11 +20,10 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker # def init_worker(v_share): # global G_var # G_var = v_share - + if n_jobs == None: + n_jobs = multiprocessing.cpu_count() with Pool(processes=n_jobs, initializer=init_worker, - initargs=glbv) as pool: - if n_jobs == None: - n_jobs = multiprocessing.cpu_count() + initargs=glbv) as pool: if chunksize == None: if len_itr < 100 * n_jobs: chunksize = int(len_itr / n_jobs) + 1 @@ -35,9 +34,9 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker pool.imap_unordered(func, itr, chunksize)): func_assign(result, var_to_assign) else: + if n_jobs == None: + n_jobs = multiprocessing.cpu_count() with Pool(processes=n_jobs) as pool: - if n_jobs == None: - n_jobs = multiprocessing.cpu_count() if chunksize == None: if len_itr < 100 * n_jobs: chunksize = int(len_itr / n_jobs) + 1