@@ -7,6 +7,12 @@ Created on Wed Oct 16 14:20:06 2019
"""
import numpy as np
from tqdm import tqdm
from itertools import combinations_with_replacement
import multiprocessing
from multiprocessing import Pool
from functools import partial
import time
import random
from scipy import optimize
import cvxpy as cp
@@ -19,7 +25,12 @@ from utils import kernel_distance_matrix
def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
random.seed(1)
cost_rdm = random.sample(range(1, 10), 5)
edit_costs = cost_rdm + [0]
# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# compute distances in feature space.
@@ -34,24 +45,13 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
edit_cost_list = []
for itr in range(itr_max):
print('iteration', itr)
ged_all = []
n_edit_operations = [[] for i in range(len(idx_nonzeros))]
print('\niteration', itr)
# compute GEDs and numbers of edit operations.
edit_cost_constant = [i for i in edit_costs]
edit_cost_list.append(edit_cost_constant)
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=30)
ged_all.append(dis)
n_eo_tmp = get_nb_edit_operations(Gn[i],
Gn[j], pi_forward, pi_backward)
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx].append(n_eo_tmp[item])
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
idx_nonzeros, parallel=True)
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
residual_list.append(residual)
@@ -59,23 +59,105 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
# "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method.
nb_cost_mat = np.array(n_edit_operations).T
edit_costs_new, residual = get _better_costs(nb_cost_mat, dis_k_vec)
edit_costs_new, residual = compute _better_costs(nb_cost_mat, dis_k_vec)
print(residual)
print('pseudo residual:', residual)
for i in range(len(edit_costs_new)):
if edit_costs_new[i] < 0:
if edit_costs_new[i] > -1e-6 :
if edit_costs_new[i] > -1e-9 :
edit_costs_new[i] = 0
else:
raise ValueError('The edit cost is negative.')
for idx, item in enumerate(idx_nonzeros):
edit_costs[item] = edit_costs_new[idx]
print('edit_costs:', edit_costs)
print('residual_list:', residual_list)
return edit_costs, residual_list, edit_cost_list
return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat
def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
ged_mat = np.zeros((len(Gn), len(Gn)))
if parallel:
# print('parallel')
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
ged_all = [0 for i in range(len_itr)]
n_edit_operations = [[0 for i in range(len_itr)] for j in
range(len(idx_nonzeros))]
itr = combinations_with_replacement(range(0, len(Gn)), 2)
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant,
idx_nonzeros)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout)
# iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, j, dis, n_eo_tmp in iterator:
idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
ged_all[idx_itr] = dis
ged_mat[i][j] = dis
ged_mat[j][i] = dis
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
# print('\n-------------------------------------------')
# print(i, j, idx_itr, dis)
pool.close()
pool.join()
else:
ged_all = []
n_edit_operations = [[] for i in range(len(idx_nonzeros))]
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)):
for j in range(i, len(Gn)):
# time0 = time.time()
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
# time1 = time.time() - time0
# time0 = time.time()
ged_all.append(dis)
ged_mat[i][j] = dis
ged_mat[j][i] = dis
n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx].append(n_eo_tmp[item])
# time2 = time.time() - time0
# print(time1, time2, time1 / time2)
return ged_all, ged_mat, n_edit_operations
def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
i = itr[0]
j = itr[1]
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant,
idx_nonzeros)
return i, j, dis, n_eo_tmp
def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
return dis, n_eo_tmp
def get_better_costs(nb_cost_mat, dis_k_vec):
def compute _better_costs(nb_cost_mat, dis_k_vec):
# # method 1: simple least square method.
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
# rcond=None)
@@ -92,13 +174,11 @@ def get_better_costs(nb_cost_mat, dis_k_vec):
b = 1
x = cp.Variable(nb_cost_mat.shape[1])
prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
[G@x <= h,
A@x == b])
[G@x <= h])
prob.solve()
edit_costs_new = x.value
residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
# p = program(minimize(norm2(nb_cost_mat*x-dis_k_vec)),[equals(sum(x),1),geq(x,0)])
return edit_costs_new, residual
@@ -111,5 +191,8 @@ if __name__ == '__main__':
remove_edges(Gn)
gkernel = 'marginalizedkernel'
itr_max = 10
edit_costs, residual_list, edit_cost_list = \
fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
time0 = time.time()
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat = \
fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
total_time = time.time() - time0
print('total time:', total_time)