Browse Source

1. update fitDistance.py.

1.1 add parallel computation of GEDs.
1.2 randomly initialize edit costs instead of uniform initialization.
1.3 remove the constrain that sum of the edit costs has to be equal to 1, avoiding sparsity.
v0.1
jajupmochi 5 years ago
parent
commit
ee6c79603d
2 changed files with 114 additions and 32 deletions
  1. +109
    -26
      preimage/fitDistance.py
  2. +5
    -6
      pygraph/utils/parallel.py

+ 109
- 26
preimage/fitDistance.py View File

@@ -7,6 +7,12 @@ Created on Wed Oct 16 14:20:06 2019
"""
import numpy as np
from tqdm import tqdm
from itertools import combinations_with_replacement
import multiprocessing
from multiprocessing import Pool
from functools import partial
import time
import random

from scipy import optimize
import cvxpy as cp
@@ -19,7 +25,12 @@ from utils import kernel_distance_matrix

def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
random.seed(1)
cost_rdm = random.sample(range(1, 10), 5)
edit_costs = cost_rdm + [0]
# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# compute distances in feature space.
@@ -34,24 +45,13 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
edit_cost_list = []
for itr in range(itr_max):
print('iteration', itr)
ged_all = []
n_edit_operations = [[] for i in range(len(idx_nonzeros))]
print('\niteration', itr)
# compute GEDs and numbers of edit operations.
edit_cost_constant = [i for i in edit_costs]
edit_cost_list.append(edit_cost_constant)
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=30)
ged_all.append(dis)
n_eo_tmp = get_nb_edit_operations(Gn[i],
Gn[j], pi_forward, pi_backward)
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx].append(n_eo_tmp[item])
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
idx_nonzeros, parallel=True)
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
residual_list.append(residual)
@@ -59,23 +59,105 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
# "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method.
nb_cost_mat = np.array(n_edit_operations).T
edit_costs_new, residual = get_better_costs(nb_cost_mat, dis_k_vec)
edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec)

print(residual)
print('pseudo residual:', residual)
for i in range(len(edit_costs_new)):
if edit_costs_new[i] < 0:
if edit_costs_new[i] > -1e-6:
if edit_costs_new[i] > -1e-9:
edit_costs_new[i] = 0
else:
raise ValueError('The edit cost is negative.')
for idx, item in enumerate(idx_nonzeros):
edit_costs[item] = edit_costs_new[idx]
print('edit_costs:', edit_costs)
print('residual_list:', residual_list)
return edit_costs, residual_list, edit_cost_list
return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat


def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
ged_mat = np.zeros((len(Gn), len(Gn)))
if parallel:
# print('parallel')
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
ged_all = [0 for i in range(len_itr)]
n_edit_operations = [[0 for i in range(len_itr)] for j in
range(len(idx_nonzeros))]
itr = combinations_with_replacement(range(0, len(Gn)), 2)
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant,
idx_nonzeros)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout)
# iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, j, dis, n_eo_tmp in iterator:
idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
ged_all[idx_itr] = dis
ged_mat[i][j] = dis
ged_mat[j][i] = dis
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
# print('\n-------------------------------------------')
# print(i, j, idx_itr, dis)
pool.close()
pool.join()
else:
ged_all = []
n_edit_operations = [[] for i in range(len(idx_nonzeros))]
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)):
for j in range(i, len(Gn)):
# time0 = time.time()
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
# time1 = time.time() - time0
# time0 = time.time()
ged_all.append(dis)
ged_mat[i][j] = dis
ged_mat[j][i] = dis
n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx].append(n_eo_tmp[item])
# time2 = time.time() - time0
# print(time1, time2, time1 / time2)
return ged_all, ged_mat, n_edit_operations

def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
i = itr[0]
j = itr[1]
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant,
idx_nonzeros)
return i, j, dis, n_eo_tmp


def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
return dis, n_eo_tmp


def get_better_costs(nb_cost_mat, dis_k_vec):
def compute_better_costs(nb_cost_mat, dis_k_vec):
# # method 1: simple least square method.
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
# rcond=None)
@@ -92,13 +174,11 @@ def get_better_costs(nb_cost_mat, dis_k_vec):
b = 1
x = cp.Variable(nb_cost_mat.shape[1])
prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
[G@x <= h,
A@x == b])
[G@x <= h])
prob.solve()
edit_costs_new = x.value
residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
# p = program(minimize(norm2(nb_cost_mat*x-dis_k_vec)),[equals(sum(x),1),geq(x,0)])
return edit_costs_new, residual


@@ -111,5 +191,8 @@ if __name__ == '__main__':
remove_edges(Gn)
gkernel = 'marginalizedkernel'
itr_max = 10
edit_costs, residual_list, edit_cost_list = \
fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
time0 = time.time()
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat = \
fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
total_time = time.time() - time0
print('total time:', total_time)

+ 5
- 6
pygraph/utils/parallel.py View File

@@ -20,11 +20,10 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker
# def init_worker(v_share):
# global G_var
# G_var = v_share
if n_jobs == None:
n_jobs = multiprocessing.cpu_count()
with Pool(processes=n_jobs, initializer=init_worker,
initargs=glbv) as pool:
if n_jobs == None:
n_jobs = multiprocessing.cpu_count()
initargs=glbv) as pool:
if chunksize == None:
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
@@ -35,9 +34,9 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker
pool.imap_unordered(func, itr, chunksize)):
func_assign(result, var_to_assign)
else:
if n_jobs == None:
n_jobs = multiprocessing.cpu_count()
with Pool(processes=n_jobs) as pool:
if n_jobs == None:
n_jobs = multiprocessing.cpu_count()
if chunksize == None:
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1


Loading…
Cancel
Save