Browse Source

1. update fitDistance.py.

1.1 add parallel computation of GEDs.
1.2 randomly initialize edit costs instead of uniform initialization.
1.3 remove the constrain that sum of the edit costs has to be equal to 1, avoiding sparsity.
v0.1
jajupmochi 5 years ago
parent
commit
ee6c79603d
2 changed files with 114 additions and 32 deletions
  1. +109
    -26
      preimage/fitDistance.py
  2. +5
    -6
      pygraph/utils/parallel.py

+ 109
- 26
preimage/fitDistance.py View File

@@ -7,6 +7,12 @@ Created on Wed Oct 16 14:20:06 2019
""" """
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from itertools import combinations_with_replacement
import multiprocessing
from multiprocessing import Pool
from functools import partial
import time
import random


from scipy import optimize from scipy import optimize
import cvxpy as cp import cvxpy as cp
@@ -19,7 +25,12 @@ from utils import kernel_distance_matrix


def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
random.seed(1)
cost_rdm = random.sample(range(1, 10), 5)
edit_costs = cost_rdm + [0]
# edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
# edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
# edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# compute distances in feature space. # compute distances in feature space.
@@ -34,24 +45,13 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
edit_cost_list = [] edit_cost_list = []
for itr in range(itr_max): for itr in range(itr_max):
print('iteration', itr)
ged_all = []
n_edit_operations = [[] for i in range(len(idx_nonzeros))]
print('\niteration', itr)
# compute GEDs and numbers of edit operations. # compute GEDs and numbers of edit operations.
edit_cost_constant = [i for i in edit_costs] edit_cost_constant = [i for i in edit_costs]
edit_cost_list.append(edit_cost_constant) edit_cost_list.append(edit_cost_constant)
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)):
for j in range(i, len(Gn)):
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=30)
ged_all.append(dis)
n_eo_tmp = get_nb_edit_operations(Gn[i],
Gn[j], pi_forward, pi_backward)
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx].append(n_eo_tmp[item])
ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant,
idx_nonzeros, parallel=True)
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
residual_list.append(residual) residual_list.append(residual)
@@ -59,23 +59,105 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
# "fit" geds to distances in feature space by tuning edit costs using the # "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method. # Least Squares Method.
nb_cost_mat = np.array(n_edit_operations).T nb_cost_mat = np.array(n_edit_operations).T
edit_costs_new, residual = get_better_costs(nb_cost_mat, dis_k_vec)
edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec)


print(residual)
print('pseudo residual:', residual)
for i in range(len(edit_costs_new)): for i in range(len(edit_costs_new)):
if edit_costs_new[i] < 0: if edit_costs_new[i] < 0:
if edit_costs_new[i] > -1e-6:
if edit_costs_new[i] > -1e-9:
edit_costs_new[i] = 0 edit_costs_new[i] = 0
else: else:
raise ValueError('The edit cost is negative.') raise ValueError('The edit cost is negative.')
for idx, item in enumerate(idx_nonzeros): for idx, item in enumerate(idx_nonzeros):
edit_costs[item] = edit_costs_new[idx] edit_costs[item] = edit_costs_new[idx]
print('edit_costs:', edit_costs)
print('residual_list:', residual_list)
return edit_costs, residual_list, edit_cost_list
return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat


def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
ged_mat = np.zeros((len(Gn), len(Gn)))
if parallel:
# print('parallel')
len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
ged_all = [0 for i in range(len_itr)]
n_edit_operations = [[0 for i in range(len_itr)] for j in
range(len(idx_nonzeros))]
itr = combinations_with_replacement(range(0, len(Gn)), 2)
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
def init_worker(gn_toshare):
global G_gn
G_gn = gn_toshare
do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant,
idx_nonzeros)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout)
# iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, j, dis, n_eo_tmp in iterator:
idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
ged_all[idx_itr] = dis
ged_mat[i][j] = dis
ged_mat[j][i] = dis
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
# print('\n-------------------------------------------')
# print(i, j, idx_itr, dis)
pool.close()
pool.join()
else:
ged_all = []
n_edit_operations = [[] for i in range(len(idx_nonzeros))]
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)):
for j in range(i, len(Gn)):
# time0 = time.time()
dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
# time1 = time.time() - time0
# time0 = time.time()
ged_all.append(dis)
ged_mat[i][j] = dis
ged_mat[j][i] = dis
n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx].append(n_eo_tmp[item])
# time2 = time.time() - time0
# print(time1, time2, time1 / time2)
return ged_all, ged_mat, n_edit_operations

def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
i = itr[0]
j = itr[1]
dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant,
idx_nonzeros)
return i, j, dis, n_eo_tmp


def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
cost='CONSTANT', method='IPFP',
edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=50)
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
return dis, n_eo_tmp




def get_better_costs(nb_cost_mat, dis_k_vec):
def compute_better_costs(nb_cost_mat, dis_k_vec):
# # method 1: simple least square method. # # method 1: simple least square method.
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
# rcond=None) # rcond=None)
@@ -92,13 +174,11 @@ def get_better_costs(nb_cost_mat, dis_k_vec):
b = 1 b = 1
x = cp.Variable(nb_cost_mat.shape[1]) x = cp.Variable(nb_cost_mat.shape[1])
prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x), prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
[G@x <= h,
A@x == b])
[G@x <= h])
prob.solve() prob.solve()
edit_costs_new = x.value edit_costs_new = x.value
residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec) residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
# p = program(minimize(norm2(nb_cost_mat*x-dis_k_vec)),[equals(sum(x),1),geq(x,0)])
return edit_costs_new, residual return edit_costs_new, residual




@@ -111,5 +191,8 @@ if __name__ == '__main__':
remove_edges(Gn) remove_edges(Gn)
gkernel = 'marginalizedkernel' gkernel = 'marginalizedkernel'
itr_max = 10 itr_max = 10
edit_costs, residual_list, edit_cost_list = \
fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
time0 = time.time()
edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat = \
fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
total_time = time.time() - time0
print('total time:', total_time)

+ 5
- 6
pygraph/utils/parallel.py View File

@@ -20,11 +20,10 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker
# def init_worker(v_share): # def init_worker(v_share):
# global G_var # global G_var
# G_var = v_share # G_var = v_share
if n_jobs == None:
n_jobs = multiprocessing.cpu_count()
with Pool(processes=n_jobs, initializer=init_worker, with Pool(processes=n_jobs, initializer=init_worker,
initargs=glbv) as pool:
if n_jobs == None:
n_jobs = multiprocessing.cpu_count()
initargs=glbv) as pool:
if chunksize == None: if chunksize == None:
if len_itr < 100 * n_jobs: if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1 chunksize = int(len_itr / n_jobs) + 1
@@ -35,9 +34,9 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker
pool.imap_unordered(func, itr, chunksize)): pool.imap_unordered(func, itr, chunksize)):
func_assign(result, var_to_assign) func_assign(result, var_to_assign)
else: else:
if n_jobs == None:
n_jobs = multiprocessing.cpu_count()
with Pool(processes=n_jobs) as pool: with Pool(processes=n_jobs) as pool:
if n_jobs == None:
n_jobs = multiprocessing.cpu_count()
if chunksize == None: if chunksize == None:
if len_itr < 100 * n_jobs: if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1 chunksize = int(len_itr / n_jobs) + 1


Loading…
Cancel
Save