Browse Source

Update fitDistance.py; constrain edit costs (c_i >= 0, sum(c_i) = 1).

v0.1
jajupmochi 5 years ago
parent
commit
2876f7520a
1 changed files with 53 additions and 41 deletions
  1. +53
    -41
      preimage/fitDistance.py

+ 53
- 41
preimage/fitDistance.py View File

@@ -8,6 +8,9 @@ Created on Wed Oct 16 14:20:06 2019
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm


from scipy import optimize
import cvxpy as cp

import sys import sys
sys.path.insert(0, "../") sys.path.insert(0, "../")
from pygraph.utils.graphfiles import loadDataset from pygraph.utils.graphfiles import loadDataset
@@ -15,12 +18,9 @@ from ged import GED, get_nb_edit_operations
from utils import kernel_distance_matrix from utils import kernel_distance_matrix


def fit_GED_to_kernel_distance(Gn, gkernel, itr_max): def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
c_vi = 1
c_vr = 1
c_vs = 1
c_ei = 1
c_er = 1
c_es = 1
# c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
# compute distances in feature space. # compute distances in feature space.
dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, gkernel=gkernel) dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, gkernel=gkernel)
@@ -36,14 +36,9 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
for itr in range(itr_max): for itr in range(itr_max):
print('iteration', itr) print('iteration', itr)
ged_all = [] ged_all = []
n_vi_all = []
n_vr_all = []
n_vs_all = []
n_ei_all = []
n_er_all = []
n_es_all = []
n_edit_operations = [[] for i in range(len(idx_nonzeros))]
# compute GEDs and numbers of edit operations. # compute GEDs and numbers of edit operations.
edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
edit_cost_constant = [i for i in edit_costs]
edit_cost_list.append(edit_cost_constant) edit_cost_list.append(edit_cost_constant)
for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
# for i in range(len(Gn)): # for i in range(len(Gn)):
@@ -53,41 +48,58 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
edit_cost_constant=edit_cost_constant, stabilizer='min', edit_cost_constant=edit_cost_constant, stabilizer='min',
repeat=30) repeat=30)
ged_all.append(dis) ged_all.append(dis)
n_vi, n_vr, n_vs, n_ei, n_er, n_es = get_nb_edit_operations(Gn[i],
n_eo_tmp = get_nb_edit_operations(Gn[i],
Gn[j], pi_forward, pi_backward) Gn[j], pi_forward, pi_backward)
n_vi_all.append(n_vi)
n_vr_all.append(n_vr)
n_vs_all.append(n_vs)
n_ei_all.append(n_ei)
n_er_all.append(n_er)
n_es_all.append(n_es)
for idx, item in enumerate(idx_nonzeros):
n_edit_operations[idx].append(n_eo_tmp[item])
residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec))) residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
residual_list.append(residual) residual_list.append(residual)
# "fit" geds to distances in feature space by tuning edit costs using the # "fit" geds to distances in feature space by tuning edit costs using the
# Least Squares Method. # Least Squares Method.
nb_cost_mat = np.column_stack((np.array(n_vi_all), np.array(n_vr_all),
np.array(n_vs_all), np.array(n_ei_all),
np.array(n_er_all), np.array(n_es_all)))
edit_costs, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
rcond=None)
for i in range(len(edit_costs)):
if edit_costs[i] < 0:
if edit_costs[i] > -1e-3:
edit_costs[i] = 0
# else:
# raise ValueError('The edit cost is negative.')
c_vi = edit_costs[0]
c_vr = edit_costs[1]
c_vs = edit_costs[2]
c_ei = edit_costs[3]
c_er = edit_costs[4]
c_es = edit_costs[5]
nb_cost_mat = np.array(n_edit_operations).T
edit_costs_new, residual = get_better_costs(nb_cost_mat, dis_k_vec)

print(residual)
for i in range(len(edit_costs_new)):
if edit_costs_new[i] < 0:
if edit_costs_new[i] > -1e-6:
edit_costs_new[i] = 0
else:
raise ValueError('The edit cost is negative.')
for idx, item in enumerate(idx_nonzeros):
edit_costs[item] = edit_costs_new[idx]
return c_vi, c_vr, c_vs, c_ei, c_er, c_es, residual_list, edit_cost_list
return edit_costs, residual_list, edit_cost_list



def get_better_costs(nb_cost_mat, dis_k_vec):
# # method 1: simple least square method.
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
# rcond=None)
# # method 2: least square method with x_i >= 0.
# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
# method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1.
P = np.dot(nb_cost_mat.T, nb_cost_mat)
q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
G = -1 * np.identity(nb_cost_mat.shape[1])
h = np.array([0 for i in range(nb_cost_mat.shape[1])])
A = np.array([1 for i in range(nb_cost_mat.shape[1])])
b = 1
x = cp.Variable(nb_cost_mat.shape[1])
prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
[G@x <= h,
A@x == b])
prob.solve()
edit_costs_new = x.value
residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
# p = program(minimize(norm2(nb_cost_mat*x-dis_k_vec)),[equals(sum(x),1),geq(x,0)])
return edit_costs_new, residual




if __name__ == '__main__': if __name__ == '__main__':
@@ -95,9 +107,9 @@ if __name__ == '__main__':
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt', ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
'extra_params': {}} # node/edge symb 'extra_params': {}} # node/edge symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
Gn = Gn[0:10]
# Gn = Gn[0:10]
remove_edges(Gn) remove_edges(Gn)
gkernel = 'marginalizedkernel' gkernel = 'marginalizedkernel'
itr_max = 10 itr_max = 10
c_vi, c_vr, c_vs, c_ei, c_er, c_es, residual_list, edit_cost_list = \
edit_costs, residual_list, edit_cost_list = \
fit_GED_to_kernel_distance(Gn, gkernel, itr_max) fit_GED_to_kernel_distance(Gn, gkernel, itr_max)

Loading…
Cancel
Save