Update fitDistance.py; constrain edit costs (c_i >= 0, sum(c_i) = 1).

5 years ago · 2876f7520a
--- a/preimage/fitDistance.py
+++ b/preimage/fitDistance.py
@@ -8,6 +8,9 @@ Created on Wed Oct 16 14:20:06 2019
 import numpy as np
 from tqdm import tqdm

 from scipy import optimize
 import cvxpy as cp

 import sys
 sys.path.insert(0, "../")
 from pygraph.utils.graphfiles import loadDataset
@@ -15,12 +18,9 @@ from ged import GED, get_nb_edit_operations
 from utils import kernel_distance_matrix

 def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
    c_vi = 1
    c_vr = 1
    c_vs = 1
    c_ei = 1
    c_er = 1
    c_es = 1
    # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
    edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
    idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
    
    # compute distances in feature space.
    dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, gkernel=gkernel)
@@ -36,14 +36,9 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
    for itr in range(itr_max):
        print('iteration', itr)
        ged_all = []
        n_vi_all = []
        n_vr_all = []
        n_vs_all = []
        n_ei_all = []
        n_er_all = []
        n_es_all = []
        n_edit_operations = [[] for i in range(len(idx_nonzeros))]
        # compute GEDs and numbers of edit operations.
        edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
        edit_cost_constant = [i for i in edit_costs]
        edit_cost_list.append(edit_cost_constant)
        for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
 #        for i in range(len(Gn)):
@@ -53,41 +48,58 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
                    edit_cost_constant=edit_cost_constant, stabilizer='min', 
                    repeat=30)
                ged_all.append(dis)
                n_vi, n_vr, n_vs, n_ei, n_er, n_es = get_nb_edit_operations(Gn[i], 
                n_eo_tmp = get_nb_edit_operations(Gn[i], 
                    Gn[j], pi_forward, pi_backward)
                n_vi_all.append(n_vi) 
                n_vr_all.append(n_vr)
                n_vs_all.append(n_vs) 
                n_ei_all.append(n_ei) 
                n_er_all.append(n_er)
                n_es_all.append(n_es)
                for idx, item in enumerate(idx_nonzeros):
                    n_edit_operations[idx].append(n_eo_tmp[item])
                
        residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
        residual_list.append(residual)
        
        # "fit" geds to distances in feature space by tuning edit costs using the
        # Least Squares Method.
        nb_cost_mat = np.column_stack((np.array(n_vi_all), np.array(n_vr_all),
                                       np.array(n_vs_all), np.array(n_ei_all),
                                       np.array(n_er_all), np.array(n_es_all)))
        edit_costs, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
                                                     rcond=None)
        for i in range(len(edit_costs)):
            if edit_costs[i] < 0:
                if edit_costs[i] > -1e-3:
                    edit_costs[i] = 0
 #                else:
 #                    raise ValueError('The edit cost is negative.')
            
        c_vi = edit_costs[0]
        c_vr = edit_costs[1]
        c_vs = edit_costs[2]
        c_ei = edit_costs[3]
        c_er = edit_costs[4]
        c_es = edit_costs[5]
        nb_cost_mat = np.array(n_edit_operations).T
        edit_costs_new, residual = get_better_costs(nb_cost_mat, dis_k_vec)

        print(residual)
        for i in range(len(edit_costs_new)):
            if edit_costs_new[i] < 0:
                if edit_costs_new[i] > -1e-6:
                    edit_costs_new[i] = 0
                else:
                    raise ValueError('The edit cost is negative.')
        
        for idx, item in enumerate(idx_nonzeros):
            edit_costs[item] = edit_costs_new[idx]
    
    return c_vi, c_vr, c_vs, c_ei, c_er, c_es, residual_list, edit_cost_list
    return edit_costs, residual_list, edit_cost_list


 def get_better_costs(nb_cost_mat, dis_k_vec):
 #    # method 1: simple least square method.
 #    edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
 #                                                     rcond=None)
    
 #    # method 2: least square method with x_i >= 0.
 #    edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
    
    # method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1.
    P = np.dot(nb_cost_mat.T, nb_cost_mat)
    q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
    G = -1 * np.identity(nb_cost_mat.shape[1])
    h = np.array([0 for i in range(nb_cost_mat.shape[1])])
    A = np.array([1 for i in range(nb_cost_mat.shape[1])])
    b = 1
    x = cp.Variable(nb_cost_mat.shape[1])
    prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
                      [G@x <= h,
                       A@x == b])
    prob.solve()
    edit_costs_new = x.value
    residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
    
 #    p = program(minimize(norm2(nb_cost_mat*x-dis_k_vec)),[equals(sum(x),1),geq(x,0)])
    return edit_costs_new, residual


 if __name__ == '__main__':
@@ -95,9 +107,9 @@ if __name__ == '__main__':
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    Gn = Gn[0:10]
 #    Gn = Gn[0:10]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    itr_max = 10
    c_vi, c_vr, c_vs, c_ei, c_er, c_es, residual_list, edit_cost_list = \
    edit_costs, residual_list, edit_cost_list = \
        fit_GED_to_kernel_distance(Gn, gkernel, itr_max)