OpenI
/
graphkit-learn

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 16 14:20:06 2019

@author: ljia
"""
import numpy as np
from tqdm import tqdm
from itertools import combinations_with_replacement, combinations
import multiprocessing
from multiprocessing import Pool
from functools import partial
import time
import random

from scipy import optimize
from scipy.optimize import minimize
import cvxpy as cp

import sys
sys.path.insert(0, "../")
from preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter
from preimage.utils import kernel_distance_matrix

def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
                               params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', 
                                           'method': 'IPFP', 'stabilizer': None},
                               init_costs=[3, 3, 1, 3, 3, 1],
                               dataset='monoterpenoides',
                               parallel=True):
    dataset = dataset.lower()
    
    # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
#    random.seed(1)
#    cost_rdm = random.sample(range(1, 10), 6)
#    init_costs = cost_rdm + [0]
#    init_costs = cost_rdm
#    init_costs = [3, 3, 1, 3, 3, 1]
#    init_costs = [i * 0.01 for i in cost_rdm] + [0]
#    init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
#    init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
#    init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
#    idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
    
    # compute distances in feature space.
    dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
    dis_k_vec = []
    for i in range(len(dis_k_mat)):
#        for j in range(i, len(dis_k_mat)):
        for j in range(i + 1, len(dis_k_mat)):
            dis_k_vec.append(dis_k_mat[i, j])
    dis_k_vec = np.array(dis_k_vec)
    
    # init ged.
    print('\ninitial:')
    time0 = time.time()
    params_ged['dataset'] = dataset
    params_ged['edit_cost_constant'] = init_costs
    ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, 
                                                            dataset,
                                                            parallel=parallel)
    residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]    
    time_list = [time.time() - time0]
    edit_cost_list = [init_costs]  
    nb_cost_mat = np.array(n_edit_operations)
    nb_cost_mat_list = [nb_cost_mat]
    print('edit_costs:', init_costs)
    print('residual_list:', residual_list)
    
    for itr in range(itr_max):
        print('\niteration', itr)
        time0 = time.time()
        # "fit" geds to distances in feature space by tuning edit costs using the
        # Least Squares Method.
        edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec, 
                                                dataset=dataset, cost=params_ged['cost'])
        for i in range(len(edit_costs_new)):
            if -1e-9 <= edit_costs_new[i] <= 1e-9:
                edit_costs_new[i] = 0
            if edit_costs_new[i] < 0:
                raise ValueError('The edit cost is negative.')
#        for i in range(len(edit_costs_new)):
#            if edit_costs_new[i] < 0:
#                edit_costs_new[i] = 0

        # compute new GEDs and numbers of edit operations.
        params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75])
        ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
                                                           dataset,
                                                           parallel=parallel)
        residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
        time_list.append(time.time() - time0)
        edit_cost_list.append(edit_costs_new)
        nb_cost_mat = np.array(n_edit_operations)
        nb_cost_mat_list.append(nb_cost_mat)                        
        print('edit_costs:', edit_costs_new)
        print('residual_list:', residual_list)
    
    return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
        time_list, nb_cost_mat_list


def compute_geds(Gn, params_ged, dataset, parallel=False):
    get_nb_eo = get_nb_edit_operations_letter if dataset == 'letter' else get_nb_edit_operations
    ged_mat = np.zeros((len(Gn), len(Gn)))
    if parallel:
#        print('parallel')
#        len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
        len_itr = int(len(Gn) * (len(Gn) - 1) / 2)
        ged_vec = [0 for i in range(len_itr)]
        n_edit_operations = [0 for i in range(len_itr)]
#        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        itr = combinations(range(0, len(Gn)), 2)
        n_jobs = multiprocessing.cpu_count()
        if len_itr < 100 * n_jobs:
            chunksize = int(len_itr / n_jobs) + 1
        else:
            chunksize = 100
        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare
        do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo)
        pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
        iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
                        desc='computing GEDs', file=sys.stdout)
#        iterator = pool.imap_unordered(do_partial, itr, chunksize)
        for i, j, dis, n_eo_tmp in iterator:
            idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2)
            ged_vec[idx_itr] = dis
            ged_mat[i][j] = dis
            ged_mat[j][i] = dis
            n_edit_operations[idx_itr] = n_eo_tmp
#            print('\n-------------------------------------------')
#            print(i, j, idx_itr, dis)
        pool.close()
        pool.join()
        
    else:
        ged_vec = []
        n_edit_operations = []
        for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
#        for i in range(len(Gn)):
            for j in range(i + 1, len(Gn)):
                dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged)
                ged_vec.append(dis)
                ged_mat[i][j] = dis
                ged_mat[j][i] = dis
                n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward)
                n_edit_operations.append(n_eo_tmp)
                    
    return ged_vec, ged_mat, n_edit_operations
                    

def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr):
    i = itr[0]
    j = itr[1]
    dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo)
    return i, j, dis, n_eo_tmp


def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo):
    dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
    n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0]
    return dis, n_eo_tmp


def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides', 
                 cost='CONSTANT', rw_constraints='2constraints'):
    if dataset.lower() == 'letter':
        if cost == 'LETTER':            
            pass
#        # method 1: set alpha automatically, just tune c_vir and c_eir by 
#        # LMS using cvxpy.
#        alpha = 0.5
#        coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec)
##        if np.count_nonzero(nb_cost_mat[:,4]) == 0:
##            alpha = 0.75
##        else:
##            alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0])
##        alpha = alpha * 0.99
#        param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1])
#        param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5])
#        nb_cost_mat_new = np.column_stack((param_vir, param_eir))
#        dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3]
#        
#        x = cp.Variable(nb_cost_mat_new.shape[1])
#        cost = cp.sum_squares(nb_cost_mat_new * x - dis_new)
#        constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
#        prob = cp.Problem(cp.Minimize(cost), constraints)
#        prob.solve()
#        edit_costs_new = x.value
#        edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha])
#        residual = np.sqrt(prob.value)
        
#        # method 2: tune c_vir, c_eir and alpha by nonlinear programming by 
#        # scipy.optimize.minimize.
#        w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
#        w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
#        w2 = nb_cost_mat[:,3]
#        w3 = dis_k_vec
#        func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
#                             + w2 * x[2] - w3 * x[3]) ** 2)
#        bounds = ((0, None), (0., None), (0.5, 0.5), (0, None))
#        res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds)
#        edit_costs_new = res.x[0:3]
#        residual = res.fun
        
        # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy.
        
        
#        # method 4: tune c_vir, c_eir and alpha by QP function
#        # scipy.optimize.least_squares. An initial guess is required.
#        w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
#        w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
#        w2 = nb_cost_mat[:,3]
#        w3 = dis_k_vec
#        func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
#                             + w2 * x[2] - w3 * x[3]) ** 2
#        res = optimize.root(func, [0.9, 1.7, 0.75, 100])
#        edit_costs_new = res.x
#        residual = None
        elif cost == 'LETTER2':
#            # 1. if c_vi != c_vr, c_ei != c_er.
#            nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
#            x = cp.Variable(nb_cost_mat_new.shape[1])
#            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
##            # 1.1 no constraints.
##            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
#            # 1.2 c_vs <= c_vi + c_vr.
#            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
#                           np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]            
##            # 2. if c_vi == c_vr, c_ei == c_er.
##            nb_cost_mat_new = nb_cost_mat[:,[0,3,4]]
##            nb_cost_mat_new[:,0] += nb_cost_mat[:,1]
##            nb_cost_mat_new[:,2] += nb_cost_mat[:,5]
##            x = cp.Variable(nb_cost_mat_new.shape[1])
##            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
##            # 2.1 no constraints.
##            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
###            # 2.2 c_vs <= c_vi + c_vr.
###            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
###                           np.array([2.0, -1.0, 0.0]).T@x >= 0.0]     
#            
#            prob = cp.Problem(cp.Minimize(cost_fun), constraints)
#            prob.solve()
#            edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
#            edit_costs_new = np.array(edit_costs_new)
#            residual = np.sqrt(prob.value)
            if rw_constraints == 'inequality':
                # c_vs <= c_vi + c_vr.
                nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
                x = cp.Variable(nb_cost_mat_new.shape[1])
                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
                constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
                               np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
                prob.solve()
                edit_costs_new = x.value
                residual = np.sqrt(prob.value)
            elif rw_constraints == '2constraints':
                # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
                nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
                x = cp.Variable(nb_cost_mat_new.shape[1])
                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
                constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
                               np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
                               np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
                               np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
                prob.solve()
                edit_costs_new = x.value
                residual = np.sqrt(prob.value)
#            elif method == 'inequality_modified':
#                # c_vs <= c_vi + c_vr.
#                nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
#                x = cp.Variable(nb_cost_mat_new.shape[1])
#                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
#                constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
#                               np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
#                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
#                prob.solve()
#                # use same costs for insertion and removal rather than the fitted costs.
#                edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
#                edit_costs_new = np.array(edit_costs_new)
#                residual = np.sqrt(prob.value)
                
    else:
#    # method 1: simple least square method.
#    edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
#                                                     rcond=None)
    
#    # method 2: least square method with x_i >= 0.
#    edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
    
    # method 3: solve as a quadratic program with constraints.
#    P = np.dot(nb_cost_mat.T, nb_cost_mat)
#    q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
#    G = -1 * np.identity(nb_cost_mat.shape[1])
#    h = np.array([0 for i in range(nb_cost_mat.shape[1])])
#    A = np.array([1 for i in range(nb_cost_mat.shape[1])])
#    b = 1
#    x = cp.Variable(nb_cost_mat.shape[1])
#    prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
#                      [G@x <= h])
#    prob.solve()
#    edit_costs_new = x.value
#    residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
    
#    G = -1 * np.identity(nb_cost_mat.shape[1])
#    h = np.array([0 for i in range(nb_cost_mat.shape[1])])
        x = cp.Variable(nb_cost_mat.shape[1])
        cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
        constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
    #                   np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
                       np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
                       np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
        prob = cp.Problem(cp.Minimize(cost_fun), constraints)
        prob.solve()
        edit_costs_new = x.value
        residual = np.sqrt(prob.value)
    
    # method 4: 
    
    return edit_costs_new, residual


if __name__ == '__main__':
    print('check test_fitDistance.py')