update preimege/ged.py

5 years ago · 1a34c9f18e
--- a/notebooks/run_marginalizedkernel.py
+++ b/notebooks/run_marginalizedkernel.py
@@ -28,6 +28,7 @@ dslist = [
 #    {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
 #    {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
 #    {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
 #    {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
 #
 #    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    # node/edge symb
@@ -57,7 +58,7 @@ estimator = marginalizedkernel
 #param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.3, 3),
 #                          'n_iteration': np.linspace(1, 1, 1),
 param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9),
                          'n_iteration': np.linspace(5, 20, 4), 
                          'n_iteration': np.linspace(1, 19, 7), 
                          'remove_totters': [False]}
 param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
              {'alpha': np.logspace(-10, 10, num=41, base=10)}]
--- a/notebooks/run_spkernel.py
+++ b/notebooks/run_spkernel.py
@@ -24,6 +24,9 @@ dslist = [
 #    {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
 #    {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
 #    {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb   
 #    {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge
 #    {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'},
 #    # node nsymb symb
 #
 #    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    # node/edge symb
--- a/notebooks/run_structuralspkernel.py
+++ b/notebooks/run_structuralspkernel.py
@@ -30,6 +30,8 @@ dslist = [
 #    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
 #    # node symb/nsymb
 #    {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
 #    {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt'},
 #    # node nsymb symb
 #
 #    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    # node/edge symb
--- a/notebooks/run_treeletkernel.py
+++ b/notebooks/run_treeletkernel.py
@@ -26,6 +26,7 @@ dslist = [
    {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
    {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb 
    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
 #    {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
    # node symb/nsymb
 #    {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb
 #    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
--- a/notebooks/run_untilhpathkernel.py
+++ b/notebooks/run_untilhpathkernel.py
@@ -27,7 +27,8 @@ dslist = [
    {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
    {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
    {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
    {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb    
    {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb  
 #    {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
 #
 #    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    # node/edge symb
@@ -54,11 +55,11 @@ dslist = [
    #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
 ]
 estimator = untilhpathkernel
 param_grid_precomputed = {'depth': np.linspace(3, 10, 8),   # [2], 
                          'k_func': [None]} # ['MinMax', 'tanimoto'],
 #param_grid_precomputed = {'depth': np.linspace(1, 10, 10),   # [2], 
 #                          'k_func': ['MinMax'], # ['MinMax', 'tanimoto'],
 #                          'compute_method': ['trie']} # ['MinMax']}
 #param_grid_precomputed = {'depth': np.linspace(3, 10, 8),   # [2], 
 #                          'k_func': [None]} # ['MinMax', 'tanimoto'],
 param_grid_precomputed = {'depth': np.linspace(1, 10, 10),   # [2], 
                          'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], # 
                          'compute_method': ['trie']} # ['MinMax']}
 param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
              {'alpha': np.logspace(-10, 10, num=41, base=10)}]
--- a/notebooks/run_weisfeilerlehmankernel.py
+++ b/notebooks/run_weisfeilerlehmankernel.py
@@ -30,6 +30,8 @@ dslist = [
    {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1_A.txt'}, # node symb
    {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109_A.txt'}, # node symb
    {'name': 'D&D', 'dataset': '../datasets/DD/DD_A.txt'}, # node symb   
 #    {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}, # node/edge symb
 #
 #    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    # node/edge symb
--- a/preimage/fitDistance.py
+++ b/preimage/fitDistance.py
@@ -7,7 +7,7 @@ Created on Wed Oct 16 14:20:06 2019
 """
 import numpy as np
 from tqdm import tqdm
 from itertools import combinations_with_replacement
 from itertools import combinations_with_replacement, combinations
 import multiprocessing
 from multiprocessing import Pool
 from functools import partial
@@ -22,110 +22,88 @@ import sys
 from ged import GED, get_nb_edit_operations
 from utils import kernel_distance_matrix
 def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, 
                               fitkernel=None, gamma=1.0):
 def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, k=4,
                               params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', 
                                           'method': 'IPFP', 'stabilizer': None},
                               init_costs=[3, 3, 1, 3, 3, 1],
                               parallel=True):
    # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
 #    random.seed(1)
    cost_rdm = random.sample(range(1, 10), 6)
 #    edit_costs = cost_rdm + [0]
    edit_costs = cost_rdm
 #    edit_costs = [i * 0.01 for i in cost_rdm] + [0]
 #    edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
 #    edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
 #    edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
    idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
 #    cost_rdm = random.sample(range(1, 10), 6)
 #    init_costs = cost_rdm + [0]
 #    init_costs = cost_rdm
    init_costs = [3, 3, 1, 3, 3, 1]
 #    init_costs = [i * 0.01 for i in cost_rdm] + [0]
 #    init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
 #    init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
 #    init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
 #    idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
    # compute distances in feature space.
    coef_dk = 1
    dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
    dis_k_vec = []
    for i in range(len(dis_k_mat)):
        for j in range(i, len(dis_k_mat)):
 #        for j in range(i, len(dis_k_mat)):
        for j in range(i + 1, len(dis_k_mat)):
            dis_k_vec.append(dis_k_mat[i, j])
    dis_k_vec = np.array(dis_k_vec)
    if fitkernel == None:
        dis_k_vec_ajusted = dis_k_vec
    elif fitkernel == 'gaussian':
        coef_dk = 1 / np.max(dis_k_vec)
        idx_dk_nonzeros = np.where(dis_k_vec != 0)[0]
        # remove 0's and constraint d_k between 0 and 1.
        dis_k_vec = dis_k_vec[idx_dk_nonzeros] * coef_dk
        dis_k_vec_ajusted = np.sqrt(-np.log(dis_k_vec) / gamma)
    residual_list = []
    edit_cost_list = []
    time_list = []
    nb_cost_mat_list = []
    # init ged.
    print('\ninitial:')
    time0 = time.time()
    params_ged['edit_cost_constant'] = init_costs
    ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, 
                                                            parallel=parallel)
    residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]    
    time_list = [time.time() - time0]
    edit_cost_list = [init_costs]  
    nb_cost_mat = np.array(n_edit_operations)
    nb_cost_mat_list = [nb_cost_mat]
    print('edit_costs:', init_costs)
    print('residual_list:', residual_list)
    for itr in range(itr_max):
        print('\niteration', itr)
        time0 = time.time()
        # compute GEDs and numbers of edit operations.
        edit_cost_constant = [i for i in edit_costs]
        edit_cost_list.append(edit_cost_constant)
        ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, 
            idx_cost_nonzeros, parallel=True)
        if fitkernel == None:
            residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
        elif fitkernel == 'gaussian':
            ged_all = np.array(ged_all)[idx_dk_nonzeros]
            residual = np.sqrt(np.sum(np.square(
                    np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
        residual_list.append(residual)
        # "fit" geds to distances in feature space by tuning edit costs using the
        # Least Squares Method.
        nb_cost_mat = np.array(n_edit_operations).T
        if fitkernel == 'gaussian':
            nb_cost_mat = nb_cost_mat[idx_dk_nonzeros]
        nb_cost_mat_list.append(nb_cost_mat)
        edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec_ajusted)
        print('pseudo residual:', residual)
        edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec)
        for i in range(len(edit_costs_new)):
            if edit_costs_new[i] < 0:
                if edit_costs_new[i] > -1e-9:
                    edit_costs_new[i] = 0
                else:
                    raise ValueError('The edit cost is negative.')
        for idx, item in enumerate(idx_cost_nonzeros):
            edit_costs[item] = edit_costs_new[idx]
 #        for i in range(len(edit_costs_new)):
 #            if edit_costs_new[i] < 0:
 #                edit_costs_new[i] = 0
        # compute new GEDs and numbers of edit operations.
        params_ged['edit_cost_constant'] = edit_costs_new
        ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, 
                                                           parallel=parallel)
        residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
        time_list.append(time.time() - time0)
        print('edit_costs:', edit_costs)
        edit_cost_list.append(edit_costs_new)
        nb_cost_mat = np.array(n_edit_operations)
        nb_cost_mat_list.append(nb_cost_mat)                        
        print('edit_costs:', edit_costs_new)
        print('residual_list:', residual_list)
    print()
    edit_cost_list.append(edit_costs)
    ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_costs, 
            idx_cost_nonzeros, parallel=True)
    if fitkernel == 0:
        residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
    elif fitkernel == 'gaussian':
        ged_all = np.array(ged_all)[idx_dk_nonzeros]
        residual = np.sqrt(np.sum(np.square(
                np.exp(-gamma * ged_all ** 2) / coef_dk - dis_k_vec)))
    residual_list.append(residual)
    nb_cost_mat_list.append(np.array(n_edit_operations).T)
    return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
        time_list, nb_cost_mat_list, coef_dk
    return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
        time_list, nb_cost_mat_list
 def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
 def compute_geds(Gn, params_ged, parallel=False):
    ged_mat = np.zeros((len(Gn), len(Gn)))
    if parallel:
 #        print('parallel')
        len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
        ged_all = [0 for i in range(len_itr)]
        n_edit_operations = [[0 for i in range(len_itr)] for j in 
                              range(len(idx_nonzeros))]
        itr = combinations_with_replacement(range(0, len(Gn)), 2)
 #        len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
        len_itr = int(len(Gn) * (len(Gn) - 1) / 2)
        ged_vec = [0 for i in range(len_itr)]
        n_edit_operations = [0 for i in range(len_itr)]
 #        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        itr = combinations(range(0, len(Gn)), 2)
        n_jobs = multiprocessing.cpu_count()
        if len_itr < 100 * n_jobs:
            chunksize = int(len_itr / n_jobs) + 1
@@ -134,68 +112,52 @@ def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare
        do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant, 
                             idx_nonzeros)
        do_partial = partial(_wrapper_compute_ged_parallel, params_ged)
        pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
        iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
                        desc='computing GEDs', file=sys.stdout)
 #        iterator = pool.imap_unordered(do_partial, itr, chunksize)
        for i, j, dis, n_eo_tmp in iterator:
            idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
            ged_all[idx_itr] = dis
            idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2)
            ged_vec[idx_itr] = dis
            ged_mat[i][j] = dis
            ged_mat[j][i] = dis
            for idx, item in enumerate(idx_nonzeros):
                n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
            n_edit_operations[idx_itr] = n_eo_tmp
 #            print('\n-------------------------------------------')
 #            print(i, j, idx_itr, dis)
        pool.close()
        pool.join()
    else:
        ged_all = []
        n_edit_operations = [[] for i in range(len(idx_nonzeros))]
        ged_vec = []
        n_edit_operations = []
        for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
 #        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
 #                time0 = time.time()
                dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy', 
                    cost='CONSTANT', method='IPFP', 
                    edit_cost_constant=edit_cost_constant, stabilizer='min', 
                    repeat=50)
 #                time1 = time.time() - time0
 #                time0 = time.time()
                ged_all.append(dis)
            for j in range(i + 1, len(Gn)):
                dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged)
                ged_vec.append(dis)
                ged_mat[i][j] = dis
                ged_mat[j][i] = dis
                n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
                for idx, item in enumerate(idx_nonzeros):
                    n_edit_operations[idx].append(n_eo_tmp[item])
 #                time2 = time.time() - time0
 #                print(time1, time2, time1 / time2)
                n_edit_operations.append(n_eo_tmp)
    return ged_all, ged_mat, n_edit_operations
    return ged_vec, ged_mat, n_edit_operations
 def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
 def _wrapper_compute_ged_parallel(params_ged, itr):
    i = itr[0]
    j = itr[1]
    dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant, 
                                          idx_nonzeros)
    dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged)
    return i, j, dis, n_eo_tmp
 def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
    dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy', 
        cost='CONSTANT', method='IPFP', 
        edit_cost_constant=edit_cost_constant, stabilizer='min', 
        repeat=50)
    n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
 def _compute_ged_parallel(g1, g2, params_ged):
    dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
    n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)       
    return dis, n_eo_tmp
 def compute_better_costs(nb_cost_mat, dis_k_vec):
 def update_costs(nb_cost_mat, dis_k_vec):
 #    # method 1: simple least square method.
 #    edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
 #                                                     rcond=None)
@@ -203,7 +165,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
 #    # method 2: least square method with x_i >= 0.
 #    edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
    # method 3: solve as a quadratic program with constraints: x_i >= 0, sum(x) = 1.
    # method 3: solve as a quadratic program with constraints.
 #    P = np.dot(nb_cost_mat.T, nb_cost_mat)
 #    q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
 #    G = -1 * np.identity(nb_cost_mat.shape[1])
@@ -221,7 +183,7 @@ def compute_better_costs(nb_cost_mat, dis_k_vec):
 #    h = np.array([0 for i in range(nb_cost_mat.shape[1])])
    x = cp.Variable(nb_cost_mat.shape[1])
    cost = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
    constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
    constraints = [x >= [0.0001 for i in range(nb_cost_mat.shape[1])],
 #                   np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
                   np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
                   np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
--- a/preimage/ged.py
+++ b/preimage/ged.py
@@ -13,29 +13,30 @@ import multiprocessing
 from multiprocessing import Pool
 from functools import partial
 from gedlibpy import librariesImport, gedlibpy
 from gedlibpy_linlin import librariesImport, gedlibpy
 def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP', 
        edit_cost_constant=[], stabilizer='min', repeat=50):
        edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50):
    """
    Compute GED for 2 graphs.
    """
    if lib == 'gedlibpy':
        def convertGraph(G):
            """Convert a graph to the proper NetworkX format that can be
            recognized by library gedlibpy.
            """
            G_new = nx.Graph()
            for nd, attrs in G.nodes(data=True):
                G_new.add_node(str(nd), chem=attrs['atom'])
    def convertGraph(G):
        """Convert a graph to the proper NetworkX format that can be
        recognized by library gedlibpy.
        """
        G_new = nx.Graph()
        for nd, attrs in G.nodes(data=True):
            G_new.add_node(str(nd), chem=attrs['atom'])
 #                G_new.add_node(str(nd), x=str(attrs['attributes'][0]), 
 #                               y=str(attrs['attributes'][1]))
            for nd1, nd2, attrs in G.edges(data=True):
                G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
 #                G_new.add_edge(str(nd1), str(nd2))
            return G_new
        for nd1, nd2, attrs in G.edges(data=True):
 #            G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
            G_new.add_edge(str(nd1), str(nd2))
        return G_new
    if lib == 'gedlibpy':
        gedlibpy.restart_env()
        gedlibpy.add_nx_graph(convertGraph(g1), "")
        gedlibpy.add_nx_graph(convertGraph(g2), "")
@@ -43,12 +44,12 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
        listID = gedlibpy.get_all_graph_ids()
        gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
        gedlibpy.init()
        gedlibpy.set_method(method, "")
        gedlibpy.set_method(method, algo_options)
        gedlibpy.init_method()
        g = listID[0]
        h = listID[1]
        if stabilizer == None:
        if stabilizer is None:
            gedlibpy.run_method(g, h)
            pi_forward = gedlibpy.get_forward_map(g, h)
            pi_backward = gedlibpy.get_backward_map(g, h)
@@ -107,13 +108,57 @@ def GED(g1, g2, lib='gedlibpy', cost='CHEM_1', method='IPFP',
        dis = upper
        # make the map label correct (label remove map as np.inf)
        nodes1 = [n for n in g1.nodes()]
        nodes2 = [n for n in g2.nodes()]
        nb1 = nx.number_of_nodes(g1)
        nb2 = nx.number_of_nodes(g2)
        pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
        pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]      
    elif lib == 'gedlib-bash':
        import time
        import random
        import sys
        import os
        sys.path.insert(0, "../")
        from pygraph.utils.graphfiles import saveDataset
        tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/'
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)
        fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
        xparams = {'method': 'gedlib', 'graph_dir': fn_collection}
        saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', 
                    filename=fn_collection, xparams=xparams)
        command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
        command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
        command += 'export LD_LIBRARY_PATH\n'
        command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
        command += './ged_for_python_bash monoterpenoides ' + fn_collection \
                + ' \'' + algo_options + '\' '
        for ec in edit_cost_constant:
            command += str(ec) + ' '
 #        output = os.system(command)
        stream = os.popen(command)
        output = stream.readlines()
 #        print(output)
        dis = float(output[0].strip())
        runtime = float(output[1].strip())
        size_forward = int(output[2].strip())
        pi_forward = [int(item.strip()) for item in output[3:3+size_forward]]
        pi_backward = [int(item.strip()) for item in output[3+size_forward:]]
 #        print(dis)
 #        print(runtime)
 #        print(size_forward)
 #        print(pi_forward)
 #        print(pi_backward)
    # make the map label correct (label remove map as np.inf)
    nodes1 = [n for n in g1.nodes()]
    nodes2 = [n for n in g2.nodes()]
    nb1 = nx.number_of_nodes(g1)
    nb2 = nx.number_of_nodes(g2)
    pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
    pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
 #        print(pi_forward)
    return dis, pi_forward, pi_backward
@@ -149,7 +194,7 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',
        g = listID[0]
        h = listID[1]
        if stabilizer == None:
        if stabilizer is None:
            gedlibpy.run_method(g, h)
            pi_forward = gedlibpy.get_forward_map(g, h)
            pi_backward = gedlibpy.get_backward_map(g, h)
@@ -183,7 +228,8 @@ def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP',
 def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', 
               'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], 
               'stabilizer': 'min', 'repeat': 50}, parallel=False):
               'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1',
               'stabilizer': None}, parallel=False):
    if parallel:
        len_itr = int(len(Gn))
        pi_forward_list = [[] for i in range(len_itr)]
--- a/preimage/iam.py
+++ b/preimage/iam.py
@@ -23,7 +23,8 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
        connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
        allBestEdges=False, allBestOutput=False,
        params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', 
                    'edit_cost_constant': [], 'stabilizer': 'min', 'repeat': 50}):
                    'edit_cost_constant': [], 'stabilizer': None, 
                    'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}):
    """See my name, then you know what I do.
    """
 #    Gn_median = Gn_median[0:10]
@@ -435,6 +436,62 @@ def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
    return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median
 def iam_bash(Gn_names, edit_cost_constant, dataset='monoterpenoides',
             graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'):
    """Compute the iam by c++ implementation (gedlib) through bash.
    """
    import os
    import time
    def createCollectionFile(Gn_names, y, filename):
        """Create collection file.
        """
        dirname_ds = os.path.dirname(filename)
        if dirname_ds != '':
            dirname_ds += '/'
            if not os.path.exists(dirname_ds) :
                os.makedirs(dirname_ds)
        with open(filename + '.xml', 'w') as fgroup:
            fgroup.write("<?xml version=\"1.0\"?>")
            fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
            fgroup.write("\n<GraphCollection>")
            for idx, fname in enumerate(Gn_names):
                fgroup.write("\n\t<graph file=\"" + fname + "\" class=\"" + str(y[idx]) + "\"/>")
            fgroup.write("\n</GraphCollection>")
            fgroup.close()
    tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/'
    fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
    createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection)
 #    graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl'
    command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
    command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
    command += 'export LD_LIBRARY_PATH\n'
    command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
    command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \
            + ' \'' + graph_dir + '\' '
    if edit_cost_constant is None:
        command += 'None'
    else:
        for ec in edit_cost_constant:
            command += str(ec) + ' '
 #        output = os.system(command)
    stream = os.popen(command)
    output = stream.readlines()    
 #    print(output)
    sod_sm = float(output[0].strip())
    sod_gm= float(output[1].strip())
    fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl'
    fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl'
    return sod_sm, sod_gm, fname_sm, fname_gm
 ###############################################################################
 # Old implementations.
--- a/preimage/test_fitDistance.py
+++ b/preimage/test_fitDistance.py
@@ -16,6 +16,319 @@ from utils import remove_edges
 from fitDistance import fit_GED_to_kernel_distance
 from utils import normalize_distance_matrix
 def median_paper_clcpc_python_best():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with 
       python invoking the c++ code by bash command (with updated library).
    """
 #    ds = {'name': 'monoterpenoides', 
 #          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
 #    _, y_all = loadDataset(ds['dataset'])
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 6
    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 
                'algo_options': algo_options, 'stabilizer': None}
    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50
    collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'
    fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'
    for y in y_all:
        for repeat in range(repeats):
            edit_costs_output_file = open(fn_edit_costs_output, 'a')
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
            Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
            edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
                nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
                                            gkernel, itr_max, params_ged=params_ged, 
                                            parallel=True)
            total_time = np.sum(time_list)
 #            print('\nedit_costs:', edit_costs)
 #            print('\nresidual_list:', residual_list)
 #            print('\nedit_cost_list:', edit_cost_list)
 #            print('\ndistance matrix in kernel space:', dis_k_mat)
 #            print('\nged matrix:', ged_mat)
 #            print('\ntotal time:', total_time)
 #            print('\nnb_cost_mat:', nb_cost_mat_list[-1])
            np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y' 
                     + y + '.repeat' + str(repeat) + '.k10..gm', 
                     edit_costs=edit_costs, 
                     residual_list=residual_list, edit_cost_list=edit_cost_list,
                     dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
                     total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
            for ec in edit_costs:
                edit_costs_output_file.write(str(ec) + ' ')
            edit_costs_output_file.write('\n')
            edit_costs_output_file.close()
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
            nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
            print(nb_consistent, nb_inconsistent, ratio_consistent)
 #            norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
 #            plt.imshow(norm_dis_k_mat)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #            
 #            norm_ged_mat = normalize_distance_matrix(ged_mat)
 #            plt.imshow(norm_ged_mat)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #            
 #            norm_diff = norm_ged_mat - norm_dis_k_mat
 #            plt.imshow(norm_diff)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #        #    draw_count_bar(norm_diff)
 def median_paper_clcpc_python_bash_cpp():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with 
       python invoking the c++ code by bash command (with updated library).
    """
 #    ds = {'name': 'monoterpenoides', 
 #          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
 #    _, y_all = loadDataset(ds['dataset'])
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 20
    algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
    params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', 
                'algo_options': algo_options}
    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50
    collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'
    fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt'
    for y in y_all:
        for repeat in range(repeats):
            edit_costs_output_file = open(fn_edit_costs_output, 'a')
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
            Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
            edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
                nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
                                            gkernel, itr_max, params_ged=params_ged, 
                                            parallel=False)
            total_time = np.sum(time_list)
 #            print('\nedit_costs:', edit_costs)
 #            print('\nresidual_list:', residual_list)
 #            print('\nedit_cost_list:', edit_cost_list)
 #            print('\ndistance matrix in kernel space:', dis_k_mat)
 #            print('\nged matrix:', ged_mat)
 #            print('\ntotal time:', total_time)
 #            print('\nnb_cost_mat:', nb_cost_mat_list[-1])
            np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
                     + y + '.repeat' + str(repeat) + '.gm', 
                     edit_costs=edit_costs, 
                     residual_list=residual_list, edit_cost_list=edit_cost_list,
                     dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
                     total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, 
                     coef_dk=coef_dk)
            for ec in edit_costs:
                edit_costs_output_file.write(str(ec) + ' ')
            edit_costs_output_file.write('\n')
            edit_costs_output_file.close()
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
 #    coef_dk = gmfile['coef_dk']
            nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
            print(nb_consistent, nb_inconsistent, ratio_consistent)
 #            norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
 #            plt.imshow(norm_dis_k_mat)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #            
 #            norm_ged_mat = normalize_distance_matrix(ged_mat)
 #            plt.imshow(norm_ged_mat)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #            
 #            norm_diff = norm_ged_mat - norm_dis_k_mat
 #            plt.imshow(norm_diff)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #        #    draw_count_bar(norm_diff)
 def test_cs_leq_ci_plus_cr_python_bash_cpp():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with 
       python invoking the c++ code by bash command (with updated library).
    """
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:10]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 10
    algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
    params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', 
                'algo_options': algo_options}
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
                                    gkernel, itr_max, params_ged=params_ged, 
                                    parallel=False)
    total_time = np.sum(time_list)
    print('\nedit_costs:', edit_costs)
    print('\nresidual_list:', residual_list)
    print('\nedit_cost_list:', edit_cost_list)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    print('\ntotal time:', total_time)
    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm', 
             edit_costs=edit_costs, 
             residual_list=residual_list, edit_cost_list=edit_cost_list,
             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, 
             coef_dk=coef_dk)
 #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
 #          'extra_params': {}}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 ##    Gn = Gn[0:10]
 ##    remove_edges(Gn)
 #    gkernel = 'untilhpathkernel'
 #    node_label = 'atom'
 #    edge_label = 'bond_type'
 #    itr_max = 10
 #    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
 #        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
 #                                                      gkernel, itr_max)
 #    total_time = np.sum(time_list)
 #    print('\nedit_costs:', edit_costs)
 #    print('\nresidual_list:', residual_list)
 #    print('\nedit_cost_list:', edit_cost_list)
 #    print('\ndistance matrix in kernel space:', dis_k_mat)
 #    print('\nged matrix:', ged_mat)
 #    print('\ntotal time:', total_time)
 #    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
 #    np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm', 
 #             edit_costs=edit_costs, 
 #             residual_list=residual_list, edit_cost_list=edit_cost_list,
 #             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
 #             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
 #    coef_dk = gmfile['coef_dk']
    nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
    print(nb_consistent, nb_inconsistent, ratio_consistent)
 #    dis_k_sub = pairwise_substitution(dis_k_mat)
 #    ged_sub = pairwise_substitution(ged_mat)    
 #    np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm', 
 #             dis_k_sub=dis_k_sub, ged_sub=ged_sub)
    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
 #    draw_count_bar(norm_diff)
 def test_anycosts():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
@@ -295,8 +608,12 @@ def draw_count_bar(norm_diff):
 if __name__ == '__main__':
 #    test_anycosts()
    test_cs_leq_ci_plus_cr()
 #    test_cs_leq_ci_plus_cr()
 #    test_unfitted()
 #    test_cs_leq_ci_plus_cr_python_bash_cpp()
 #    median_paper_clcpc_python_bash_cpp()
    median_paper_clcpc_python_best()
 #    x = np.array([[1,2,3],[4,5,6],[7,8,9]])
 #    xx = pairwise_substitution(x)
--- a/preimage/test_iam.py
+++ b/preimage/test_iam.py
@@ -22,6 +22,130 @@ from iam import iam_upgraded
 from utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar
 #from ged import ged_median
 def test_iam_monoterpenoides_with_init40():
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    # unfitted edit costs.
    c_vi = 3
    c_vr = 3
    c_vs = 1
    c_ei = 3
    c_er = 3
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.0001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    ged_stabilizer = None
 #    ged_repeat = 50
    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'algo_options': algo_options,
                  'stabilizer': ged_stabilizer}
    collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'
    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50
    # classify graphs according to classes.
    time_list = []
    dis_ks_min_list = []
    dis_ks_set_median_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    sod_list_list = []
    for y in y_all:
        print('\n-------------------------------------------------------')
        print('class of y:', y)
        time_list.append([])
        dis_ks_min_list.append([])
        dis_ks_set_median_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])
        for repeat in range(repeats):
            # load median set.
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
            Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir)
            Gn_candidate = [g.copy() for g in Gn_median]
            time0 = time.time()
            G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median, 
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label, 
                connected=connected_iam, removeNodes=removeNodes, 
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(G_gen_median_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_gen_median)
            print('\nsmallest sod in graph space:', sod_gen_median)
            sod_list_list.append(sod_list)
 #            # show the best graph and save it to file.
 #            print('one of the possible corresponding pre-images is')
 #            nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), 
 #                    with_labels=True)
 ##            plt.show()
 #    #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
 ##            plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + 
 ##                        '_repeat' + str(repeat) + '_' + str(time.time()) +
 ##                        '.png', format="PNG")
 #            plt.clf()
 #    #        print(G_gen_median_list[0].nodes(data=True))
 #    #        print(G_gen_median_list[0].edges(data=True))
        print('\nsods of the set median for this class:', sod_set_median_list[-1])
        print('\nsods in graph space for this class:', sod_gs_list[-1])
 #        print('\ndistance in kernel space of set median for this class:', 
 #              dis_ks_set_median_list[-1])
 #        print('\nsmallest distances in kernel space for this class:', 
 #              dis_ks_min_list[-1])   
        print('\ntimes for this class:', time_list[-1])
        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
 #        dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
 #        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])
    print()
    print('\nmean sods of the set median for each class:', sod_set_median_list)
    print('\nmean sods in graph space for each class:', sod_gs_list)
 #    print('\ndistances in kernel space of set median for each class:', 
 #            dis_ks_set_median_list)
 #    print('\nmean smallest distances in kernel space for each class:', 
 #            dis_ks_min_list)
    print('\nmean times for each class:', time_list)
    print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
 #    print('\nmean distances in kernel space of set median of all:', 
 #            np.mean(dis_ks_set_median_list))
 #    print('\nmean smallest distances in kernel space of all:', 
 #            np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))
 def test_iam_monoterpenoides():
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
@@ -834,9 +958,10 @@ if __name__ == '__main__':
 # tests on different numbers of median-sets.
 #    test_iam_median_nb()
 #    test_iam_letter_h()
    test_iam_monoterpenoides()
 #    test_iam_monoterpenoides()
 #    test_iam_mutag()
 #    test_iam_fitdistance()
 #    print("test log")
    test_iam_monoterpenoides_with_init40()
--- a/preimage/utils.py
+++ b/preimage/utils.py
@@ -17,8 +17,10 @@ from pygraph.kernels.marginalizedKernel import marginalizedkernel
 from pygraph.kernels.untilHPathKernel import untilhpathkernel
 from pygraph.kernels.spKernel import spkernel
 import functools
 from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
 from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel
 from pygraph.kernels.structuralspKernel import structuralspkernel
 from pygraph.kernels.treeletKernel import treeletkernel
 from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
 def remove_edges(Gn):
@@ -46,18 +48,29 @@ def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose):
                                  n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'untilhpathkernel':
        Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
                                  depth=10, k_func='MinMax', compute_method='trie',
                                  depth=7, k_func='MinMax', compute_method='trie',
                                  n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'spkernel':
        mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
        Kmatrix, _, _ = spkernel(Gn, node_label='atom', node_kernels=
        Kmatrix, _, _ = spkernel(Gn, node_label=node_label, node_kernels=
                              {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
                              n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'structuralspkernel':
        mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
        Kmatrix, _ = structuralspkernel(Gn, node_label='atom', node_kernels=
        Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, node_kernels=
                              {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
                              n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'treeletkernel':
 #        pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
        pkernel = functools.partial(gaussiankernel, gamma=1e-6)
        mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
        Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
                                   sub_kernel=pkernel,
                                   n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'weisfeilerlehmankernel':
        Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
                                   height=4, base_kernel='subtree',
                                   n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    # normalization
    Kmatrix_diag = Kmatrix.diagonal().copy()
@@ -79,7 +92,7 @@ def gram2distances(Kmatrix):
 def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, gkernel=None):
    dis_mat = np.empty((len(Gn), len(Gn)))
    if Kmatrix == None:
    if Kmatrix is None:
        Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
    for i in range(len(Gn)):
        for j in range(i, len(Gn)):
@@ -109,6 +122,21 @@ def get_same_item_indices(ls):
    return idx_dict
 def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None,
                                                  node_label=None, edge_label=None):
    dis_k_all = [] # distance between g_star and each graph.
    alpha = [1 / len(Gn)] * len(Gn)
    if Kmatrix is None:
        Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
    term3 = 0
    for i1, a1 in enumerate(alpha):
        for i2, a2 in enumerate(alpha):
            term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
    for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
        dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
        dis_all.append(dtemp)
 def normalize_distance_matrix(D):
    max_value = np.amax(D)
    min_value = np.amin(D)
--- a/pygraph/utils/graphfiles.py
+++ b/pygraph/utils/graphfiles.py
@@ -124,21 +124,21 @@ def saveGXL(graph, filename, method='benoit'):
        # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
 #        pass
        gxl_file = open(filename, 'w')
        gxl_file.write("<?xml version=\"1.0\"?>\n")
        gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
        gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
        gxl_file.write("<gxl>\n")
        gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n")
        gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n")
        for v, attrs in graph.nodes(data=True):
            gxl_file.write("<node id=\"_" + str(v) + "\">")
            gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>")
            gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['chem']) + "</int></attr>")
            gxl_file.write("</node>\n")
        for v1, v2, attrs in graph.edges(data=True):
            gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">")
 #            gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>")
            gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>")
            gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['valence']) + "</int></attr>")
 #            gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>")
            gxl_file.write("</edge>\n")
        gxl_file.write("</graph>\n")
        gxl_file.write("</gxl>\n")
        gxl_file.write("</gxl>")
        gxl_file.close()
    elif method == 'gedlib-letter':
        # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
@@ -147,15 +147,15 @@ def saveGXL(graph, filename, method='benoit'):
        gxl_file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
        gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
        gxl_file.write("<gxl xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n")
        gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">")
        gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n")
        for v, attrs in graph.nodes(data=True):
            gxl_file.write("<node id=\"_" + str(v) + "\">")
            gxl_file.write("<attr name=\"x\"><float>" + str(attrs['attributes'][0]) + "</float></attr>")
            gxl_file.write("<attr name=\"y\"><float>" + str(attrs['attributes'][1]) + "</float></attr>")
            gxl_file.write("</node>")
            gxl_file.write("</node>\n")
        for v1, v2, attrs in graph.edges(data=True):
            gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>")
        gxl_file.write("</graph>")
            gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\"/>\n")
        gxl_file.write("</graph>\n")
        gxl_file.write("</gxl>")
        gxl_file.close()
@@ -466,12 +466,15 @@ def loadDataset(filename, filename_y=None, extra_params=None):
 def loadFromXML(filename, extra_params):
    import xml.etree.ElementTree as ET
    dirname_dataset = dirname(filename)
    if extra_params:
        dirname_dataset = extra_params
    else:
        dirname_dataset = dirname(filename)
    tree = ET.parse(filename)
    root = tree.getroot()
    data = []
    y = []
    for graph in root.iter('print'):
    for graph in root.iter('graph'):
        mol_filename = graph.attrib['file']
        mol_class = graph.attrib['class']
        data.append(loadGXL(dirname_dataset + '/' + mol_filename))
@@ -541,15 +544,22 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
        dirname_ds += '/'
        if not os.path.exists(dirname_ds) :
            os.makedirs(dirname_ds)
    if 'graph_dir' in xparams:
        graph_dir = xparams['graph_dir'] + '/'
        if not os.path.exists(graph_dir):
            os.makedirs(graph_dir)
    else:
        graph_dir = dirname_ds 
    if group == 'xml' and gformat == 'gxl':
        with open(filename + '.xml', 'w') as fgroup:
            fgroup.write("<?xml version=\"1.0\"?>")
            fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"https://dbblumenthal.github.io/gedlib/GraphCollection_8dtd_source.html\">")
            fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
            fgroup.write("\n<GraphCollection>")
            for idx, g in enumerate(Gn):
                fname_tmp = "graph" + str(idx) + ".gxl"
                saveGXL(g, dirname_ds + fname_tmp, method=xparams['method'])
                saveGXL(g, graph_dir + fname_tmp, method=xparams['method'])
                fgroup.write("\n\t<graph file=\"" + fname_tmp + "\" class=\"" + str(y[idx]) + "\"/>")
            fgroup.write("\n</GraphCollection>")
            fgroup.close()
@@ -558,18 +568,18 @@ def saveDataset(Gn, y, gformat='gxl', group=None, filename='gfile', xparams=None
 if __name__ == '__main__':    
 #    ### Load dataset from .ds file.
 #    # .ct files.
    ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
        'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
    Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
 #    ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'}  # node symb
 #    Gn, y = loadDataset(ds['dataset'])
 #    ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
 #    Gn, y = loadDataset(ds['dataset'])
 #    ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
 #    Gn, y = loadDataset(ds['dataset'])
    print(Gn[1].nodes(data=True))
    print(Gn[1].edges(data=True))
    print(y[1])
 #    ds = {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds',
 #        'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'}
 #    Gn, y = loadDataset(ds['dataset'], filename_y=ds['dataset_y'])
 ##    ds = {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds'}  # node symb
 ##    Gn, y = loadDataset(ds['dataset'])
 ##    ds = {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'} # node/edge symb
 ##    Gn, y = loadDataset(ds['dataset'])
 ##    ds = {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'} # unlabeled
 ##    Gn, y = loadDataset(ds['dataset'])
 #    print(Gn[1].nodes(data=True))
 #    print(Gn[1].edges(data=True))
 #    print(y[1])
 #    # .gxl file.
 #    ds = {'name': 'monoterpenoides', 
@@ -579,6 +589,33 @@ if __name__ == '__main__':
 #    print(Gn[1].edges(data=True))
 #    print(y[1])
    ### Convert graph from one format to another.
    # .gxl file.
    import networkx as nx
    ds = {'name': 'monoterpenoides', 
          'dataset': '../../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y = loadDataset(ds['dataset'])
    y = [int(i) for i in y]
    print(Gn[1].nodes(data=True))
    print(Gn[1].edges(data=True))
    print(y[1])
    # Convert a graph to the proper NetworkX format that can be recognized by library gedlib.
    Gn_new = []
    for G in Gn:
        G_new = nx.Graph()
        for nd, attrs in G.nodes(data=True):
            G_new.add_node(str(nd), chem=attrs['atom'])
        for nd1, nd2, attrs in G.edges(data=True):
            G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
 #            G_new.add_edge(str(nd1), str(nd2))
        Gn_new.append(G_new)
    print(Gn_new[1].nodes(data=True))
    print(Gn_new[1].edges(data=True))
    print(Gn_new[1])
    filename = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl/monoterpenoides'
    xparams = {'method': 'gedlib'}
    saveDataset(Gn, y, gformat='gxl', group='xml', filename=filename, xparams=xparams)
 #    ds = {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
 #          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
 #    Gn, y = loadDataset(ds['dataset'], extra_params=ds['extra_params'])