add the treelet kernel.

6 years ago · 92c74448ec
--- a/notebooks/run_commonwalkkernel.py
+++ b/notebooks/run_commonwalkkernel.py
@@ -8,10 +8,8 @@ Created on Fri Sep 28 17:01:13 2018

 from libs import *
 import multiprocessing
 from sklearn.metrics.pairwise import rbf_kernel

 from pygraph.kernels.commonWalkKernel import commonwalkkernel
 from pygraph.utils.kernels import deltakernel, kernelproduct

 dslist = [
    {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
--- a/notebooks/run_treeletkernel.py
+++ b/notebooks/run_treeletkernel.py
@@ -0,0 +1,83 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Oct  5 19:19:33 2018

@author: ljia
 """

 from libs import *
 import multiprocessing

 from pygraph.kernels.treeletKernel import treeletkernel
 from pygraph.utils.kernels import gaussiankernel, polynomialkernel

 dslist = [
    {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
        'task': 'regression'},  # node symb
    {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
             'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },  
    # contains single node graph, node symb
    {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', },  # node/edge symb
    {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', },  # unlabeled
    {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
             'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},  # node/edge symb
 #    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
 #    # node nsymb
    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
    # node symb/nsymb
 #    {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    # node/edge symb
 #    {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
 #     'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},  # node symb

    #     {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
    # # #     {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
    # # #     {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
    #     {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
    #
    # #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
    # #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
    # #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

    # #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
    {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
    #     {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
    #     {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
    #     {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
    #         'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

    #     # not working below
    #     {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
    #     {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
    #     {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
    #     {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
 ]
 estimator = treeletkernel
 param_grid_precomputed = {'sub_kernel': [gaussiankernel, polynomialkernel]}
 param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
              {'alpha': np.logspace(-10, 10, num=41, base=10)}]

 for ds in dslist:
    print()
    print(ds['name'])
    model_selection_for_precomputed_kernel(
        ds['dataset'],
        estimator,
        param_grid_precomputed,
        (param_grid[1] if ('task' in ds and ds['task']
                           == 'regression') else param_grid[0]),
        (ds['task'] if 'task' in ds else 'classification'),
        NUM_TRIALS=30,
        datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
        ds_name=ds['name'],
        n_jobs=multiprocessing.cpu_count(),
        read_gm_from_file=False,
        verbose=True)
    print()
--- a/notebooks/run_untilhpathkernel.py
+++ b/notebooks/run_untilhpathkernel.py
@@ -10,7 +10,6 @@ from libs import *
 import multiprocessing

 from pygraph.kernels.untilHPathKernel import untilhpathkernel
 from pygraph.utils.kernels import deltakernel, kernelproduct

 dslist = [
    {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
--- a/notebooks/utils/plot_all_graphs.ipynb
+++ b/notebooks/utils/plot_all_graphs.ipynb
--- a/preimage/gk_iam.py
+++ b/preimage/gk_iam.py
@@ -11,13 +11,17 @@ and the iterative alternate minimizations (IAM) in reference [2].
    pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004.
    [2] Generalized median graph via iterative alternate minimization.
 """
 import sys
 import numpy as np
 import multiprocessing
 from tqdm import tqdm
 import networkx as nx
 import matplotlib.pyplot as plt

 from iam import iam
 from iam import iam, test_iam_with_more_graphs_as_init, test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations
 sys.path.insert(0, "../")
 from pygraph.kernels.marginalizedKernel import marginalizedkernel
 from pygraph.kernels.untilHPathKernel import untilhpathkernel


 def gk_iam(Gn, alpha):
@@ -29,58 +33,59 @@ def gk_iam(Gn, alpha):
    -----
    Every time a better graph is acquired, the older one is replaced by it.
    """
    # compute k nearest neighbors of phi in DN.
    dis_list = [] # distance between g_star and each graph.
    for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
        dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
                      k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * 
                      (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
                      k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
        dis_list.append(dtemp)
        
    # sort
    sort_idx = np.argsort(dis_list)
    dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
    g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
    if dis_gs[0] == 0: # the exact pre-image.
        print('The exact pre-image is found from the input dataset.')
        return 0, g0hat
    dhat = dis_gs[0] # the nearest distance
    Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
    gihat_list = []
    
 #    i = 1
    r = 1
    while r < r_max:
        print('r =', r)
 #        found = False
        Gs_nearest = Gk + gihat_list
        g_tmp = iam(Gs_nearest)
        
        # compute distance between phi and the new generated graph.
        knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
                       p_quit=lmbda, n_iteration=20, remove_totters=False,
                       n_jobs=multiprocessing.cpu_count(), verbose=False)
        dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * 
              knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * 
              (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
              k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
        if dnew <= dhat: # the new distance is smaller
            print('I am smaller!')
            dhat = dnew
            g_new = g_tmp.copy() # found better graph.
            gihat_list = [g_new]
            dis_gs.append(dhat)
            r = 0
        else:
            r += 1
            
    ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list)
    
    return dhat, ghat
    pass
 #    # compute k nearest neighbors of phi in DN.
 #    dis_list = [] # distance between g_star and each graph.
 #    for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
 #        dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
 #                      k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * 
 #                      (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
 #                      k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
 #        dis_list.append(dtemp)
 #        
 #    # sort
 #    sort_idx = np.argsort(dis_list)
 #    dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
 #    g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
 #    if dis_gs[0] == 0: # the exact pre-image.
 #        print('The exact pre-image is found from the input dataset.')
 #        return 0, g0hat
 #    dhat = dis_gs[0] # the nearest distance
 #    Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
 #    gihat_list = []
 #    
 ##    i = 1
 #    r = 1
 #    while r < r_max:
 #        print('r =', r)
 ##        found = False
 #        Gs_nearest = Gk + gihat_list
 #        g_tmp = iam(Gs_nearest)
 #        
 #        # compute distance between phi and the new generated graph.
 #        knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
 #                       p_quit=lmbda, n_iteration=20, remove_totters=False,
 #                       n_jobs=multiprocessing.cpu_count(), verbose=False)
 #        dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * 
 #              knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * 
 #              (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
 #              k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
 #        if dnew <= dhat: # the new distance is smaller
 #            print('I am smaller!')
 #            dhat = dnew
 #            g_new = g_tmp.copy() # found better graph.
 #            gihat_list = [g_new]
 #            dis_gs.append(dhat)
 #            r = 0
 #        else:
 #            r += 1
 #            
 #    ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list)
 #    
 #    return dhat, ghat


 def gk_iam_nearest(Gn, alpha):
 def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
    """This function constructs graph pre-image by the iterative pre-image 
    framework in reference [1], algorithm 1, where the step of generating new 
    graphs randomly is replaced by the IAM algorithm in reference [2].
@@ -94,10 +99,11 @@ def gk_iam_nearest(Gn, alpha):
    # compute k nearest neighbors of phi in DN.
    dis_list = [] # distance between g_star and each graph.
    for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
        dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
                      k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * 
                      (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
                      k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
        dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix)
 #        dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
 #                      k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha * 
 #                      (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha * 
 #                      k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6])
        dis_list.append(dtemp)
        
    # sort
@@ -108,9 +114,12 @@ def gk_iam_nearest(Gn, alpha):
        print('The exact pre-image is found from the input dataset.')
        return 0, g0hat
    dhat = dis_gs[0] # the nearest distance
    ghat = g0hat
    Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
    Gs_nearest = Gk
    ghat = g0hat.copy()
    Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
    for gi in Gk:
        nx.draw_networkx(gi)
        plt.show()
    Gs_nearest = Gk.copy()
 #    gihat_list = []
    
 #    i = 1
@@ -119,18 +128,29 @@ def gk_iam_nearest(Gn, alpha):
        print('r =', r)
 #        found = False
 #        Gs_nearest = Gk + gihat_list
        g_tmp = iam(Gs_nearest)
 #        g_tmp = iam(Gs_nearest)
        g_tmp = test_iam_with_more_graphs_as_init(Gs_nearest, Gs_nearest, c_ei=1, c_er=1, c_es=1)
        nx.draw_networkx(g_tmp)
        plt.show()
        
        # compute distance between phi and the new generated graph.
        knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
                       p_quit=lmbda, n_iteration=20, remove_totters=False,
                       n_jobs=multiprocessing.cpu_count(), verbose=False)
        dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * 
              knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * 
              (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
              k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
        if dnew <= dhat: # the new distance is smaller
        gi_list = [Gn[i] for i in idx_gi]
        knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False)
        dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew)
        
 #        dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] * 
 #              knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] * 
 #              alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] * 
 #              k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
        if dnew <= dhat and g_tmp != ghat: # the new distance is smaller
            print('I am smaller!')
            print(str(dhat) + '->' + str(dnew))
 #            nx.draw_networkx(ghat)
 #            plt.show()
 #            print('->')
 #            nx.draw_networkx(g_tmp)
 #            plt.show()
            
            dhat = dnew
            g_new = g_tmp.copy() # found better graph.
            ghat = g_tmp.copy()
@@ -144,48 +164,205 @@ def gk_iam_nearest(Gn, alpha):
            r += 1
    
    return dhat, ghat


 def dis_gstar(idx_g, idx_gi, alpha, Kmatrix):
    term1 = Kmatrix[idx_g, idx_g]
    term2 = 0
    for i, a in enumerate(alpha):
        term2 += a * Kmatrix[idx_g, idx_gi[i]]
    term2 *= 2
    term3 = 0
    for i1, a1 in enumerate(alpha):
        for i2, a2 in enumerate(alpha):
            term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
    return np.sqrt(term1 - term2 + term3)


 def compute_kernel(Gn, graph_kernel, verbose):
    if graph_kernel == 'marginalizedkernel':
        Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
                                  p_quit=0.3, n_iteration=19, remove_totters=False,
                                  n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'untilhpathkernel':
        Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type',
                                  depth=2, k_func='MinMax', compute_method='trie',
                                  n_jobs=multiprocessing.cpu_count(), verbose=verbose)
        
    # normalization
    Kmatrix_diag = Kmatrix.diagonal().copy()
    for i in range(len(Kmatrix)):
        for j in range(i, len(Kmatrix)):
            Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
            Kmatrix[j][i] = Kmatrix[i][j]
    return Kmatrix
            

 def gram2distances(Kmatrix):
    dmatrix = np.zeros((len(Kmatrix), len(Kmatrix)))
    for i1 in range(len(Kmatrix)):
        for i2 in range(len(Kmatrix)):
            dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2]
    dmatrix = np.sqrt(dmatrix)
    return dmatrix

 # --------------------------- These are tests --------------------------------#
    
 def test_who_is_the_closest_in_kernel_space(Gn):
    idx_gi = [0, 6]
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    # create the "median" graph.
    gnew = g2.copy()
    gnew.remove_node(0)
    nx.draw_networkx(gnew)
    plt.show()
    print(gnew.nodes(data=True))
    Gn = [gnew] + Gn
    
    # compute gram matrix
    Kmatrix = compute_kernel(Gn, 'untilhpathkernel', True)
    # the distance matrix
    dmatrix = gram2distances(Kmatrix)
    print(np.sort(dmatrix[idx_gi[0] + 1]))
    print(np.argsort(dmatrix[idx_gi[0] + 1]))
    print(np.sort(dmatrix[idx_gi[1] + 1]))
    print(np.argsort(dmatrix[idx_gi[1] + 1]))
    # for all g in Gn, compute (d(g1, g) + d(g2, g)) / 2
    dis_median = [(dmatrix[i, idx_gi[0] + 1] + dmatrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
    print(np.sort(dis_median))
    print(np.argsort(dis_median))
    return


 def test_who_is_the_closest_in_GED_space(Gn):
    from iam import GED
    idx_gi = [0, 6]
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    # create the "median" graph.
    gnew = g2.copy()
    gnew.remove_node(0)
    nx.draw_networkx(gnew)
    plt.show()
    print(gnew.nodes(data=True))
    Gn = [gnew] + Gn
    
    # compute GEDs
    ged_matrix = np.zeros((len(Gn), len(Gn)))
    for i1 in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
        for i2 in range(len(Gn)):
            dis, _, _ = GED(Gn[i1], Gn[i2], lib='gedlib')
            ged_matrix[i1, i2] = dis
    print(np.sort(ged_matrix[idx_gi[0] + 1]))
    print(np.argsort(ged_matrix[idx_gi[0] + 1]))
    print(np.sort(ged_matrix[idx_gi[1] + 1]))
    print(np.argsort(ged_matrix[idx_gi[1] + 1]))
    # for all g in Gn, compute (GED(g1, g) + GED(g2, g)) / 2
    dis_median = [(ged_matrix[i, idx_gi[0] + 1] + ged_matrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
    print(np.sort(dis_median))
    print(np.argsort(dis_median))
    return


 def test_will_IAM_give_the_median_graph_we_wanted(Gn):
    idx_gi = [0, 6]
    g1 = Gn[idx_gi[0]].copy()
    g2 = Gn[idx_gi[1]].copy()
 #    del Gn[idx_gi[0]]
 #    del Gn[idx_gi[1] - 1]
    g_median = test_iam_with_more_graphs_as_init([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
 #    g_median = test_iam_with_more_graphs_as_init(Gn, Gn, c_ei=1, c_er=1, c_es=1)
    nx.draw_networkx(g_median)
    plt.show()
    print(g_median.nodes(data=True))
    print(g_median.edges(data=True))
    
    
 def test_new_IAM_allGraph_deleteNodes(Gn):
    idx_gi = [0, 6]
 #    g1 = Gn[idx_gi[0]].copy()
 #    g2 = Gn[idx_gi[1]].copy()

    g1 = nx.Graph(name='haha')
    g1.add_nodes_from([(2, {'atom': 'C'}), (3, {'atom': 'O'}), (4, {'atom': 'C'})])
    g1.add_edges_from([(2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
    g2 = nx.Graph(name='hahaha')
    g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
                       (3, {'atom': 'O'}), (4, {'atom': 'C'})])
    g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
                       (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
 #    g2 = g1.copy()
 #    g2.add_nodes_from([(3, {'atom': 'O'})])
 #    g2.add_nodes_from([(4, {'atom': 'C'})])
 #    g2.add_edges_from([(1, 3, {'bond_type': '1'})])
 #    g2.add_edges_from([(3, 4, {'bond_type': '1'})])

 #    del Gn[idx_gi[0]]
 #    del Gn[idx_gi[1] - 1]
    g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
 #    g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(Gn, Gn, c_ei=1, c_er=1, c_es=1)
    nx.draw_networkx(g_median)
    plt.show()
    print(g_median.nodes(data=True))
    print(g_median.edges(data=True))


 if __name__ == '__main__':
    import sys
    sys.path.insert(0, "../")
    from pygraph.kernels.marginalizedKernel import marginalizedkernel
    from pygraph.utils.graphfiles import loadDataset
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
 #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
 #          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
 #    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
 #          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
 #          'extra_params': {}}
    ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
        'extra_params': {}} # node symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:10]
 #    Gn = Gn[0:20]
    
    test_new_IAM_allGraph_deleteNodes(Gn)
    test_will_IAM_give_the_median_graph_we_wanted(Gn)
    test_who_is_the_closest_in_GED_space(Gn)
    test_who_is_the_closest_in_kernel_space(Gn)
    
    lmbda = 0.03 # termination probalility
    r_max = 10 # recursions
    l = 500
    alpha_range = np.linspace(0.1, 0.9, 9)
    k = 5 # k nearest neighbors
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 20 # k nearest neighbors
    
    # randomly select two molecules
    np.random.seed(1)
    idx1, idx2 = np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx1]
    g2 = Gn[idx2]
    idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    
 #    g_tmp = iam([g1, g2])
 #    nx.draw_networkx(g_tmp)
 #    plt.show()
    
    # compute 
    k_list = [] # kernel between each graph and itself.
    k_g1_list = [] # kernel between each graph and g1
    k_g2_list = [] # kernel between each graph and g2
    for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): 
        ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None,
                                   p_quit=lmbda, n_iteration=20, remove_totters=False,
                                   n_jobs=multiprocessing.cpu_count(), verbose=False)
        k_list.append(ktemp[0][0, 0])
        k_g1_list.append(ktemp[0][0, 1])
        k_g2_list.append(ktemp[0][0, 2])
 #    k_list = [] # kernel between each graph and itself.
 #    k_g1_list = [] # kernel between each graph and g1
 #    k_g2_list = [] # kernel between each graph and g2
 #    for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): 
 #        ktemp = compute_kernel([g, g1, g2], 'marginalizedkernel', False)
 #        k_list.append(ktemp[0][0, 0])
 #        k_g1_list.append(ktemp[0][0, 1])
 #        k_g2_list.append(ktemp[0][0, 2])
        
    km = compute_kernel(Gn, 'untilhpathkernel', True)
 #    k_list = np.diag(km) # kernel between each graph and itself.
 #    k_g1_list = km[idx_gi[0]] # kernel between each graph and g1
 #    k_g2_list = km[idx_gi[1]] # kernel between each graph and g2    

    g_best = []
    dis_best = []
    # for each alpha
    for alpha in alpha_range:
        print('alpha =', alpha)
        dhat, ghat = gk_iam_nearest(Gn, alpha)
        dhat, ghat = gk_iam_nearest(Gn, [alpha, 1 - alpha], idx_gi, km, k, r_max)
        dis_best.append(dhat)
        g_best.append(ghat)
        
--- a/preimage/iam.py
+++ b/preimage/iam.py
@@ -16,18 +16,17 @@ import librariesImport, script
 sys.path.insert(0, "../")
 from pygraph.utils.graphfiles import saveDataset
 from pygraph.utils.graphdataset import get_dataset_attributes
 from pygraph.utils.utils import graph_isIdentical
 #from pygraph.utils.utils import graph_deepcopy


 def iam(Gn, node_label='atom', edge_label='bond_type'):
 def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', 
        connected=True):
    """See my name, then you know what I do.
    """
 #    Gn = Gn[0:10]
    Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
    
    c_er = 1
    c_es = 1
    c_ei = 1
    
    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
@@ -37,7 +36,7 @@ def iam(Gn, node_label='atom', edge_label='bond_type'):
        dist_sum = 0
        pi_all.append([])
        for idx2, G_p_prime in enumerate(Gn):
            dist_tmp, pi_tmp = GED(G_p, G_p_prime)
            dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime)
            pi_all[idx1].append(pi_tmp)
            dist_sum += dist_tmp
        if dist_sum < dis_min:
@@ -50,7 +49,7 @@ def iam(Gn, node_label='atom', edge_label='bond_type'):
    # phase 2: iteration.
    ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], 
                                      edge_label=edge_label)
    for itr in range(0, 10):
    for itr in range(0, 10): # @todo: the convergence condition?
        G_new = G.copy()
        # update vertex labels.
        # pre-compute h_i0 for each label.
@@ -138,34 +137,40 @@ def iam(Gn, node_label='atom', edge_label='bond_type'):
                        G_new.remove_edge(nd1, nd2)
                        
        G = G_new.copy()
        
        # update pi_p
        pi_p = []
        for idx1, G_p in enumerate(Gn):
            dist_tmp, pi_tmp, _ = GED(G, G_p)
            pi_p.append(pi_tmp)
    
    return G


 def GED(g1, g2, lib='gedlib'):
    """
    Compute GED. It is a dummy function for now.
    Compute GED.
    """
    if lib == 'gedlib':
        # transform dataset to the 'xml' file as the GedLib required.
        saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp')
        script.appel()
 #        script.appel()
        script.PyRestartEnv()
        script.PyLoadGXLGraph('ged_tmp/', 'collections/tmp.xml')
        script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml')
        listID = script.PyGetGraphIds()
        script.PySetEditCost("CHEM_1")
        script.PySetEditCost("CHEM_2")
        script.PyInitEnv()
        script.PySetMethod("BIPARTITE", "")
        script.PyInitMethod()
        g = listID[0]
        h = listID[1]
        script.PyRunMethod(g, h)
        liste = script.PyGetAllMap(g, h)
        pi_forward, pi_backward = script.PyGetAllMap(g, h)
        upper = script.PyGetUpperBound(g, h)
        lower = script.PyGetLowerBound(g, h)        
        dis = upper + lower
        pi = liste[0]
        dis = (upper + lower) / 2
        
    return dis, pi
    return dis, pi_forward, pi_backward


 def get_node_labels(Gn, node_label):
@@ -182,6 +187,434 @@ def get_edge_labels(Gn, edge_label):
    return el


 # --------------------------- These are tests --------------------------------#
    
 def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, 
                                      node_label='atom', edge_label='bond_type'):
    """See my name, then you know what I do.
    """
    from tqdm import tqdm
 #    Gn = Gn[0:10]
    Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
    
    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
 #    pi_p = []
    pi_all_forward = []
    pi_all_backward = []
    for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout):
        dist_sum = 0
        pi_all_forward.append([])
        pi_all_backward.append([])
        for idx2, G_p_prime in enumerate(Gn):
            dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime)
            pi_all_forward[idx1].append(pi_tmp_forward)
            pi_all_backward[idx1].append(pi_tmp_backward)
            dist_sum += dist_tmp
        if dist_sum <= dis_min:
            dis_min = dist_sum
            G = G_p.copy()
            idx_min = idx1
    # list of edit operations.        
    pi_p_forward = pi_all_forward[idx_min]
    pi_p_backward = pi_all_backward[idx_min]
            
    # phase 2: iteration.
    ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'], 
                                      edge_label=edge_label)
    label_set = get_node_labels(Gn + [G], node_label)
    for itr in range(0, 10): # @todo: the convergence condition?
        G_new = G.copy()
        # update vertex labels.
        # pre-compute h_i0 for each label.
 #        for label in get_node_labels(Gn, node_label):
 #            print(label)
 #        for nd in G.nodes(data=True):
 #            pass
        if not ds_attrs['node_attr_dim']: # labels are symbolic
            for nd in G.nodes():
                h_i0_list = []
                label_list = []
                for label in label_set:
                    h_i0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p_forward[idx][nd]
                        if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
                            h_i0 += 1
                    h_i0_list.append(h_i0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
                idx_rdm = random.randint(0, len(idx_max) - 1)
                G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
        else: # labels are non-symbolic
            for nd in G.nodes():
                Si_norm = 0
                phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
                for idx, g in enumerate(Gn):
                    pi_i = pi_p_forward[idx][nd]
                    if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
                        Si_norm += 1
                        phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])                
                phi_i_bar /= Si_norm
                G_new.nodes[nd]['attributes'] = phi_i_bar
                                            
        # update edge labels and adjacency matrix.
        if ds_attrs['edge_labeled']:
            for nd1, nd2, _ in G.edges(data=True):
                h_ij0_list = []
                label_list = []
                for label in get_edge_labels(Gn, edge_label):
                    h_ij0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p_forward[idx][nd1]
                        pi_j = pi_p_forward[idx][nd2]
                        h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and 
                                   g.has_edge(pi_i, pi_j) and 
                                   g.edges[pi_i, pi_j][edge_label] == label)
                        h_ij0 += h_ij0_p
                    h_ij0_list.append(h_ij0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
                h_ij0_max = h_ij0_list[idx_max[0]]
                idx_rdm = random.randint(0, len(idx_max) - 1)
                best_label = label_list[idx_max[idx_rdm]]
                       
                # check whether a_ij is 0 or 1.
                sij_norm = 0
                for idx, g in enumerate(Gn):
                    pi_i = pi_p_forward[idx][nd1]
                    pi_j = pi_p_forward[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                    G_new.edges[nd1, nd2][edge_label] = best_label
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)                
        else: # if edges are unlabeled
            # @todo: works only for undirected graphs.
            for nd1 in range(nx.number_of_nodes(G)):
                for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
                    sij_norm = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p_forward[idx][nd1]
                        pi_j = pi_p_forward[idx][nd2]
                        if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                           sij_norm += 1
                    if sij_norm > len(Gn) * c_er / (c_er + c_ei):
                        if not G_new.has_edge(nd1, nd2):
                            G_new.add_edge(nd1, nd2)
                    elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
                        if G_new.has_edge(nd1, nd2):
                            G_new.remove_edge(nd1, nd2)
                    # do not change anything when equal.
                        
        G = G_new.copy()
        
        # update pi_p
        pi_p_forward = []
        for G_p in Gn:
            dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
            pi_p_forward.append(pi_tmp_forward)
    
    return G


 def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
        Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, node_label='atom', 
        edge_label='bond_type', connected=True):
    """See my name, then you know what I do.
    """
    from tqdm import tqdm
 #    Gn_median = Gn_median[0:10]
 #    Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
    node_ir = sys.maxsize * 2 # Max number for c++, corresponding to the node remove and insertion.
    label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
    ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, 
                                      attr_names=['edge_labeled', 'node_attr_dim'], 
                                      edge_label=edge_label)

    
    def generate_graph(G, pi_p_forward, label_set):
        G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
 #        nx.draw_networkx(G)
 #        import matplotlib.pyplot as plt
 #        plt.show()
 #        print(pi_p_forward)
                    
        # update vertex labels.
        # pre-compute h_i0 for each label.
 #        for label in get_node_labels(Gn, node_label):
 #            print(label)
 #        for nd in G.nodes(data=True):
 #            pass
        if not ds_attrs['node_attr_dim']: # labels are symbolic
            for ndi, (nd, _) in enumerate(G.nodes(data=True)):
                h_i0_list = []
                label_list = []
                for label in label_set:
                    h_i0 = 0
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][ndi]
                        if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
                            h_i0 += 1
                    h_i0_list.append(h_i0)
                    label_list.append(label)
                # case when the node is to be removed.
                h_i0_remove = 0
                for idx, g in enumerate(Gn_median):
                    pi_i = pi_p_forward[idx][ndi]
                    if pi_i == node_ir:
                        h_i0_remove += 1
                h_i0_list.append(h_i0_remove)
                label_list.append(label_r)
                # get the best labels.
                idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
                nlabel_best = [label_list[idx] for idx in idx_max]
                # generate "best" graphs with regard to "best" node labels.
                G_new_list_nd = []
                for g in G_new_list:
                    for nl in nlabel_best:
                        g_tmp = g.copy()
                        if nl == label_r:
                            g_tmp.remove_node(nd)
                        else:
                            g_tmp.nodes[nd][node_label] = nl
                        G_new_list_nd.append(g_tmp)
 #                            nx.draw_networkx(g_tmp)
 #                            import matplotlib.pyplot as plt
 #                            plt.show()
 #                            print(g_tmp.nodes(data=True))
 #                            print(g_tmp.edges(data=True))
                G_new_list = G_new_list_nd[:]

        else: # labels are non-symbolic
            for nd in G.nodes():
                Si_norm = 0
                phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
                for idx, g in enumerate(Gn_median):
                    pi_i = pi_p_forward[idx][nd]
                    if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
                        Si_norm += 1
                        phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])                
                phi_i_bar /= Si_norm
                G_new.nodes[nd]['attributes'] = phi_i_bar
                                            
        # update edge labels and adjacency matrix.
        if ds_attrs['edge_labeled']:
            for nd1, nd2, _ in G.edges(data=True):
                h_ij0_list = []
                label_list = []
                for label in get_edge_labels(Gn_median, edge_label):
                    h_ij0 = 0
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][nd1]
                        pi_j = pi_p_forward[idx][nd2]
                        h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and 
                                   g.has_edge(pi_i, pi_j) and 
                                   g.edges[pi_i, pi_j][edge_label] == label)
                        h_ij0 += h_ij0_p
                    h_ij0_list.append(h_ij0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
                h_ij0_max = h_ij0_list[idx_max[0]]
                idx_rdm = random.randint(0, len(idx_max) - 1)
                best_label = label_list[idx_max[idx_rdm]]
                       
                # check whether a_ij is 0 or 1.
                sij_norm = 0
                for idx, g in enumerate(Gn_median):
                    pi_i = pi_p_forward[idx][nd1]
                    pi_j = pi_p_forward[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                    G_new.edges[nd1, nd2][edge_label] = best_label
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)                
        else: # if edges are unlabeled
            # @todo: works only for undirected graphs.
            nd_list = [n for n in G.nodes()]
            for g_tmp in G_new_list:
                for nd1i in range(nx.number_of_nodes(G)):
                    nd1 = nd_list[nd1i]
                    for nd2i in range(nd1i + 1, nx.number_of_nodes(G)):
                        nd2 = nd_list[nd2i]
                        sij_norm = 0
                        for idx, g in enumerate(Gn_median):
                            pi_i = pi_p_forward[idx][nd1i]
                            pi_j = pi_p_forward[idx][nd2i]
                            if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                               sij_norm += 1
                        if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
                            # @todo: should we consider if nd1 and nd2 in g_tmp?
                            # or just add the edge anyway?
                            if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
                                and not g_tmp.has_edge(nd1, nd2):
                                g_tmp.add_edge(nd1, nd2)
                        elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
                            if g_tmp.has_edge(nd1, nd2):
                                g_tmp.remove_edge(nd1, nd2)
                        # do not change anything when equal.                        
        
        # find the best graph generated in this iteration and update pi_p.
        # @todo: should we update all graphs generated or just the best ones?
        dis_list, pi_forward_list = median_distance(G_new_list, Gn_median)
        # @todo: should we remove the identical and connectivity check? 
        # Don't know which is faster.
        G_new_list, idx_list = remove_duplicates(G_new_list)
        pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
 #        if connected == True:
 #            G_new_list, idx_list = remove_disconnected(G_new_list)
 #            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
 #        idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
 #        dis_min = dis_list[idx_min_tmp_list[0]]
 #        pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
 #        G_new_list = [G_new_list[idx] for idx in idx_min_list] 
        
        for g in G_new_list:
            import matplotlib.pyplot as plt 
            nx.draw_networkx(g)
            plt.show()
            print(g.nodes(data=True))
            print(g.edges(data=True))
        
        return G_new_list, pi_forward_list
    
    
    def median_distance(Gn, Gn_median, measure='ged', verbose=False):
        dis_list = []
        pi_forward_list = []
        for idx, G in tqdm(enumerate(Gn), desc='computing median distances', 
                           file=sys.stdout) if verbose else enumerate(Gn):
            dis_sum = 0
            pi_forward_list.append([])
            for G_p in Gn_median:
                dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
                pi_forward_list[idx].append(pi_tmp_forward)
                dis_sum += dis_tmp
            dis_list.append(dis_sum)
        return dis_list, pi_forward_list
    
    
    def best_median_graphs(Gn_candidate, dis_all, pi_all_forward):
        idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
        dis_min = dis_all[idx_min_list[0]]
        pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
        G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
        return G_min_list, pi_forward_min_list, dis_min
    
    
    def iteration_proc(G, pi_p_forward):
        G_list = [G]
        pi_forward_list = [pi_p_forward]
        # iterations.
        for itr in range(0, 10): # @todo: the convergence condition?
 #            print('itr is', itr)
            G_new_list = []
            pi_forward_new_list = []
            for idx, G in enumerate(G_list):
                label_set = get_node_labels(Gn_median + [G], node_label)                        
                G_tmp_list, pi_forward_tmp_list = generate_graph(
                    G, pi_forward_list[idx], label_set)
                G_new_list += G_tmp_list
                pi_forward_new_list += pi_forward_tmp_list
            G_list = G_new_list[:]
            pi_forward_list = pi_forward_new_list[:]
        
        G_list, idx_list = remove_duplicates(G_list)
        pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
            
 #        import matplotlib.pyplot as plt
 #        for g in G_list:             
 #            nx.draw_networkx(g)
 #            plt.show()
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
            
        return G_list, pi_forward_list # do we return all graphs or the best ones?
    
    
    def remove_duplicates(Gn):
        """Remove duplicate graphs from list.
        """
        Gn_new = []
        idx_list = []
        for idx, g in enumerate(Gn):
            dupl = False
            for g_new in Gn_new:
                if graph_isIdentical(g_new, g):
                    dupl = True
                    break
            if not dupl:
                Gn_new.append(g)
                idx_list.append(idx)
        return Gn_new, idx_list
    
    
    def remove_disconnected(Gn):
        """Remove disconnected graphs from list.
        """
        Gn_new = []
        idx_list = []
        for idx, g in enumerate(Gn):
            if nx.is_connected(g):
                Gn_new.append(g)
                idx_list.append(idx)
        return Gn_new, idx_list

   
    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
    dis_all, pi_all_forward = median_distance(Gn_candidate[::-1], Gn_median)
    # find all smallest distances.
    idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
    dis_min = dis_all[idx_min_list[0]]
    
    # phase 2: iteration.
    G_list = []
    for idx_min in idx_min_list[::-1]:
 #        print('idx_min is', idx_min)
        G = Gn_candidate[idx_min].copy()
        # list of edit operations.        
        pi_p_forward = pi_all_forward[idx_min]
 #        pi_p_backward = pi_all_backward[idx_min]        
        Gi_list, pi_i_forward_list = iteration_proc(G, pi_p_forward)            
        G_list += Gi_list
        
    G_list, _ = remove_duplicates(G_list)
    if connected == True:
        G_list, _ = remove_disconnected(G_list)

    import matplotlib.pyplot as plt 
    for g in G_list:
        nx.draw_networkx(g)
        plt.show()
        print(g.nodes(data=True))
        print(g.edges(data=True))
    
    # get the best median graphs
    dis_all, pi_all_forward = median_distance(G_list, Gn_median)
    G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
            G_list, dis_all, pi_all_forward)
    for g in G_min_list:
        nx.draw_networkx(g)
        plt.show()
        print(g.nodes(data=True))
        print(g.edges(data=True))
    return G_min_list


 if __name__ == '__main__':
    from pygraph.utils.graphfiles import loadDataset
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
--- a/pygraph/kernels/treeletKernel.py
+++ b/pygraph/kernels/treeletKernel.py
@@ -0,0 +1,430 @@
 """
@author: linlin
@references: Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47.
 """

 import sys
 sys.path.insert(0, "../")
 import time
 from collections import Counter
 from itertools import chain
 from functools import partial
 from multiprocessing import Pool
 from tqdm import tqdm

 import networkx as nx
 import numpy as np

 from pygraph.utils.graphdataset import get_dataset_attributes
 from pygraph.utils.parallel import parallel_gm

 def treeletkernel(*args, 
                  sub_kernel, 
                  node_label='atom', 
                  edge_label='bond_type', 
                  n_jobs=None, 
                  verbose=True):
    """Calculate treelet graph kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.
    sub_kernel : function
        The sub-kernel between 2 real number vectors. Each vector counts the
        numbers of isomorphic treelets in a graph.
    node_label : string
        Node attribute used as label. The default node label is atom.        
    edge_label : string
        Edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the treelet kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label, edge_label=edge_label)
    labeled = False
    if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']:
        labeled = True
        if not ds_attrs['node_labeled']:
            for G in Gn:
                nx.set_node_attributes(G, '0', 'atom')
        if not ds_attrs['edge_labeled']:
            for G in Gn:
                nx.set_edge_attributes(G, '0', 'bond_type')
    
    start_time = time.time()
    
    # ---- use pool.imap_unordered to parallel and track progress. ----
    # get all canonical keys of all graphs before calculating kernels to save 
    # time, but this may cost a lot of memory for large dataset.
    pool = Pool(n_jobs)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    canonkeys = [[] for _ in range(len(Gn))]
    getps_partial = partial(wrapper_get_canonkeys, node_label, edge_label, 
                            labeled, ds_attrs['is_directed'])
    if verbose:
        iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize),
                        desc='getting canonkeys', file=sys.stdout)
    else:
        iterator = pool.imap_unordered(getps_partial, itr, chunksize)
    for i, ck in iterator:
        canonkeys[i] = ck
    pool.close()
    pool.join()
    
    # compute kernels.
    def init_worker(canonkeys_toshare):
        global G_canonkeys
        G_canonkeys = canonkeys_toshare
    do_partial = partial(wrapper_treeletkernel_do, sub_kernel)
    parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                glbv=(canonkeys,), n_jobs=n_jobs, verbose=verbose)
    
    run_time = time.time() - start_time
    if verbose:
        print("\n --- treelet kernel matrix of size %d built in %s seconds ---" 
              % (len(Gn), run_time))
        
    return Kmatrix, run_time


 def _treeletkernel_do(canonkey1, canonkey2, sub_kernel):
    """Calculate treelet graph kernel between 2 graphs.
    
    Parameters
    ----------
    canonkey1, canonkey2 : list
        List of canonical keys in 2 graphs, where each key is represented by a string.
        
    Return
    ------
    kernel : float
        Treelet Kernel between 2 graphs.
    """
    keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs
    vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys])
    vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) 
    kernel = np.sum(np.exp(-np.square(vector1 - vector2) / 2))
 #    kernel = sub_kernel(vector1, vector2) 
    return kernel


 def wrapper_treeletkernel_do(sub_kernel, itr):
    i = itr[0]
    j = itr[1]
    return i, j, _treeletkernel_do(G_canonkeys[i], G_canonkeys[j], sub_kernel)


 def get_canonkeys(G, node_label, edge_label, labeled, is_directed):
    """Generate canonical keys of all treelets in a graph.
    
    Parameters
    ----------
    G : NetworkX graphs
        The graph in which keys are generated.
    node_label : string
        node attribute used as label. The default node label is atom.        
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.
        
    Return
    ------
    canonkey/canonkey_l : dict
        For unlabeled graphs, canonkey is a dictionary which records amount of 
        every tree pattern. For labeled graphs, canonkey_l is one which keeps 
        track of amount of every treelet.
    """
    patterns = {} # a dictionary which consists of lists of patterns for all graphlet.
    canonkey = {} # canonical key, a dictionary which records amount of every tree pattern.

    ### structural analysis ###
    ### In this section, a list of patterns is generated for each graphlet, 
    ### where every pattern is represented by nodes ordered by Morgan's 
    ### extended labeling.
    # linear patterns
    patterns['0'] = G.nodes()
    canonkey['0'] = nx.number_of_nodes(G)
    for i in range(1, 6): # for i in range(1, 6):
        patterns[str(i)] = find_all_paths(G, i, is_directed)
        canonkey[str(i)] = len(patterns[str(i)])

    # n-star patterns
    patterns['3star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3]
    patterns['4star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4]
    patterns['5star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5]        
    # n-star patterns
    canonkey['6'] = len(patterns['3star'])
    canonkey['8'] = len(patterns['4star'])
    canonkey['d'] = len(patterns['5star'])

    # pattern 7
    patterns['7'] = [] # the 1st line of Table 1 in Ref [1]
    for pattern in patterns['3star']:
        for i in range(1, len(pattern)): # for each neighbor of node 0
            if G.degree(pattern[i]) >= 2:
                pattern_t = pattern[:]
                # set the node with degree >= 2 as the 4th node
                pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i]
                for neighborx in G[pattern[i]]:
                    if neighborx != pattern[0]:
                        new_pattern = pattern_t + [neighborx]
                        patterns['7'].append(new_pattern)
    canonkey['7'] = len(patterns['7'])

    # pattern 11
    patterns['11'] = [] # the 4th line of Table 1 in Ref [1]
    for pattern in patterns['4star']:
        for i in range(1, len(pattern)):
            if G.degree(pattern[i]) >= 2:
                pattern_t = pattern[:]
                pattern_t[i], pattern_t[4] = pattern_t[4], pattern_t[i]
                for neighborx in G[pattern[i]]:
                    if neighborx != pattern[0]:
                        new_pattern = pattern_t + [ neighborx ]
                        patterns['11'].append(new_pattern)
    canonkey['b'] = len(patterns['11'])

    # pattern 12
    patterns['12'] = [] # the 5th line of Table 1 in Ref [1]
    rootlist = [] # a list of root nodes, whose extended labels are 3
    for pattern in patterns['3star']:
        if pattern[0] not in rootlist: # prevent to count the same pattern twice from each of the two root nodes
            rootlist.append(pattern[0])
            for i in range(1, len(pattern)):
                if G.degree(pattern[i]) >= 3:
                    rootlist.append(pattern[i])
                    pattern_t = pattern[:]
                    pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i]
                    for neighborx1 in G[pattern[i]]:
                        if neighborx1 != pattern[0]:
                            for neighborx2 in G[pattern[i]]:
                                if neighborx1 > neighborx2 and neighborx2 != pattern[0]:
                                    new_pattern = pattern_t + [neighborx1] + [neighborx2]
 #                         new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ]
                                    patterns['12'].append(new_pattern)
    canonkey['c'] = int(len(patterns['12']) / 2)

    # pattern 9
    patterns['9'] = [] # the 2nd line of Table 1 in Ref [1]
    for pattern in patterns['3star']:
        for pairs in [ [neighbor1, neighbor2] for neighbor1 in G[pattern[0]] if G.degree(neighbor1) >= 2 \
            for neighbor2 in G[pattern[0]] if G.degree(neighbor2) >= 2 if neighbor1 > neighbor2 ]:
            pattern_t = pattern[:]
            # move nodes with extended labels 4 to specific position to correspond to their children
            pattern_t[pattern_t.index(pairs[0])], pattern_t[2] = pattern_t[2], pattern_t[pattern_t.index(pairs[0])]
            pattern_t[pattern_t.index(pairs[1])], pattern_t[3] = pattern_t[3], pattern_t[pattern_t.index(pairs[1])]
            for neighborx1 in G[pairs[0]]:
                if neighborx1 != pattern[0]:
                    for neighborx2 in G[pairs[1]]:
                        if neighborx2 != pattern[0]:
                            new_pattern = pattern_t + [neighborx1] + [neighborx2]
                            patterns['9'].append(new_pattern)
    canonkey['9'] = len(patterns['9'])

    # pattern 10
    patterns['10'] = [] # the 3rd line of Table 1 in Ref [1]
    for pattern in patterns['3star']:        
        for i in range(1, len(pattern)):
            if G.degree(pattern[i]) >= 2:
                for neighborx in G[pattern[i]]:
                    if neighborx != pattern[0] and G.degree(neighborx) >= 2:
                        pattern_t = pattern[:]
                        pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i]
                        new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ]
                        patterns['10'].extend(new_patterns)
    canonkey['a'] = len(patterns['10'])

    ### labeling information ###
    ### In this section, a list of canonical keys is generated for every 
    ### pattern obtained in the structural analysis section above, which is a 
    ### string corresponding to a unique treelet. A dictionary is built to keep
    ### track of the amount of every treelet.
    if labeled == True:
        canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet.

        # linear patterns
        canonkey_t = Counter(list(nx.get_node_attributes(G, node_label).values()))
        for key in canonkey_t:
            canonkey_l['0' + key] = canonkey_t[key]

        for i in range(1, 6): # for i in range(1, 6):
            treelet = []
            for pattern in patterns[str(i)]:
                canonlist = list(chain.from_iterable((G.node[node][node_label], \
                    G[node][pattern[idx+1]][edge_label]) for idx, node in enumerate(pattern[:-1])))
                canonlist.append(G.node[pattern[-1]][node_label])
                canonkey_t = ''.join(canonlist)
                canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1]
                treelet.append(str(i) + canonkey_t)
            canonkey_l.update(Counter(treelet))

        # n-star patterns
        for i in range(3, 6):
            treelet = []
            for pattern in patterns[str(i) + 'star']:
                canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:] ]
                canonlist.sort()
                canonkey_t = ('d' if i == 5 else str(i * 2)) + G.node[pattern[0]][node_label] + ''.join(canonlist)
                treelet.append(canonkey_t)
            canonkey_l.update(Counter(treelet))

        # pattern 7
        treelet = []
        for pattern in patterns['7']:
            canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ]
            canonlist.sort()
            canonkey_t = '7' + G.node[pattern[0]][node_label] + ''.join(canonlist) \
                + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \
                 + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label]
            treelet.append(canonkey_t)
        canonkey_l.update(Counter(treelet))

        # pattern 11
        treelet = []
        for pattern in patterns['11']:
            canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:4] ]
            canonlist.sort()
            canonkey_t = 'b' + G.node[pattern[0]][node_label] + ''.join(canonlist) \
                + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[0]][edge_label] \
                 + G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label]
            treelet.append(canonkey_t)
        canonkey_l.update(Counter(treelet))

        # pattern 10
        treelet = []
        for pattern in patterns['10']:
            canonkey4 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label]
            canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ]
            canonlist.sort()
            canonkey0 = ''.join(canonlist)
            canonkey_t = 'a' + G.node[pattern[3]][node_label] \
                + G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] \
                + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \
                + canonkey4 + canonkey0
            treelet.append(canonkey_t)
        canonkey_l.update(Counter(treelet))

        # pattern 12
        treelet = []
        for pattern in patterns['12']:
            canonlist0 = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ]
            canonlist0.sort()
            canonlist3 = [ G.node[leaf][node_label] + G[leaf][pattern[3]][edge_label] for leaf in pattern[4:6] ]
            canonlist3.sort()
            
            # 2 possible key can be generated from 2 nodes with extended label 3, select the one with lower lexicographic order.
            canonkey_t1 = 'c' + G.node[pattern[0]][node_label] \
                + ''.join(canonlist0) \
                + G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \
                + ''.join(canonlist3)

            canonkey_t2 = 'c' + G.node[pattern[3]][node_label] \
                + ''.join(canonlist3) \
                + G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \
                + ''.join(canonlist0)

            treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2)
        canonkey_l.update(Counter(treelet))

        # pattern 9
        treelet = []
        for pattern in patterns['9']:
            canonkey2 = G.node[pattern[4]][node_label] + G[pattern[4]][pattern[2]][edge_label]
            canonkey3 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[3]][edge_label]
            prekey2 = G.node[pattern[2]][node_label] + G[pattern[2]][pattern[0]][edge_label]
            prekey3 = G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label]
            if prekey2 + canonkey2 < prekey3 + canonkey3:
                canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \
                    + prekey2 + prekey3 + canonkey2 + canonkey3
            else:
                canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \
                    + prekey3 + prekey2 + canonkey3 + canonkey2
            treelet.append('9' + G.node[pattern[0]][node_label] + canonkey_t)
        canonkey_l.update(Counter(treelet))

        return canonkey_l

    return canonkey


 def wrapper_get_canonkeys(node_label, edge_label, labeled, is_directed, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    return i, get_canonkeys(g, node_label, edge_label, labeled, is_directed)
    

 def find_paths(G, source_node, length):
    """Find all paths with a certain length those start from a source node. 
    A recursive depth first search is applied.
    
    Parameters
    ----------
    G : NetworkX graphs
        The graph in which paths are searched.
    source_node : integer
        The number of the node from where all paths start.
    length : integer
        The length of paths.
        
    Return
    ------
    path : list of list
        List of paths retrieved, where each path is represented by a list of nodes.
    """
    if length == 0:
        return [[source_node]]
    path = [[source_node] + path for neighbor in G[source_node] \
        for path in find_paths(G, neighbor, length - 1) if source_node not in path]
    return path


 def find_all_paths(G, length, is_directed):
    """Find all paths with a certain length in a graph. A recursive depth first
    search is applied.
    
    Parameters
    ----------
    G : NetworkX graphs
        The graph in which paths are searched.
    length : integer
        The length of paths.
        
    Return
    ------
    path : list of list
        List of paths retrieved, where each path is represented by a list of nodes.
    """
    all_paths = []
    for node in G:
        all_paths.extend(find_paths(G, node, length))
        
    if not is_directed:
        # For each path, two presentations are retrieved from its two extremities. 
        # Remove one of them.
        all_paths_r = [path[::-1] for path in all_paths]  
        for idx, path in enumerate(all_paths[:-1]):
            for path2 in all_paths_r[idx+1::]:
                if path == path2:
                    all_paths[idx] = []
                    break
        all_paths = list(filter(lambda a: a != [], all_paths))
            
    return all_paths
--- a/pygraph/kernels/untilHPathKernel.py
+++ b/pygraph/kernels/untilHPathKernel.py
@@ -31,6 +31,7 @@ def untilhpathkernel(*args,
                     n_jobs=None,
                     verbose=True):
    """Calculate path graph kernels up to depth/hight h between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
@@ -124,7 +125,7 @@ def untilhpathkernel(*args,
        def init_worker(trie_toshare):
            global G_trie
            G_trie = trie_toshare
        do_partial = partial(wrapper_uhpath_do_trie, k_func)   
        do_partial = partial(wrapper_uhpath_do_trie, k_func)
        parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                    glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose) 
    else:
--- a/pygraph/utils/graphfiles.py
+++ b/pygraph/utils/graphfiles.py
@@ -84,7 +84,7 @@ def loadGXL(filename):
    return g


 def saveGXL(graph, filename, method='benoit'):
 def saveGXL(graph, filename, method='gedlib'):
    if method == 'benoit':
        import xml.etree.ElementTree as ET
        root_node = ET.Element('gxl')
@@ -124,23 +124,24 @@ def saveGXL(graph, filename, method='benoit'):
        tree.write(filename)
    elif method == 'gedlib':
        # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
        pass
 #        gxl_file = open(filename, 'w')
 #        gxl_file.write("<?xml version=\"1.0\"?>\n")
 #        gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
 #        gxl_file.write("<gxl>\n")
 #        gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n")
 #        for v in graph:
 #            gxl_file.write("<node id=\"_" + str(v) + "\">\n")
 #            gxl_file.write("<attr name=\"chem\"><int>" + str(self.node_labels[node]) + "</int></attr>\n")
 #            gxl_file.write("</node>\n")
 #        for edge in self.edge_list:
 #            gxl_file.write("<edge from=\"_" + str(edge[0]) + "\" to=\"_" + str(edge[1]) + "\">\n")
 #            gxl_file.write("<attr name=\"valence\"><int>1</int></attr>\n")
 #            gxl_file.write("</edge>\n")
 #        gxl_file.write("</graph>\n")
 #        gxl_file.write("</gxl>\n")
 #        gxl_file.close()
 #        pass
        gxl_file = open(filename, 'w')
        gxl_file.write("<?xml version=\"1.0\"?>\n")
        gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
        gxl_file.write("<gxl>\n")
        gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n")
        for v, attrs in graph.nodes(data=True):
            gxl_file.write("<node id=\"_" + str(v) + "\">\n")
            gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>\n")
            gxl_file.write("</node>\n")
        for v1, v2, attrs in graph.edges(data=True):
            gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">\n")
 #            gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>\n")
            gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>\n")
            gxl_file.write("</edge>\n")
        gxl_file.write("</graph>\n")
        gxl_file.write("</gxl>\n")
        gxl_file.close()


 def loadSDF(filename):
--- a/pygraph/utils/kernels.py
+++ b/pygraph/utils/kernels.py
@@ -57,6 +57,27 @@ def gaussiankernel(x, y, gamma=None):
    return kernel


 def polynomialkernel(x, y, d=1, c=0):
    """Polynomial kernel.
    Compute the polynomial kernel between x and y:

        K(x, y) = (x^Ty)^d + c.

    Parameters
    ----------
    x, y : array

    d : integer, default 1
    
    c : float, default 0

    Returns
    -------
    kernel : float
    """
    return np.dot(x, y) ** d + c


 def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1):
    """Sum of a pair of kernels.

@@ -110,3 +131,7 @@ def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1):
    else:
        kernel = lamda * k1(d11, d12) * k2(d21, d22)
    return kernel


 if __name__ == '__main__':
    o = polynomialkernel([1, 2], [3, 4], 2, 3)
--- a/pygraph/utils/model_selection_precomputed.py
+++ b/pygraph/utils/model_selection_precomputed.py
@@ -145,7 +145,8 @@ def model_selection_for_precomputed_kernel(datafile,
 #            Kmatrix = np.random.rand(2250, 2250)
 #            current_run_time = 0.1
            
            # remove graphs whose kernels with themselves are zeros
            # remove graphs whose kernels with themselves are zeros 
            # @todo: y not changed accordingly?
            Kmatrix_diag = Kmatrix.diagonal().copy()
            nb_g_ignore = 0
            for idxk, diag in enumerate(Kmatrix_diag):
@@ -154,6 +155,7 @@ def model_selection_for_precomputed_kernel(datafile,
                    Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
                    nb_g_ignore += 1
            # normalization
            # @todo: works only for undirected graph?
            Kmatrix_diag = Kmatrix.diagonal().copy()
            for i in range(len(Kmatrix)):
                for j in range(i, len(Kmatrix)):
--- a/pygraph/utils/utils.py
+++ b/pygraph/utils/utils.py
@@ -1,5 +1,6 @@
 import networkx as nx
 import numpy as np
 from copy import deepcopy
 #from itertools import product

 # from tqdm import tqdm
@@ -183,3 +184,61 @@ def direct_product(G1, G2, node_label, edge_label):
    # gt = nx.convert_node_labels_to_integers(
    #     gt, first_label=0, label_attribute='label_orignal')
    return gt


 def graph_deepcopy(G):
    """Deep copy a graph, including deep copy of all nodes, edges and 
    attributes of the graph, nodes and edges.
    
    Note
    ----
    It is the same as the NetworkX function graph.copy(), as far as I know.
    """
    # add graph attributes.
    labels = {}
    for k, v in G.graph.items():
        labels[k] = deepcopy(v)
    if G.is_directed():
        G_copy = nx.DiGraph(**labels)
    else:
        G_copy = nx.Graph(**labels)
        
    # add nodes    
    for nd, attrs in G.nodes(data=True):
        labels = {}
        for k, v in attrs.items():
            labels[k] = deepcopy(v)
        G_copy.add_node(nd, **labels)
        
    # add edges.
    for nd1, nd2, attrs in G.edges(data=True):
        labels = {}
        for k, v in attrs.items():
            labels[k] = deepcopy(v)
        G_copy.add_edge(nd1, nd2, **labels)
    
    return G_copy


 def graph_isIdentical(G1, G2):
    """Check if two graphs are identical, including: same nodes, edges, node
    labels/attributes, edge labels/attributes.
    
    Notes
    ----
    1. The type of graphs has to be the same.
    2. Global/Graph attributes are neglected as they may contain names for graphs.
    """
    # check nodes.
    nlist1 = [n for n in G1.nodes(data=True)]
    nlist2 = [n for n in G2.nodes(data=True)]
    if not nlist1 == nlist2:
        return False
    # check edges.
    elist1 = [n for n in G1.edges(data=True)]
    elist2 = [n for n in G2.edges(data=True)]
    if not elist1 == elist2:
        return False
    # check graph attributes.
    
    return True