OpenI
/
graphkit-learn

 
			
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul  4 12:20:16 2019

@author: ljia
"""
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

import sys
sys.path.insert(0, "../")
from pygraph.utils.graphfiles import loadDataset
from median import draw_Letter_graph


# --------------------------- These are tests --------------------------------#
    
def test_who_is_the_closest_in_kernel_space(Gn):
    idx_gi = [0, 6]
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    # create the "median" graph.
    gnew = g2.copy()
    gnew.remove_node(0)
    nx.draw_networkx(gnew)
    plt.show()
    print(gnew.nodes(data=True))
    Gn = [gnew] + Gn
    
    # compute gram matrix
    Kmatrix = compute_kernel(Gn, 'untilhpathkernel', True)
    # the distance matrix
    dmatrix = gram2distances(Kmatrix)
    print(np.sort(dmatrix[idx_gi[0] + 1]))
    print(np.argsort(dmatrix[idx_gi[0] + 1]))
    print(np.sort(dmatrix[idx_gi[1] + 1]))
    print(np.argsort(dmatrix[idx_gi[1] + 1]))
    # for all g in Gn, compute (d(g1, g) + d(g2, g)) / 2
    dis_median = [(dmatrix[i, idx_gi[0] + 1] + dmatrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
    print(np.sort(dis_median))
    print(np.argsort(dis_median))
    return


def test_who_is_the_closest_in_GED_space(Gn):
    from iam import GED
    idx_gi = [0, 6]
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    # create the "median" graph.
    gnew = g2.copy()
    gnew.remove_node(0)
    nx.draw_networkx(gnew)
    plt.show()
    print(gnew.nodes(data=True))
    Gn = [gnew] + Gn
    
    # compute GEDs
    ged_matrix = np.zeros((len(Gn), len(Gn)))
    for i1 in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
        for i2 in range(len(Gn)):
            dis, _, _ = GED(Gn[i1], Gn[i2], lib='gedlib')
            ged_matrix[i1, i2] = dis
    print(np.sort(ged_matrix[idx_gi[0] + 1]))
    print(np.argsort(ged_matrix[idx_gi[0] + 1]))
    print(np.sort(ged_matrix[idx_gi[1] + 1]))
    print(np.argsort(ged_matrix[idx_gi[1] + 1]))
    # for all g in Gn, compute (GED(g1, g) + GED(g2, g)) / 2
    dis_median = [(ged_matrix[i, idx_gi[0] + 1] + ged_matrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
    print(np.sort(dis_median))
    print(np.argsort(dis_median))
    return


def test_will_IAM_give_the_median_graph_we_wanted(Gn):
    idx_gi = [0, 6]
    g1 = Gn[idx_gi[0]].copy()
    g2 = Gn[idx_gi[1]].copy()
#    del Gn[idx_gi[0]]
#    del Gn[idx_gi[1] - 1]
    g_median = test_iam_with_more_graphs_as_init([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
#    g_median = test_iam_with_more_graphs_as_init(Gn, Gn, c_ei=1, c_er=1, c_es=1)
    nx.draw_networkx(g_median)
    plt.show()
    print(g_median.nodes(data=True))
    print(g_median.edges(data=True))
    
    
def test_new_IAM_allGraph_deleteNodes(Gn):
    idx_gi = [0, 6]
#    g1 = Gn[idx_gi[0]].copy()
#    g2 = Gn[idx_gi[1]].copy()

#    g1 = nx.Graph(name='haha')
#    g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})])
#    g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})])
#    g2 = nx.Graph(name='hahaha')
#    g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
#                       (3, {'atom': 'O'}), (4, {'atom': 'C'})])
#    g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
#                       (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
    
    g1 = nx.Graph(name='haha')
    g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
                       (3, {'atom': 'S'}), (4, {'atom': 'S'})])
    g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
                       (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
    g2 = nx.Graph(name='hahaha')
    g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
                       (3, {'atom': 'O'}), (4, {'atom': 'O'})])
    g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
                       (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])

#    g2 = g1.copy()
#    g2.add_nodes_from([(3, {'atom': 'O'})])
#    g2.add_nodes_from([(4, {'atom': 'C'})])
#    g2.add_edges_from([(1, 3, {'bond_type': '1'})])
#    g2.add_edges_from([(3, 4, {'bond_type': '1'})])

#    del Gn[idx_gi[0]]
#    del Gn[idx_gi[1] - 1]
    
    nx.draw_networkx(g1)
    plt.show()
    print(g1.nodes(data=True))
    print(g1.edges(data=True))
    nx.draw_networkx(g2)
    plt.show()
    print(g2.nodes(data=True))
    print(g2.edges(data=True))
    
    g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
#    g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(Gn, Gn, c_ei=1, c_er=1, c_es=1)
    nx.draw_networkx(g_median)
    plt.show()
    print(g_median.nodes(data=True))
    print(g_median.edges(data=True))
    
    
def test_the_simple_two(Gn, gkernel):
    from gk_iam import gk_iam_nearest_multi, compute_kernel
    lmbda = 0.03 # termination probalility
    r_max = 10 # recursions
    l = 500
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 2 # k nearest neighbors
    
    # randomly select two molecules
    np.random.seed(1)
    idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    Gn_mix = [g.copy() for g in Gn]
    Gn_mix.append(g1.copy())
    Gn_mix.append(g2.copy())
    
#    g_tmp = iam([g1, g2])
#    nx.draw_networkx(g_tmp)
#    plt.show()
    
    # compute 
#    k_list = [] # kernel between each graph and itself.
#    k_g1_list = [] # kernel between each graph and g1
#    k_g2_list = [] # kernel between each graph and g2
#    for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): 
#        ktemp = compute_kernel([g, g1, g2], 'marginalizedkernel', False)
#        k_list.append(ktemp[0][0, 0])
#        k_g1_list.append(ktemp[0][0, 1])
#        k_g2_list.append(ktemp[0][0, 2])
        
    km = compute_kernel(Gn_mix, gkernel, True)
#    k_list = np.diag(km) # kernel between each graph and itself.
#    k_g1_list = km[idx_gi[0]] # kernel between each graph and g1
#    k_g2_list = km[idx_gi[1]] # kernel between each graph and g2    

    g_best = []
    dis_best = []
    # for each alpha
    for alpha in alpha_range:
        print('alpha =', alpha)
        dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha], 
                                               range(len(Gn), len(Gn) + 2), km,
                                               k, r_max,gkernel)
        dis_best.append(dhat)
        g_best.append(ghat_list)
        
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', dis_best[idx])
        print('the corresponding pre-images are')
        for g in g_best[idx]:
            nx.draw_networkx(g)
            plt.show()
            print(g.nodes(data=True))
            print(g.edges(data=True))
            
    
def test_remove_bests(Gn, gkernel):
    from gk_iam import gk_iam_nearest_multi, compute_kernel
    lmbda = 0.03 # termination probalility
    r_max = 10 # recursions
    l = 500
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 20 # k nearest neighbors
    
    # randomly select two molecules
    np.random.seed(1)
    idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    # remove the best 2 graphs.
    del Gn[idx_gi[0]]
    del Gn[idx_gi[1] - 1]
#    del Gn[8]
    
    Gn_mix = [g.copy() for g in Gn]
    Gn_mix.append(g1.copy())
    Gn_mix.append(g2.copy())

    
    # compute
    km = compute_kernel(Gn_mix, gkernel, True)
    g_best = []
    dis_best = []
    # for each alpha
    for alpha in alpha_range:
        print('alpha =', alpha)
        dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha], 
                                               range(len(Gn), len(Gn) + 2), km, 
                                               k, r_max, gkernel)
        dis_best.append(dhat)
        g_best.append(ghat_list)
        
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', dis_best[idx])
        print('the corresponding pre-images are')
        for g in g_best[idx]:
            draw_Letter_graph(g)
#            nx.draw_networkx(g)
#            plt.show()
            print(g.nodes(data=True))
            print(g.edges(data=True))
            
            
def test_gkiam_letter_h():
    from gk_iam import gk_iam_nearest_multi, compute_kernel
    from iam import median_distance
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
#    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
#          'extra_params': {}} # node nsymb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    gkernel = 'structuralspkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 3 # recursions
#    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 10 # k nearest neighbors
    
    # classify graphs according to letters.
    idx_dict = get_same_item_indices(y_all)
    time_list = []
    sod_list = []
    sod_min_list = []
    for letter in idx_dict:
        print('\n-------------------------------------------------------\n')
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
        Gn_mix = Gn_let + [g.copy() for g in Gn_let]
        
        alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
        
        # compute
        time0 = time.time()
        km = compute_kernel(Gn_mix, gkernel, True)
        g_best = []
        dis_best = []
        # for each alpha
        for alpha in alpha_range:
            print('alpha =', alpha)
            dhat, ghat_list = gk_iam_nearest_multi(Gn_let, Gn_let, [alpha] * len(Gn_let), 
                                                   range(len(Gn_let), len(Gn_mix)), km, 
                                                   k, r_max, gkernel, c_ei=1.7, 
                                                   c_er=1.7, c_es=1.7)
            dis_best.append(dhat)
            g_best.append(ghat_list)
        time_list.append(time.time() - time0)
            
        # show best graphs and save them to file.
        for idx, item in enumerate(alpha_range):
            print('when alpha is', item, 'the shortest distance is', dis_best[idx])
            print('the corresponding pre-images are')
            for g in g_best[idx]:
                draw_Letter_graph(g, savepath='results/gk_iam/')
#            nx.draw_networkx(g)
#            plt.show()
                print(g.nodes(data=True))
                print(g.edges(data=True))
                
        # compute the corresponding sod in graph space. (alpha range not considered.)
        sod_tmp, _ = median_distance(g_best[0], Gn_let)
        sod_list.append(sod_tmp)
        sod_min_list.append(np.min(sod_tmp))
        
                
    print('\nsods in graph space: ', sod_list)
    print('\nsmallest sod in graph space for each letter: ', sod_min_list)               
    print('\ntimes:', time_list)
                
                
def get_same_item_indices(ls):
    """Get the indices of the same items in a list. Return a dict keyed by items.
    """
    idx_dict = {}
    for idx, item in enumerate(ls):
        if item in idx_dict:
            idx_dict[item].append(idx)
        else:
            idx_dict[item] = [idx]
    return idx_dict


#def compute_letter_median_by_average(Gn):
#    return g_median
    

def test_iam_letter_h():
    from iam import test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations
    from gk_iam import dis_gstar, compute_kernel
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
#    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
#          'extra_params': {}} # node nsymb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    
    lmbda = 0.03 # termination probalility
#    alpha_range = np.linspace(0.5, 0.5, 1)
    
    # classify graphs according to letters.
    idx_dict = get_same_item_indices(y_all)
    time_list = []
    sod_list = []
    sod_min_list = []
    for letter in idx_dict:        
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
        
        alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
        
        # compute
        g_best = []
        dis_best = []
        time0 = time.time()
        # for each alpha
        for alpha in alpha_range:
            print('alpha =', alpha)
            ghat_list, dhat = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
                Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7)
            dis_best.append(dhat)
            g_best.append(ghat_list)
        time_list.append(time.time() - time0)
            
        # show best graphs and save them to file.
        for idx, item in enumerate(alpha_range):
            print('when alpha is', item, 'the shortest distance is', dis_best[idx])
            print('the corresponding pre-images are')
            for g in g_best[idx]:
                draw_Letter_graph(g, savepath='results/iam/')
#            nx.draw_networkx(g)
#            plt.show()
                print(g.nodes(data=True))
                print(g.edges(data=True))
                
        # compute the corresponding sod in kernel space. (alpha range not considered.)
        gkernel = 'structuralspkernel'        
        sod_tmp = []
        Gn_mix = g_best[0] + Gn_let
        km = compute_kernel(Gn_mix, gkernel, True)
        for ig, g in tqdm(enumerate(g_best[0]), desc='computing kernel sod', file=sys.stdout):
            dtemp = dis_gstar(ig, range(len(g_best[0]), len(Gn_mix)), 
                              [alpha_range[0]] * len(Gn_let), km, withterm3=False)
            sod_tmp.append(dtemp)
        sod_list.append(sod_tmp)
        sod_min_list.append(np.min(sod_tmp))
        
                
    print('\nsods in kernel space: ', sod_list)
    print('\nsmallest sod in kernel space for each letter: ', sod_min_list)
    print('\ntimes:', time_list)
        

if __name__ == '__main__':
#    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
#          'extra_params': {}}  # node/edge symb
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
#    ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
#          'extra_params': {}}
#    ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
#        'extra_params': {}} # node symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
#    Gn = Gn[0:20]
    
#    import networkx.algorithms.isomorphism as iso
#    G1 = nx.MultiDiGraph()
#    G2 = nx.MultiDiGraph()
#    G1.add_nodes_from([1,2,3], fill='red')
#    G2.add_nodes_from([10,20,30,40], fill='red')
#    nx.add_path(G1, [1,2,3,4], weight=3, linewidth=2.5)
#    nx.add_path(G2, [10,20,30,40], weight=3)
#    nm = iso.categorical_node_match('fill', 'red')
#    print(nx.is_isomorphic(G1, G2, node_match=nm))
#    
#    test_new_IAM_allGraph_deleteNodes(Gn)
#    test_will_IAM_give_the_median_graph_we_wanted(Gn)
#    test_who_is_the_closest_in_GED_space(Gn)
#    test_who_is_the_closest_in_kernel_space(Gn)
    
#    test_the_simple_two(Gn, 'untilhpathkernel')
#    test_remove_bests(Gn, 'untilhpathkernel')
    test_gkiam_letter_h()
#    test_iam_letter_h()