OpenI
/
graphkit-learn

"""
@author: linlin

@references:

    [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between 
    labeled graphs. In Proceedings of the 20th International Conference on 
    Machine Learning, Washington, DC, United States, 2003.

    [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and 
    Jean-Philippe Vert. Extensions of marginalized graph kernels. In 
    Proceedings of the twenty-first international conference on Machine 
    learning, page 70. ACM, 2004.
"""

import sys
import time
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm
tqdm.monitor_interval = 0
#import traceback

import networkx as nx
import numpy as np

from gklearn.utils.kernels import deltakernel
from gklearn.utils.utils import untotterTransformation
from gklearn.utils.graphdataset import get_dataset_attributes
from gklearn.utils.parallel import parallel_gm
sys.path.insert(0, "../")


def marginalizedkernel(*args,
                       node_label='atom',
                       edge_label='bond_type',
                       p_quit=0.5,
                       n_iteration=20,
                       remove_totters=False,
                       n_jobs=None,
                       verbose=True):
    """Calculate marginalized graph kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.

    node_label : string
        Node attribute used as symbolic label. The default node label is 'atom'.

    edge_label : string
        Edge attribute used as symbolic label. The default edge label is 'bond_type'.

    p_quit : integer
        The termination probability in the random walks generating step.

    n_iteration : integer
        Time of iterations to calculate R_inf.

    remove_totters : boolean
        Whether to remove totterings by method introduced in [2]. The default 
        value is False.

    n_jobs : int
        Number of jobs for parallelization.   

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the marginalized kernel between
        2 praphs.
    """
    # pre-process
    n_iteration = int(n_iteration)
    Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
    Gn = [g.copy() for g in Gn]
    
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label, edge_label=edge_label)
    if not ds_attrs['node_labeled'] or node_label == None:
        node_label = 'atom'
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')
    if not ds_attrs['edge_labeled'] or edge_label == None:
        edge_label = 'bond_type'
        for G in Gn:
            nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()
    
    if remove_totters:
        # ---- use pool.imap_unordered to parallel and track progress. ----
        pool = Pool(n_jobs)
        untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label)
        if len(Gn) < 100 * n_jobs:
            chunksize = int(len(Gn) / n_jobs) + 1
        else:
            chunksize = 100
        for i, g in tqdm(
                pool.imap_unordered(
                    untotter_partial, range(0, len(Gn)), chunksize),
                desc='removing tottering',
                file=sys.stdout):
            Gn[i] = g
        pool.close()
        pool.join()

#        # ---- direct running, normally use single CPU core. ----
#        Gn = [
#            untotterTransformation(G, node_label, edge_label)
#            for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
#        ]

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
                global G_gn
                G_gn = gn_toshare
    do_partial = partial(wrapper_marg_do, node_label, edge_label,
                         p_quit, n_iteration)   
    parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)


#    # ---- direct running, normally use single CPU core. ----
##    pbar = tqdm(
##        total=(1 + len(Gn)) * len(Gn) / 2,
##        desc='calculating kernels',
##        file=sys.stdout)
#    for i in range(0, len(Gn)):
#        for j in range(i, len(Gn)):
##            print(i, j)
#            Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
#                                                   edge_label, p_quit, n_iteration)
#            Kmatrix[j][i] = Kmatrix[i][j]
##            pbar.update(1)

    run_time = time.time() - start_time
    if verbose:
        print("\n --- marginalized kernel matrix of size %d built in %s seconds ---"
              % (len(Gn), run_time))

    return Kmatrix, run_time


def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration):
    """Calculate marginalized graph kernel between 2 graphs.

    Parameters
    ----------
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.
    p_quit : integer
        the termination probability in the random walks generating step.
    n_iteration : integer
        time of iterations to calculate R_inf.

    Return
    ------
    kernel : float
        Marginalized Kernel between 2 graphs.
    """
    # init parameters
    kernel = 0
    num_nodes_G1 = nx.number_of_nodes(g1)
    num_nodes_G2 = nx.number_of_nodes(g2)
    # the initial probability distribution in the random walks generating step
    # (uniform distribution over |G|)
    p_init_G1 = 1 / num_nodes_G1
    p_init_G2 = 1 / num_nodes_G2

    q = p_quit * p_quit
    r1 = q

#    # initial R_inf
#    # matrix to save all the R_inf for all pairs of nodes
#    R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
#
#    # calculate R_inf with a simple interative method
#    for i in range(1, n_iteration):
#        R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])
#        R_inf_new.fill(r1)
#
#        # calculate R_inf for each pair of nodes
#        for node1 in g1.nodes(data=True):
#            neighbor_n1 = g1[node1[0]]
#            # the transition probability distribution in the random walks
#            # generating step (uniform distribution over the vertices adjacent
#            # to the current vertex)
#            if len(neighbor_n1) > 0:
#                p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
#                for node2 in g2.nodes(data=True):
#                    neighbor_n2 = g2[node2[0]]
#                    if len(neighbor_n2) > 0:
#                        p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
#        
#                        for neighbor1 in neighbor_n1:
#                            for neighbor2 in neighbor_n2:
#                                t = p_trans_n1 * p_trans_n2 * \
#                                    deltakernel(g1.node[neighbor1][node_label],
#                                                g2.node[neighbor2][node_label]) * \
#                                    deltakernel(
#                                        neighbor_n1[neighbor1][edge_label],
#                                        neighbor_n2[neighbor2][edge_label])
#        
#                                R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
#                                    neighbor2]  # ref [1] equation (8)
#        R_inf[:] = R_inf_new
#
#    # add elements of R_inf up and calculate kernel
#    for node1 in g1.nodes(data=True):
#        for node2 in g2.nodes(data=True):
#            s = p_init_G1 * p_init_G2 * deltakernel(
#                node1[1][node_label], node2[1][node_label])
#            kernel += s * R_inf[node1[0]][node2[0]]  # ref [1] equation (6)
    
    
    R_inf = {} # dict to save all the R_inf for all pairs of nodes
    # initial R_inf, the 1st iteration.
    for node1 in g1.nodes():
        for node2 in g2.nodes():
#            R_inf[(node1[0], node2[0])] = r1
            if len(g1[node1]) > 0:
                if len(g2[node2]) > 0:
                    R_inf[(node1, node2)] = r1
                else:
                    R_inf[(node1, node2)] = p_quit
            else:
                if len(g2[node2]) > 0:
                    R_inf[(node1, node2)] = p_quit
                else:
                    R_inf[(node1, node2)] = 1
            
    # compute all transition probability first.
    t_dict = {}
    if n_iteration > 1:
        for node1 in g1.nodes():
            neighbor_n1 = g1[node1]
            # the transition probability distribution in the random walks
            # generating step (uniform distribution over the vertices adjacent
            # to the current vertex)
            if len(neighbor_n1) > 0:
                p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
                for node2 in g2.nodes():
                    neighbor_n2 = g2[node2]
                    if len(neighbor_n2) > 0:
                        p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
                        for neighbor1 in neighbor_n1:
                            for neighbor2 in neighbor_n2:
                                t_dict[(node1, node2, neighbor1, neighbor2)] = \
                                    p_trans_n1 * p_trans_n2 * \
                                    deltakernel(g1.node[neighbor1][node_label],
                                                g2.node[neighbor2][node_label]) * \
                                    deltakernel(
                                        neighbor_n1[neighbor1][edge_label],
                                        neighbor_n2[neighbor2][edge_label])

    # calculate R_inf with a simple interative method
    for i in range(2, n_iteration + 1):
        R_inf_old = R_inf.copy()

        # calculate R_inf for each pair of nodes
        for node1 in g1.nodes():
            neighbor_n1 = g1[node1]
            # the transition probability distribution in the random walks
            # generating step (uniform distribution over the vertices adjacent
            # to the current vertex)
            if len(neighbor_n1) > 0:
                for node2 in g2.nodes():
                    neighbor_n2 = g2[node2]
                    if len(neighbor_n2) > 0:   
                        R_inf[(node1, node2)] = r1
                        for neighbor1 in neighbor_n1:
                            for neighbor2 in neighbor_n2:
                                R_inf[(node1, node2)] += \
                                    (t_dict[(node1, node2, neighbor1, neighbor2)] * \
                                    R_inf_old[(neighbor1, neighbor2)])  # ref [1] equation (8)

    # add elements of R_inf up and calculate kernel
    for (n1, n2), value in R_inf.items():
        s = p_init_G1 * p_init_G2 * deltakernel(
                g1.nodes[n1][node_label], g2.nodes[n2][node_label])
        kernel += s * value  # ref [1] equation (6)

    return kernel
        
        
def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr):
    i= itr[0]
    j = itr[1]
    return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration)
    

def wrapper_untotter(Gn, node_label, edge_label, i):
    return i, untotterTransformation(Gn[i], node_label, edge_label)