OpenI
/
graphkit-learn

"""
@author: linlin
@references:
    [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. 
    Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 
    2011;12(Sep):2539-61.
"""

import sys
from collections import Counter
sys.path.insert(0, "../")
from functools import partial
import time
#from multiprocessing import Pool
from tqdm import tqdm

import networkx as nx
import numpy as np

#from pygraph.kernels.pathKernel import pathkernel
from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.parallel import parallel_gm

# @todo: support edge kernel, sp kernel, user-defined kernel.
def weisfeilerlehmankernel(*args, 
                           node_label='atom',
                           edge_label='bond_type',
                           height=0,
                           base_kernel='subtree',
                           parallel=None,
                           n_jobs=None, 
                           verbose=True):
    """Calculate Weisfeiler-Lehman kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.        
    node_label : string
        Node attribute used as label. The default node label is atom.        
    edge_label : string
        Edge attribute used as label. The default edge label is bond_type.        
    height : int
        Subtree height.
    base_kernel : string
        Base kernel used in each iteration of WL kernel. Only default 'subtree' 
        kernel can be applied for now.
#        The default base 
#        kernel is subtree kernel. For user-defined kernel, base_kernel is the 
#        name of the base kernel function used in each iteration of WL kernel. 
#        This function returns a Numpy matrix, each element of which is the 
#        user-defined Weisfeiler-Lehman kernel between 2 praphs.
    parallel : None
        Which paralleliztion method is applied to compute the kernel. No 
        parallelization can be applied for now.
    n_jobs : int
        Number of jobs for parallelization. The default is to use all 
        computational cores. This argument is only valid when one of the 
        parallelization method is applied and can be ignored for now.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.

    Notes
    -----
    This function now supports WL subtree kernel only.
    """
    # pre-process
    base_kernel = base_kernel.lower()
    Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
    Gn = [g.copy() for g in Gn]
    ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], 
                                      node_label=node_label)
    if not ds_attrs['node_labeled']:
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')

    start_time = time.time()

    # for WL subtree kernel
    if base_kernel == 'subtree':           
        Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose)

    # for WL shortest path kernel
    elif base_kernel == 'sp':
        Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)

    # for WL edge kernel
    elif base_kernel == 'edge':
        Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)

    # for user defined base kernel
    else:
        Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)

    run_time = time.time() - start_time
    if verbose:
        print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" 
              % (base_kernel, len(args[0]), run_time))

    return Kmatrix, run_time


def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose):
    """Calculate Weisfeiler-Lehman kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.      
    height : int
        wl height.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # initial for height = 0
    all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

    # for each graph
    for G in Gn:
        # get the set of original labels
        labels_ori = list(nx.get_node_attributes(G, node_label).values())
        # number of occurence of each label in G
        all_num_of_each_label.append(dict(Counter(labels_ori)))

    # calculate subtree kernel with the 0th iteration and add it to the final kernel
    compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)

    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
#        all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
        all_num_of_each_label = [] # number of occurence of each label in G

#        # for each graph
#        # ---- use pool.imap_unordered to parallel and track progress. ----
#        pool = Pool(n_jobs)
#        itr = zip(Gn, range(0, len(Gn)))
#        if len(Gn) < 100 * n_jobs:
#            chunksize = int(len(Gn) / n_jobs) + 1
#        else:
#            chunksize = 100
#        all_multisets_list = [[] for _ in range(len(Gn))]
##        set_unique_list = [[] for _ in range(len(Gn))]
#        get_partial = partial(wrapper_wl_iteration, node_label)
##        if verbose:
##            iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
##                            desc='wl iteration', file=sys.stdout)
##        else:
#        iterator = pool.imap_unordered(get_partial, itr, chunksize)
#        for i, all_multisets in iterator:
#            all_multisets_list[i] = all_multisets
##            set_unique_list[i] = set_unique
##            all_set_unique = all_set_unique | set(set_unique)
#        pool.close()
#        pool.join()
        
#        all_set_unique = set()
#        for uset in all_multisets_list:
#            all_set_unique = all_set_unique | set(uset)
#            
#        all_set_unique = list(all_set_unique)
##        # a dictionary mapping original labels to new ones. 
##        set_compressed = {}
##        for idx, uset in enumerate(all_set_unique):
##            set_compressed.update({uset: idx})
#            
#        for ig, G in enumerate(Gn):
#
##            # a dictionary mapping original labels to new ones. 
##            set_compressed = {}
##            # if a label occured before, assign its former compressed label, 
##            # else assign the number of labels occured + 1 as the compressed label. 
##            for value in set_unique_list[i]:
##                if uset in all_set_unique:
##                    set_compressed.update({uset: all_set_compressed[value]})
##                else:
##                    set_compressed.update({value: str(num_of_labels_occured + 1)})
##                    num_of_labels_occured += 1
#                    
##            all_set_compressed.update(set_compressed)
#            
#            # relabel nodes
#            for idx, node in enumerate(G.nodes()):
#                G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx])
#                
#            # get the set of compressed labels
#            labels_comp = list(nx.get_node_attributes(G, node_label).values())
##            all_labels_ori.update(labels_comp)
#            all_num_of_each_label[ig] = dict(Counter(labels_comp))
            
            
#        all_set_unique = list(all_set_unique)
        
        
        # @todo: parallel this part.
        for idx, G in enumerate(Gn):

            all_multisets = []
            for node, attrs in G.nodes(data=True):
                # Multiset-label determination.
                multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
                # sorting each multiset
                multiset.sort()
                multiset = [attrs[node_label]] + multiset # add the prefix 
                all_multisets.append(tuple(multiset))

            # label compression
            set_unique = list(set(all_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, 
            # else assign the number of labels occured + 1 as the compressed label. 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({value: all_set_compressed[value]})
                else:
                    set_compressed.update({value: str(num_of_labels_occured + 1)})
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)

            # relabel nodes
            for idx, node in enumerate(G.nodes()):
                G.nodes[node][node_label] = set_compressed[all_multisets[idx]]

            # get the set of compressed labels
            labels_comp = list(nx.get_node_attributes(G, node_label).values())
#            all_labels_ori.update(labels_comp)
            all_num_of_each_label.append(dict(Counter(labels_comp)))

        # calculate subtree kernel with h iterations and add it to the final kernel
        compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)

    return Kmatrix


def wl_iteration(G, node_label):
    all_multisets = []
    for node, attrs in G.nodes(data=True):
        # Multiset-label determination.
        multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
        # sorting each multiset
        multiset.sort()
        multiset = [attrs[node_label]] + multiset # add the prefix 
        all_multisets.append(tuple(multiset))
#    # label compression
#    set_unique = list(set(all_multisets)) # set of unique multiset labels
    return all_multisets
    
#    # a dictionary mapping original labels to new ones. 
#    set_compressed = {}
#    # if a label occured before, assign its former compressed label, 
#    # else assign the number of labels occured + 1 as the compressed label. 
#    for value in set_unique:
#        if value in all_set_compressed.keys():
#            set_compressed.update({value: all_set_compressed[value]})
#        else:
#            set_compressed.update({value: str(num_of_labels_occured + 1)})
#            num_of_labels_occured += 1
#
#    all_set_compressed.update(set_compressed)
#
#    # relabel nodes
#    for idx, node in enumerate(G.nodes()):
#        G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
#
#    # get the set of compressed labels
#    labels_comp = list(nx.get_node_attributes(G, node_label).values())
#    all_labels_ori.update(labels_comp)
#    all_num_of_each_label.append(dict(Counter(labels_comp)))
#    return


def wrapper_wl_iteration(node_label, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    all_multisets = wl_iteration(g, node_label)
    return i, all_multisets


def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose):
    """Compute kernel matrix using the base kernel.
    """
    if parallel == 'imap_unordered':
        # compute kernels.
        def init_worker(alllabels_toshare):
            global G_alllabels
            G_alllabels = alllabels_toshare
        do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
        parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                    glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose)
    elif parallel == None:
        for i in range(len(Kmatrix)):
            for j in range(i, len(Kmatrix)):
                Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i],
                       all_num_of_each_label[j], Kmatrix[i][j])
                Kmatrix[j][i] = Kmatrix[i][j]


def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):
    """Compute the subtree kernel.
    """
    labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
    vector1 = np.array([(num_of_each_label1[label] 
                        if (label in num_of_each_label1.keys()) else 0) 
                        for label in labels])
    vector2 = np.array([(num_of_each_label2[label] 
                        if (label in num_of_each_label2.keys()) else 0) 
                        for label in labels])
    kernel += np.dot(vector1, vector2)
    return kernel


def wrapper_compute_subtree_kernel(Kmatrix, itr):
    i = itr[0]
    j = itr[1]
    return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j])
                

def _wl_spkernel_do(Gn, node_label, edge_label, height):
    """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.      
    edge_label : string
        edge attribute used as label.       
    height : int
        subtree height.
        
    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """
    pass
    from pygraph.utils.utils import getSPGraph
      
    # init.
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel

    Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
    
    # initial for height = 0
    for i in range(0, len(Gn)):
        for j in range(i, len(Gn)):
            for e1 in Gn[i].edges(data = True):
                for e2 in Gn[j].edges(data = True):          
                    if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                        Kmatrix[i][j] += 1
            Kmatrix[j][i] = Kmatrix[i][j]
            
    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
        for G in Gn: # for each graph
            set_multisets = []
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)          

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)
            
            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]
                
        # calculate subtree kernel with h iterations and add it to the final kernel
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                for e1 in Gn[i].edges(data = True):
                    for e2 in Gn[j].edges(data = True):          
                        if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                            Kmatrix[i][j] += 1
                Kmatrix[j][i] = Kmatrix[i][j]
        
    return Kmatrix


def _wl_edgekernel_do(Gn, node_label, edge_label, height):
    """Calculate Weisfeiler-Lehman edge kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.      
    edge_label : string
        edge attribute used as label.       
    height : int
        subtree height.
        
    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """      
    pass
    # init.
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
    # initial for height = 0
    for i in range(0, len(Gn)):
        for j in range(i, len(Gn)):
            for e1 in Gn[i].edges(data = True):
                for e2 in Gn[j].edges(data = True):          
                    if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                        Kmatrix[i][j] += 1
            Kmatrix[j][i] = Kmatrix[i][j]
            
    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
        for G in Gn: # for each graph
            set_multisets = []            
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)          

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)
            
            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]
                
        # calculate subtree kernel with h iterations and add it to the final kernel
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                for e1 in Gn[i].edges(data = True):
                    for e2 in Gn[j].edges(data = True):          
                        if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                            Kmatrix[i][j] += 1
                Kmatrix[j][i] = Kmatrix[i][j]
        
    return Kmatrix


def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
    """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.      
    edge_label : string
        edge attribute used as label.       
    height : int
        subtree height.
    base_kernel : string
        Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
        
    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """      
    pass
    # init.
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
    # initial for height = 0
    Kmatrix = base_kernel(Gn, node_label, edge_label)
            
    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
        for G in Gn: # for each graph
            set_multisets = []           
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)          

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)
            
            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]
                
        # calculate kernel with h iterations and add it to the final kernel
        Kmatrix += base_kernel(Gn, node_label, edge_label)
        
    return Kmatrix