OpenI
/
graphkit-learn

"""
@author: linlin
@references:
    [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. 
    Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 
    2011;12(Sep):2539-61.
"""

import sys
from collections import Counter
sys.path.insert(0, "../")
from functools import partial
import time
#from multiprocessing import Pool
from tqdm import tqdm

import networkx as nx
import numpy as np

#from pygraph.kernels.pathKernel import pathkernel
from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.parallel import parallel_gm

# @todo: support edge kernel, sp kernel, user-defined kernel.
def weisfeilerlehmankernel(*args, 
                           node_label='atom',
                           edge_label='bond_type',
                           height=0,
                           base_kernel='subtree',
                           parallel=None,
                           n_jobs=None, 
                           verbose=True):
    """Calculate Weisfeiler-Lehman kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.        
    node_label : string
        node attribute used as label. The default node label is atom.        
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.        
    height : int
        subtree height
    base_kernel : string
        base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. For user-defined kernel, base_kernel is the name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.

    Notes
    -----
    This function now supports WL subtree kernel only.
    """
    # pre-process
    base_kernel = base_kernel.lower()
    Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
    Gn = [g.copy() for g in Gn]
    ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], 
                                      node_label=node_label)
    if not ds_attrs['node_labeled']:
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')

    start_time = time.time()

    # for WL subtree kernel
    if base_kernel == 'subtree':           
        Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose)

    # for WL shortest path kernel
    elif base_kernel == 'sp':
        Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)

    # for WL edge kernel
    elif base_kernel == 'edge':
        Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)

    # for user defined base kernel
    else:
        Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)

    run_time = time.time() - start_time
    if verbose:
        print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" 
              % (base_kernel, len(args[0]), run_time))

    return Kmatrix, run_time


def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose):
    """Calculate Weisfeiler-Lehman kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.      
    height : int
        wl height.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # initial for height = 0
    all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration

    # for each graph
    for G in Gn:
        # get the set of original labels
        labels_ori = list(nx.get_node_attributes(G, node_label).values())
        # number of occurence of each label in G
        all_num_of_each_label.append(dict(Counter(labels_ori)))

    # calculate subtree kernel with the 0th iteration and add it to the final kernel
    compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)

    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
#        all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
        all_num_of_each_label = [] # number of occurence of each label in G

#        # for each graph
#        # ---- use pool.imap_unordered to parallel and track progress. ----
#        pool = Pool(n_jobs)
#        itr = zip(Gn, range(0, len(Gn)))
#        if len(Gn) < 100 * n_jobs:
#            chunksize = int(len(Gn) / n_jobs) + 1
#        else:
#            chunksize = 100
#        all_multisets_list = [[] for _ in range(len(Gn))]
##        set_unique_list = [[] for _ in range(len(Gn))]
#        get_partial = partial(wrapper_wl_iteration, node_label)
##        if verbose:
##            iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
##                            desc='wl iteration', file=sys.stdout)
##        else:
#        iterator = pool.imap_unordered(get_partial, itr, chunksize)
#        for i, all_multisets in iterator:
#            all_multisets_list[i] = all_multisets
##            set_unique_list[i] = set_unique
##            all_set_unique = all_set_unique | set(set_unique)
#        pool.close()
#        pool.join()
        
#        all_set_unique = set()
#        for uset in all_multisets_list:
#            all_set_unique = all_set_unique | set(uset)
#            
#        all_set_unique = list(all_set_unique)
##        # a dictionary mapping original labels to new ones. 
##        set_compressed = {}
##        for idx, uset in enumerate(all_set_unique):
##            set_compressed.update({uset: idx})
#            
#        for ig, G in enumerate(Gn):
#
##            # a dictionary mapping original labels to new ones. 
##            set_compressed = {}
##            # if a label occured before, assign its former compressed label, 
##            # else assign the number of labels occured + 1 as the compressed label. 
##            for value in set_unique_list[i]:
##                if uset in all_set_unique:
##                    set_compressed.update({uset: all_set_compressed[value]})
##                else:
##                    set_compressed.update({value: str(num_of_labels_occured + 1)})
##                    num_of_labels_occured += 1
#                    
##            all_set_compressed.update(set_compressed)
#            
#            # relabel nodes
#            for idx, node in enumerate(G.nodes()):
#                G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx])
#                
#            # get the set of compressed labels
#            labels_comp = list(nx.get_node_attributes(G, node_label).values())
##            all_labels_ori.update(labels_comp)
#            all_num_of_each_label[ig] = dict(Counter(labels_comp))
            
            
#        all_set_unique = list(all_set_unique)
        
        
        # @todo: parallel this part.
        for idx, G in enumerate(Gn):

            all_multisets = []
            for node, attrs in G.nodes(data=True):
                # Multiset-label determination.
                multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
                # sorting each multiset
                multiset.sort()
                multiset = [attrs[node_label]] + multiset # add the prefix 
                all_multisets.append(tuple(multiset))

            # label compression
            set_unique = list(set(all_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, 
            # else assign the number of labels occured + 1 as the compressed label. 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({value: all_set_compressed[value]})
                else:
                    set_compressed.update({value: str(num_of_labels_occured + 1)})
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)

            # relabel nodes
            for idx, node in enumerate(G.nodes()):
                G.nodes[node][node_label] = set_compressed[all_multisets[idx]]

            # get the set of compressed labels
            labels_comp = list(nx.get_node_attributes(G, node_label).values())
#            all_labels_ori.update(labels_comp)
            all_num_of_each_label.append(dict(Counter(labels_comp)))

        # calculate subtree kernel with h iterations and add it to the final kernel
        compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)

    return Kmatrix


def wl_iteration(G, node_label):
    all_multisets = []
    for node, attrs in G.nodes(data=True):
        # Multiset-label determination.
        multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
        # sorting each multiset
        multiset.sort()
        multiset = [attrs[node_label]] + multiset # add the prefix 
        all_multisets.append(tuple(multiset))
#    # label compression
#    set_unique = list(set(all_multisets)) # set of unique multiset labels
    return all_multisets
    
#    # a dictionary mapping original labels to new ones. 
#    set_compressed = {}
#    # if a label occured before, assign its former compressed label, 
#    # else assign the number of labels occured + 1 as the compressed label. 
#    for value in set_unique:
#        if value in all_set_compressed.keys():
#            set_compressed.update({value: all_set_compressed[value]})
#        else:
#            set_compressed.update({value: str(num_of_labels_occured + 1)})
#            num_of_labels_occured += 1
#
#    all_set_compressed.update(set_compressed)
#
#    # relabel nodes
#    for idx, node in enumerate(G.nodes()):
#        G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
#
#    # get the set of compressed labels
#    labels_comp = list(nx.get_node_attributes(G, node_label).values())
#    all_labels_ori.update(labels_comp)
#    all_num_of_each_label.append(dict(Counter(labels_comp)))
#    return


def wrapper_wl_iteration(node_label, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    all_multisets = wl_iteration(g, node_label)
    return i, all_multisets


def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose):
    """Compute kernel matrix using the base kernel.
    """
    if parallel == 'imap_unordered':
        # compute kernels.
        def init_worker(alllabels_toshare):
            global G_alllabels
            G_alllabels = alllabels_toshare
        do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
        parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                    glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose)
    elif parallel == None:
        for i in range(len(Kmatrix)):
            for j in range(i, len(Kmatrix)):
                Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i],
                       all_num_of_each_label[j], Kmatrix[i][j])
                Kmatrix[j][i] = Kmatrix[i][j]


def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):
    """Compute the subtree kernel.
    """
    labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
    vector1 = np.array([(num_of_each_label1[label] 
                        if (label in num_of_each_label1.keys()) else 0) 
                        for label in labels])
    vector2 = np.array([(num_of_each_label2[label] 
                        if (label in num_of_each_label2.keys()) else 0) 
                        for label in labels])
    kernel += np.dot(vector1, vector2)
    return kernel


def wrapper_compute_subtree_kernel(Kmatrix, itr):
    i = itr[0]
    j = itr[1]
    return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j])
                

def _wl_spkernel_do(Gn, node_label, edge_label, height):
    """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.      
    edge_label : string
        edge attribute used as label.       
    height : int
        subtree height.
        
    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """
    pass
    from pygraph.utils.utils import getSPGraph
      
    # init.
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel

    Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
    
    # initial for height = 0
    for i in range(0, len(Gn)):
        for j in range(i, len(Gn)):
            for e1 in Gn[i].edges(data = True):
                for e2 in Gn[j].edges(data = True):          
                    if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                        Kmatrix[i][j] += 1
            Kmatrix[j][i] = Kmatrix[i][j]
            
    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
        for G in Gn: # for each graph
            set_multisets = []
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)          

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)
            
            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]
                
        # calculate subtree kernel with h iterations and add it to the final kernel
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                for e1 in Gn[i].edges(data = True):
                    for e2 in Gn[j].edges(data = True):          
                        if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                            Kmatrix[i][j] += 1
                Kmatrix[j][i] = Kmatrix[i][j]
        
    return Kmatrix


def _wl_edgekernel_do(Gn, node_label, edge_label, height):
    """Calculate Weisfeiler-Lehman edge kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.      
    edge_label : string
        edge attribute used as label.       
    height : int
        subtree height.
        
    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """      
    pass
    # init.
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
    # initial for height = 0
    for i in range(0, len(Gn)):
        for j in range(i, len(Gn)):
            for e1 in Gn[i].edges(data = True):
                for e2 in Gn[j].edges(data = True):          
                    if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                        Kmatrix[i][j] += 1
            Kmatrix[j][i] = Kmatrix[i][j]
            
    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
        for G in Gn: # for each graph
            set_multisets = []            
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)          

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)
            
            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]
                
        # calculate subtree kernel with h iterations and add it to the final kernel
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                for e1 in Gn[i].edges(data = True):
                    for e2 in Gn[j].edges(data = True):          
                        if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                            Kmatrix[i][j] += 1
                Kmatrix[j][i] = Kmatrix[i][j]
        
    return Kmatrix


def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
    """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.      
    edge_label : string
        edge attribute used as label.       
    height : int
        subtree height.
    base_kernel : string
        Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
        
    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """      
    pass
    # init.
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
    # initial for height = 0
    Kmatrix = base_kernel(Gn, node_label, edge_label)
            
    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
        for G in Gn: # for each graph
            set_multisets = []           
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)          

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)
            
            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]
                
        # calculate kernel with h iterations and add it to the final kernel
        Kmatrix += base_kernel(Gn, node_label, edge_label)
        
    return Kmatrix