OpenI
/
graphkit-learn

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 27 10:56:23 2018

@author: linlin
@references: Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For 
Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
"""

import sys
import time
from itertools import combinations, combinations_with_replacement, product
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm

import networkx as nx
import numpy as np

from pygraph.utils.graphdataset import get_dataset_attributes

sys.path.insert(0, "../")


def structuralspkernel(*args,
                       node_label='atom',
                       edge_weight=None,
                       edge_label='bond_type',
                       node_kernels=None,
                       edge_kernels=None,
                       n_jobs=None):
    """Calculate mean average structural shortest path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when nodes are unlabeled.
    edge_kernels: dict
        A dictionary of kernel functions for edges, including 3 items: 'symb' 
        for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' 
        for both labels. The first 2 functions take two edge labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two edges. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when edges are unlabeled.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the mean average structural 
        shortest path kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
                    'edge_attr_dim', 'is_directed'],
        node_label=node_label, edge_label=edge_label)

    start_time = time.time()

    # get shortest paths of each graph in Gn
    splist = [None] * len(Gn)
    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 1000 * n_jobs:
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 1000
    # chunksize = 300  # int(len(list(itr)) / n_jobs)
    for i, sp in tqdm(
            pool.imap_unordered(getsp_partial, itr, chunksize),
            desc='getting shortest paths',
            file=sys.stdout):
        splist[i] = sp
#        time.sleep(10)
    pool.close()
    pool.join()
    
    
#    # get shortest paths of each graph in Gn
#    splist = [[] for _ in range(len(Gn))]
#    # get shortest path graphs of Gn
#    getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
#    itr = zip(Gn, range(0, len(Gn)))
#    if len(Gn) < 1000 * n_jobs:
#        chunksize = int(len(Gn) / n_jobs) + 1
#    else:
#        chunksize = 1000
#    # chunksize = 300  # int(len(list(itr)) / n_jobs)
#    from contextlib import closing  
#    with closing(Pool(n_jobs)) as pool:
##        for i, sp in tqdm(
#        res = pool.imap_unordered(getsp_partial, itr, 10)
##                desc='getting shortest paths',
##                file=sys.stdout):
##            splist[i] = sp
##        time.sleep(10)
#    pool.close()
#    pool.join()
    
#    ss = 0
#    ss += sys.getsizeof(splist)
#    for spss in splist:
#        ss += sys.getsizeof(spss)
#        for spp in spss:
#            ss += sys.getsizeof(spp)
    
    
#    time.sleep(20)
    
#    # ---- direct running, normally use single CPU core. ----
#    splist = []
#    for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
#        splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))

    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
    # for i in result_sp:
    #     sp_ml[i[0]] = i[1]
    # edge_x_g = [[] for i in range(len(sp_ml))]
    # edge_y_g = [[] for i in range(len(sp_ml))]
    # edge_w_g = [[] for i in range(len(sp_ml))]
    # for idx, item in enumerate(sp_ml):
    #     for i1 in range(len(item)):
    #         for i2 in range(i1 + 1, len(item)):
    #             if item[i1, i2] != np.inf:
    #                 edge_x_g[idx].append(i1)
    #                 edge_y_g[idx].append(i2)
    #                 edge_w_g[idx].append(item[i1, i2])
    # print(len(edge_x_g[0]))
    # print(len(edge_y_g[0]))
    # print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    pool = Pool(n_jobs)
    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
                         node_kernels, edge_kernels)
    itr = zip(combinations_with_replacement(Gn, 2),
              combinations_with_replacement(splist, 2),
              combinations_with_replacement(range(0, len(Gn)), 2))
    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
    if len_itr < 1000 * n_jobs:
        chunksize = int(len_itr / n_jobs) + 1
    else:
        chunksize = 1000
    for i, j, kernel in tqdm(
            pool.imap_unordered(do_partial, itr, chunksize),
            desc='calculating kernels',
            file=sys.stdout):
        Kmatrix[i][j] = kernel
        Kmatrix[j][i] = kernel
    pool.close()
    pool.join()

#    # ---- use pool.imap_unordered to parallel and track progress. ----
#    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
#                         node_kernels, edge_kernels)
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
#    if len_itr < 1000 * n_jobs:
#        chunksize = int(len_itr / n_jobs) + 1
#    else:
#        chunksize = 1000
#    from contextlib import closing
#    with closing(Pool(n_jobs)) as pool:
#        for i, j, kernel in tqdm(
#                pool.imap_unordered(do_partial, itr, 1000),
#                desc='calculating kernels',
#                file=sys.stdout):
#            Kmatrix[i][j] = kernel
#            Kmatrix[j][i] = kernel
#    pool.close()
#    pool.join()


#    # ---- direct running, normally use single CPU core. ----
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
#        i, j, kernel = wrapper_ssp_do(ds_attrs, node_label, edge_label, 
#                                      node_kernels, edge_kernels, gs)
#        if(kernel > 1):
#            print("error here ")
#        Kmatrix[i][j] = kernel
#        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time


def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
                          node_kernels, edge_kernels):
    
    kernel = 0

    # First, compute shortest path matrices, method borrowed from FCSP.
    if ds_attrs['node_labeled']:
        # node symb and non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['mix']
            vk_dict = {}  # shortest path matrices dict
            for n1, n2 in product(
                    g1.nodes(data=True), g2.nodes(data=True)):
                vk_dict[(n1[0], n2[0])] = kn(
                    n1[1][node_label], n2[1][node_label],
                    n1[1]['attributes'], n2[1]['attributes'])
        # node symb labeled
        else:
            kn = node_kernels['symb']
            vk_dict = {}  # shortest path matrices dict
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
                                                 n2[1][node_label])
    else:
        # node non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['nsymb']
            vk_dict = {}  # shortest path matrices dict
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
                                                 n2[1]['attributes'])
        # node unlabeled
        else:
            vk_dict = {}

    # Then, compute kernels between all pairs of edges, which idea is an
    # extension of FCSP. It suits sparse graphs, which is the most case we
    # went though. For dense graphs, it would be slow.
    if ds_attrs['edge_labeled']:
        # edge symb and non-synb labeled
        if ds_attrs['edge_attr_dim'] > 0:
            ke = edge_kernels['mix']
            ek_dict = {}  # dict of edge kernels
            for e1, e2 in product(
                    g1.edges(data=True), g2.edges(data=True)):
                ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
                    e1[2]['attributes'], e2[2]['attributes'])
                ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
                ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
                ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
                ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
        # edge symb labeled
        else:
            ke = edge_kernels['symb']
            ek_dict = {}
            for e1 in g1.edges(data=True):
                for e2 in g2.edges(data=True):
                    ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
                    ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
                    ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
                    ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
                    ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
    else:
        # edge non-synb labeled
        if ds_attrs['edge_attr_dim'] > 0:
            ke = edge_kernels['nsymb']
            ek_dict = {}
            for e1 in g1.edges(data=True):
                for e2 in g2.edges(data=True):
                    ek_temp = kn(e1[2]['attributes'], e2[2]['attributes'])
                    ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
                    ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
                    ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
                    ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
        # edge unlabeled
        else:
            ek_dict = {}

    # compute graph kernels
    if vk_dict:
        if ek_dict:
            for p1, p2 in product(spl1, spl2):
                if len(p1) == len(p2):
                    kpath = vk_dict[(p1[0], p2[0])]
                    if kpath:
                        for idx in range(1, len(p1)):
                            kpath *= vk_dict[(p1[idx], p2[idx])] * \
                                ek_dict[((p1[idx-1], p1[idx]),
                                         (p2[idx-1], p2[idx]))]
                            if not kpath:
                                break
                        kernel += kpath  # add up kernels of all paths
        else:
            for p1, p2 in product(spl1, spl2):
                if len(p1) == len(p2):
                    kpath = vk_dict[(p1[0], p2[0])]
                    if kpath:
                        for idx in range(1, len(p1)):
                            kpath *= vk_dict[(p1[idx], p2[idx])]
                            if not kpath:
                                break
                        kernel += kpath  # add up kernels of all paths
    else:
        if ek_dict:
            for p1, p2 in product(spl1, spl2):
                if len(p1) == len(p2):
                    if len(p1) == 0:
                        kernel += 1
                    else:
                        kpath = 1
                        for idx in range(0, len(p1) - 1):
                            kpath *= ek_dict[((p1[idx], p1[idx+1]),
                                              (p2[idx], p2[idx+1]))]
                            if not kpath:
                                break
                        kernel += kpath  # add up kernels of all paths
        else:
            for p1, p2 in product(spl1, spl2):
                if len(p1) == len(p2):
                    kernel += 1

    kernel = kernel / (len(spl1) * len(spl2))  # calculate mean average

    # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
    # # compute vertex kernel matrix
    # try:
    #     vk_mat = np.zeros((nx.number_of_nodes(g1),
    #                        nx.number_of_nodes(g2)))
    #     g1nl = enumerate(g1.nodes(data=True))
    #     g2nl = enumerate(g2.nodes(data=True))
    #     for i1, n1 in g1nl:
    #         for i2, n2 in g2nl:
    #             vk_mat[i1][i2] = kn(
    #                 n1[1][node_label], n2[1][node_label],
    #                 [n1[1]['attributes']], [n2[1]['attributes']])

    #     range1 = range(0, len(edge_w_g[i]))
    #     range2 = range(0, len(edge_w_g[j]))
    #     for i1 in range1:
    #         x1 = edge_x_g[i][i1]
    #         y1 = edge_y_g[i][i1]
    #         w1 = edge_w_g[i][i1]
    #         for i2 in range2:
    #             x2 = edge_x_g[j][i2]
    #             y2 = edge_y_g[j][i2]
    #             w2 = edge_w_g[j][i2]
    #             ke = (w1 == w2)
    #             if ke > 0:
    #                 kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
    #                 kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
    #                 Kmatrix += kn1 + kn2
    return kernel


def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, 
                   edge_kernels, itr_item):
    g1 = itr_item[0][0]
    g2 = itr_item[0][1]
    spl1 = itr_item[1][0]
    spl2 = itr_item[1][1]
    i = itr_item[2][0]
    j = itr_item[2][1]
    return i, j, structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, 
        node_label, edge_label, node_kernels, edge_kernels)


def get_shortest_paths(G, weight, directed):
    """Get all shortest paths of a graph.

    Parameters
    ----------
    G : NetworkX graphs
        The graphs whose paths are calculated.
    weight : string/None
        edge attribute used as weight to calculate the shortest path.
    directed: boolean
        Whether graph is directed.

    Return
    ------
    sp : list of list
        List of shortest paths of the graph, where each path is represented by a list of nodes.
    """
    sp = []
    for n1, n2 in combinations(G.nodes(), 2):
        try:
            spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
        except nx.NetworkXNoPath:  # nodes not connected
            #            sp.append([])
            pass
        else:
            sp += spltemp
            # each edge walk is counted twice, starting from both its extreme nodes.
            if not directed:
                sp += [sptemp[::-1] for sptemp in spltemp]
                
    # add single nodes as length 0 paths.
    sp += [[n] for n in G.nodes()]
    return sp


def wrapper_getSP(weight, directed, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    return i, get_shortest_paths(g, weight, directed)