OpenI
/
graphkit-learn

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 27 10:56:23 2018

@author: linlin
@references: Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For 
Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
"""

import sys
import time
from itertools import combinations, combinations_with_replacement, product
from functools import partial
from joblib import Parallel, delayed
from multiprocessing import Pool
from tqdm import tqdm

import networkx as nx
import numpy as np

from pygraph.utils.graphdataset import get_dataset_attributes

sys.path.insert(0, "../")


def structuralspkernel(*args,
                       node_label='atom',
                       edge_weight=None,
                       edge_label='bond_type',
                       node_kernels=None,
                       edge_kernels=None,
                       n_jobs=None):
    """Calculate mean average structural shortest path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when nodes are unlabeled.
    edge_kernels: dict
        A dictionary of kernel functions for edges, including 3 items: 'symb' 
        for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' 
        for both labels. The first 2 functions take two edge labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two edges. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when edges are unlabeled.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the mean average structural 
        shortest path kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]

    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
                    'edge_attr_dim', 'is_directed'],
        node_label=node_label, edge_label=edge_label)

    start_time = time.time()

    # get shortest paths of each graph in Gn
    splist = [[] for _ in range(len(Gn))]
    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
    if len(Gn) < 1000 * n_jobs:
#        # use default chunksize as pool.map when iterable is less than 100
#        chunksize, extra = divmod(len(Gn), n_jobs * 4)
#        if extra:
#            chunksize += 1
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 1000
    # chunksize = 300  # int(len(list(itr)) / n_jobs)
    for i, sp in tqdm(
            pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
            desc='getting shortest paths',
            file=sys.stdout):
        splist[i] = sp
    pool.close()
    pool.join()

    # # ---- use pool.map to parallel ----
    # result_sp = pool.map(getsp_partial, range(0, len(Gn)))
    # for i in result_sp:
    #     Gn[i[0]] = i[1]
    # or
    # getsp_partial = partial(wrap_getSP, Gn, weight)
    # for i, g in tqdm(
    #         pool.map(getsp_partial, range(0, len(Gn))),
    #         desc='getting sp graphs',
    #         file=sys.stdout):
    #     Gn[i] = g

    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
    # for i in result_sp:
    #     sp_ml[i[0]] = i[1]
    # edge_x_g = [[] for i in range(len(sp_ml))]
    # edge_y_g = [[] for i in range(len(sp_ml))]
    # edge_w_g = [[] for i in range(len(sp_ml))]
    # for idx, item in enumerate(sp_ml):
    #     for i1 in range(len(item)):
    #         for i2 in range(i1 + 1, len(item)):
    #             if item[i1, i2] != np.inf:
    #                 edge_x_g[idx].append(i1)
    #                 edge_y_g[idx].append(i2)
    #                 edge_w_g[idx].append(item[i1, i2])
    # print(len(edge_x_g[0]))
    # print(len(edge_y_g[0]))
    # print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    pool = Pool(n_jobs)
    do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
                         node_label, edge_label, node_kernels, edge_kernels)
    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
    if len_itr < 1000 * n_jobs:
        chunksize = int(len_itr / n_jobs) + 1
    else:
        chunksize = 1000
    for i, j, kernel in tqdm(
            pool.imap_unordered(do_partial, itr, chunksize),
            desc='calculating kernels',
            file=sys.stdout):
        Kmatrix[i][j] = kernel
        Kmatrix[j][i] = kernel
    pool.close()
    pool.join()

    # # ---- use pool.map to parallel. ----
    # # result_perf = pool.map(do_partial, itr)
    # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
    # itr = combinations_with_replacement(range(0, len(Gn)), 2)
    # for i, j, kernel in tqdm(
    #         pool.map(do_partial, itr), desc='calculating kernels',
    #         file=sys.stdout):
    #     Kmatrix[i][j] = kernel
    #     Kmatrix[j][i] = kernel
    # pool.close()
    # pool.join()

    # # ---- use joblib.Parallel to parallel and track progress. ----
    # result_perf = Parallel(
    #     n_jobs=n_jobs, verbose=10)(
    #         delayed(do_partial)(ij)
    #         for ij in combinations_with_replacement(range(0, len(Gn)), 2))
    # result_perf = [
    #     do_partial(ij)
    #     for ij in combinations_with_replacement(range(0, len(Gn)), 2)
    # ]
    # for i in result_perf:
    #     Kmatrix[i[0]][i[1]] = i[2]
    #     Kmatrix[i[1]][i[0]] = i[2]

#    # ---- direct running, normally use single CPU core. ----
#    itr = combinations_with_replacement(range(0, len(Gn)), 2)
#    for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
#        i, j, kernel = structuralspkernel_do(Gn, splist, ds_attrs, 
#            node_label, edge_label, node_kernels, edge_kernels, gs)
#        Kmatrix[i][j] = kernel
#        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time


def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
                          node_kernels, edge_kernels, ij):

    iglobal = ij[0]
    jglobal = ij[1]
    g1 = Gn[iglobal]
    g2 = Gn[jglobal]
    spl1 = splist[iglobal]
    spl2 = splist[jglobal]
    kernel = 0

    try:
        # First, compute shortest path matrices, method borrowed from FCSP.
        if ds_attrs['node_labeled']:
            # node symb and non-synb labeled
            if ds_attrs['node_attr_dim'] > 0:
                kn = node_kernels['mix']
                vk_dict = {}  # shortest path matrices dict
                for n1, n2 in product(
                        g1.nodes(data=True), g2.nodes(data=True)):
                    vk_dict[(n1[0], n2[0])] = kn(
                        n1[1][node_label], n2[1][node_label],
                        [n1[1]['attributes']], [n2[1]['attributes']])
            # node symb labeled
            else:
                kn = node_kernels['symb']
                vk_dict = {}  # shortest path matrices dict
                for n1 in g1.nodes(data=True):
                    for n2 in g2.nodes(data=True):
                        vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
                                                     n2[1][node_label])
        else:
            # node non-synb labeled
            if ds_attrs['node_attr_dim'] > 0:
                kn = node_kernels['nsymb']
                vk_dict = {}  # shortest path matrices dict
                for n1 in g1.nodes(data=True):
                    for n2 in g2.nodes(data=True):
                        vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
                                                     [n2[1]['attributes']])
            # node unlabeled
            else:
                vk_dict = {}

        # Then, compute kernels between all pairs of edges, which idea is an
        # extension of FCSP. It suits sparse graphs, which is the most case we
        # went though. For dense graphs, it would be slow.
        if ds_attrs['edge_labeled']:
            # edge symb and non-synb labeled
            if ds_attrs['edge_attr_dim'] > 0:
                ke = edge_kernels['mix']
                ek_dict = {}  # dict of edge kernels
                for e1, e2 in product(
                        g1.edges(data=True), g2.edges(data=True)):
                    ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
                        e1[2][edge_label], e2[2][edge_label],
                        [e1[2]['attributes']], [e2[2]['attributes']])
            # edge symb labeled
            else:
                ke = edge_kernels['symb']
                ek_dict = {}
                for e1 in g1.edges(data=True):
                    for e2 in g2.edges(data=True):
                        ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
                            e1[2][edge_label], e2[2][edge_label])
        else:
            # edge non-synb labeled
            if ds_attrs['edge_attr_dim'] > 0:
                ke = edge_kernels['nsymb']
                ek_dict = {}
                for e1 in g1.edges(data=True):
                    for e2 in g2.edges(data=True):
                        ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = kn(
                            [e1[2]['attributes']], [e2[2]['attributes']])
            # edge unlabeled
            else:
                ek_dict = {}

        # compute graph kernels
        if vk_dict:
            if ek_dict:
                for p1, p2 in product(spl1, spl2):
                    if len(p1) == len(p2):
                        kpath = vk_dict[(p1[0], p2[0])]
                        if kpath:
                            for idx in range(1, len(p1)):
                                kpath *= vk_dict[(p1[idx], p2[idx])] * \
                                    ek_dict[((p1[idx-1], p1[idx]),
                                             (p2[idx-1], p2[idx]))]
                                if not kpath:
                                    break
                            kernel += kpath  # add up kernels of all paths
            else:
                for p1, p2 in product(spl1, spl2):
                    if len(p1) == len(p2):
                        kpath = vk_dict[(p1[0], p2[0])]
                        if kpath:
                            for idx in range(1, len(p1)):
                                kpath *= vk_dict[(p1[idx], p2[idx])]
                                if not kpath:
                                    break
                            kernel += kpath  # add up kernels of all paths
        else:
            if ek_dict:
                for p1, p2 in product(spl1, spl2):
                    if len(p1) == len(p2):
                        if len(p1) == 0:
                            kernel += 1
                        else:
                            kpath = 1
                            for idx in range(0, len(p1) - 1):
                                kpath *= ek_dict[((p1[idx], p1[idx+1]),
                                                  (p2[idx], p2[idx+1]))]
                                if not kpath:
                                    break
                            kernel += kpath  # add up kernels of all paths
            else:
                for p1, p2 in product(spl1, spl2):
                    if len(p1) == len(p2):
                        kernel += 1

        kernel = kernel / (len(spl1) * len(spl2))  # calculate mean average

        # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
        # # compute vertex kernel matrix
        # try:
        #     vk_mat = np.zeros((nx.number_of_nodes(g1),
        #                        nx.number_of_nodes(g2)))
        #     g1nl = enumerate(g1.nodes(data=True))
        #     g2nl = enumerate(g2.nodes(data=True))
        #     for i1, n1 in g1nl:
        #         for i2, n2 in g2nl:
        #             vk_mat[i1][i2] = kn(
        #                 n1[1][node_label], n2[1][node_label],
        #                 [n1[1]['attributes']], [n2[1]['attributes']])

        #     range1 = range(0, len(edge_w_g[i]))
        #     range2 = range(0, len(edge_w_g[j]))
        #     for i1 in range1:
        #         x1 = edge_x_g[i][i1]
        #         y1 = edge_y_g[i][i1]
        #         w1 = edge_w_g[i][i1]
        #         for i2 in range2:
        #             x2 = edge_x_g[j][i2]
        #             y2 = edge_y_g[j][i2]
        #             w2 = edge_w_g[j][i2]
        #             ke = (w1 == w2)
        #             if ke > 0:
        #                 kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
        #                 kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
        #                 Kmatrix += kn1 + kn2
    except KeyError:  # missing labels or attributes
        pass

    return iglobal, jglobal, kernel


def get_shortest_paths(G, weight, directed):
    """Get all shortest paths of a graph.

    Parameters
    ----------
    G : NetworkX graphs
        The graphs whose paths are calculated.
    weight : string/None
        edge attribute used as weight to calculate the shortest path.
    directed: boolean
        Whether graph is directed.

    Return
    ------
    sp : list of list
        List of shortest paths of the graph, where each path is represented by a list of nodes.
    """
    sp = []
    for n1, n2 in combinations(G.nodes(), 2):
        try:
            spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
            sp += spltemp
            # each edge walk is counted twice, starting from both its extreme nodes.
            if not directed:
                sp += [sptemp[::-1] for sptemp in spltemp]
        except nx.NetworkXNoPath:  # nodes not connected
            #            sp.append([])
            pass
    # add single nodes as length 0 paths.
    sp += [[n] for n in G.nodes()]
    return sp


def wrap_getSP(Gn, weight, directed, i):
    return i, get_shortest_paths(Gn[i], weight, directed)