OpenI
/
graphkit-learn

"""
@author: linlin <jajupmochi@gmail.com>
@references:
    [1] Tamás Horváth, Thomas Gärtner, and Stefan Wrobel. Cyclic pattern kernels for predictive graph mining. In Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining, pages 158–167. ACM, 2004.
    [2]	Hopcroft, J.; Tarjan, R. (1973). “Efficient algorithms for graph manipulation”. Communications of the ACM 16: 372–378. doi:10.1145/362248.362272.
    [3] Finding all the elementary circuits of a directed graph. D. B. Johnson, SIAM Journal on Computing 4, no. 1, 77-84, 1975. http://dx.doi.org/10.1137/0204007
"""

import sys
import pathlib
sys.path.insert(0, "../")
import time

import networkx as nx
import numpy as np

from tqdm import tqdm


def cyclicpatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):
    """Calculate cyclic pattern graph kernels between graphs.
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.
    depth : integer
        Depth of search. Longest length of paths.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
    """
    Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    start_time = time.time()

    # get all cyclic and tree patterns of all graphs before calculating kernels to save time, but this may consume a lot of memory for large dataset.
    all_patterns = [ get_patterns(Gn[i], node_label=node_label, edge_label = edge_label, labeled = labeled, cycle_bound = cycle_bound)
        for i in tqdm(range(0, len(Gn)), desc='retrieve patterns', file=sys.stdout) ]

    for i in tqdm(range(0, len(Gn)), desc='calculate kernels', file=sys.stdout):
        for j in range(i, len(Gn)):
            Kmatrix[i][j] = _cyclicpatternkernel_do(all_patterns[i], all_patterns[j])
            Kmatrix[j][i] = Kmatrix[i][j]

    run_time = time.time() - start_time
    print("\n --- kernel matrix of cyclic pattern kernel of size %d built in %s seconds ---" % (len(Gn), run_time))

    return Kmatrix, run_time


def _cyclicpatternkernel_do(patterns1, patterns2):
    """Calculate path graph kernels up to depth d between 2 graphs.

    Parameters
    ----------
    paths1, paths2 : list
        List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
    k_func : function
        A kernel function used using different notions of fingerprint similarity.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.

    Return
    ------
    kernel : float
        Treelet Kernel between 2 graphs.
    """
    return len(set(patterns1) & set(patterns2))


def get_patterns(G, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):
    """Find all cyclic and tree patterns in a graph.

    Parameters
    ----------
    G : NetworkX graphs
        The graph in which paths are searched.
    length : integer
        The maximum length of paths.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.

    Return
    ------
    path : list
        List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
    """
    number_simplecycles = 0
    bridges = nx.Graph()
    patterns = []

    bicomponents = nx.biconnected_component_subgraphs(G) # all biconnected components of G. this function use algorithm in reference [2], which (i guess) is slightly different from the one used in paper [1]
    for subgraph in bicomponents:
        if nx.number_of_edges(subgraph) > 1:
            simple_cycles = list(nx.simple_cycles(G.to_directed())) # all simple cycles in biconnected components. this function use algorithm in reference [3], which has time complexity O((n+e)(N+1)) for n nodes, e edges and N simple cycles. Which might be slower than the algorithm applied in paper [1]
            if cycle_bound != None and len(simple_cycles) > cycle_bound - number_simplecycles: # in paper [1], when applying another algorithm (subroutine RT), this becomes len(simple_cycles) == cycle_bound - number_simplecycles + 1, check again.
                return []
            else:

                # calculate canonical representation for each simple cycle
                all_canonkeys = []
                for cycle in simple_cycles:
                    canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[:-1] ]
                    canonkey = ''.join(canonlist)
                    canonkey = canonkey if canonkey < canonkey[::-1] else canonkey[::-1]
                    for i in range(1, len(cycle[:-1])):
                        canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[i:-1] + cycle[:i] ]
                        canonkey_t = ''.join(canonlist)
                        canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1]
                        canonkey = canonkey if canonkey < canonkey_t else canonkey_t
                    all_canonkeys.append(canonkey)

                patterns = list(set(patterns) | set(all_canonkeys))
                number_simplecycles += len(simple_cycles)
        else:
            bridges.add_edges_from(subgraph.edges(data=True))

    # calculate canonical representation for each connected component in bridge set
    components = list(nx.connected_component_subgraphs(bridges)) # all connected components in the bridge
    tree_patterns = []
    for tree in components:
        break


    # patterns += pi(bridges)
    return patterns