From ea31d25e9f39af26d9d80a9bc3916d7a59d91784 Mon Sep 17 00:00:00 2001 From: linlin Date: Mon, 5 Oct 2020 16:37:27 +0200 Subject: [PATCH] New translations structuralspKernel.py (French) --- lang/fr/gklearn/kernels/structuralspKernel.py | 863 ++++++++++++++++++++++++++ 1 file changed, 863 insertions(+) create mode 100644 lang/fr/gklearn/kernels/structuralspKernel.py diff --git a/lang/fr/gklearn/kernels/structuralspKernel.py b/lang/fr/gklearn/kernels/structuralspKernel.py new file mode 100644 index 0000000..fb8dbf9 --- /dev/null +++ b/lang/fr/gklearn/kernels/structuralspKernel.py @@ -0,0 +1,863 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Sep 27 10:56:23 2018 + +@author: linlin + +@references: + + [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For + Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). +""" + +import sys +import time +from itertools import combinations, product +from functools import partial +from multiprocessing import Pool +from tqdm import tqdm + +import networkx as nx +import numpy as np + +from gklearn.utils.graphdataset import get_dataset_attributes +from gklearn.utils.parallel import parallel_gm +from gklearn.utils.trie import Trie + +def structuralspkernel(*args, + node_label='atom', + edge_weight=None, + edge_label='bond_type', + node_kernels=None, + edge_kernels=None, + compute_method='naive', + parallel='imap_unordered', +# parallel=None, + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate mean average structural shortest path kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + + node_label : string + Node attribute used as label. The default node label is atom. + + edge_weight : string + Edge attribute name corresponding to the edge weight. Applied for the + computation of the shortest paths. + + edge_label : string + Edge attribute used as label. The default edge label is bond_type. + + node_kernels : dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when nodes are unlabeled. + + edge_kernels : dict + A dictionary of kernel functions for edges, including 3 items: 'symb' + for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' + for both labels. The first 2 functions take two edge labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two edges. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when edges are unlabeled. + + compute_method : string + Computation method to store the shortest paths and compute the graph + kernel. The Following choices are available: + + 'trie': store paths as tries. + + 'naive': store paths to lists. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the mean average structural + shortest path kernel between 2 praphs. + """ + # pre-process + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + Gn = [g.copy() for g in Gn] + weight = None + if edge_weight is None: + if verbose: + print('\n None edge weight specified. Set all weight to 1.\n') + else: + try: + some_weight = list( + nx.get_edge_attributes(Gn[0], edge_weight).values())[0] + if isinstance(some_weight, (float, int)): + weight = edge_weight + else: + if verbose: + print( + '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' + % edge_weight) + except: + if verbose: + print( + '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' + % edge_weight) + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', + 'edge_attr_dim', 'is_directed'], + node_label=node_label, edge_label=edge_label) + + start_time = time.time() + + # get shortest paths of each graph in Gn + if parallel == 'imap_unordered': + splist = [None] * len(Gn) + pool = Pool(n_jobs) + itr = zip(Gn, range(0, len(Gn))) + if chunksize is None: + if len(Gn) < 100 * n_jobs: + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + # get shortest path graphs of Gn + if compute_method == 'trie': + getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) + else: + getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) + if verbose: + iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), + desc='getting shortest paths', file=sys.stdout) + else: + iterator = pool.imap_unordered(getsp_partial, itr, chunksize) + for i, sp in iterator: + splist[i] = sp + # time.sleep(10) + pool.close() + pool.join() + # ---- direct running, normally use single CPU core. ---- + elif parallel is None: + splist = [] + if verbose: + iterator = tqdm(Gn, desc='getting sp graphs', file=sys.stdout) + else: + iterator = Gn + if compute_method == 'trie': + for g in iterator: + splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed'])) + else: + for g in iterator: + splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) + +# ss = 0 +# ss += sys.getsizeof(splist) +# for spss in splist: +# ss += sys.getsizeof(spss) +# for spp in spss: +# ss += sys.getsizeof(spp) + + +# time.sleep(20) + + + + # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) + # sp_ml = [0] * len(Gn) # shortest path matrices + # for i in result_sp: + # sp_ml[i[0]] = i[1] + # edge_x_g = [[] for i in range(len(sp_ml))] + # edge_y_g = [[] for i in range(len(sp_ml))] + # edge_w_g = [[] for i in range(len(sp_ml))] + # for idx, item in enumerate(sp_ml): + # for i1 in range(len(item)): + # for i2 in range(i1 + 1, len(item)): + # if item[i1, i2] != np.inf: + # edge_x_g[idx].append(i1) + # edge_y_g[idx].append(i2) + # edge_w_g[idx].append(item[i1, i2]) + # print(len(edge_x_g[0])) + # print(len(edge_y_g[0])) + # print(len(edge_w_g[0])) + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + if parallel == 'imap_unordered': + def init_worker(spl_toshare, gs_toshare): + global G_spl, G_gs + G_spl = spl_toshare + G_gs = gs_toshare + if compute_method == 'trie': + do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + else: + do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + # ---- direct running, normally use single CPU core. ---- + elif parallel is None: + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(Gn)), 2) + if verbose: + iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + else: + iterator = itr + if compute_method == 'trie': + for i, j in iterator: + kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j], + ds_attrs, node_label, edge_label, node_kernels, edge_kernels) + Kmatrix[i][j] = kernel + Kmatrix[j][i] = kernel + else: + for i, j in iterator: + kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], + ds_attrs, node_label, edge_label, node_kernels, edge_kernels) + # if(kernel > 1): + # print("error here ") + Kmatrix[i][j] = kernel + Kmatrix[j][i] = kernel + +# # ---- use pool.map to parallel. ---- +# pool = Pool(n_jobs) +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# node_kernels, edge_kernels) +# itr = zip(combinations_with_replacement(Gn, 2), +# combinations_with_replacement(splist, 2), +# combinations_with_replacement(range(0, len(Gn)), 2)) +# for i, j, kernel in tqdm( +# pool.map(do_partial, itr), desc='calculating kernels', +# file=sys.stdout): +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# pool.close() +# pool.join() + +# # ---- use pool.imap_unordered to parallel and track progress. ---- +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# node_kernels, edge_kernels) +# itr = zip(combinations_with_replacement(Gn, 2), +# combinations_with_replacement(splist, 2), +# combinations_with_replacement(range(0, len(Gn)), 2)) +# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) +# if len_itr < 1000 * n_jobs: +# chunksize = int(len_itr / n_jobs) + 1 +# else: +# chunksize = 1000 +# from contextlib import closing +# with closing(Pool(n_jobs)) as pool: +# for i, j, kernel in tqdm( +# pool.imap_unordered(do_partial, itr, 1000), +# desc='calculating kernels', +# file=sys.stdout): +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# pool.close() +# pool.join() + + + + run_time = time.time() - start_time + if verbose: + print("\n --- shortest path kernel matrix of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time + + +def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels): + + kernel = 0 + + # First, compute shortest path matrices, method borrowed from FCSP. + vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) + # Then, compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) + + # compute graph kernels + if vk_dict: + if ek_dict: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = vk_dict[(p1[0], p2[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], p2[idx])] * \ + ek_dict[((p1[idx-1], p1[idx]), + (p2[idx-1], p2[idx]))] + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = vk_dict[(p1[0], p2[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], p2[idx])] + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + if ek_dict: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + if len(p1) == 0: + kernel += 1 + else: + kpath = 1 + for idx in range(0, len(p1) - 1): + kpath *= ek_dict[((p1[idx], p1[idx+1]), + (p2[idx], p2[idx+1]))] + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kernel += 1 + try: + kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average + except ZeroDivisionError: + print(spl1, spl2) + print(g1.nodes(data=True)) + print(g1.edges(data=True)) + raise Exception + + # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation + # # compute vertex kernel matrix + # try: + # vk_mat = np.zeros((nx.number_of_nodes(g1), + # nx.number_of_nodes(g2))) + # g1nl = enumerate(g1.nodes(data=True)) + # g2nl = enumerate(g2.nodes(data=True)) + # for i1, n1 in g1nl: + # for i2, n2 in g2nl: + # vk_mat[i1][i2] = kn( + # n1[1][node_label], n2[1][node_label], + # [n1[1]['attributes']], [n2[1]['attributes']]) + + # range1 = range(0, len(edge_w_g[i])) + # range2 = range(0, len(edge_w_g[j])) + # for i1 in range1: + # x1 = edge_x_g[i][i1] + # y1 = edge_y_g[i][i1] + # w1 = edge_w_g[i][i1] + # for i2 in range2: + # x2 = edge_x_g[j][i2] + # y2 = edge_y_g[j][i2] + # w2 = edge_w_g[j][i2] + # ke = (w1 == w2) + # if ke > 0: + # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] + # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] + # Kmatrix += kn1 + kn2 + return kernel + + +def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, + edge_kernels, itr): + i = itr[0] + j = itr[1] + return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], + ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + + +def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels): + +# # traverse all paths in graph1. Deep-first search is applied. +# def traverseBothTrie(root, trie2, kernel, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# traverseTrie2(trie2.root, pcurrent, kernel, +# pcurrent=[]) +# if node['children'] != {}: +# traverseBothTrie(node, trie2, kernel, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. +# def traverseTrie2(root, p1, kernel, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) +# if node['children'] != {}: +# traverseTrie2(node, p1, kernel, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] +# +# +# kernel = [0] +# +# # First, compute shortest path matrices, method borrowed from FCSP. +# vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) +# # Then, compute kernels between all pairs of edges, which is an idea of +# # extension of FCSP. It suits sparse graphs, which is the most case we +# # went though. For dense graphs, this would be slow. +# ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) +# +# # compute graph kernels +# traverseBothTrie(trie1[0].root, trie2[0], kernel) +# +# kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average + +# # traverse all paths in graph1. Deep-first search is applied. +# def traverseBothTrie(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict, +# pcurrent=[]) +# if node['children'] != {}: +# traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. +# def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) +# if node['children'] != {}: +# traverseTrie2(node, p1, kernel, vk_dict, ek_dict, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] + + + kernel = [0] + + # First, compute shortest path matrices, method borrowed from FCSP. + vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) + # Then, compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) + + # compute graph kernels +# traverseBothTrie(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + if vk_dict: + if ek_dict: + traverseBothTriem(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + else: + traverseBothTriev(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + else: + if ek_dict: + traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + else: + traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + + kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average + + return kernel + + +def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels, + edge_kernels, itr): + i = itr[0] + j = itr[1] + return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs, + node_label, edge_label, node_kernels, edge_kernels) + + +def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs): + # compute shortest path matrices, method borrowed from FCSP. + vk_dict = {} # shortest path matrices dict + if ds_attrs['node_labeled']: + # node symb and non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['mix'] + for n1, n2 in product( + g1.nodes(data=True), g2.nodes(data=True)): + vk_dict[(n1[0], n2[0])] = kn( + n1[1][node_label], n2[1][node_label], + n1[1]['attributes'], n2[1]['attributes']) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], + n2[1][node_label]) + else: + # node non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], + n2[1]['attributes']) + # node unlabeled + else: + pass + + return vk_dict + + +def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs): + # compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = {} # dict of edge kernels + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1, e2 in product( + g1.edges(data=True), g2.edges(data=True)): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + # edge unlabeled + else: + pass + + return ek_dict + + +# traverse all paths in graph1. Deep-first search is applied. +def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. +def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + kpath = vk_dict[(p1[0], pcurrent[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], pcurrent[idx])] * \ + ek_dict[((p1[idx-1], p1[idx]), + (pcurrent[idx-1], pcurrent[idx]))] + if not kpath: + break + kernel[0] += kpath # add up kernels of all paths + if node['children'] != {}: + traverseTrie2m(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph1. Deep-first search is applied. +def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. +def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + kpath = vk_dict[(p1[0], pcurrent[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], pcurrent[idx])] + if not kpath: + break + kernel[0] += kpath # add up kernels of all paths + if node['children'] != {}: + traverseTrie2v(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph1. Deep-first search is applied. +def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. +def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + if len(p1) == 0: + kernel += 1 + else: + kpath = 1 + for idx in range(0, len(p1) - 1): + kpath *= ek_dict[((p1[idx], p1[idx+1]), + (pcurrent[idx], pcurrent[idx+1]))] + if not kpath: + break + kernel[0] += kpath # add up kernels of all paths + if node['children'] != {}: + traverseTrie2e(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph1. Deep-first search is applied. +def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. +def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + kernel[0] += 1 + if node['children'] != {}: + traverseTrie2u(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +#def computePathKernel(p1, p2, vk_dict, ek_dict): +# kernel = 0 +# if vk_dict: +# if ek_dict: +# if len(p1) == len(p2): +# kpath = vk_dict[(p1[0], p2[0])] +# if kpath: +# for idx in range(1, len(p1)): +# kpath *= vk_dict[(p1[idx], p2[idx])] * \ +# ek_dict[((p1[idx-1], p1[idx]), +# (p2[idx-1], p2[idx]))] +# if not kpath: +# break +# kernel += kpath # add up kernels of all paths +# else: +# if len(p1) == len(p2): +# kpath = vk_dict[(p1[0], p2[0])] +# if kpath: +# for idx in range(1, len(p1)): +# kpath *= vk_dict[(p1[idx], p2[idx])] +# if not kpath: +# break +# kernel += kpath # add up kernels of all paths +# else: +# if ek_dict: +# if len(p1) == len(p2): +# if len(p1) == 0: +# kernel += 1 +# else: +# kpath = 1 +# for idx in range(0, len(p1) - 1): +# kpath *= ek_dict[((p1[idx], p1[idx+1]), +# (p2[idx], p2[idx+1]))] +# if not kpath: +# break +# kernel += kpath # add up kernels of all paths +# else: +# if len(p1) == len(p2): +# kernel += 1 +# +# return kernel + + +def get_shortest_paths(G, weight, directed): + """Get all shortest paths of a graph. + + Parameters + ---------- + G : NetworkX graphs + The graphs whose paths are calculated. + weight : string/None + edge attribute used as weight to calculate the shortest path. + directed: boolean + Whether graph is directed. + + Return + ------ + sp : list of list + List of shortest paths of the graph, where each path is represented by a list of nodes. + """ + sp = [] + for n1, n2 in combinations(G.nodes(), 2): + try: + spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) + except nx.NetworkXNoPath: # nodes not connected + # sp.append([]) + pass + else: + sp += spltemp + # each edge walk is counted twice, starting from both its extreme nodes. + if not directed: + sp += [sptemp[::-1] for sptemp in spltemp] + + # add single nodes as length 0 paths. + sp += [[n] for n in G.nodes()] + return sp + + +def wrapper_getSP_naive(weight, directed, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, get_shortest_paths(g, weight, directed) + + +def get_sps_as_trie(G, weight, directed): + """Get all shortest paths of a graph and insert them into a trie. + + Parameters + ---------- + G : NetworkX graphs + The graphs whose paths are calculated. + weight : string/None + edge attribute used as weight to calculate the shortest path. + directed: boolean + Whether graph is directed. + + Return + ------ + sp : list of list + List of shortest paths of the graph, where each path is represented by a list of nodes. + """ + sptrie = Trie() + lensp = 0 + for n1, n2 in combinations(G.nodes(), 2): + try: + spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) + except nx.NetworkXNoPath: # nodes not connected + pass + else: + lensp += len(spltemp) + if not directed: + lensp += len(spltemp) + for sp in spltemp: + sptrie.insertWord(sp) + # each edge walk is counted twice, starting from both its extreme nodes. + if not directed: + sptrie.insertWord(sp[::-1]) + + # add single nodes as length 0 paths. + for n in G.nodes(): + sptrie.insertWord([n]) + + return sptrie, lensp + nx.number_of_nodes(G) + + +def wrapper_getSP_trie(weight, directed, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, get_sps_as_trie(g, weight, directed)