|
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Thu Sep 27 10:56:23 2018
-
- @author: linlin
- @references: Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
- Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
- """
-
- import sys
- import time
- from itertools import combinations, product
- from functools import partial
- from multiprocessing import Pool
- from tqdm import tqdm
-
- import networkx as nx
- import numpy as np
-
- from pygraph.utils.graphdataset import get_dataset_attributes
- from pygraph.utils.parallel import parallel_gm
- from pygraph.utils.trie import Trie
-
- sys.path.insert(0, "../")
-
-
- def structuralspkernel(*args,
- node_label='atom',
- edge_weight=None,
- edge_label='bond_type',
- node_kernels=None,
- edge_kernels=None,
- compute_method='naive',
- n_jobs=None,
- verbose=True):
- """Calculate mean average structural shortest path kernels between graphs.
-
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- /
- G1, G2 : NetworkX graphs
- 2 graphs between which the kernel is calculated.
- node_label : string
- node attribute used as label. The default node label is atom.
- edge_weight : string
- Edge attribute name corresponding to the edge weight.
- edge_label : string
- edge attribute used as label. The default edge label is bond_type.
- node_kernels: dict
- A dictionary of kernel functions for nodes, including 3 items: 'symb'
- for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
- for both labels. The first 2 functions take two node labels as
- parameters, and the 'mix' function takes 4 parameters, a symbolic and a
- non-symbolic label for each the two nodes. Each label is in form of 2-D
- dimension array (n_samples, n_features). Each function returns a number
- as the kernel value. Ignored when nodes are unlabeled.
- edge_kernels: dict
- A dictionary of kernel functions for edges, including 3 items: 'symb'
- for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
- for both labels. The first 2 functions take two edge labels as
- parameters, and the 'mix' function takes 4 parameters, a symbolic and a
- non-symbolic label for each the two edges. Each label is in form of 2-D
- dimension array (n_samples, n_features). Each function returns a number
- as the kernel value. Ignored when edges are unlabeled.
-
- Return
- ------
- Kmatrix : Numpy matrix
- Kernel matrix, each element of which is the mean average structural
- shortest path kernel between 2 praphs.
- """
- # pre-process
- Gn = args[0] if len(args) == 1 else [args[0], args[1]]
- Gn = [g.copy() for g in Gn]
- weight = None
- if edge_weight is None:
- if verbose:
- print('\n None edge weight specified. Set all weight to 1.\n')
- else:
- try:
- some_weight = list(
- nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
- if isinstance(some_weight, (float, int)):
- weight = edge_weight
- else:
- if verbose:
- print(
- '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
- % edge_weight)
- except:
- if verbose:
- print(
- '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
- % edge_weight)
- ds_attrs = get_dataset_attributes(
- Gn,
- attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
- 'edge_attr_dim', 'is_directed'],
- node_label=node_label, edge_label=edge_label)
-
- start_time = time.time()
-
- # get shortest paths of each graph in Gn
- splist = [None] * len(Gn)
- pool = Pool(n_jobs)
- itr = zip(Gn, range(0, len(Gn)))
- if len(Gn) < 100 * n_jobs:
- chunksize = int(len(Gn) / n_jobs) + 1
- else:
- chunksize = 100
- # get shortest path graphs of Gn
- if compute_method == 'trie':
- getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed'])
- else:
- getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed'])
- if verbose:
- iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
- desc='getting shortest paths', file=sys.stdout)
- else:
- iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
- for i, sp in iterator:
- splist[i] = sp
- # time.sleep(10)
- pool.close()
- pool.join()
-
-
- # ss = 0
- # ss += sys.getsizeof(splist)
- # for spss in splist:
- # ss += sys.getsizeof(spss)
- # for spp in spss:
- # ss += sys.getsizeof(spp)
-
-
- # time.sleep(20)
-
- # # ---- direct running, normally use single CPU core. ----
- # splist = []
- # if compute_method == 'trie':
- # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
- # splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed']))
- # else:
- # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
- # splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
-
- # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
- # sp_ml = [0] * len(Gn) # shortest path matrices
- # for i in result_sp:
- # sp_ml[i[0]] = i[1]
- # edge_x_g = [[] for i in range(len(sp_ml))]
- # edge_y_g = [[] for i in range(len(sp_ml))]
- # edge_w_g = [[] for i in range(len(sp_ml))]
- # for idx, item in enumerate(sp_ml):
- # for i1 in range(len(item)):
- # for i2 in range(i1 + 1, len(item)):
- # if item[i1, i2] != np.inf:
- # edge_x_g[idx].append(i1)
- # edge_y_g[idx].append(i2)
- # edge_w_g[idx].append(item[i1, i2])
- # print(len(edge_x_g[0]))
- # print(len(edge_y_g[0]))
- # print(len(edge_w_g[0]))
-
- Kmatrix = np.zeros((len(Gn), len(Gn)))
-
- # ---- use pool.imap_unordered to parallel and track progress. ----
- def init_worker(spl_toshare, gs_toshare):
- global G_spl, G_gs
- G_spl = spl_toshare
- G_gs = gs_toshare
- if compute_method == 'trie':
- do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label,
- node_kernels, edge_kernels)
- parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
- glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose)
- else:
- do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
- node_kernels, edge_kernels)
- parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
- glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose)
-
- # # ---- use pool.map to parallel. ----
- # pool = Pool(n_jobs)
- # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
- # node_kernels, edge_kernels)
- # itr = zip(combinations_with_replacement(Gn, 2),
- # combinations_with_replacement(splist, 2),
- # combinations_with_replacement(range(0, len(Gn)), 2))
- # for i, j, kernel in tqdm(
- # pool.map(do_partial, itr), desc='calculating kernels',
- # file=sys.stdout):
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
- # pool.close()
- # pool.join()
-
- # # ---- use pool.imap_unordered to parallel and track progress. ----
- # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
- # node_kernels, edge_kernels)
- # itr = zip(combinations_with_replacement(Gn, 2),
- # combinations_with_replacement(splist, 2),
- # combinations_with_replacement(range(0, len(Gn)), 2))
- # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
- # if len_itr < 1000 * n_jobs:
- # chunksize = int(len_itr / n_jobs) + 1
- # else:
- # chunksize = 1000
- # from contextlib import closing
- # with closing(Pool(n_jobs)) as pool:
- # for i, j, kernel in tqdm(
- # pool.imap_unordered(do_partial, itr, 1000),
- # desc='calculating kernels',
- # file=sys.stdout):
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
- # pool.close()
- # pool.join()
-
-
- # # ---- direct running, normally use single CPU core. ----
- # from itertools import combinations_with_replacement
- # itr = combinations_with_replacement(range(0, len(Gn)), 2)
- # if compute_method == 'trie':
- # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
- # kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j],
- # ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
- # else:
- # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
- # kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
- # ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
- # # if(kernel > 1):
- # # print("error here ")
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
-
- run_time = time.time() - start_time
- if verbose:
- print("\n --- shortest path kernel matrix of size %d built in %s seconds ---"
- % (len(Gn), run_time))
-
- return Kmatrix, run_time
-
-
- def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
- node_kernels, edge_kernels):
-
- kernel = 0
-
- # First, compute shortest path matrices, method borrowed from FCSP.
- vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
- # Then, compute kernels between all pairs of edges, which is an idea of
- # extension of FCSP. It suits sparse graphs, which is the most case we
- # went though. For dense graphs, this would be slow.
- ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
-
- # compute graph kernels
- if vk_dict:
- if ek_dict:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- kpath = vk_dict[(p1[0], p2[0])]
- if kpath:
- for idx in range(1, len(p1)):
- kpath *= vk_dict[(p1[idx], p2[idx])] * \
- ek_dict[((p1[idx-1], p1[idx]),
- (p2[idx-1], p2[idx]))]
- if not kpath:
- break
- kernel += kpath # add up kernels of all paths
- else:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- kpath = vk_dict[(p1[0], p2[0])]
- if kpath:
- for idx in range(1, len(p1)):
- kpath *= vk_dict[(p1[idx], p2[idx])]
- if not kpath:
- break
- kernel += kpath # add up kernels of all paths
- else:
- if ek_dict:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- if len(p1) == 0:
- kernel += 1
- else:
- kpath = 1
- for idx in range(0, len(p1) - 1):
- kpath *= ek_dict[((p1[idx], p1[idx+1]),
- (p2[idx], p2[idx+1]))]
- if not kpath:
- break
- kernel += kpath # add up kernels of all paths
- else:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- kernel += 1
-
- kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
-
- # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
- # # compute vertex kernel matrix
- # try:
- # vk_mat = np.zeros((nx.number_of_nodes(g1),
- # nx.number_of_nodes(g2)))
- # g1nl = enumerate(g1.nodes(data=True))
- # g2nl = enumerate(g2.nodes(data=True))
- # for i1, n1 in g1nl:
- # for i2, n2 in g2nl:
- # vk_mat[i1][i2] = kn(
- # n1[1][node_label], n2[1][node_label],
- # [n1[1]['attributes']], [n2[1]['attributes']])
-
- # range1 = range(0, len(edge_w_g[i]))
- # range2 = range(0, len(edge_w_g[j]))
- # for i1 in range1:
- # x1 = edge_x_g[i][i1]
- # y1 = edge_y_g[i][i1]
- # w1 = edge_w_g[i][i1]
- # for i2 in range2:
- # x2 = edge_x_g[j][i2]
- # y2 = edge_y_g[j][i2]
- # w2 = edge_w_g[j][i2]
- # ke = (w1 == w2)
- # if ke > 0:
- # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
- # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
- # Kmatrix += kn1 + kn2
- return kernel
-
-
- def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
- edge_kernels, itr):
- i = itr[0]
- j = itr[1]
- return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j],
- ds_attrs, node_label, edge_label,
- node_kernels, edge_kernels)
-
-
- def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,
- node_kernels, edge_kernels):
-
- # # traverse all paths in graph1. Deep-first search is applied.
- # def traverseBothTrie(root, trie2, kernel, pcurrent=[]):
- # for key, node in root['children'].items():
- # pcurrent.append(key)
- # if node['isEndOfWord']:
- # # print(node['count'])
- # traverseTrie2(trie2.root, pcurrent, kernel,
- # pcurrent=[])
- # if node['children'] != {}:
- # traverseBothTrie(node, trie2, kernel, pcurrent)
- # else:
- # del pcurrent[-1]
- # if pcurrent != []:
- # del pcurrent[-1]
- #
- #
- # # traverse all paths in graph2 and find out those that are not in
- # # graph1. Deep-first search is applied.
- # def traverseTrie2(root, p1, kernel, pcurrent=[]):
- # for key, node in root['children'].items():
- # pcurrent.append(key)
- # if node['isEndOfWord']:
- # # print(node['count'])
- # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
- # if node['children'] != {}:
- # traverseTrie2(node, p1, kernel, pcurrent)
- # else:
- # del pcurrent[-1]
- # if pcurrent != []:
- # del pcurrent[-1]
- #
- #
- # kernel = [0]
- #
- # # First, compute shortest path matrices, method borrowed from FCSP.
- # vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
- # # Then, compute kernels between all pairs of edges, which is an idea of
- # # extension of FCSP. It suits sparse graphs, which is the most case we
- # # went though. For dense graphs, this would be slow.
- # ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
- #
- # # compute graph kernels
- # traverseBothTrie(trie1[0].root, trie2[0], kernel)
- #
- # kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average
-
- # # traverse all paths in graph1. Deep-first search is applied.
- # def traverseBothTrie(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
- # for key, node in root['children'].items():
- # pcurrent.append(key)
- # if node['isEndOfWord']:
- # # print(node['count'])
- # traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
- # pcurrent=[])
- # if node['children'] != {}:
- # traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
- # else:
- # del pcurrent[-1]
- # if pcurrent != []:
- # del pcurrent[-1]
- #
- #
- # # traverse all paths in graph2 and find out those that are not in
- # # graph1. Deep-first search is applied.
- # def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
- # for key, node in root['children'].items():
- # pcurrent.append(key)
- # if node['isEndOfWord']:
- # # print(node['count'])
- # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict)
- # if node['children'] != {}:
- # traverseTrie2(node, p1, kernel, vk_dict, ek_dict, pcurrent)
- # else:
- # del pcurrent[-1]
- # if pcurrent != []:
- # del pcurrent[-1]
-
-
- kernel = [0]
-
- # First, compute shortest path matrices, method borrowed from FCSP.
- vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs)
- # Then, compute kernels between all pairs of edges, which is an idea of
- # extension of FCSP. It suits sparse graphs, which is the most case we
- # went though. For dense graphs, this would be slow.
- ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs)
-
- # compute graph kernels
- # traverseBothTrie(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
- if vk_dict:
- if ek_dict:
- traverseBothTriem(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
- else:
- traverseBothTriev(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
- else:
- if ek_dict:
- traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
- else:
- traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict)
-
- kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average
-
- return kernel
-
-
- def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels,
- edge_kernels, itr):
- i = itr[0]
- j = itr[1]
- return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs,
- node_label, edge_label, node_kernels, edge_kernels)
-
-
- def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs):
- # compute shortest path matrices, method borrowed from FCSP.
- vk_dict = {} # shortest path matrices dict
- if ds_attrs['node_labeled']:
- # node symb and non-synb labeled
- if ds_attrs['node_attr_dim'] > 0:
- kn = node_kernels['mix']
- for n1, n2 in product(
- g1.nodes(data=True), g2.nodes(data=True)):
- vk_dict[(n1[0], n2[0])] = kn(
- n1[1][node_label], n2[1][node_label],
- n1[1]['attributes'], n2[1]['attributes'])
- # node symb labeled
- else:
- kn = node_kernels['symb']
- for n1 in g1.nodes(data=True):
- for n2 in g2.nodes(data=True):
- vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
- n2[1][node_label])
- else:
- # node non-synb labeled
- if ds_attrs['node_attr_dim'] > 0:
- kn = node_kernels['nsymb']
- for n1 in g1.nodes(data=True):
- for n2 in g2.nodes(data=True):
- vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
- n2[1]['attributes'])
- # node unlabeled
- else:
- pass
-
- return vk_dict
-
-
- def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs):
- # compute kernels between all pairs of edges, which is an idea of
- # extension of FCSP. It suits sparse graphs, which is the most case we
- # went though. For dense graphs, this would be slow.
- ek_dict = {} # dict of edge kernels
- if ds_attrs['edge_labeled']:
- # edge symb and non-synb labeled
- if ds_attrs['edge_attr_dim'] > 0:
- ke = edge_kernels['mix']
- for e1, e2 in product(
- g1.edges(data=True), g2.edges(data=True)):
- ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
- e1[2]['attributes'], e2[2]['attributes'])
- ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
- # edge symb labeled
- else:
- ke = edge_kernels['symb']
- for e1 in g1.edges(data=True):
- for e2 in g2.edges(data=True):
- ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
- ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
- else:
- # edge non-synb labeled
- if ds_attrs['edge_attr_dim'] > 0:
- ke = edge_kernels['nsymb']
- for e1 in g1.edges(data=True):
- for e2 in g2.edges(data=True):
- ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
- ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
- # edge unlabeled
- else:
- pass
-
- return ek_dict
-
-
- # traverse all paths in graph1. Deep-first search is applied.
- def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
- for key, node in root['children'].items():
- pcurrent.append(key)
- if node['isEndOfWord']:
- # print(node['count'])
- traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
- pcurrent=[])
- if node['children'] != {}:
- traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
- else:
- del pcurrent[-1]
- if pcurrent != []:
- del pcurrent[-1]
-
-
- # traverse all paths in graph2 and find out those that are not in
- # graph1. Deep-first search is applied.
- def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
- for key, node in root['children'].items():
- pcurrent.append(key)
- if node['isEndOfWord']:
- # print(node['count'])
- if len(p1) == len(pcurrent):
- kpath = vk_dict[(p1[0], pcurrent[0])]
- if kpath:
- for idx in range(1, len(p1)):
- kpath *= vk_dict[(p1[idx], pcurrent[idx])] * \
- ek_dict[((p1[idx-1], p1[idx]),
- (pcurrent[idx-1], pcurrent[idx]))]
- if not kpath:
- break
- kernel[0] += kpath # add up kernels of all paths
- if node['children'] != {}:
- traverseTrie2m(node, p1, kernel, vk_dict, ek_dict, pcurrent)
- else:
- del pcurrent[-1]
- if pcurrent != []:
- del pcurrent[-1]
-
-
- # traverse all paths in graph1. Deep-first search is applied.
- def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
- for key, node in root['children'].items():
- pcurrent.append(key)
- if node['isEndOfWord']:
- # print(node['count'])
- traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
- pcurrent=[])
- if node['children'] != {}:
- traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
- else:
- del pcurrent[-1]
- if pcurrent != []:
- del pcurrent[-1]
-
-
- # traverse all paths in graph2 and find out those that are not in
- # graph1. Deep-first search is applied.
- def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
- for key, node in root['children'].items():
- pcurrent.append(key)
- if node['isEndOfWord']:
- # print(node['count'])
- if len(p1) == len(pcurrent):
- kpath = vk_dict[(p1[0], pcurrent[0])]
- if kpath:
- for idx in range(1, len(p1)):
- kpath *= vk_dict[(p1[idx], pcurrent[idx])]
- if not kpath:
- break
- kernel[0] += kpath # add up kernels of all paths
- if node['children'] != {}:
- traverseTrie2v(node, p1, kernel, vk_dict, ek_dict, pcurrent)
- else:
- del pcurrent[-1]
- if pcurrent != []:
- del pcurrent[-1]
-
-
- # traverse all paths in graph1. Deep-first search is applied.
- def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
- for key, node in root['children'].items():
- pcurrent.append(key)
- if node['isEndOfWord']:
- # print(node['count'])
- traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
- pcurrent=[])
- if node['children'] != {}:
- traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
- else:
- del pcurrent[-1]
- if pcurrent != []:
- del pcurrent[-1]
-
-
- # traverse all paths in graph2 and find out those that are not in
- # graph1. Deep-first search is applied.
- def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
- for key, node in root['children'].items():
- pcurrent.append(key)
- if node['isEndOfWord']:
- # print(node['count'])
- if len(p1) == len(pcurrent):
- if len(p1) == 0:
- kernel += 1
- else:
- kpath = 1
- for idx in range(0, len(p1) - 1):
- kpath *= ek_dict[((p1[idx], p1[idx+1]),
- (pcurrent[idx], pcurrent[idx+1]))]
- if not kpath:
- break
- kernel[0] += kpath # add up kernels of all paths
- if node['children'] != {}:
- traverseTrie2e(node, p1, kernel, vk_dict, ek_dict, pcurrent)
- else:
- del pcurrent[-1]
- if pcurrent != []:
- del pcurrent[-1]
-
-
- # traverse all paths in graph1. Deep-first search is applied.
- def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):
- for key, node in root['children'].items():
- pcurrent.append(key)
- if node['isEndOfWord']:
- # print(node['count'])
- traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict,
- pcurrent=[])
- if node['children'] != {}:
- traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent)
- else:
- del pcurrent[-1]
- if pcurrent != []:
- del pcurrent[-1]
-
-
- # traverse all paths in graph2 and find out those that are not in
- # graph1. Deep-first search is applied.
- def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):
- for key, node in root['children'].items():
- pcurrent.append(key)
- if node['isEndOfWord']:
- # print(node['count'])
- if len(p1) == len(pcurrent):
- kernel[0] += 1
- if node['children'] != {}:
- traverseTrie2u(node, p1, kernel, vk_dict, ek_dict, pcurrent)
- else:
- del pcurrent[-1]
- if pcurrent != []:
- del pcurrent[-1]
-
-
- #def computePathKernel(p1, p2, vk_dict, ek_dict):
- # kernel = 0
- # if vk_dict:
- # if ek_dict:
- # if len(p1) == len(p2):
- # kpath = vk_dict[(p1[0], p2[0])]
- # if kpath:
- # for idx in range(1, len(p1)):
- # kpath *= vk_dict[(p1[idx], p2[idx])] * \
- # ek_dict[((p1[idx-1], p1[idx]),
- # (p2[idx-1], p2[idx]))]
- # if not kpath:
- # break
- # kernel += kpath # add up kernels of all paths
- # else:
- # if len(p1) == len(p2):
- # kpath = vk_dict[(p1[0], p2[0])]
- # if kpath:
- # for idx in range(1, len(p1)):
- # kpath *= vk_dict[(p1[idx], p2[idx])]
- # if not kpath:
- # break
- # kernel += kpath # add up kernels of all paths
- # else:
- # if ek_dict:
- # if len(p1) == len(p2):
- # if len(p1) == 0:
- # kernel += 1
- # else:
- # kpath = 1
- # for idx in range(0, len(p1) - 1):
- # kpath *= ek_dict[((p1[idx], p1[idx+1]),
- # (p2[idx], p2[idx+1]))]
- # if not kpath:
- # break
- # kernel += kpath # add up kernels of all paths
- # else:
- # if len(p1) == len(p2):
- # kernel += 1
- #
- # return kernel
-
-
- def get_shortest_paths(G, weight, directed):
- """Get all shortest paths of a graph.
-
- Parameters
- ----------
- G : NetworkX graphs
- The graphs whose paths are calculated.
- weight : string/None
- edge attribute used as weight to calculate the shortest path.
- directed: boolean
- Whether graph is directed.
-
- Return
- ------
- sp : list of list
- List of shortest paths of the graph, where each path is represented by a list of nodes.
- """
- sp = []
- for n1, n2 in combinations(G.nodes(), 2):
- try:
- spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
- except nx.NetworkXNoPath: # nodes not connected
- # sp.append([])
- pass
- else:
- sp += spltemp
- # each edge walk is counted twice, starting from both its extreme nodes.
- if not directed:
- sp += [sptemp[::-1] for sptemp in spltemp]
-
- # add single nodes as length 0 paths.
- sp += [[n] for n in G.nodes()]
- return sp
-
-
- def wrapper_getSP_naive(weight, directed, itr_item):
- g = itr_item[0]
- i = itr_item[1]
- return i, get_shortest_paths(g, weight, directed)
-
-
- def get_sps_as_trie(G, weight, directed):
- """Get all shortest paths of a graph and insert them into a trie.
-
- Parameters
- ----------
- G : NetworkX graphs
- The graphs whose paths are calculated.
- weight : string/None
- edge attribute used as weight to calculate the shortest path.
- directed: boolean
- Whether graph is directed.
-
- Return
- ------
- sp : list of list
- List of shortest paths of the graph, where each path is represented by a list of nodes.
- """
- sptrie = Trie()
- lensp = 0
- for n1, n2 in combinations(G.nodes(), 2):
- try:
- spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
- except nx.NetworkXNoPath: # nodes not connected
- pass
- else:
- lensp += len(spltemp)
- if not directed:
- lensp += len(spltemp)
- for sp in spltemp:
- sptrie.insertWord(sp)
- # each edge walk is counted twice, starting from both its extreme nodes.
- if not directed:
- sptrie.insertWord(sp[::-1])
-
- # add single nodes as length 0 paths.
- for n in G.nodes():
- sptrie.insertWord([n])
-
- return sptrie, lensp + nx.number_of_nodes(G)
-
-
- def wrapper_getSP_trie(weight, directed, itr_item):
- g = itr_item[0]
- i = itr_item[1]
- return i, get_sps_as_trie(g, weight, directed)
|