|
- """
- @author: linlin
- @references: Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
- """
-
- import sys
- import pathlib
- sys.path.insert(0, "../")
- from tqdm import tqdm
- import time
- from itertools import combinations, combinations_with_replacement, product
- from functools import partial
- from joblib import Parallel, delayed
- from multiprocessing import Pool
-
- import networkx as nx
- import numpy as np
-
- from pygraph.utils.utils import getSPGraph
- from pygraph.utils.graphdataset import get_dataset_attributes
-
-
- def spkernel(*args,
- node_label='atom',
- edge_weight=None,
- node_kernels=None,
- n_jobs=None):
- """Calculate shortest-path kernels between graphs.
-
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- /
- G1, G2 : NetworkX graphs
- 2 graphs between which the kernel is calculated.
- edge_weight : string
- Edge attribute name corresponding to the edge weight.
- node_kernels: dict
- A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled.
-
- Return
- ------
- Kmatrix : Numpy matrix
- Kernel matrix, each element of which is the sp kernel between 2 praphs.
- """
- # pre-process
- Gn = args[0] if len(args) == 1 else [args[0], args[1]]
-
- weight = None
- if edge_weight == None:
- print('\n None edge weight specified. Set all weight to 1.\n')
- else:
- try:
- some_weight = list(
- nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
- if isinstance(some_weight, float) or isinstance(some_weight, int):
- weight = edge_weight
- else:
- print(
- '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
- % edge_weight)
- except:
- print(
- '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
- % edge_weight)
- ds_attrs = get_dataset_attributes(
- Gn,
- attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
- node_label=node_label)
-
- # remove graphs with no edges, as no sp can be found in their structures, so the kernel between such a graph and itself will be zero.
- len_gn = len(Gn)
- Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
- idx = [G[0] for G in Gn]
- Gn = [G[1] for G in Gn]
- if len(Gn) != len_gn:
- print('\n %d graphs are removed as they don\'t contain edges.\n' %
- (len_gn - len(Gn)))
-
- start_time = time.time()
-
- pool = Pool(n_jobs)
- # get shortest path graphs of Gn
- getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight)
- if len(Gn) < 100:
- # use default chunksize as pool.map when iterable is less than 100
- chunksize, extra = divmod(len(Gn), n_jobs * 4)
- if extra:
- chunksize += 1
- else:
- chunksize = 100
- # chunksize = 300 # int(len(list(itr)) / n_jobs)
- for i, g in tqdm(
- pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
- desc='getting sp graphs',
- file=sys.stdout):
- Gn[i] = g
-
- # # ---- use pool.map to parallel ----
- # result_sp = pool.map(getsp_partial, range(0, len(Gn)))
- # for i in result_sp:
- # Gn[i[0]] = i[1]
- # or
- # getsp_partial = partial(wrap_getSPGraph, Gn, edge_weight)
- # for i, g in tqdm(
- # pool.map(getsp_partial, range(0, len(Gn))),
- # desc='getting sp graphs',
- # file=sys.stdout):
- # Gn[i] = g
-
- # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
- # sp_ml = [0] * len(Gn) # shortest path matrices
- # for i in result_sp:
- # sp_ml[i[0]] = i[1]
- # edge_x_g = [[] for i in range(len(sp_ml))]
- # edge_y_g = [[] for i in range(len(sp_ml))]
- # edge_w_g = [[] for i in range(len(sp_ml))]
- # for idx, item in enumerate(sp_ml):
- # for i1 in range(len(item)):
- # for i2 in range(i1 + 1, len(item)):
- # if item[i1, i2] != np.inf:
- # edge_x_g[idx].append(i1)
- # edge_y_g[idx].append(i2)
- # edge_w_g[idx].append(item[i1, i2])
- # print(len(edge_x_g[0]))
- # print(len(edge_y_g[0]))
- # print(len(edge_w_g[0]))
-
- Kmatrix = np.zeros((len(Gn), len(Gn)))
-
- # ---- use pool.imap_unordered to parallel and track progress. ----
- do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
- itr = combinations_with_replacement(range(0, len(Gn)), 2)
- len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
- if len_itr < 100:
- chunksize, extra = divmod(len_itr, n_jobs * 4)
- if extra:
- chunksize += 1
- else:
- chunksize = 100
- for i, j, kernel in tqdm(
- pool.imap_unordered(do_partial, itr, chunksize),
- desc='calculating kernels',
- file=sys.stdout):
- Kmatrix[i][j] = kernel
- Kmatrix[j][i] = kernel
- pool.close()
- pool.join()
-
- # # ---- use pool.map to parallel. ----
- # # result_perf = pool.map(do_partial, itr)
- # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
- # itr = combinations_with_replacement(range(0, len(Gn)), 2)
- # for i, j, kernel in tqdm(
- # pool.map(do_partial, itr), desc='calculating kernels',
- # file=sys.stdout):
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
- # pool.close()
- # pool.join()
-
- # # ---- use joblib.Parallel to parallel and track progress. ----
- # result_perf = Parallel(
- # n_jobs=n_jobs, verbose=10)(
- # delayed(do_partial)(ij)
- # for ij in combinations_with_replacement(range(0, len(Gn)), 2))
- # result_perf = [
- # do_partial(ij)
- # for ij in combinations_with_replacement(range(0, len(Gn)), 2)
- # ]
- # for i in result_perf:
- # Kmatrix[i[0]][i[1]] = i[2]
- # Kmatrix[i[1]][i[0]] = i[2]
-
- # # ---- direct running, normally use single CPU core. ----
- # itr = combinations_with_replacement(range(0, len(Gn)), 2)
- # for gs in tqdm(itr, desc='calculating kernels', file=sys.stdout):
- # i, j, kernel = spkernel_do(Gn, ds_attrs, node_label, node_kernels, gs)
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
-
- run_time = time.time() - start_time
- print(
- "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
- % (len(Gn), run_time))
-
- return Kmatrix, run_time, idx
-
-
- def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
-
- i = ij[0]
- j = ij[1]
- g1 = Gn[i]
- g2 = Gn[j]
- Kmatrix = 0
-
- try:
- # compute shortest path matrices first, method borrowed from FCSP.
- if ds_attrs['node_labeled']:
- # node symb and non-synb labeled
- if ds_attrs['node_attr_dim'] > 0:
- kn = node_kernels['mix']
- vk_dict = {} # shortest path matrices dict
- for n1, n2 in product(
- g1.nodes(data=True), g2.nodes(data=True)):
- vk_dict[(n1[0], n2[0])] = kn(
- n1[1][node_label], n2[1][node_label],
- [n1[1]['attributes']], [n2[1]['attributes']])
- # node symb labeled
- else:
- kn = node_kernels['symb']
- vk_dict = {} # shortest path matrices dict
- for n1 in g1.nodes(data=True):
- for n2 in g2.nodes(data=True):
- vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
- n2[1][node_label])
- else:
- # node non-synb labeled
- if ds_attrs['node_attr_dim'] > 0:
- kn = node_kernels['nsymb']
- vk_dict = {} # shortest path matrices dict
- for n1 in g1.nodes(data=True):
- for n2 in g2.nodes(data=True):
- vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
- [n2[1]['attributes']])
- # node unlabeled
- else:
- for e1, e2 in product(
- Gn[i].edges(data=True), Gn[j].edges(data=True)):
- if e1[2]['cost'] == e2[2]['cost']:
- Kmatrix += 1
- return i, j, Kmatrix
-
- # compute graph kernels
- if ds_attrs['is_directed']:
- for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
- if e1[2]['cost'] == e2[2]['cost']:
- # each edge walk is counted twice, starting from both its extreme nodes.
- nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
- e2[1])]
- kn1 = nk11 * nk22
- Kmatrix += kn1 + kn2
- else:
- for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
- if e1[2]['cost'] == e2[2]['cost']:
- # each edge walk is counted twice, starting from both its extreme nodes.
- nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
- e1[0], e2[1])], vk_dict[(e1[1],
- e2[0])], vk_dict[(e1[1],
- e2[1])]
- kn1 = nk11 * nk22
- kn2 = nk12 * nk21
- Kmatrix += kn1 + kn2
-
- # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
- # # compute vertex kernel matrix
- # try:
- # vk_mat = np.zeros((nx.number_of_nodes(g1),
- # nx.number_of_nodes(g2)))
- # g1nl = enumerate(g1.nodes(data=True))
- # g2nl = enumerate(g2.nodes(data=True))
- # for i1, n1 in g1nl:
- # for i2, n2 in g2nl:
- # vk_mat[i1][i2] = kn(
- # n1[1][node_label], n2[1][node_label],
- # [n1[1]['attributes']], [n2[1]['attributes']])
-
- # range1 = range(0, len(edge_w_g[i]))
- # range2 = range(0, len(edge_w_g[j]))
- # for i1 in range1:
- # x1 = edge_x_g[i][i1]
- # y1 = edge_y_g[i][i1]
- # w1 = edge_w_g[i][i1]
- # for i2 in range2:
- # x2 = edge_x_g[j][i2]
- # y2 = edge_y_g[j][i2]
- # w2 = edge_w_g[j][i2]
- # ke = (w1 == w2)
- # if ke > 0:
- # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
- # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
- # Kmatrix += kn1 + kn2
- except KeyError: # missing labels or attributes
- pass
-
- return i, j, Kmatrix
-
-
- def wrap_getSPGraph(Gn, weight, i):
- return i, getSPGraph(Gn[i], edge_weight=weight)
- # return i, nx.floyd_warshall_numpy(Gn[i], weight=weight)
-
-
- # def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
-
- # i = ij[0]
- # j = ij[1]
- # g1 = Gn[i]
- # g2 = Gn[j]
- # Kmatrix = 0
- # if ds_attrs['node_labeled']:
- # # node symb and non-synb labeled
- # if ds_attrs['node_attr_dim'] > 0:
- # if ds_attrs['is_directed']:
- # for e1, e2 in product(
- # Gn[i].edges(data=True), Gn[j].edges(data=True)):
- # if e1[2]['cost'] == e2[2]['cost']:
- # kn = node_kernels['mix']
- # try:
- # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
- # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
- # j].nodes[e2[1]]
- # kn1 = kn(
- # n11[node_label], n21[node_label],
- # [n11['attributes']], [n21['attributes']]) * kn(
- # n12[node_label], n22[node_label],
- # [n12['attributes']], [n22['attributes']])
- # Kmatrix += kn1
- # except KeyError: # missing labels or attributes
- # pass
- # else:
- # kn = node_kernels['mix']
- # try:
- # # compute shortest path matrices first, method borrowed from FCSP.
- # vk_dict = {} # shortest path matrices dict
- # for n1 in g1.nodes(data=True):
- # for n2 in g2.nodes(data=True):
- # vk_dict[(n1[0], n2[0])] = kn(
- # n1[1][node_label], n2[1][node_label],
- # [n1[1]['attributes']], [n2[1]['attributes']])
-
- # for e1, e2 in product(
- # g1.edges(data=True), g2.edges(data=True)):
- # if e1[2]['cost'] == e2[2]['cost']:
- # # each edge walk is counted twice, starting from both its extreme nodes.
- # nk11, nk12, nk21, nk22 = vk_dict[(
- # e1[0],
- # e2[0])], vk_dict[(e1[0], e2[1])], vk_dict[(
- # e1[1], e2[0])], vk_dict[(e1[1], e2[1])]
- # kn1 = nk11 * nk22
- # kn2 = nk12 * nk21
- # Kmatrix += kn1 + kn2
-
- # # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
- # # # compute vertex kernel matrix
- # # try:
- # # vk_mat = np.zeros((nx.number_of_nodes(g1),
- # # nx.number_of_nodes(g2)))
- # # g1nl = enumerate(g1.nodes(data=True))
- # # g2nl = enumerate(g2.nodes(data=True))
- # # for i1, n1 in g1nl:
- # # for i2, n2 in g2nl:
- # # vk_mat[i1][i2] = kn(
- # # n1[1][node_label], n2[1][node_label],
- # # [n1[1]['attributes']], [n2[1]['attributes']])
-
- # # range1 = range(0, len(edge_w_g[i]))
- # # range2 = range(0, len(edge_w_g[j]))
- # # for i1 in range1:
- # # x1 = edge_x_g[i][i1]
- # # y1 = edge_y_g[i][i1]
- # # w1 = edge_w_g[i][i1]
- # # for i2 in range2:
- # # x2 = edge_x_g[j][i2]
- # # y2 = edge_y_g[j][i2]
- # # w2 = edge_w_g[j][i2]
- # # ke = (w1 == w2)
- # # if ke > 0:
- # # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
- # # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
- # # Kmatrix += kn1 + kn2
-
- # except KeyError: # missing labels or attributes
- # pass
-
- # # node symb labeled
- # else:
- # if ds_attrs['is_directed']:
- # for e1, e2 in product(
- # Gn[i].edges(data=True), Gn[j].edges(data=True)):
- # if e1[2]['cost'] == e2[2]['cost']:
- # kn = node_kernels['symb']
- # try:
- # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
- # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
- # j].nodes[e2[1]]
- # kn1 = kn(n11[node_label], n21[node_label]) * kn(
- # n12[node_label], n22[node_label])
- # Kmatrix += kn1
- # except KeyError: # missing labels
- # pass
- # else:
- # kn = node_kernels['symb']
- # try:
- # # compute shortest path matrices first, method borrowed from FCSP.
- # vk_dict = {} # shortest path matrices dict
- # for n1 in g1.nodes(data=True):
- # for n2 in g2.nodes(data=True):
- # vk_dict[(n1[0], n2[0])] = kn(
- # n1[1][node_label], n2[1][node_label])
-
- # for e1, e2 in product(
- # g1.edges(data=True), g2.edges(data=True)):
- # if e1[2]['cost'] == e2[2]['cost']:
- # # each edge walk is counted twice, starting from both its extreme nodes.
- # nk11, nk12, nk21, nk22 = vk_dict[(
- # e1[0],
- # e2[0])], vk_dict[(e1[0], e2[1])], vk_dict[(
- # e1[1], e2[0])], vk_dict[(e1[1], e2[1])]
- # kn1 = nk11 * nk22
- # kn2 = nk12 * nk21
- # Kmatrix += kn1 + kn2
- # except KeyError: # missing labels
- # pass
- # else:
- # # node non-synb labeled
- # if ds_attrs['node_attr_dim'] > 0:
- # if ds_attrs['is_directed']:
- # for e1, e2 in product(
- # Gn[i].edges(data=True), Gn[j].edges(data=True)):
- # if e1[2]['cost'] == e2[2]['cost']:
- # kn = node_kernels['nsymb']
- # try:
- # # each edge walk is counted twice, starting from both its extreme nodes.
- # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
- # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
- # j].nodes[e2[1]]
- # kn1 = kn(
- # [n11['attributes']], [n21['attributes']]) * kn(
- # [n12['attributes']], [n22['attributes']])
- # Kmatrix += kn1
- # except KeyError: # missing attributes
- # pass
- # else:
- # for e1, e2 in product(
- # Gn[i].edges(data=True), Gn[j].edges(data=True)):
- # if e1[2]['cost'] == e2[2]['cost']:
- # kn = node_kernels['nsymb']
- # try:
- # # each edge walk is counted twice, starting from both its extreme nodes.
- # n11, n12, n21, n22 = Gn[i].nodes[e1[0]], Gn[
- # i].nodes[e1[1]], Gn[j].nodes[e2[0]], Gn[
- # j].nodes[e2[1]]
- # kn1 = kn(
- # [n11['attributes']], [n21['attributes']]) * kn(
- # [n12['attributes']], [n22['attributes']])
- # kn2 = kn(
- # [n11['attributes']], [n22['attributes']]) * kn(
- # [n12['attributes']], [n21['attributes']])
- # Kmatrix += kn1 + kn2
- # except KeyError: # missing attributes
- # pass
- # # node unlabeled
- # else:
- # for e1, e2 in product(
- # Gn[i].edges(data=True), Gn[j].edges(data=True)):
- # if e1[2]['cost'] == e2[2]['cost']:
- # Kmatrix += 1
-
- # return i, j, Kmatrix
|