diff --git a/lang/fr/gklearn/kernels/spKernel.py b/lang/fr/gklearn/kernels/spKernel.py new file mode 100644 index 0000000..b48a905 --- /dev/null +++ b/lang/fr/gklearn/kernels/spKernel.py @@ -0,0 +1,326 @@ +""" +@author: linlin + +@references: + + [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData + Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. +""" + +import sys +import time +from itertools import product +from functools import partial +from multiprocessing import Pool +from tqdm import tqdm + +import networkx as nx +import numpy as np + +from gklearn.utils.utils import getSPGraph +from gklearn.utils.graphdataset import get_dataset_attributes +from gklearn.utils.parallel import parallel_gm + +def spkernel(*args, + node_label='atom', + edge_weight=None, + node_kernels=None, + parallel='imap_unordered', + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate shortest-path kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + + node_label : string + Node attribute used as label. The default node label is atom. + + edge_weight : string + Edge attribute name corresponding to the edge weight. + + node_kernels : dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns an + number as the kernel value. Ignored when nodes are unlabeled. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the sp kernel between 2 praphs. + """ + # pre-process + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + Gn = [g.copy() for g in Gn] + weight = None + if edge_weight is None: + if verbose: + print('\n None edge weight specified. Set all weight to 1.\n') + else: + try: + some_weight = list( + nx.get_edge_attributes(Gn[0], edge_weight).values())[0] + if isinstance(some_weight, (float, int)): + weight = edge_weight + else: + if verbose: + print( + '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' + % edge_weight) + except: + if verbose: + print( + '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' + % edge_weight) + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], + node_label=node_label) + + # remove graphs with no edges, as no sp can be found in their structures, + # so the kernel between such a graph and itself will be zero. + len_gn = len(Gn) + Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] + idx = [G[0] for G in Gn] + Gn = [G[1] for G in Gn] + if len(Gn) != len_gn: + if verbose: + print('\n %d graphs are removed as they don\'t contain edges.\n' % + (len_gn - len(Gn))) + + start_time = time.time() + + if parallel == 'imap_unordered': + pool = Pool(n_jobs) + # get shortest path graphs of Gn + getsp_partial = partial(wrapper_getSPGraph, weight) + itr = zip(Gn, range(0, len(Gn))) + if chunksize is None: + if len(Gn) < 100 * n_jobs: + # # use default chunksize as pool.map when iterable is less than 100 + # chunksize, extra = divmod(len(Gn), n_jobs * 4) + # if extra: + # chunksize += 1 + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + if verbose: + iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), + desc='getting sp graphs', file=sys.stdout) + else: + iterator = pool.imap_unordered(getsp_partial, itr, chunksize) + for i, g in iterator: + Gn[i] = g + pool.close() + pool.join() + + elif parallel is None: + pass +# # ---- direct running, normally use single CPU core. ---- +# for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): +# i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) + + # # ---- use pool.map to parallel ---- + # result_sp = pool.map(getsp_partial, range(0, len(Gn))) + # for i in result_sp: + # Gn[i[0]] = i[1] + # or + # getsp_partial = partial(wrap_getSPGraph, Gn, weight) + # for i, g in tqdm( + # pool.map(getsp_partial, range(0, len(Gn))), + # desc='getting sp graphs', + # file=sys.stdout): + # Gn[i] = g + + # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) + # sp_ml = [0] * len(Gn) # shortest path matrices + # for i in result_sp: + # sp_ml[i[0]] = i[1] + # edge_x_g = [[] for i in range(len(sp_ml))] + # edge_y_g = [[] for i in range(len(sp_ml))] + # edge_w_g = [[] for i in range(len(sp_ml))] + # for idx, item in enumerate(sp_ml): + # for i1 in range(len(item)): + # for i2 in range(i1 + 1, len(item)): + # if item[i1, i2] != np.inf: + # edge_x_g[idx].append(i1) + # edge_y_g[idx].append(i2) + # edge_w_g[idx].append(item[i1, i2]) + # print(len(edge_x_g[0])) + # print(len(edge_y_g[0])) + # print(len(edge_w_g[0])) + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + + + # # ---- use pool.map to parallel. ---- + # # result_perf = pool.map(do_partial, itr) + # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) + # itr = combinations_with_replacement(range(0, len(Gn)), 2) + # for i, j, kernel in tqdm( + # pool.map(do_partial, itr), desc='calculating kernels', + # file=sys.stdout): + # Kmatrix[i][j] = kernel + # Kmatrix[j][i] = kernel + # pool.close() + # pool.join() + + # # ---- use joblib.Parallel to parallel and track progress. ---- + # result_perf = Parallel( + # n_jobs=n_jobs, verbose=10)( + # delayed(do_partial)(ij) + # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) + # result_perf = [ + # do_partial(ij) + # for ij in combinations_with_replacement(range(0, len(Gn)), 2) + # ] + # for i in result_perf: + # Kmatrix[i[0]][i[1]] = i[2] + # Kmatrix[i[1]][i[0]] = i[2] + +# # ---- direct running, normally use single CPU core. ---- +# from itertools import combinations_with_replacement +# itr = combinations_with_replacement(range(0, len(Gn)), 2) +# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel + + run_time = time.time() - start_time + if verbose: + print( + "\n --- shortest path kernel matrix of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time, idx + + +def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): + + kernel = 0 + + # compute shortest path matrices first, method borrowed from FCSP. + vk_dict = {} # shortest path matrices dict + if ds_attrs['node_labeled']: + # node symb and non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['mix'] + for n1, n2 in product( + g1.nodes(data=True), g2.nodes(data=True)): + vk_dict[(n1[0], n2[0])] = kn( + n1[1][node_label], n2[1][node_label], + n1[1]['attributes'], n2[1]['attributes']) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], + n2[1][node_label]) + else: + # node non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], + n2[1]['attributes']) + # node unlabeled + else: + for e1, e2 in product( + g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + kernel += 1 + return kernel + + # compute graph kernels + if ds_attrs['is_directed']: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], + e2[1])] + kn1 = nk11 * nk22 + kernel += kn1 + else: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + # each edge walk is counted twice, starting from both its extreme nodes. + nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( + e1[0], e2[1])], vk_dict[(e1[1], + e2[0])], vk_dict[(e1[1], + e2[1])] + kn1 = nk11 * nk22 + kn2 = nk12 * nk21 + kernel += kn1 + kn2 + + # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation + # # compute vertex kernels + # try: + # vk_mat = np.zeros((nx.number_of_nodes(g1), + # nx.number_of_nodes(g2))) + # g1nl = enumerate(g1.nodes(data=True)) + # g2nl = enumerate(g2.nodes(data=True)) + # for i1, n1 in g1nl: + # for i2, n2 in g2nl: + # vk_mat[i1][i2] = kn( + # n1[1][node_label], n2[1][node_label], + # [n1[1]['attributes']], [n2[1]['attributes']]) + + # range1 = range(0, len(edge_w_g[i])) + # range2 = range(0, len(edge_w_g[j])) + # for i1 in range1: + # x1 = edge_x_g[i][i1] + # y1 = edge_y_g[i][i1] + # w1 = edge_w_g[i][i1] + # for i2 in range2: + # x2 = edge_x_g[j][i2] + # y2 = edge_y_g[j][i2] + # w2 = edge_w_g[j][i2] + # ke = (w1 == w2) + # if ke > 0: + # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] + # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] + # kernel += kn1 + kn2 + + return kernel + + +def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr): + i = itr[0] + j = itr[1] + return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) + +#def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item): +# g1 = itr_item[0][0] +# g2 = itr_item[0][1] +# i = itr_item[1][0] +# j = itr_item[1][1] +# return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) + + +def wrapper_getSPGraph(weight, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, getSPGraph(g, edge_weight=weight) + # return i, nx.floyd_warshall_numpy(g, weight=weight)