From e2af9432627ee64645e193d1f78d7c8b6b01548a Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 14 Dec 2020 10:28:53 +0100 Subject: [PATCH] [Very important!!!] fix bugs in ssp kernel functions, before this update symbolic/dicrete edge labels were ignored! --- gklearn/kernels/structural_sp.py | 4 +- gklearn/kernels/structuralspKernel.py | 194 +++++++++++++++++----------------- 2 files changed, 100 insertions(+), 98 deletions(-) diff --git a/gklearn/kernels/structural_sp.py b/gklearn/kernels/structural_sp.py index 1464807..ba98a6c 100644 --- a/gklearn/kernels/structural_sp.py +++ b/gklearn/kernels/structural_sp.py @@ -252,6 +252,7 @@ class StructuralSP(GraphKernel): if not kpath: break kernel += kpath # add up kernels of all paths +# print(kernel, ',', p1, ',', p2) else: for p1, p2 in product(spl1, spl2): if len(p1) == len(p2): @@ -398,6 +399,7 @@ class StructuralSP(GraphKernel): if not kpath: break kernel += kpath # add up kernels of all paths +# print(kernel, ',', p1, ',', p2) else: for p1, p2 in product(spl1, spl2): if len(p1) == len(p2): @@ -495,4 +497,4 @@ class StructuralSP(GraphKernel): else: pass - return ek_dict \ No newline at end of file + return ek_dict \ No newline at end of file diff --git a/gklearn/kernels/structuralspKernel.py b/gklearn/kernels/structuralspKernel.py index cfafc8c..a1d2539 100644 --- a/gklearn/kernels/structuralspKernel.py +++ b/gklearn/kernels/structuralspKernel.py @@ -5,9 +5,9 @@ Created on Thu Sep 27 10:56:23 2018 @author: linlin -@references: +@references: - [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For + [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). """ @@ -43,7 +43,7 @@ def structuralspkernel(*args, ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. - + G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. @@ -51,25 +51,25 @@ def structuralspkernel(*args, Node attribute used as label. The default node label is atom. edge_weight : string - Edge attribute name corresponding to the edge weight. Applied for the + Edge attribute name corresponding to the edge weight. Applied for the computation of the shortest paths. edge_label : string Edge attribute used as label. The default edge label is bond_type. node_kernels : dict - A dictionary of kernel functions for nodes, including 3 items: 'symb' - for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' - for both labels. The first 2 functions take two node labels as + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when nodes are unlabeled. edge_kernels : dict - A dictionary of kernel functions for edges, including 3 items: 'symb' - for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' - for both labels. The first 2 functions take two edge labels as + A dictionary of kernel functions for edges, including 3 items: 'symb' + for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' + for both labels. The first 2 functions take two edge labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two edges. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number @@ -89,7 +89,7 @@ def structuralspkernel(*args, Return ------ Kmatrix : Numpy matrix - Kernel matrix, each element of which is the mean average structural + Kernel matrix, each element of which is the mean average structural shortest path kernel between 2 praphs. """ # pre-process @@ -135,9 +135,9 @@ def structuralspkernel(*args, chunksize = 100 # get shortest path graphs of Gn if compute_method == 'trie': - getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) + getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) else: - getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) + getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting shortest paths', file=sys.stdout) @@ -161,17 +161,17 @@ def structuralspkernel(*args, else: for g in iterator: splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) - + # ss = 0 # ss += sys.getsizeof(splist) # for spss in splist: # ss += sys.getsizeof(spss) # for spp in spss: # ss += sys.getsizeof(spp) - - + + # time.sleep(20) - + # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) @@ -194,21 +194,21 @@ def structuralspkernel(*args, Kmatrix = np.zeros((len(Gn), len(Gn))) - # ---- use pool.imap_unordered to parallel and track progress. ---- + # ---- use pool.imap_unordered to parallel and track progress. ---- if parallel == 'imap_unordered': def init_worker(spl_toshare, gs_toshare): global G_spl, G_gs G_spl = spl_toshare - G_gs = gs_toshare - if compute_method == 'trie': - do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) - else: - do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + G_gs = gs_toshare + if compute_method == 'trie': + do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + else: + do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # ---- direct running, normally use single CPU core. ---- elif parallel is None: @@ -232,10 +232,10 @@ def structuralspkernel(*args, # print("error here ") Kmatrix[i][j] = kernel Kmatrix[j][i] = kernel - + # # ---- use pool.map to parallel. ---- # pool = Pool(n_jobs) -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), @@ -249,7 +249,7 @@ def structuralspkernel(*args, # pool.join() # # ---- use pool.imap_unordered to parallel and track progress. ---- -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), @@ -282,7 +282,7 @@ def structuralspkernel(*args, def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, node_kernels, edge_kernels): - + kernel = 0 # First, compute shortest path matrices, method borrowed from FCSP. @@ -373,25 +373,25 @@ def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, return kernel -def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, +def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, edge_kernels, itr): i = itr[0] j = itr[1] - return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], - ds_attrs, node_label, edge_label, + return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], + ds_attrs, node_label, edge_label, node_kernels, edge_kernels) - - + + def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, node_kernels, edge_kernels): - + # # traverse all paths in graph1. Deep-first search is applied. # def traverseBothTrie(root, trie2, kernel, pcurrent=[]): # for key, node in root['children'].items(): # pcurrent.append(key) # if node['isEndOfWord']: # # print(node['count']) -# traverseTrie2(trie2.root, pcurrent, kernel, +# traverseTrie2(trie2.root, pcurrent, kernel, # pcurrent=[]) # if node['children'] != {}: # traverseBothTrie(node, trie2, kernel, pcurrent) @@ -399,14 +399,14 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # del pcurrent[-1] # if pcurrent != []: # del pcurrent[-1] -# -# -# # traverse all paths in graph2 and find out those that are not in -# # graph1. Deep-first search is applied. +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. # def traverseTrie2(root, p1, kernel, pcurrent=[]): # for key, node in root['children'].items(): # pcurrent.append(key) -# if node['isEndOfWord']: +# if node['isEndOfWord']: # # print(node['count']) # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) # if node['children'] != {}: @@ -415,8 +415,8 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # del pcurrent[-1] # if pcurrent != []: # del pcurrent[-1] -# -# +# +# # kernel = [0] # # # First, compute shortest path matrices, method borrowed from FCSP. @@ -437,7 +437,7 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # pcurrent.append(key) # if node['isEndOfWord']: # # print(node['count']) -# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict, +# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict, # pcurrent=[]) # if node['children'] != {}: # traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -445,14 +445,14 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # del pcurrent[-1] # if pcurrent != []: # del pcurrent[-1] -# -# -# # traverse all paths in graph2 and find out those that are not in -# # graph1. Deep-first search is applied. +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. # def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): # for key, node in root['children'].items(): # pcurrent.append(key) -# if node['isEndOfWord']: +# if node['isEndOfWord']: # # print(node['count']) # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) # if node['children'] != {}: @@ -461,8 +461,8 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # del pcurrent[-1] # if pcurrent != []: # del pcurrent[-1] - - + + kernel = [0] # First, compute shortest path matrices, method borrowed from FCSP. @@ -483,20 +483,20 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, if ek_dict: traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) else: - traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) kernel = kernel[0] / (trie1[1] * trie2[1]) # Compute mean average return kernel -def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels, +def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels, edge_kernels, itr): i = itr[0] j = itr[1] - return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs, + return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs, node_label, edge_label, node_kernels, edge_kernels) - + def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs): # compute shortest path matrices, method borrowed from FCSP. @@ -528,7 +528,7 @@ def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs): # node unlabeled else: pass - + return vk_dict @@ -573,17 +573,17 @@ def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs): # edge unlabeled else: pass - - return ek_dict - - + + return ek_dict + + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) if node['isEndOfWord']: # print(node['count']) - traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict, pcurrent=[]) if node['children'] != {}: traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -591,14 +591,14 @@ def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - -# traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: # print(node['count']) if len(p1) == len(pcurrent): kpath = vk_dict[(p1[0], pcurrent[0])] @@ -616,7 +616,7 @@ def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): @@ -624,7 +624,7 @@ def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): pcurrent.append(key) if node['isEndOfWord']: # print(node['count']) - traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict, pcurrent=[]) if node['children'] != {}: traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -632,14 +632,14 @@ def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - -# traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: # print(node['count']) if len(p1) == len(pcurrent): kpath = vk_dict[(p1[0], pcurrent[0])] @@ -655,15 +655,15 @@ def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - + + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) if node['isEndOfWord']: # print(node['count']) - traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict, pcurrent=[]) if node['children'] != {}: traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -671,14 +671,14 @@ def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - -# traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: # print(node['count']) if len(p1) == len(pcurrent): if len(p1) == 0: @@ -697,15 +697,15 @@ def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - + + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) if node['isEndOfWord']: # print(node['count']) - traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict, pcurrent=[]) if node['children'] != {}: traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -713,14 +713,14 @@ def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - -# traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: # print(node['count']) if len(p1) == len(pcurrent): kernel[0] += 1 @@ -730,8 +730,8 @@ def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - + + #def computePathKernel(p1, p2, vk_dict, ek_dict): # kernel = 0 # if vk_dict: @@ -771,7 +771,7 @@ def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): # else: # if len(p1) == len(p2): # kernel += 1 -# +# # return kernel @@ -804,7 +804,7 @@ def get_shortest_paths(G, weight, directed): # each edge walk is counted twice, starting from both its extreme nodes. if not directed: sp += [sptemp[::-1] for sptemp in spltemp] - + # add single nodes as length 0 paths. sp += [[n] for n in G.nodes()] return sp @@ -849,7 +849,7 @@ def get_sps_as_trie(G, weight, directed): # each edge walk is counted twice, starting from both its extreme nodes. if not directed: sptrie.insertWord(sp[::-1]) - + # add single nodes as length 0 paths. for n in G.nodes(): sptrie.insertWord([n])