From 123391d3a1e0b95ef361240dc40f4e04235f841a Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 19 Feb 2019 18:21:32 +0100 Subject: [PATCH] add comments in run_spkernel.py. --- .gitignore | 12 + notebooks/run_degree_differs_uhp.py | 5 +- notebooks/run_spkernel.py | 45 +-- notebooks/run_structuralspkernel.py | 3 +- notebooks/run_vertex_differs_uhp.py | 3 +- pygraph/kernels/randomWalkKernel.py | 4 +- pygraph/kernels/structuralspKernel.py | 613 +++++++++++++++++++++++++++------- pygraph/kernels/untilHPathKernel.py | 2 +- 8 files changed, 530 insertions(+), 157 deletions(-) diff --git a/.gitignore b/.gitignore index 3815c26..b6bbda3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,11 +2,23 @@ .ipynb_checkpoints datasets/* !datasets/ds.py +!datasets/Alkane/* +!datasets/acyclic/* +!datasets/MAO/* +!datasets/PAH/* +!datasets/MUTAG/* +!datasets/Letter-med/* +!datasets/ENZYMES_txt/* notebooks/results/* +notebooks/check_gm/* +notebooks/test_parallel/* requirements/* +pygraph/model.py +pygraph/kernels/*_sym.py *.npy *.eps *.dat +*.pyc __pycache__ ##*# diff --git a/notebooks/run_degree_differs_uhp.py b/notebooks/run_degree_differs_uhp.py index 122627e..ec17fa6 100644 --- a/notebooks/run_degree_differs_uhp.py +++ b/notebooks/run_degree_differs_uhp.py @@ -27,7 +27,8 @@ def run_ms(dataset, y, ds): from pygraph.kernels.untilHPathKernel import untilhpathkernel estimator = untilhpathkernel param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], - 'k_func': ['MinMax', 'tanimoto']} # ['MinMax']} + 'k_func': ['MinMax', 'tanimoto'], + 'compute_method': ['trie']} # ['MinMax']} param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, {'alpha': np.logspace(-10, 10, num=41, base=10)}] @@ -58,7 +59,7 @@ for ds in dslist: ave_time = [] std_time = [] ave_degree = [] - for piece in range(1, 5): + for piece in range(0, 5): print('piece', str(piece), ':') Gn_p = Gn[len_1piece * piece:len_1piece * (piece + 1)] y_all_p = y_all[len_1piece * piece:len_1piece * (piece + 1)] diff --git a/notebooks/run_spkernel.py b/notebooks/run_spkernel.py index 2fb55a3..f4b9193 100644 --- a/notebooks/run_spkernel.py +++ b/notebooks/run_spkernel.py @@ -6,6 +6,7 @@ from pygraph.kernels.spKernel import spkernel from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct #from pygraph.utils.model_selection_precomputed import trial_do +# datasets dslist = [ {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb @@ -20,31 +21,31 @@ dslist = [ # node nsymb {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb + # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # # node/edge symb # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb - - # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb - # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb - # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb - # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, - # - # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb - # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb - # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb - # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb - # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb - - # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb - # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb - # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb - # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', - # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb - # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', - # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb - # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', - # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb +# +# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb +# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb +# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb +# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, +# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb +# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb +# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb +# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb +# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb +# +# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb +# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb +# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb +# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', +# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb +# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', +# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb +# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', +# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb # # not working below # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, @@ -53,12 +54,14 @@ dslist = [ # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, ] estimator = spkernel +# hyper-parameters mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) param_grid_precomputed = {'node_kernels': [ {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]} param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, {'alpha': np.logspace(-10, 10, num=41, base=10)}] +# for each dataset, do model selection. for ds in dslist: print() print(ds['name']) diff --git a/notebooks/run_structuralspkernel.py b/notebooks/run_structuralspkernel.py index 1164cc9..f8fd1a0 100644 --- a/notebooks/run_structuralspkernel.py +++ b/notebooks/run_structuralspkernel.py @@ -64,7 +64,8 @@ mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}], 'edge_kernels': - [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]} + [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}], + 'compute_method': ['trie']} param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, {'alpha': np.logspace(-10, 10, num=41, base=10)}] diff --git a/notebooks/run_vertex_differs_uhp.py b/notebooks/run_vertex_differs_uhp.py index f4cea6a..cdfbe15 100644 --- a/notebooks/run_vertex_differs_uhp.py +++ b/notebooks/run_vertex_differs_uhp.py @@ -38,7 +38,8 @@ def run_ms(dataset, y, ds): from pygraph.kernels.untilHPathKernel import untilhpathkernel estimator = untilhpathkernel param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], - 'k_func': ['MinMax', 'tanimoto']} # ['MinMax']} + 'k_func': ['MinMax', 'tanimoto'], + 'compute_method': ['trie']} # ['MinMax']} param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, {'alpha': np.logspace(-10, 10, num=41, base=10)}] diff --git a/pygraph/kernels/randomWalkKernel.py b/pygraph/kernels/randomWalkKernel.py index d6295fa..ae510ed 100644 --- a/pygraph/kernels/randomWalkKernel.py +++ b/pygraph/kernels/randomWalkKernel.py @@ -319,7 +319,7 @@ def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda): - # Frist, ompute kernels between all pairs of nodes, method borrowed + # Frist, compute kernels between all pairs of nodes, method borrowed # from FCSP. It is faster than directly computing all edge kernels # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the # graphs compared, which is the most case we went though. For very @@ -417,7 +417,7 @@ def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda): - # Frist, ompute kernels between all pairs of nodes, method borrowed + # Frist, compute kernels between all pairs of nodes, method borrowed # from FCSP. It is faster than directly computing all edge kernels # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the # graphs compared, which is the most case we went though. For very diff --git a/pygraph/kernels/structuralspKernel.py b/pygraph/kernels/structuralspKernel.py index d047fa9..0925179 100644 --- a/pygraph/kernels/structuralspKernel.py +++ b/pygraph/kernels/structuralspKernel.py @@ -20,6 +20,7 @@ import numpy as np from pygraph.utils.graphdataset import get_dataset_attributes from pygraph.utils.parallel import parallel_gm +from pygraph.utils.trie import Trie sys.path.insert(0, "../") @@ -30,6 +31,7 @@ def structuralspkernel(*args, edge_label='bond_type', node_kernels=None, edge_kernels=None, + compute_method='trie', n_jobs=None): """Calculate mean average structural shortest path kernels between graphs. @@ -99,14 +101,16 @@ def structuralspkernel(*args, # get shortest paths of each graph in Gn splist = [None] * len(Gn) pool = Pool(n_jobs) - # get shortest path graphs of Gn - getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 - # chunksize = 300 # int(len(list(itr)) / n_jobs) + # get shortest path graphs of Gn + if compute_method == 'trie': + getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) + else: + getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) for i, sp in tqdm( pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting shortest paths', @@ -117,27 +121,6 @@ def structuralspkernel(*args, pool.join() -# # get shortest paths of each graph in Gn -# splist = [[] for _ in range(len(Gn))] -# # get shortest path graphs of Gn -# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) -# itr = zip(Gn, range(0, len(Gn))) -# if len(Gn) < 1000 * n_jobs: -# chunksize = int(len(Gn) / n_jobs) + 1 -# else: -# chunksize = 1000 -# # chunksize = 300 # int(len(list(itr)) / n_jobs) -# from contextlib import closing -# with closing(Pool(n_jobs)) as pool: -## for i, sp in tqdm( -# res = pool.imap_unordered(getsp_partial, itr, 10) -## desc='getting shortest paths', -## file=sys.stdout): -## splist[i] = sp -## time.sleep(10) -# pool.close() -# pool.join() - # ss = 0 # ss += sys.getsizeof(splist) # for spss in splist: @@ -150,8 +133,12 @@ def structuralspkernel(*args, # # ---- direct running, normally use single CPU core. ---- # splist = [] -# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): -# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) +# if compute_method == 'trie': +# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): +# splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed'])) +# else: +# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): +# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # sp_ml = [0] * len(Gn) # shortest path matrices @@ -177,33 +164,17 @@ def structuralspkernel(*args, def init_worker(spl_toshare, gs_toshare): global G_spl, G_gs G_spl = spl_toshare - G_gs = gs_toshare - do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(splist, Gn), n_jobs=n_jobs) - - -# # ---- use pool.imap_unordered to parallel and track progress. ---- -# pool = Pool(n_jobs) -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, -# node_kernels, edge_kernels) -# itr = zip(combinations_with_replacement(Gn, 2), -# combinations_with_replacement(splist, 2), -# combinations_with_replacement(range(0, len(Gn)), 2)) -# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) -# if len_itr < 1000 * n_jobs: -# chunksize = int(len_itr / n_jobs) + 1 -# else: -# chunksize = 1000 -# for i, j, kernel in tqdm( -# pool.imap_unordered(do_partial, itr, chunksize), -# desc='calculating kernels', -# file=sys.stdout): -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel -# pool.close() -# pool.join() + G_gs = gs_toshare + if compute_method == 'trie': + do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(splist, Gn), n_jobs=n_jobs) + else: + do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(splist, Gn), n_jobs=n_jobs) # # ---- use pool.map to parallel. ---- # pool = Pool(n_jobs) @@ -244,14 +215,22 @@ def structuralspkernel(*args, # # ---- direct running, normally use single CPU core. ---- +# from itertools import combinations_with_replacement # itr = combinations_with_replacement(range(0, len(Gn)), 2) -# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): -# kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], -# ds_attrs, node_label, edge_label, node_kernels, edge_kernels) -## if(kernel > 1): -## print("error here ") -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel +# if compute_method == 'trie': +# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j], +# ds_attrs, node_label, edge_label, node_kernels, edge_kernels) +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# else: +# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], +# ds_attrs, node_label, edge_label, node_kernels, edge_kernels) +# # if(kernel > 1): +# # print("error here ") +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel run_time = time.time() - start_time print( @@ -267,75 +246,11 @@ def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, kernel = 0 # First, compute shortest path matrices, method borrowed from FCSP. - vk_dict = {} # shortest path matrices dict - if ds_attrs['node_labeled']: - # node symb and non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['mix'] - for n1, n2 in product( - g1.nodes(data=True), g2.nodes(data=True)): - vk_dict[(n1[0], n2[0])] = kn( - n1[1][node_label], n2[1][node_label], - n1[1]['attributes'], n2[1]['attributes']) - # node symb labeled - else: - kn = node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], - n2[1][node_label]) - else: - # node non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], - n2[1]['attributes']) - # node unlabeled - else: - pass - - # Then, compute kernels between all pairs of edges, which idea is an + vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) + # Then, compute kernels between all pairs of edges, which is an idea of # extension of FCSP. It suits sparse graphs, which is the most case we # went though. For dense graphs, this would be slow. - ek_dict = {} # dict of edge kernels - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1, e2 in product( - g1.edges(data=True), g2.edges(data=True)): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = kn(e1[2]['attributes'], e2[2]['attributes']) - ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp - # edge unlabeled - else: - pass + ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) # compute graph kernels if vk_dict: @@ -420,6 +335,399 @@ def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs, node_label, edge_label, node_kernels, edge_kernels) + + +def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels): + +# # traverse all paths in graph1. Deep-first search is applied. +# def traverseBothTrie(root, trie2, kernel, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# traverseTrie2(trie2.root, pcurrent, kernel, +# pcurrent=[]) +# if node['children'] != {}: +# traverseBothTrie(node, trie2, kernel, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. +# def traverseTrie2(root, p1, kernel, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) +# if node['children'] != {}: +# traverseTrie2(node, p1, kernel, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] +# +# +# kernel = [0] +# +# # First, compute shortest path matrices, method borrowed from FCSP. +# vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) +# # Then, compute kernels between all pairs of edges, which is an idea of +# # extension of FCSP. It suits sparse graphs, which is the most case we +# # went though. For dense graphs, this would be slow. +# ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) +# +# # compute graph kernels +# traverseBothTrie(trie1[0].root, trie2[0], kernel) +# +# kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average + +# # traverse all paths in graph1. Deep-first search is applied. +# def traverseBothTrie(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict, +# pcurrent=[]) +# if node['children'] != {}: +# traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. +# def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) +# if node['children'] != {}: +# traverseTrie2(node, p1, kernel, vk_dict, ek_dict, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] + + + kernel = [0] + + # First, compute shortest path matrices, method borrowed from FCSP. + vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) + # Then, compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) + + # compute graph kernels +# traverseBothTrie(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + if vk_dict: + if ek_dict: + traverseBothTriem(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + else: + traverseBothTriev(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + else: + if ek_dict: + traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + else: + traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + + kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average + + return kernel + + +def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels, + edge_kernels, itr): + i = itr[0] + j = itr[1] + return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs, + node_label, edge_label, node_kernels, edge_kernels) + + +def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs): + # compute shortest path matrices, method borrowed from FCSP. + vk_dict = {} # shortest path matrices dict + if ds_attrs['node_labeled']: + # node symb and non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['mix'] + for n1, n2 in product( + g1.nodes(data=True), g2.nodes(data=True)): + vk_dict[(n1[0], n2[0])] = kn( + n1[1][node_label], n2[1][node_label], + n1[1]['attributes'], n2[1]['attributes']) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], + n2[1][node_label]) + else: + # node non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], + n2[1]['attributes']) + # node unlabeled + else: + pass + + return vk_dict + + +def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs): + # compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = {} # dict of edge kernels + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1, e2 in product( + g1.edges(data=True), g2.edges(data=True)): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + # edge unlabeled + else: + pass + + return ek_dict + + +# traverse all paths in graph1. Deep-first search is applied. +def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. +def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + kpath = vk_dict[(p1[0], pcurrent[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], pcurrent[idx])] * \ + ek_dict[((p1[idx-1], p1[idx]), + (pcurrent[idx-1], pcurrent[idx]))] + if not kpath: + break + kernel[0] += kpath # add up kernels of all paths + if node['children'] != {}: + traverseTrie2m(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph1. Deep-first search is applied. +def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. +def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + kpath = vk_dict[(p1[0], pcurrent[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], pcurrent[idx])] + if not kpath: + break + kernel[0] += kpath # add up kernels of all paths + if node['children'] != {}: + traverseTrie2v(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph1. Deep-first search is applied. +def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. +def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + if len(p1) == 0: + kernel += 1 + else: + kpath = 1 + for idx in range(0, len(p1) - 1): + kpath *= ek_dict[((p1[idx], p1[idx+1]), + (pcurrent[idx], pcurrent[idx+1]))] + if not kpath: + break + kernel[0] += kpath # add up kernels of all paths + if node['children'] != {}: + traverseTrie2e(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph1. Deep-first search is applied. +def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. +def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + kernel[0] += 1 + if node['children'] != {}: + traverseTrie2u(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + +#def computePathKernel(p1, p2, vk_dict, ek_dict): +# kernel = 0 +# if vk_dict: +# if ek_dict: +# if len(p1) == len(p2): +# kpath = vk_dict[(p1[0], p2[0])] +# if kpath: +# for idx in range(1, len(p1)): +# kpath *= vk_dict[(p1[idx], p2[idx])] * \ +# ek_dict[((p1[idx-1], p1[idx]), +# (p2[idx-1], p2[idx]))] +# if not kpath: +# break +# kernel += kpath # add up kernels of all paths +# else: +# if len(p1) == len(p2): +# kpath = vk_dict[(p1[0], p2[0])] +# if kpath: +# for idx in range(1, len(p1)): +# kpath *= vk_dict[(p1[idx], p2[idx])] +# if not kpath: +# break +# kernel += kpath # add up kernels of all paths +# else: +# if ek_dict: +# if len(p1) == len(p2): +# if len(p1) == 0: +# kernel += 1 +# else: +# kpath = 1 +# for idx in range(0, len(p1) - 1): +# kpath *= ek_dict[((p1[idx], p1[idx+1]), +# (p2[idx], p2[idx+1]))] +# if not kpath: +# break +# kernel += kpath # add up kernels of all paths +# else: +# if len(p1) == len(p2): +# kernel += 1 +# +# return kernel def get_shortest_paths(G, weight, directed): @@ -457,7 +765,54 @@ def get_shortest_paths(G, weight, directed): return sp -def wrapper_getSP(weight, directed, itr_item): +def wrapper_getSP_naive(weight, directed, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, get_shortest_paths(g, weight, directed) + + +def get_sps_as_trie(G, weight, directed): + """Get all shortest paths of a graph and insert them into a trie. + + Parameters + ---------- + G : NetworkX graphs + The graphs whose paths are calculated. + weight : string/None + edge attribute used as weight to calculate the shortest path. + directed: boolean + Whether graph is directed. + + Return + ------ + sp : list of list + List of shortest paths of the graph, where each path is represented by a list of nodes. + """ + sptrie = Trie() + lensp = 0 + for n1, n2 in combinations(G.nodes(), 2): + try: + spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) + except nx.NetworkXNoPath: # nodes not connected + pass + else: + lensp += len(spltemp) + if not directed: + lensp += len(spltemp) + for sp in spltemp: + sptrie.insertWord(sp) + # each edge walk is counted twice, starting from both its extreme nodes. + if not directed: + sptrie.insertWord(sp[::-1]) + + # add single nodes as length 0 paths. + for n in G.nodes(): + sptrie.insertWord([n]) + + return sptrie, lensp + nx.number_of_nodes(G) + + +def wrapper_getSP_trie(weight, directed, itr_item): g = itr_item[0] i = itr_item[1] - return i, get_shortest_paths(g, weight, directed) \ No newline at end of file + return i, get_sps_as_trie(g, weight, directed) \ No newline at end of file diff --git a/pygraph/kernels/untilHPathKernel.py b/pygraph/kernels/untilHPathKernel.py index 8b80f99..a82378f 100644 --- a/pygraph/kernels/untilHPathKernel.py +++ b/pygraph/kernels/untilHPathKernel.py @@ -27,7 +27,7 @@ def untilhpathkernel(*args, edge_label='bond_type', depth=10, k_func='tanimoto', - compute_method='naive', + compute_method='trie', n_jobs=None): """Calculate path graph kernels up to depth/hight h between graphs. Parameters