From 6c1e8a7692a90e8ea28aeb57ee0a7d9f31cf5b05 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 27 Mar 2020 11:39:19 +0100 Subject: [PATCH] clear repo: remove useless files. --- gklearn/kernels/.tags | 188 - gklearn/kernels/else/rwalk_sym.py | 842 --- gklearn/kernels/else/sp_sym.py | 200 - gklearn/kernels/else/ssp_sym.py | 464 -- gklearn/kernels/unfinished/cyclicPatternKernel.py | 147 - gklearn/kernels/unfinished/pathKernel.py | 234 - gklearn/kernels/unfinished/treePatternKernel.py | 241 - .../kernels/unfinished/weisfeilerLehmanKernel.py | 403 - gklearn/preimage/common_types.py | 17 - gklearn/preimage/cpp2python.py | 134 - gklearn/preimage/find_best_k.py | 170 - gklearn/preimage/fitDistance.py | 430 -- gklearn/preimage/ged.py | 467 -- gklearn/preimage/iam.py | 775 -- gklearn/preimage/knn.py | 114 - gklearn/preimage/libs.py | 6 - gklearn/preimage/median.py | 218 - gklearn/preimage/median_benoit.py | 201 - gklearn/preimage/median_graph_estimator.py | 826 -- gklearn/preimage/median_linlin.py | 215 - gklearn/preimage/median_preimage_generator.py | 15 - gklearn/preimage/misc.py | 108 - gklearn/preimage/pathfrequency.py | 201 - gklearn/preimage/preimage_generator.py | 12 - gklearn/preimage/preimage_iam.py | 705 -- gklearn/preimage/preimage_random.py | 309 - gklearn/preimage/python_code.py | 122 - gklearn/preimage/test.py | 83 - gklearn/preimage/test_fitDistance.py | 648 -- gklearn/preimage/test_ged.py | 520 -- gklearn/preimage/test_iam.py | 964 --- gklearn/preimage/test_k_closest_graphs.py | 462 -- gklearn/preimage/test_median_graph_estimator.py | 91 - gklearn/preimage/test_others.py | 686 -- gklearn/preimage/test_preimage_iam.py | 620 -- gklearn/preimage/test_preimage_mix.py | 539 -- gklearn/preimage/test_preimage_random.py | 398 - gklearn/preimage/timer.py | 40 - gklearn/preimage/utils.py | 151 - gklearn/preimage/visualization.py | 585 -- gklearn/preimage/xp_fit_method.py | 935 --- gklearn/preimage/xp_letter_h.py | 476 -- gklearn/preimage/xp_monoterpenoides.py | 249 - gklearn/utils/isNotebook.py | 16 - gklearn/utils/logger2file.py | 27 - .../else/compute_spkernel_for_syntheticnew.py | 52 - .../else/compute_sspkernel_for_syntheticnew.py | 54 - notebooks/else/job_graphkernels.sl | 19 - notebooks/else/job_test.sl | 12 - notebooks/else/run_rwalk_symonly.py | 70 - notebooks/else/run_sp_symonly.py | 61 - notebooks/else/run_ssp_symonly.py | 47 - notebooks/preimage/results.gm.npz | Bin 17288 -> 0 bytes notebooks/unfinished/run_cyclicpatternkernel.ipynb | 1329 ---- .../unfinished/run_treeletkernel_acyclic.ipynb | 786 -- notebooks/unfinished/run_treepatternkernel.ipynb | 7966 -------------------- .../unfinished/run_weisfeilerLehmankernel.ipynb | 3812 ---------- notebooks/unfinished/test_mpi.py | 47 - 58 files changed, 29509 deletions(-) delete mode 100644 gklearn/kernels/.tags delete mode 100644 gklearn/kernels/else/rwalk_sym.py delete mode 100644 gklearn/kernels/else/sp_sym.py delete mode 100644 gklearn/kernels/else/ssp_sym.py delete mode 100644 gklearn/kernels/unfinished/cyclicPatternKernel.py delete mode 100644 gklearn/kernels/unfinished/pathKernel.py delete mode 100644 gklearn/kernels/unfinished/treePatternKernel.py delete mode 100644 gklearn/kernels/unfinished/weisfeilerLehmanKernel.py delete mode 100644 gklearn/preimage/common_types.py delete mode 100644 gklearn/preimage/cpp2python.py delete mode 100644 gklearn/preimage/find_best_k.py delete mode 100644 gklearn/preimage/fitDistance.py delete mode 100644 gklearn/preimage/ged.py delete mode 100644 gklearn/preimage/iam.py delete mode 100644 gklearn/preimage/knn.py delete mode 100644 gklearn/preimage/libs.py delete mode 100644 gklearn/preimage/median.py delete mode 100644 gklearn/preimage/median_benoit.py delete mode 100644 gklearn/preimage/median_graph_estimator.py delete mode 100644 gklearn/preimage/median_linlin.py delete mode 100644 gklearn/preimage/median_preimage_generator.py delete mode 100644 gklearn/preimage/misc.py delete mode 100644 gklearn/preimage/pathfrequency.py delete mode 100644 gklearn/preimage/preimage_generator.py delete mode 100644 gklearn/preimage/preimage_iam.py delete mode 100644 gklearn/preimage/preimage_random.py delete mode 100644 gklearn/preimage/python_code.py delete mode 100644 gklearn/preimage/test.py delete mode 100644 gklearn/preimage/test_fitDistance.py delete mode 100644 gklearn/preimage/test_ged.py delete mode 100644 gklearn/preimage/test_iam.py delete mode 100644 gklearn/preimage/test_k_closest_graphs.py delete mode 100644 gklearn/preimage/test_median_graph_estimator.py delete mode 100644 gklearn/preimage/test_others.py delete mode 100644 gklearn/preimage/test_preimage_iam.py delete mode 100644 gklearn/preimage/test_preimage_mix.py delete mode 100644 gklearn/preimage/test_preimage_random.py delete mode 100644 gklearn/preimage/timer.py delete mode 100644 gklearn/preimage/utils.py delete mode 100644 gklearn/preimage/visualization.py delete mode 100644 gklearn/preimage/xp_fit_method.py delete mode 100644 gklearn/preimage/xp_letter_h.py delete mode 100644 gklearn/preimage/xp_monoterpenoides.py delete mode 100644 gklearn/utils/isNotebook.py delete mode 100644 gklearn/utils/logger2file.py delete mode 100644 notebooks/else/compute_spkernel_for_syntheticnew.py delete mode 100644 notebooks/else/compute_sspkernel_for_syntheticnew.py delete mode 100644 notebooks/else/job_graphkernels.sl delete mode 100644 notebooks/else/job_test.sl delete mode 100644 notebooks/else/run_rwalk_symonly.py delete mode 100644 notebooks/else/run_sp_symonly.py delete mode 100644 notebooks/else/run_ssp_symonly.py delete mode 100644 notebooks/preimage/results.gm.npz delete mode 100644 notebooks/unfinished/run_cyclicpatternkernel.ipynb delete mode 100644 notebooks/unfinished/run_treeletkernel_acyclic.ipynb delete mode 100644 notebooks/unfinished/run_treepatternkernel.ipynb delete mode 100644 notebooks/unfinished/run_weisfeilerLehmankernel.ipynb delete mode 100644 notebooks/unfinished/test_mpi.py diff --git a/gklearn/kernels/.tags b/gklearn/kernels/.tags deleted file mode 100644 index 0663235..0000000 --- a/gklearn/kernels/.tags +++ /dev/null @@ -1,188 +0,0 @@ -!_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/ -!_TAG_FILE_SORTED 0 /0=unsorted, 1=sorted, 2=foldcase/ -!_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/ -!_TAG_PROGRAM_NAME Exuberant Ctags // -!_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/ -!_TAG_PROGRAM_VERSION 5.9~svn20110310 // -commonwalkkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^def commonwalkkernel(*args,$/;" function line:23 -compute_method /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ compute_method = compute_method.lower()$/;" variable line:67 -Gn /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ Gn = args[0] if len(args) == 1 else [args[0], args[1]]$/;" variable line:69 -len_gn /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ len_gn = len(Gn)$/;" variable line:72 -Gn /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1]$/;" variable line:73 -idx /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ idx = [G[0] for G in Gn]$/;" variable line:74 -Gn /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ Gn = [G[1] for G in Gn]$/;" variable line:75 -ds_attrs /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ ds_attrs = get_dataset_attributes($/;" variable line:81 -attr_names /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ attr_names=['node_labeled', 'edge_labeled', 'is_directed'],$/;" variable line:83 -Gn /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ Gn = [G.to_directed() for G in Gn]$/;" variable line:92 -start_time /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ start_time = time.time()$/;" variable line:94 -Kmatrix /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ Kmatrix = np.zeros((len(Gn), len(Gn)))$/;" variable line:96 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ def init_worker(gn_toshare):$/;" function line:99 -run_time /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^ run_time = time.time() - start_time$/;" variable line:173 -_commonwalkkernel_exp /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta):$/;" function line:181 -wrapper_cw_exp /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^def wrapper_cw_exp(node_label, edge_label, beta, itr):$/;" function line:249 -_commonwalkkernel_geo /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma):$/;" function line:255 -wrapper_cw_geo /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^def wrapper_cw_geo(node_label, edge_label, gama, itr):$/;" function line:290 -_commonwalkkernel_brute /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^def _commonwalkkernel_brute(walks1,$/;" function line:296 -find_all_walks_until_length /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^def find_all_walks_until_length(G,$/;" function line:336 -find_walks /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^def find_walks(G, source_node, length):$/;" function line:388 -find_all_walks /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py /^def find_all_walks(G, length):$/;" function line:412 -randomwalkkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def randomwalkkernel(*args,$/;" function line:27 -_sylvester_equation /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs):$/;" function line:150 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^ def init_worker(Awl_toshare):$/;" function line:184 function:_sylvester_equation -wrapper_se_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def wrapper_se_do(lmda, itr):$/;" function line:214 -_se_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _se_do(A_wave1, A_wave2, lmda):$/;" function line:220 -_conjugate_gradient /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, $/;" function line:236 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^ def init_worker(gn_toshare):$/;" function line:280 function:_conjugate_gradient -wrapper_cg_unlabled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def wrapper_cg_unlabled_do(lmda, itr):$/;" function line:302 -_cg_unlabled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _cg_unlabled_do(A_wave1, A_wave2, lmda):$/;" function line:308 -wrapper_cg_labled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, $/;" function line:320 -_cg_labled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, $/;" function line:328 -_fixed_point /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, $/;" function line:351 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^ def init_worker(gn_toshare):$/;" function line:408 function:_fixed_point -wrapper_fp_labled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, $/;" function line:418 -_fp_labled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, $/;" function line:426 -func_fp /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def func_fp(x, p_times, lmda, w_times):$/;" function line:448 -_spectral_decomposition /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs):$/;" function line:456 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^ def init_worker(q_T_toshare, P_toshare, D_toshare):$/;" function line:492 function:_spectral_decomposition -wrapper_sd_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def wrapper_sd_do(weight, sub_kernel, itr):$/;" function line:516 -_sd_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): $/;" function line:523 -_randomwalkkernel_kron /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def _randomwalkkernel_kron(G1, G2, node_label, edge_label):$/;" function line:540 -getLabels /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def getLabels(Gn, node_label, edge_label, directed):$/;" function line:561 -filterGramMatrix /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def filterGramMatrix(gmt, label_dict, label, directed):$/;" function line:581 -computeVK /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def computeVK(g1, g2, ds_attrs, node_kernels, node_label):$/;" function line:593 -computeW /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py /^def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):$/;" function line:627 -spkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py /^def spkernel(*args,$/;" function line:24 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py /^ def init_worker(gn_toshare):$/;" function line:115 function:spkernel -spkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py /^def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):$/;" function line:130 -wrapper_sp_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py /^def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):$/;" function line:191 -wrapper_getSPGraph /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py /^def wrapper_getSPGraph(weight, itr_item):$/;" function line:197 -structuralspkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py /^def structuralspkernel(*args,$/;" function line:25 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py /^ def init_worker(spl_toshare, gs_toshare):$/;" function line:177 function:structuralspkernel -structuralspkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py /^def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,$/;" function line:265 -wrapper_ssp_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py /^def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, $/;" function line:417 -get_shortest_paths /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py /^def get_shortest_paths(G, weight, directed):$/;" function line:426 -wrapper_getSP /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py /^def wrapper_getSP(weight, directed, itr_item):$/;" function line:461 -marginalizedkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py /^def marginalizedkernel(*args,$/;" function line:31 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py /^ def init_worker(gn_toshare):$/;" function line:114 function:marginalizedkernel -_marginalizedkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py /^def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration):$/;" function line:144 -wrapper_marg_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py /^def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr):$/;" function line:290 -wrapper_untotter /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py /^def wrapper_untotter(Gn, node_label, edge_label, i):$/;" function line:296 -randomwalkkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def randomwalkkernel(*args,$/;" function line:21 -_sylvester_equation /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True):$/;" function line:197 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^ def init_worker(Awl_toshare):$/;" function line:232 function:_sylvester_equation -wrapper_se_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def wrapper_se_do(lmda, itr):$/;" function line:262 -_se_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _se_do(A_wave1, A_wave2, lmda):$/;" function line:268 -_conjugate_gradient /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, $/;" function line:284 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^ def init_worker(gn_toshare):$/;" function line:328 function:_conjugate_gradient -wrapper_cg_unlabled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def wrapper_cg_unlabled_do(lmda, itr):$/;" function line:350 -_cg_unlabled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _cg_unlabled_do(A_wave1, A_wave2, lmda):$/;" function line:356 -wrapper_cg_labled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, $/;" function line:368 -_cg_labled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, $/;" function line:376 -_fixed_point /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, $/;" function line:399 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^ def init_worker(gn_toshare):$/;" function line:456 function:_fixed_point -wrapper_fp_labled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, $/;" function line:466 -_fp_labled_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, $/;" function line:474 -func_fp /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def func_fp(x, p_times, lmda, w_times):$/;" function line:496 -_spectral_decomposition /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=True):$/;" function line:504 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^ def init_worker(q_T_toshare, P_toshare, D_toshare):$/;" function line:541 function:_spectral_decomposition -wrapper_sd_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def wrapper_sd_do(weight, sub_kernel, itr):$/;" function line:566 -_sd_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): $/;" function line:573 -_randomwalkkernel_kron /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def _randomwalkkernel_kron(G1, G2, node_label, edge_label):$/;" function line:590 -getLabels /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def getLabels(Gn, node_label, edge_label, directed):$/;" function line:611 -filterGramMatrix /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def filterGramMatrix(gmt, label_dict, label, directed):$/;" function line:631 -computeVK /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def computeVK(g1, g2, ds_attrs, node_kernels, node_label):$/;" function line:643 -computeW /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py /^def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):$/;" function line:677 -spkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py /^def spkernel(*args,$/;" function line:22 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py /^ def init_worker(gn_toshare):$/;" function line:157 function:spkernel -spkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py /^def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):$/;" function line:207 -wrapper_sp_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py /^def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):$/;" function line:297 -wrapper_getSPGraph /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py /^def wrapper_getSPGraph(weight, itr_item):$/;" function line:310 -structuralspkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def structuralspkernel(*args,$/;" function line:28 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^ def init_worker(spl_toshare, gs_toshare):$/;" function line:179 function:structuralspkernel -structuralspkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,$/;" function line:258 -wrapper_ssp_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, $/;" function line:346 -ssp_do_trie /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,$/;" function line:355 -wrapper_ssp_do_trie /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels, $/;" function line:463 -getAllNodeKernels /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs):$/;" function line:471 -getAllEdgeKernels /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs):$/;" function line:505 -traverseBothTriem /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):$/;" function line:551 -traverseTrie2m /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):$/;" function line:568 -traverseBothTriev /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):$/;" function line:592 -traverseTrie2v /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):$/;" function line:609 -traverseBothTriee /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):$/;" function line:631 -traverseTrie2e /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):$/;" function line:648 -traverseBothTrieu /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):$/;" function line:673 -traverseTrie2u /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):$/;" function line:690 -get_shortest_paths /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def get_shortest_paths(G, weight, directed):$/;" function line:748 -wrapper_getSP_naive /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def wrapper_getSP_naive(weight, directed, itr_item):$/;" function line:783 -get_sps_as_trie /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def get_sps_as_trie(G, weight, directed):$/;" function line:789 -wrapper_getSP_trie /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py /^def wrapper_getSP_trie(weight, directed, itr_item):$/;" function line:830 -treeletkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py /^def treeletkernel(*args, $/;" function line:23 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py /^ def init_worker(canonkeys_toshare):$/;" function line:105 function:treeletkernel -_treeletkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py /^def _treeletkernel_do(canonkey1, canonkey2, sub_kernel):$/;" function line:140 -wrapper_treeletkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py /^def wrapper_treeletkernel_do(sub_kernel, itr):$/;" function line:160 -get_canonkeys /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py /^def get_canonkeys(G, node_label, edge_label, labeled, is_directed):$/;" function line:166 -wrapper_get_canonkeys /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py /^def wrapper_get_canonkeys(node_label, edge_label, labeled, is_directed, itr_item):$/;" function line:418 -find_paths /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py /^def find_paths(G, source_node, length):$/;" function line:424 -find_all_paths /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py /^def find_all_paths(G, length, is_directed):$/;" function line:449 -cyclicpatternkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/cyclicPatternKernel.py /^def cyclicpatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):$/;" function line:20 -_cyclicpatternkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/cyclicPatternKernel.py /^def _cyclicpatternkernel_do(patterns1, patterns2):$/;" function line:63 -get_patterns /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/cyclicPatternKernel.py /^def get_patterns(G, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):$/;" function line:87 -pathkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py /^def pathkernel(*args, node_label='atom', edge_label='bond_type'):$/;" function line:20 -_pathkernel_do_l /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py /^def _pathkernel_do_l(G1, G2, sp1, sp2, node_label, edge_label):$/;" function line:107 -_pathkernel_do_nl /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py /^def _pathkernel_do_nl(G1, G2, sp1, sp2, node_label):$/;" function line:148 -_pathkernel_do_el /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py /^def _pathkernel_do_el(G1, G2, sp1, sp2, edge_label):$/;" function line:171 -_pathkernel_do_unl /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py /^def _pathkernel_do_unl(G1, G2, sp1, sp2):$/;" function line:196 -get_shortest_paths /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py /^def get_shortest_paths(G, weight):$/;" function line:211 -treepatternkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py /^def treepatternkernel(*args,$/;" function line:21 -_treepatternkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py /^def _treepatternkernel_do(G1, G2, node_label, edge_label, labeled, kernel_type,$/;" function line:90 -matchingset /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py /^ def matchingset(n1, n2):$/;" function line:119 function:_treepatternkernel_do -mset_com /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py /^ def mset_com(allpairs, length):$/;" function line:123 function:_treepatternkernel_do.matchingset -kernel_h /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py /^ def kernel_h(h):$/;" function line:165 function:_treepatternkernel_do -weisfeilerlehmankernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py /^def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):$/;" function line:18 -_wl_subtreekernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py /^def _wl_subtreekernel_do(Gn, node_label, edge_label, height):$/;" function line:75 -_wl_spkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py /^def _wl_spkernel_do(Gn, node_label, edge_label, height):$/;" function line:183 -_wl_edgekernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py /^def _wl_edgekernel_do(Gn, node_label, edge_label, height):$/;" function line:264 -_wl_userkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py /^def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):$/;" function line:340 -untilhpathkernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def untilhpathkernel(*args,$/;" function line:25 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^ def init_worker(trie_toshare):$/;" function line:142 function:untilhpathkernel -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^ def init_worker(plist_toshare):$/;" function line:149 function:untilhpathkernel -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^ def init_worker(plist_toshare):$/;" function line:156 function:untilhpathkernel -_untilhpathkernel_do_trie /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def _untilhpathkernel_do_trie(trie1, trie2, k_func):$/;" function line:207 -traverseTrie1t /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^ def traverseTrie1t(root, trie2, setlist, pcurrent=[]):$/;" function line:226 function:_untilhpathkernel_do_trie -traverseTrie2t /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^ def traverseTrie2t(root, trie1, setlist, pcurrent=[]):$/;" function line:244 function:_untilhpathkernel_do_trie -traverseTrie1m /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^ def traverseTrie1m(root, trie2, sumlist, pcurrent=[]):$/;" function line:271 function:_untilhpathkernel_do_trie -traverseTrie2m /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^ def traverseTrie2m(root, trie1, sumlist, pcurrent=[]):$/;" function line:289 function:_untilhpathkernel_do_trie -wrapper_uhpath_do_trie /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def wrapper_uhpath_do_trie(k_func, itr):$/;" function line:316 -_untilhpathkernel_do_naive /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def _untilhpathkernel_do_naive(paths1, paths2, k_func):$/;" function line:322 -wrapper_uhpath_do_naive /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def wrapper_uhpath_do_naive(k_func, itr):$/;" function line:365 -_untilhpathkernel_do_kernelless /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def _untilhpathkernel_do_kernelless(paths1, paths2, k_func):$/;" function line:371 -wrapper_uhpath_do_kernelless /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def wrapper_uhpath_do_kernelless(k_func, itr):$/;" function line:414 -find_all_paths_until_length /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def find_all_paths_until_length(G,$/;" function line:421 -wrapper_find_all_paths_until_length /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, $/;" function line:492 -find_all_path_as_trie /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def find_all_path_as_trie(G,$/;" function line:501 -traverseGraph /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^ def traverseGraph(root, ptrie, length, G, ds_attrs, node_label, edge_label,$/;" function line:542 function:find_all_path_as_trie -wrapper_find_all_path_as_trie /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def wrapper_find_all_path_as_trie(length, ds_attrs, node_label, $/;" function line:593 -paths2labelseqs /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py /^def paths2labelseqs(plist, G, ds_attrs, node_label, edge_label):$/;" function line:601 -weisfeilerlehmankernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def weisfeilerlehmankernel(*args, $/;" function line:25 -base_kernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ base_kernel = base_kernel.lower()$/;" variable line:74 -Gn /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list$/;" variable line:75 -Gn /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ Gn = [g.copy() for g in Gn]$/;" variable line:76 -ds_attrs /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], $/;" variable line:77 -node_label /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ node_label=node_label)$/;" variable line:78 -start_time /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ start_time = time.time()$/;" variable line:83 -Kmatrix /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose)$/;" variable line:87 -Kmatrix /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)$/;" variable line:91 -Kmatrix /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)$/;" variable line:95 -Kmatrix /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)$/;" variable line:99 -run_time /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ run_time = time.time() - start_time$/;" variable line:101 -_wl_kernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose):$/;" function line:109 -wl_iteration /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def wl_iteration(G, node_label):$/;" function line:256 -wrapper_wl_iteration /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def wrapper_wl_iteration(node_label, itr_item):$/;" function line:293 -compute_kernel_matrix /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose):$/;" function line:300 -init_worker /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^ def init_worker(alllabels_toshare):$/;" function line:305 function:compute_kernel_matrix -compute_subtree_kernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):$/;" function line:319 -wrapper_compute_subtree_kernel /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def wrapper_compute_subtree_kernel(Kmatrix, itr):$/;" function line:333 -_wl_spkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def _wl_spkernel_do(Gn, node_label, edge_label, height):$/;" function line:339 -_wl_edgekernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def _wl_edgekernel_do(Gn, node_label, edge_label, height):$/;" function line:421 -_wl_userkernel_do /media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py /^def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):$/;" function line:498 diff --git a/gklearn/kernels/else/rwalk_sym.py b/gklearn/kernels/else/rwalk_sym.py deleted file mode 100644 index e9db9fd..0000000 --- a/gklearn/kernels/else/rwalk_sym.py +++ /dev/null @@ -1,842 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Sun Dec 23 16:53:57 2018 - -@author: ljia -@references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and -Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, -11(Apr):1201–1242, 2010. -""" - -import sys -sys.path.insert(0, "../") -import time -from functools import partial -from tqdm import tqdm - -import networkx as nx -import numpy as np -from scipy.sparse import identity, kron -from scipy.sparse.linalg import cg -from scipy.optimize import fixed_point - -from gklearn.utils.graphdataset import get_dataset_attributes -from gklearn.utils.parallel import parallel_gm - -def randomwalkkernel(*args, - # params for all method. - compute_method=None, - weight=1, - p=None, - q=None, - edge_weight=None, - # params for conjugate and fp method. - node_kernels=None, - edge_kernels=None, - node_label='atom', - edge_label='bond_type', - # params for spectral method. - sub_kernel=None, - n_jobs=None): - """Calculate random walk graph kernels. - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - / - G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - h : integer - Longest length of walks. - method : string - Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the path kernel up to d between 2 praphs. - """ - compute_method = compute_method.lower() - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - - eweight = None - if edge_weight == None: - print('\n None edge weight specified. Set all weight to 1.\n') - else: - try: - some_weight = list( - nx.get_edge_attributes(Gn[0], edge_weight).values())[0] - if isinstance(some_weight, float) or isinstance(some_weight, int): - eweight = edge_weight - else: - print( - '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' - % edge_weight) - except: - print( - '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' - % edge_weight) - - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', - 'edge_attr_dim', 'is_directed'], - node_label=node_label, - edge_label=edge_label) - ds_attrs['node_attr_dim'] = 0 - ds_attrs['edge_attr_dim'] = 0 - - # remove graphs with no edges, as no walk can be found in their structures, - # so the weight matrix between such a graph and itself might be zero. - len_gn = len(Gn) - Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] - idx = [G[0] for G in Gn] - Gn = [G[1] for G in Gn] - if len(Gn) != len_gn: - print('\n %d graphs are removed as they don\'t contain edges.\n' % - (len_gn - len(Gn))) - - start_time = time.time() - -# # get vertex and edge concatenated labels for each graph -# label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed']) -# gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed']) - - if compute_method == 'sylvester': - import warnings - warnings.warn('All labels are ignored.') - Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs) - - elif compute_method == 'conjugate': - Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, - node_kernels, edge_kernels, - node_label, edge_label, eweight, n_jobs) - - elif compute_method == 'fp': - Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels, - edge_kernels, node_label, edge_label, - eweight, n_jobs) - - elif compute_method == 'spectral': - import warnings - warnings.warn('All labels are ignored. Only works for undirected graphs.') - Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs) - - elif compute_method == 'kron': - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j], - node_label, edge_label) - Kmatrix[j][i] = Kmatrix[i][j] - else: - raise Exception( - 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".' - ) - - run_time = time.time() - start_time - print( - "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---" - % (len(Gn), run_time)) - - return Kmatrix, run_time, idx - - -############################################################################### -def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs): - """Calculate walk graph kernels up to n between 2 graphs using Sylvester method. - - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - Kmatrix = np.zeros((len(Gn), len(Gn))) - - if q == None: - # don't normalize adjacency matrices if q is a uniform vector. Note - # A_wave_list accually contains the transposes of the adjacency matrices. - A_wave_list = [ - nx.adjacency_matrix(G, eweight).todense().transpose() for G in tqdm( - Gn, desc='compute adjacency matrices', file=sys.stdout) - ] -# # normalized adjacency matrices -# A_wave_list = [] -# for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): -# A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() -# norm = A_tilde.sum(axis=0) -# norm[norm == 0] = 1 -# A_wave_list.append(A_tilde / norm) - if p == None: # p is uniform distribution as default. - def init_worker(Awl_toshare): - global G_Awl - G_Awl = Awl_toshare - do_partial = partial(wrapper_se_do, lmda) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(A_wave_list,), n_jobs=n_jobs) - -# pbar = tqdm( -# total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', -# file=sys.stdout) -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# S = lmda * A_wave_list[j] -# T_t = A_wave_list[i] -# # use uniform distribution if there is no prior knowledge. -# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) -# p_times_uni = 1 / nb_pd -# M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni) -# X = dlyap(S, T_t, M0) -# X = np.reshape(X, (-1, 1), order='F') -# # use uniform distribution if there is no prior knowledge. -# q_times = np.full((1, nb_pd), p_times_uni) -# Kmatrix[i][j] = np.dot(q_times, X) -# Kmatrix[j][i] = Kmatrix[i][j] -# pbar.update(1) - - return Kmatrix - - -def wrapper_se_do(lmda, itr): - i = itr[0] - j = itr[1] - return i, j, _se_do(G_Awl[i], G_Awl[j], lmda) - - -def _se_do(A_wave1, A_wave2, lmda): - from control import dlyap - S = lmda * A_wave2 - T_t = A_wave1 - # use uniform distribution if there is no prior knowledge. - nb_pd = len(A_wave1) * len(A_wave2) - p_times_uni = 1 / nb_pd - M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni) - X = dlyap(S, T_t, M0) - X = np.reshape(X, (-1, 1), order='F') - # use uniform distribution if there is no prior knowledge. - q_times = np.full((1, nb_pd), p_times_uni) - return np.dot(q_times, X) - - -############################################################################### -def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, - node_label, edge_label, eweight, n_jobs): - """Calculate walk graph kernels up to n between 2 graphs using conjugate method. - - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - Kmatrix = np.zeros((len(Gn), len(Gn))) - -# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ -# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1: -# # this is faster from unlabeled graphs. @todo: why? -# if q == None: -# # don't normalize adjacency matrices if q is a uniform vector. Note -# # A_wave_list accually contains the transposes of the adjacency matrices. -# A_wave_list = [ -# nx.adjacency_matrix(G, eweight).todense().transpose() for G in -# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) -# ] -# if p == None: # p is uniform distribution as default. -# def init_worker(Awl_toshare): -# global G_Awl -# G_Awl = Awl_toshare -# do_partial = partial(wrapper_cg_unlabled_do, lmda) -# parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, -# glbv=(A_wave_list,), n_jobs=n_jobs) -# else: - # reindex nodes using consecutive integers for convenience of kernel calculation. - Gn = [nx.convert_node_labels_to_integers( - g, first_label=0, label_attribute='label_orignal') for g in tqdm( - Gn, desc='reindex vertices', file=sys.stdout)] - - if p == None and q == None: # p and q are uniform distributions as default. - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels, - node_label, edge_kernels, edge_label, lmda) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs) - -# pbar = tqdm( -# total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', -# file=sys.stdout) -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels, -# node_label, edge_kernels, edge_label, lmda) -# Kmatrix[i][j] = result -# Kmatrix[j][i] = Kmatrix[i][j] -# pbar.update(1) - return Kmatrix - - -def wrapper_cg_unlabled_do(lmda, itr): - i = itr[0] - j = itr[1] - return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda) - - -def _cg_unlabled_do(A_wave1, A_wave2, lmda): - nb_pd = len(A_wave1) * len(A_wave2) - p_times_uni = 1 / nb_pd - w_times = kron(A_wave1, A_wave2).todense() - A = identity(w_times.shape[0]) - w_times * lmda - b = np.full((nb_pd, 1), p_times_uni) - x, _ = cg(A, b) - # use uniform distribution if there is no prior knowledge. - q_times = np.full((1, nb_pd), p_times_uni) - return np.dot(q_times, x) - - -def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, - edge_label, lmda, itr): - i = itr[0] - j = itr[1] - return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, - node_label, edge_kernels, edge_label, lmda) - - -def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, - edge_kernels, edge_label, lmda): - # Frist, ompute kernels between all pairs of nodes, method borrowed - # from FCSP. It is faster than directly computing all edge kernels - # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the - # graphs compared, which is the most case we went though. For very - # sparse graphs, this would be slow. - vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) - - # Compute weight matrix of the direct product graph. - w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, - edge_kernels, edge_label) - # use uniform distribution if there is no prior knowledge. - p_times_uni = 1 / w_dim - A = identity(w_times.shape[0]) - w_times * lmda - b = np.full((w_dim, 1), p_times_uni) - x, _ = cg(A, b) - # use uniform distribution if there is no prior knowledge. - q_times = np.full((1, w_dim), p_times_uni) - return np.dot(q_times, x) - - -############################################################################### -def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, - node_label, edge_label, eweight, n_jobs): - """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method. - - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - - - Kmatrix = np.zeros((len(Gn), len(Gn))) - -# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ -# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1: -# # this is faster from unlabeled graphs. @todo: why? -# if q == None: -# # don't normalize adjacency matrices if q is a uniform vector. Note -# # A_wave_list accually contains the transposes of the adjacency matrices. -# A_wave_list = [ -# nx.adjacency_matrix(G, eweight).todense().transpose() for G in -# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) -# ] -# if p == None: # p is uniform distribution as default. -# pbar = tqdm( -# total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', -# file=sys.stdout) -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# # use uniform distribution if there is no prior knowledge. -# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) -# p_times_uni = 1 / nb_pd -# w_times = kron(A_wave_list[i], A_wave_list[j]).todense() -# p_times = np.full((nb_pd, 1), p_times_uni) -# x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times)) -# # use uniform distribution if there is no prior knowledge. -# q_times = np.full((1, nb_pd), p_times_uni) -# Kmatrix[i][j] = np.dot(q_times, x) -# Kmatrix[j][i] = Kmatrix[i][j] -# pbar.update(1) -# else: - # reindex nodes using consecutive integers for convenience of kernel calculation. - Gn = [nx.convert_node_labels_to_integers( - g, first_label=0, label_attribute='label_orignal') for g in tqdm( - Gn, desc='reindex vertices', file=sys.stdout)] - - if p == None and q == None: # p and q are uniform distributions as default. - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels, - node_label, edge_kernels, edge_label, lmda) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs) - return Kmatrix - - -def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, - edge_label, lmda, itr): - i = itr[0] - j = itr[1] - return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, - node_label, edge_kernels, edge_label, lmda) - - -def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, - edge_kernels, edge_label, lmda): - # Frist, ompute kernels between all pairs of nodes, method borrowed - # from FCSP. It is faster than directly computing all edge kernels - # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the - # graphs compared, which is the most case we went though. For very - # sparse graphs, this would be slow. - vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) - - # Compute weight matrix of the direct product graph. - w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, - edge_kernels, edge_label) - # use uniform distribution if there is no prior knowledge. - p_times_uni = 1 / w_dim - p_times = np.full((w_dim, 1), p_times_uni) - x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times), - xtol=1e-06, maxiter=1000) - # use uniform distribution if there is no prior knowledge. - q_times = np.full((1, w_dim), p_times_uni) - return np.dot(q_times, x) - - -def func_fp(x, p_times, lmda, w_times): - haha = w_times * x - haha = lmda * haha - haha = p_times + haha - return p_times + lmda * np.dot(w_times, x) - - -############################################################################### -def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs): - """Calculate walk graph kernels up to n between 2 unlabeled graphs using - spectral decomposition method. Labels will be ignored. - - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - Kmatrix = np.zeros((len(Gn), len(Gn))) - - if q == None: - # precompute the spectral decomposition of each graph. - P_list = [] - D_list = [] - for G in tqdm(Gn, desc='spectral decompose', file=sys.stdout): - # don't normalize adjacency matrices if q is a uniform vector. Note - # A accually is the transpose of the adjacency matrix. - A = nx.adjacency_matrix(G, eweight).todense().transpose() - ew, ev = np.linalg.eig(A) - D_list.append(ew) - P_list.append(ev) -# P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? - - if p == None: # p is uniform distribution as default. - q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn] -# q_T_list = [q.T for q in q_list] - def init_worker(q_T_toshare, P_toshare, D_toshare): - global G_q_T, G_P, G_D - G_q_T = q_T_toshare - G_P = P_toshare - G_D = D_toshare - do_partial = partial(wrapper_sd_do, weight, sub_kernel) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs) - - -# pbar = tqdm( -# total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', -# file=sys.stdout) -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], -# D_list[i], D_list[j], weight, sub_kernel) -# Kmatrix[i][j] = result -# Kmatrix[j][i] = Kmatrix[i][j] -# pbar.update(1) - return Kmatrix - - -def wrapper_sd_do(weight, sub_kernel, itr): - i = itr[0] - j = itr[1] - return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j], - weight, sub_kernel) - - -def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): - # use uniform distribution if there is no prior knowledge. - kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense() - # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs -# kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense() - if sub_kernel == 'exp': - D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) - kmiddle = np.diag(np.exp(weight * D_diag)) - elif sub_kernel == 'geo': - D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) - kmiddle = np.diag(weight * D_diag) - kmiddle = np.identity(len(kmiddle)) - weight * kmiddle - kmiddle = np.linalg.inv(kmiddle) - return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0] - - -############################################################################### -def _randomwalkkernel_kron(G1, G2, node_label, edge_label): - """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method. - - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - pass - - -############################################################################### -def getLabels(Gn, node_label, edge_label, directed): - """Get symbolic labels of a graph dataset, where vertex labels are dealt - with by concatenating them to the edge labels of adjacent edges. - """ - label_list = [] - label_set = set() - for g in Gn: - label_g = {} - for e in g.edges(data=True): - nl1 = g.node[e[0]][node_label] - nl2 = g.node[e[1]][node_label] - if not directed and nl1 > nl2: - nl1, nl2 = nl2, nl1 - label = (nl1, e[2][edge_label], nl2) - label_g[(e[0], e[1])] = label - label_list.append(label_g) - label_set = set([l for lg in label_list for l in lg.values()]) - return label_list, len(label_set) - - -def filterGramMatrix(gmt, label_dict, label, directed): - """Compute (the transpose of) the Gram matrix filtered by a label. - """ - gmf = np.zeros(gmt.shape) - for (n1, n2), l in label_dict.items(): - if l == label: - gmf[n2, n1] = gmt[n2, n1] - if not directed: - gmf[n1, n2] = gmt[n1, n2] - return gmf - - -def computeVK(g1, g2, ds_attrs, node_kernels, node_label): - '''Compute vertex kernels between vertices of two graphs. - ''' - vk_dict = {} # shortest path matrices dict - if ds_attrs['node_labeled']: - # node symb and non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['mix'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn( - n1[1][node_label], n2[1][node_label], - n1[1]['attributes'], n2[1]['attributes']) - # node symb labeled - else: - kn = node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], - n2[1][node_label]) - else: - # node non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], - n2[1]['attributes']) - # node unlabeled - else: - pass - return vk_dict - - -def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label): - '''Compute weight matrix of the direct product graph. - ''' - w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) - w_times = np.zeros((w_dim, w_dim)) - if vk_dict: # node labeled - if ds_attrs['is_directed']: - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] - # edge unlabeled - else: - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * vk_dict[(e1[1], e2[1])] - else: # undirected - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] \ - + vk_dict[(e1[0], e2[1])] \ - * ek_temp * vk_dict[(e1[1], e2[0])] - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] \ - + vk_dict[(e1[0], e2[1])] \ - * ek_temp * vk_dict[(e1[1], e2[0])] - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] \ - + vk_dict[(e1[0], e2[1])] \ - * ek_temp * vk_dict[(e1[1], e2[0])] - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - # edge unlabeled - else: - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * vk_dict[(e1[1], e2[1])] \ - + vk_dict[(e1[0], e2[1])] \ - * vk_dict[(e1[1], e2[0])] - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - else: # node unlabeled - if ds_attrs['is_directed']: - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - # edge unlabeled - else: - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = 1 - else: # undirected - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - # edge unlabeled - else: - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = 1 - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - return w_times, w_dim \ No newline at end of file diff --git a/gklearn/kernels/else/sp_sym.py b/gklearn/kernels/else/sp_sym.py deleted file mode 100644 index 0da15ee..0000000 --- a/gklearn/kernels/else/sp_sym.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Dec 21 18:02:00 2018 - -@author: ljia -""" - -import sys -import time -from itertools import product -from functools import partial -from multiprocessing import Pool -from tqdm import tqdm - -import networkx as nx -import numpy as np - -from gklearn.utils.utils import getSPGraph -from gklearn.utils.graphdataset import get_dataset_attributes -from gklearn.utils.parallel import parallel_gm -sys.path.insert(0, "../") - -def spkernel(*args, - node_label='atom', - edge_weight=None, - node_kernels=None, - n_jobs=None): - """Calculate shortest-path kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - / - G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - node_label : string - node attribute used as label. The default node label is atom. - edge_weight : string - Edge attribute name corresponding to the edge weight. - node_kernels: dict - A dictionary of kernel functions for nodes, including 3 items: 'symb' - for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' - for both labels. The first 2 functions take two node labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two nodes. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns an - number as the kernel value. Ignored when nodes are unlabeled. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the sp kernel between 2 praphs. - """ - # pre-process - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - weight = None - if edge_weight is None: - print('\n None edge weight specified. Set all weight to 1.\n') - else: - try: - some_weight = list( - nx.get_edge_attributes(Gn[0], edge_weight).values())[0] - if isinstance(some_weight, (float, int)): - weight = edge_weight - else: - print( - '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' - % edge_weight) - except: - print( - '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' - % edge_weight) - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], - node_label=node_label) - ds_attrs['node_attr_dim'] = 0 - - # remove graphs with no edges, as no sp can be found in their structures, - # so the kernel between such a graph and itself will be zero. - len_gn = len(Gn) - Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] - idx = [G[0] for G in Gn] - Gn = [G[1] for G in Gn] - if len(Gn) != len_gn: - print('\n %d graphs are removed as they don\'t contain edges.\n' % - (len_gn - len(Gn))) - - start_time = time.time() - - pool = Pool(n_jobs) - # get shortest path graphs of Gn - getsp_partial = partial(wrapper_getSPGraph, weight) - itr = zip(Gn, range(0, len(Gn))) - if len(Gn) < 100 * n_jobs: -# # use default chunksize as pool.map when iterable is less than 100 -# chunksize, extra = divmod(len(Gn), n_jobs * 4) -# if extra: -# chunksize += 1 - chunksize = int(len(Gn) / n_jobs) + 1 - else: - chunksize = 100 - for i, g in tqdm( - pool.imap_unordered(getsp_partial, itr, chunksize), - desc='getting sp graphs', file=sys.stdout): - Gn[i] = g - pool.close() - pool.join() - - Kmatrix = np.zeros((len(Gn), len(Gn))) - - # ---- use pool.imap_unordered to parallel and track progress. ---- - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs) - - run_time = time.time() - start_time - print( - "\n --- shortest path kernel matrix of size %d built in %s seconds ---" - % (len(Gn), run_time)) - - return Kmatrix, run_time, idx - - -def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): - - kernel = 0 - - # compute shortest path matrices first, method borrowed from FCSP. - vk_dict = {} # shortest path matrices dict - if ds_attrs['node_labeled']: - # node symb and non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['mix'] - for n1, n2 in product( - g1.nodes(data=True), g2.nodes(data=True)): - vk_dict[(n1[0], n2[0])] = kn( - n1[1][node_label], n2[1][node_label], - n1[1]['attributes'], n2[1]['attributes']) - # node symb labeled - else: - kn = node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], - n2[1][node_label]) - else: - # node non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], - n2[1]['attributes']) - # node unlabeled - else: - for e1, e2 in product( - g1.edges(data=True), g2.edges(data=True)): - if e1[2]['cost'] == e2[2]['cost']: - kernel += 1 - return kernel - - # compute graph kernels - if ds_attrs['is_directed']: - for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): - if e1[2]['cost'] == e2[2]['cost']: - nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], - e2[1])] - kn1 = nk11 * nk22 - kernel += kn1 - else: - for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): - if e1[2]['cost'] == e2[2]['cost']: - # each edge walk is counted twice, starting from both its extreme nodes. - nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( - e1[0], e2[1])], vk_dict[(e1[1], - e2[0])], vk_dict[(e1[1], - e2[1])] - kn1 = nk11 * nk22 - kn2 = nk12 * nk21 - kernel += kn1 + kn2 - - return kernel - - -def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr): - i = itr[0] - j = itr[1] - return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) - - -def wrapper_getSPGraph(weight, itr_item): - g = itr_item[0] - i = itr_item[1] - return i, getSPGraph(g, edge_weight=weight) \ No newline at end of file diff --git a/gklearn/kernels/else/ssp_sym.py b/gklearn/kernels/else/ssp_sym.py deleted file mode 100644 index d0cf9ca..0000000 --- a/gklearn/kernels/else/ssp_sym.py +++ /dev/null @@ -1,464 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Sun Dec 23 16:42:48 2018 - -@author: ljia -""" - -import sys -import time -from itertools import combinations, product -from functools import partial -from multiprocessing import Pool -from tqdm import tqdm - -import networkx as nx -import numpy as np - -from gklearn.utils.graphdataset import get_dataset_attributes -from gklearn.utils.parallel import parallel_gm - -sys.path.insert(0, "../") - - -def structuralspkernel(*args, - node_label='atom', - edge_weight=None, - edge_label='bond_type', - node_kernels=None, - edge_kernels=None, - n_jobs=None): - """Calculate mean average structural shortest path kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - / - G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - node_label : string - node attribute used as label. The default node label is atom. - edge_weight : string - Edge attribute name corresponding to the edge weight. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - node_kernels: dict - A dictionary of kernel functions for nodes, including 3 items: 'symb' - for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' - for both labels. The first 2 functions take two node labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two nodes. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns a number - as the kernel value. Ignored when nodes are unlabeled. - edge_kernels: dict - A dictionary of kernel functions for edges, including 3 items: 'symb' - for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' - for both labels. The first 2 functions take two edge labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two edges. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns a number - as the kernel value. Ignored when edges are unlabeled. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the mean average structural - shortest path kernel between 2 praphs. - """ - # pre-process - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - weight = None - if edge_weight is None: - print('\n None edge weight specified. Set all weight to 1.\n') - else: - try: - some_weight = list( - nx.get_edge_attributes(Gn[0], edge_weight).values())[0] - if isinstance(some_weight, (float, int)): - weight = edge_weight - else: - print( - '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' - % edge_weight) - except: - print( - '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' - % edge_weight) - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', - 'edge_attr_dim', 'is_directed'], - node_label=node_label, edge_label=edge_label) - ds_attrs['node_attr_dim'] = 0 - ds_attrs['edge_attr_dim'] = 0 - - start_time = time.time() - - # get shortest paths of each graph in Gn - splist = [None] * len(Gn) - pool = Pool(n_jobs) - # get shortest path graphs of Gn - getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) - itr = zip(Gn, range(0, len(Gn))) - if len(Gn) < 100 * n_jobs: - chunksize = int(len(Gn) / n_jobs) + 1 - else: - chunksize = 100 - # chunksize = 300 # int(len(list(itr)) / n_jobs) - for i, sp in tqdm( - pool.imap_unordered(getsp_partial, itr, chunksize), - desc='getting shortest paths', - file=sys.stdout): - splist[i] = sp -# time.sleep(10) - pool.close() - pool.join() - - -# # get shortest paths of each graph in Gn -# splist = [[] for _ in range(len(Gn))] -# # get shortest path graphs of Gn -# getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed']) -# itr = zip(Gn, range(0, len(Gn))) -# if len(Gn) < 1000 * n_jobs: -# chunksize = int(len(Gn) / n_jobs) + 1 -# else: -# chunksize = 1000 -# # chunksize = 300 # int(len(list(itr)) / n_jobs) -# from contextlib import closing -# with closing(Pool(n_jobs)) as pool: -## for i, sp in tqdm( -# res = pool.imap_unordered(getsp_partial, itr, 10) -## desc='getting shortest paths', -## file=sys.stdout): -## splist[i] = sp -## time.sleep(10) -# pool.close() -# pool.join() - -# ss = 0 -# ss += sys.getsizeof(splist) -# for spss in splist: -# ss += sys.getsizeof(spss) -# for spp in spss: -# ss += sys.getsizeof(spp) - - -# time.sleep(20) - -# # ---- direct running, normally use single CPU core. ---- -# splist = [] -# for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): -# splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) - - # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) - # sp_ml = [0] * len(Gn) # shortest path matrices - # for i in result_sp: - # sp_ml[i[0]] = i[1] - # edge_x_g = [[] for i in range(len(sp_ml))] - # edge_y_g = [[] for i in range(len(sp_ml))] - # edge_w_g = [[] for i in range(len(sp_ml))] - # for idx, item in enumerate(sp_ml): - # for i1 in range(len(item)): - # for i2 in range(i1 + 1, len(item)): - # if item[i1, i2] != np.inf: - # edge_x_g[idx].append(i1) - # edge_y_g[idx].append(i2) - # edge_w_g[idx].append(item[i1, i2]) - # print(len(edge_x_g[0])) - # print(len(edge_y_g[0])) - # print(len(edge_w_g[0])) - - Kmatrix = np.zeros((len(Gn), len(Gn))) - - # ---- use pool.imap_unordered to parallel and track progress. ---- - def init_worker(spl_toshare, gs_toshare): - global G_spl, G_gs - G_spl = spl_toshare - G_gs = gs_toshare - do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(splist, Gn), n_jobs=n_jobs) - - -# # ---- use pool.imap_unordered to parallel and track progress. ---- -# pool = Pool(n_jobs) -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, -# node_kernels, edge_kernels) -# itr = zip(combinations_with_replacement(Gn, 2), -# combinations_with_replacement(splist, 2), -# combinations_with_replacement(range(0, len(Gn)), 2)) -# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) -# if len_itr < 1000 * n_jobs: -# chunksize = int(len_itr / n_jobs) + 1 -# else: -# chunksize = 1000 -# for i, j, kernel in tqdm( -# pool.imap_unordered(do_partial, itr, chunksize), -# desc='calculating kernels', -# file=sys.stdout): -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel -# pool.close() -# pool.join() - -# # ---- use pool.map to parallel. ---- -# pool = Pool(n_jobs) -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, -# node_kernels, edge_kernels) -# itr = zip(combinations_with_replacement(Gn, 2), -# combinations_with_replacement(splist, 2), -# combinations_with_replacement(range(0, len(Gn)), 2)) -# for i, j, kernel in tqdm( -# pool.map(do_partial, itr), desc='calculating kernels', -# file=sys.stdout): -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel -# pool.close() -# pool.join() - -# # ---- use pool.imap_unordered to parallel and track progress. ---- -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, -# node_kernels, edge_kernels) -# itr = zip(combinations_with_replacement(Gn, 2), -# combinations_with_replacement(splist, 2), -# combinations_with_replacement(range(0, len(Gn)), 2)) -# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) -# if len_itr < 1000 * n_jobs: -# chunksize = int(len_itr / n_jobs) + 1 -# else: -# chunksize = 1000 -# from contextlib import closing -# with closing(Pool(n_jobs)) as pool: -# for i, j, kernel in tqdm( -# pool.imap_unordered(do_partial, itr, 1000), -# desc='calculating kernels', -# file=sys.stdout): -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel -# pool.close() -# pool.join() - - -# # ---- direct running, normally use single CPU core. ---- -# from itertools import combinations_with_replacement -# itr = combinations_with_replacement(range(0, len(Gn)), 2) -# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): -# kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], -# ds_attrs, node_label, edge_label, node_kernels, edge_kernels) -## if(kernel > 1): -## print("error here ") -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel - - run_time = time.time() - start_time - print( - "\n --- shortest path kernel matrix of size %d built in %s seconds ---" - % (len(Gn), run_time)) - - return Kmatrix, run_time - - -def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels): - - kernel = 0 - - # First, compute shortest path matrices, method borrowed from FCSP. - vk_dict = {} # shortest path matrices dict - if ds_attrs['node_labeled']: - # node symb and non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['mix'] - for n1, n2 in product( - g1.nodes(data=True), g2.nodes(data=True)): - vk_dict[(n1[0], n2[0])] = kn( - n1[1][node_label], n2[1][node_label], - n1[1]['attributes'], n2[1]['attributes']) - # node symb labeled - else: - kn = node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], - n2[1][node_label]) - else: - # node non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], - n2[1]['attributes']) - # node unlabeled - else: - pass - - # Then, compute kernels between all pairs of edges, which idea is an - # extension of FCSP. It suits sparse graphs, which is the most case we - # went though. For dense graphs, this would be slow. - ek_dict = {} # dict of edge kernels - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1, e2 in product( - g1.edges(data=True), g2.edges(data=True)): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = kn(e1[2]['attributes'], e2[2]['attributes']) - ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp - # edge unlabeled - else: - pass - - # compute graph kernels - if vk_dict: - if ek_dict: - for p1, p2 in product(spl1, spl2): - if len(p1) == len(p2): - kpath = vk_dict[(p1[0], p2[0])] - if kpath: - for idx in range(1, len(p1)): - kpath *= vk_dict[(p1[idx], p2[idx])] * \ - ek_dict[((p1[idx-1], p1[idx]), - (p2[idx-1], p2[idx]))] - if not kpath: - break - kernel += kpath # add up kernels of all paths - else: - for p1, p2 in product(spl1, spl2): - if len(p1) == len(p2): - kpath = vk_dict[(p1[0], p2[0])] - if kpath: - for idx in range(1, len(p1)): - kpath *= vk_dict[(p1[idx], p2[idx])] - if not kpath: - break - kernel += kpath # add up kernels of all paths - else: - if ek_dict: - for p1, p2 in product(spl1, spl2): - if len(p1) == len(p2): - if len(p1) == 0: - kernel += 1 - else: - kpath = 1 - for idx in range(0, len(p1) - 1): - kpath *= ek_dict[((p1[idx], p1[idx+1]), - (p2[idx], p2[idx+1]))] - if not kpath: - break - kernel += kpath # add up kernels of all paths - else: - for p1, p2 in product(spl1, spl2): - if len(p1) == len(p2): - kernel += 1 - - kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average - - # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation - # # compute vertex kernel matrix - # try: - # vk_mat = np.zeros((nx.number_of_nodes(g1), - # nx.number_of_nodes(g2))) - # g1nl = enumerate(g1.nodes(data=True)) - # g2nl = enumerate(g2.nodes(data=True)) - # for i1, n1 in g1nl: - # for i2, n2 in g2nl: - # vk_mat[i1][i2] = kn( - # n1[1][node_label], n2[1][node_label], - # [n1[1]['attributes']], [n2[1]['attributes']]) - - # range1 = range(0, len(edge_w_g[i])) - # range2 = range(0, len(edge_w_g[j])) - # for i1 in range1: - # x1 = edge_x_g[i][i1] - # y1 = edge_y_g[i][i1] - # w1 = edge_w_g[i][i1] - # for i2 in range2: - # x2 = edge_x_g[j][i2] - # y2 = edge_y_g[j][i2] - # w2 = edge_w_g[j][i2] - # ke = (w1 == w2) - # if ke > 0: - # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] - # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] - # Kmatrix += kn1 + kn2 - return kernel - - -def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, - edge_kernels, itr): - i = itr[0] - j = itr[1] - return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], - ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - - -def get_shortest_paths(G, weight, directed): - """Get all shortest paths of a graph. - - Parameters - ---------- - G : NetworkX graphs - The graphs whose paths are calculated. - weight : string/None - edge attribute used as weight to calculate the shortest path. - directed: boolean - Whether graph is directed. - - Return - ------ - sp : list of list - List of shortest paths of the graph, where each path is represented by a list of nodes. - """ - sp = [] - for n1, n2 in combinations(G.nodes(), 2): - try: - spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) - except nx.NetworkXNoPath: # nodes not connected - # sp.append([]) - pass - else: - sp += spltemp - # each edge walk is counted twice, starting from both its extreme nodes. - if not directed: - sp += [sptemp[::-1] for sptemp in spltemp] - - # add single nodes as length 0 paths. - sp += [[n] for n in G.nodes()] - return sp - - -def wrapper_getSP(weight, directed, itr_item): - g = itr_item[0] - i = itr_item[1] - return i, get_shortest_paths(g, weight, directed) \ No newline at end of file diff --git a/gklearn/kernels/unfinished/cyclicPatternKernel.py b/gklearn/kernels/unfinished/cyclicPatternKernel.py deleted file mode 100644 index b643c4b..0000000 --- a/gklearn/kernels/unfinished/cyclicPatternKernel.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -@author: linlin -@references: - [1] Tamás Horváth, Thomas Gärtner, and Stefan Wrobel. Cyclic pattern kernels for predictive graph mining. In Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining, pages 158–167. ACM, 2004. - [2] Hopcroft, J.; Tarjan, R. (1973). “Efficient algorithms for graph manipulation”. Communications of the ACM 16: 372–378. doi:10.1145/362248.362272. - [3] Finding all the elementary circuits of a directed graph. D. B. Johnson, SIAM Journal on Computing 4, no. 1, 77-84, 1975. http://dx.doi.org/10.1137/0204007 -""" - -import sys -import pathlib -sys.path.insert(0, "../") -import time - -import networkx as nx -import numpy as np - -from tqdm import tqdm - - -def cyclicpatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None): - """Calculate cyclic pattern graph kernels between graphs. - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - / - G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - labeled : boolean - Whether the graphs are labeled. The default is True. - depth : integer - Depth of search. Longest length of paths. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the path kernel up to d between 2 praphs. - """ - Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list - Kmatrix = np.zeros((len(Gn), len(Gn))) - - start_time = time.time() - - # get all cyclic and tree patterns of all graphs before calculating kernels to save time, but this may consume a lot of memory for large dataset. - all_patterns = [ get_patterns(Gn[i], node_label=node_label, edge_label = edge_label, labeled = labeled, cycle_bound = cycle_bound) - for i in tqdm(range(0, len(Gn)), desc='retrieve patterns', file=sys.stdout) ] - - for i in tqdm(range(0, len(Gn)), desc='calculate kernels', file=sys.stdout): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _cyclicpatternkernel_do(all_patterns[i], all_patterns[j]) - Kmatrix[j][i] = Kmatrix[i][j] - - run_time = time.time() - start_time - print("\n --- kernel matrix of cyclic pattern kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) - - return Kmatrix, run_time - - -def _cyclicpatternkernel_do(patterns1, patterns2): - """Calculate path graph kernels up to depth d between 2 graphs. - - Parameters - ---------- - paths1, paths2 : list - List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. - k_func : function - A kernel function used using different notions of fingerprint similarity. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - labeled : boolean - Whether the graphs are labeled. The default is True. - - Return - ------ - kernel : float - Treelet Kernel between 2 graphs. - """ - return len(set(patterns1) & set(patterns2)) - - -def get_patterns(G, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None): - """Find all cyclic and tree patterns in a graph. - - Parameters - ---------- - G : NetworkX graphs - The graph in which paths are searched. - length : integer - The maximum length of paths. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - labeled : boolean - Whether the graphs are labeled. The default is True. - - Return - ------ - path : list - List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. - """ - number_simplecycles = 0 - bridges = nx.Graph() - patterns = [] - - bicomponents = nx.biconnected_component_subgraphs(G) # all biconnected components of G. this function use algorithm in reference [2], which (i guess) is slightly different from the one used in paper [1] - for subgraph in bicomponents: - if nx.number_of_edges(subgraph) > 1: - simple_cycles = list(nx.simple_cycles(G.to_directed())) # all simple cycles in biconnected components. this function use algorithm in reference [3], which has time complexity O((n+e)(N+1)) for n nodes, e edges and N simple cycles. Which might be slower than the algorithm applied in paper [1] - if cycle_bound != None and len(simple_cycles) > cycle_bound - number_simplecycles: # in paper [1], when applying another algorithm (subroutine RT), this becomes len(simple_cycles) == cycle_bound - number_simplecycles + 1, check again. - return [] - else: - - # calculate canonical representation for each simple cycle - all_canonkeys = [] - for cycle in simple_cycles: - canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[:-1] ] - canonkey = ''.join(canonlist) - canonkey = canonkey if canonkey < canonkey[::-1] else canonkey[::-1] - for i in range(1, len(cycle[:-1])): - canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[i:-1] + cycle[:i] ] - canonkey_t = ''.join(canonlist) - canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1] - canonkey = canonkey if canonkey < canonkey_t else canonkey_t - all_canonkeys.append(canonkey) - - patterns = list(set(patterns) | set(all_canonkeys)) - number_simplecycles += len(simple_cycles) - else: - bridges.add_edges_from(subgraph.edges(data=True)) - - # calculate canonical representation for each connected component in bridge set - components = list(nx.connected_component_subgraphs(bridges)) # all connected components in the bridge - tree_patterns = [] - for tree in components: - break - - - - # patterns += pi(bridges) - return patterns diff --git a/gklearn/kernels/unfinished/pathKernel.py b/gklearn/kernels/unfinished/pathKernel.py deleted file mode 100644 index 3511f2c..0000000 --- a/gklearn/kernels/unfinished/pathKernel.py +++ /dev/null @@ -1,234 +0,0 @@ -""" -@author: linlin -@references: Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). -""" - -import sys -import pathlib -sys.path.insert(0, "../") -import time -import itertools -from tqdm import tqdm - -import networkx as nx -import numpy as np - -from gklearn.kernels.deltaKernel import deltakernel -from gklearn.utils.graphdataset import get_dataset_attributes - - -def pathkernel(*args, node_label='atom', edge_label='bond_type'): - """Calculate mean average path kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - / - G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - - Return - ------ - Kmatrix/kernel : Numpy matrix/float - Kernel matrix, each element of which is the path kernel between 2 praphs. / Path kernel between 2 graphs. - """ - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - Kmatrix = np.zeros((len(Gn), len(Gn))) - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'edge_labeled', 'is_directed'], - node_label=node_label, - edge_label=edge_label) - try: - some_weight = list(nx.get_edge_attributes(Gn[0], - edge_label).values())[0] - weight = edge_label if isinstance(some_weight, float) or isinstance( - some_weight, int) else None - except: - weight = None - - start_time = time.time() - - splist = [ - get_shortest_paths(Gn[i], weight) for i in tqdm( - range(0, len(Gn)), desc='getting shortest paths', file=sys.stdout) - ] - - pbar = tqdm( - total=((len(Gn) + 1) * len(Gn) / 2), - desc='calculating kernels', - file=sys.stdout) - if ds_attrs['node_labeled']: - if ds_attrs['edge_labeled']: - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _pathkernel_do_l(Gn[i], Gn[j], splist[i], - splist[j], node_label, - edge_label) - Kmatrix[j][i] = Kmatrix[i][j] - pbar.update(1) - else: - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _pathkernel_do_nl(Gn[i], Gn[j], splist[i], - splist[j], node_label) - Kmatrix[j][i] = Kmatrix[i][j] - pbar.update(1) - - else: - if ds_attrs['edge_labeled']: - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _pathkernel_do_el(Gn[i], Gn[j], splist[i], - splist[j], edge_label) - Kmatrix[j][i] = Kmatrix[i][j] - pbar.update(1) - else: - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _pathkernel_do_unl(Gn[i], Gn[j], splist[i], - splist[j]) - Kmatrix[j][i] = Kmatrix[i][j] - pbar.update(1) - - run_time = time.time() - start_time - print( - "\n --- mean average path kernel matrix of size %d built in %s seconds ---" - % (len(Gn), run_time)) - - return Kmatrix, run_time - - -def _pathkernel_do_l(G1, G2, sp1, sp2, node_label, edge_label): - """Calculate mean average path kernel between 2 fully-labeled graphs. - - Parameters - ---------- - G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - sp1, sp2 : list of list - List of shortest paths of 2 graphs, where each path is represented by a list of nodes. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - - Return - ------ - kernel : float - Path Kernel between 2 graphs. - """ - # calculate kernel - kernel = 0 - # if len(sp1) == 0 or len(sp2) == 0: - # return 0 # @todo: should it be zero? - for path1 in sp1: - for path2 in sp2: - if len(path1) == len(path2): - kernel_path = (G1.node[path1[0]][node_label] == G2.node[path2[ - 0]][node_label]) - if kernel_path: - for i in range(1, len(path1)): - # kernel = 1 if all corresponding nodes and edges in the 2 paths have same labels, otherwise 0 - if G1[path1[i - 1]][path1[i]][edge_label] != G2[path2[i - 1]][path2[i]][edge_label] or G1.node[path1[i]][node_label] != G2.node[path2[i]][node_label]: - kernel_path = 0 - break - kernel += kernel_path # add up kernels of all paths - - kernel = kernel / (len(sp1) * len(sp2)) # calculate mean average - - return kernel - - -def _pathkernel_do_nl(G1, G2, sp1, sp2, node_label): - """Calculate mean average path kernel between 2 node-labeled graphs. - """ - # calculate kernel - kernel = 0 - # if len(sp1) == 0 or len(sp2) == 0: - # return 0 # @todo: should it be zero? - for path1 in sp1: - for path2 in sp2: - if len(path1) == len(path2): - kernel_path = 1 - for i in range(0, len(path1)): - # kernel = 1 if all corresponding nodes in the 2 paths have same labels, otherwise 0 - if G1.node[path1[i]][node_label] != G2.node[path2[i]][node_label]: - kernel_path = 0 - break - kernel += kernel_path - - kernel = kernel / (len(sp1) * len(sp2)) # calculate mean average - - return kernel - - -def _pathkernel_do_el(G1, G2, sp1, sp2, edge_label): - """Calculate mean average path kernel between 2 edge-labeled graphs. - """ - # calculate kernel - kernel = 0 - for path1 in sp1: - for path2 in sp2: - if len(path1) == len(path2): - if len(path1) == 0: - kernel += 1 - else: - kernel_path = 1 - for i in range(0, len(path1) - 1): - # kernel = 1 if all corresponding edges in the 2 paths have same labels, otherwise 0 - if G1[path1[i]][path1[i + 1]][edge_label] != G2[path2[ - i]][path2[i + 1]][edge_label]: - kernel_path = 0 - break - kernel += kernel_path - - kernel = kernel / (len(sp1) * len(sp2)) # calculate mean average - - return kernel - - -def _pathkernel_do_unl(G1, G2, sp1, sp2): - """Calculate mean average path kernel between 2 unlabeled graphs. - """ - # calculate kernel - kernel = 0 - for path1 in sp1: - for path2 in sp2: - if len(path1) == len(path2): - kernel += 1 - - kernel = kernel / (len(sp1) * len(sp2)) # calculate mean average - - return kernel - - -def get_shortest_paths(G, weight): - """Get all shortest paths of a graph. - - Parameters - ---------- - G : NetworkX graphs - The graphs whose paths are calculated. - weight : string/None - edge attribute used as weight to calculate the shortest path. - - Return - ------ - sp : list of list - List of shortest paths of the graph, where each path is represented by a list of nodes. - """ - sp = [] - for n1, n2 in itertools.combinations(G.nodes(), 2): - try: - sp.append(nx.shortest_path(G, n1, n2, weight=weight)) - except nx.NetworkXNoPath: # nodes not connected - sp.append([]) - # add single nodes as length 0 paths. - sp += [[n] for n in G.nodes()] - return sp \ No newline at end of file diff --git a/gklearn/kernels/unfinished/treePatternKernel.py b/gklearn/kernels/unfinished/treePatternKernel.py deleted file mode 100644 index 5e90a1d..0000000 --- a/gklearn/kernels/unfinished/treePatternKernel.py +++ /dev/null @@ -1,241 +0,0 @@ -""" -@author: linlin -@references: Pierre Mahé and Jean-Philippe Vert. Graph kernels based on tree patterns for molecules. Machine learning, 75(1):3–35, 2009. -""" - -import sys -import pathlib -sys.path.insert(0, "../") -import time - -import networkx as nx -import numpy as np - -from collections import Counter -from tqdm import tqdm -tqdm.monitor_interval = 0 - -from gklearn.utils.utils import untotterTransformation - - -def treepatternkernel(*args, - node_label='atom', - edge_label='bond_type', - labeled=True, - kernel_type='untiln', - lmda=1, - h=1, - remove_totters=True): - """Calculate tree pattern graph kernels between graphs. - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - / - G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - labeled : boolean - Whether the graphs are labeled. The default is True. - kernel_type : string - Type of tree pattern kernel, could be 'untiln', 'size' or 'branching'. - lmda : float - Weight to decide whether linear patterns or trees pattern of increasing complexity are favored. - h : integer - The upper bound of the height of tree patterns. - remove_totters : boolean - whether to remove totters. The default value is True. - - Return - ------ - Kmatrix: Numpy matrix - Kernel matrix, each element of which is the tree pattern graph kernel between 2 praphs. - """ - if h < 1: - raise Exception('h > 0 is requested.') - kernel_type = kernel_type.lower() - # arrange all graphs in a list - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - Kmatrix = np.zeros((len(Gn), len(Gn))) - h = int(h) - - start_time = time.time() - - if remove_totters: - Gn = [untotterTransformation(G, node_label, edge_label) for G in Gn] - - pbar = tqdm( - total=(1 + len(Gn)) * len(Gn) / 2, - desc='calculate kernels', - file=sys.stdout) - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _treepatternkernel_do(Gn[i], Gn[j], node_label, - edge_label, labeled, - kernel_type, lmda, h) - Kmatrix[j][i] = Kmatrix[i][j] - pbar.update(1) - - run_time = time.time() - start_time - print( - "\n --- kernel matrix of tree pattern kernel of size %d built in %s seconds ---" - % (len(Gn), run_time)) - - return Kmatrix, run_time - - -def _treepatternkernel_do(G1, G2, node_label, edge_label, labeled, kernel_type, - lmda, h): - """Calculate tree pattern graph kernels between 2 graphs. - - Parameters - ---------- - paths1, paths2 : list - List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path. - k_func : function - A kernel function used using different notions of fingerprint similarity. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - labeled : boolean - Whether the graphs are labeled. The default is True. - kernel_type : string - Type of tree pattern kernel, could be 'untiln', 'size' or 'branching'. - lmda : float - Weight to decide whether linear patterns or trees pattern of increasing complexity are favored. - h : integer - The upper bound of the height of tree patterns. - - Return - ------ - kernel : float - Treelet Kernel between 2 graphs. - """ - - def matchingset(n1, n2): - """Get neiborhood matching set of two nodes in two graphs. - """ - - def mset_com(allpairs, length): - """Find all sets R of pairs by combination. - """ - if length == 1: - mset = [[pair] for pair in allpairs] - return mset, mset - else: - mset, mset_l = mset_com(allpairs, length - 1) - mset_tmp = [] - for pairset in mset_l: # for each pair set of length l-1 - nodeset1 = [pair[0] for pair in pairset - ] # nodes already in the set - nodeset2 = [pair[1] for pair in pairset] - for pair in allpairs: - if (pair[0] not in nodeset1) and ( - pair[1] not in nodeset2 - ): # nodes in R should be unique - mset_tmp.append( - pairset + [pair] - ) # add this pair to the pair set of length l-1, constructing a new set of length l - nodeset1.append(pair[0]) - nodeset2.append(pair[1]) - - mset.extend(mset_tmp) - - return mset, mset_tmp - - allpairs = [ - ] # all pairs those have the same node labels and edge labels - for neighbor1 in G1[n1]: - for neighbor2 in G2[n2]: - if G1.node[neighbor1][node_label] == G2.node[neighbor2][node_label] \ - and G1[n1][neighbor1][edge_label] == G2[n2][neighbor2][edge_label]: - allpairs.append([neighbor1, neighbor2]) - - if allpairs != []: - mset, _ = mset_com(allpairs, len(allpairs)) - else: - mset = [] - - return mset - - def kernel_h(h): - """Calculate kernel of h-th iteration. - """ - - if kernel_type == 'untiln': - all_kh = { str(n1) + '.' + str(n2) : (G1.node[n1][node_label] == G2.node[n2][node_label]) \ - for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ] - all_kh_tmp = all_kh.copy() - for i in range(2, h + 1): - for n1 in G1.nodes(): - for n2 in G2.nodes(): - kh = 0 - mset = all_msets[str(n1) + '.' + str(n2)] - for R in mset: - kh_tmp = 1 - for pair in R: - kh_tmp *= lmda * all_kh[str(pair[0]) - + '.' + str(pair[1])] - kh += 1 / lmda * kh_tmp - kh = (G1.node[n1][node_label] == G2.node[n2][ - node_label]) * (1 + kh) - all_kh_tmp[str(n1) + '.' + str(n2)] = kh - all_kh = all_kh_tmp.copy() - - elif kernel_type == 'size': - all_kh = { str(n1) + '.' + str(n2) : lmda * (G1.node[n1][node_label] == G2.node[n2][node_label]) \ - for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ] - all_kh_tmp = all_kh.copy() - for i in range(2, h + 1): - for n1 in G1.nodes(): - for n2 in G2.nodes(): - kh = 0 - mset = all_msets[str(n1) + '.' + str(n2)] - for R in mset: - kh_tmp = 1 - for pair in R: - kh_tmp *= lmda * all_kh[str(pair[0]) - + '.' + str(pair[1])] - kh += kh_tmp - kh *= lmda * ( - G1.node[n1][node_label] == G2.node[n2][node_label]) - all_kh_tmp[str(n1) + '.' + str(n2)] = kh - all_kh = all_kh_tmp.copy() - - elif kernel_type == 'branching': - all_kh = { str(n1) + '.' + str(n2) : (G1.node[n1][node_label] == G2.node[n2][node_label]) \ - for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ] - all_kh_tmp = all_kh.copy() - for i in range(2, h + 1): - for n1 in G1.nodes(): - for n2 in G2.nodes(): - kh = 0 - mset = all_msets[str(n1) + '.' + str(n2)] - for R in mset: - kh_tmp = 1 - for pair in R: - kh_tmp *= lmda * all_kh[str(pair[0]) - + '.' + str(pair[1])] - kh += 1 / lmda * kh_tmp - kh *= ( - G1.node[n1][node_label] == G2.node[n2][node_label]) - all_kh_tmp[str(n1) + '.' + str(n2)] = kh - all_kh = all_kh_tmp.copy() - - return all_kh - - # calculate matching sets for every pair of nodes at first to avoid calculating in every iteration. - all_msets = ({ str(node1) + '.' + str(node2) : matchingset(node1, node2) for node1 in G1.nodes() \ - for node2 in G2.nodes() } if h > 1 else {}) - - all_kh = kernel_h(h) - kernel = sum(all_kh.values()) - - if kernel_type == 'size': - kernel = kernel / (lmda**h) - - return kernel diff --git a/gklearn/kernels/unfinished/weisfeilerLehmanKernel.py b/gklearn/kernels/unfinished/weisfeilerLehmanKernel.py deleted file mode 100644 index f5b903c..0000000 --- a/gklearn/kernels/unfinished/weisfeilerLehmanKernel.py +++ /dev/null @@ -1,403 +0,0 @@ -""" -@author: linlin -@references: - [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61. -""" - -import sys -import pathlib -from collections import Counter -sys.path.insert(0, "../") - -import networkx as nx -import numpy as np -import time - -from gklearn.kernels.pathKernel import pathkernel - -def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'): - """Calculate Weisfeiler-Lehman kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - / - G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - height : int - subtree height - base_kernel : string - base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. For user-defined kernel, base_kernel is the name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. - - Notes - ----- - This function now supports WL subtree kernel, WL shortest path kernel and WL edge kernel. - """ - base_kernel = base_kernel.lower() - Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list - Kmatrix = np.zeros((len(Gn), len(Gn))) - - start_time = time.time() - - # for WL subtree kernel - if base_kernel == 'subtree': - Kmatrix = _wl_subtreekernel_do(args[0], node_label, edge_label, height) - - # for WL shortest path kernel - elif base_kernel == 'sp': - Kmatrix = _wl_spkernel_do(args[0], node_label, edge_label, height) - - # for WL edge kernel - elif base_kernel == 'edge': - Kmatrix = _wl_edgekernel_do(args[0], node_label, edge_label, height) - - # for user defined base kernel - else: - Kmatrix = _wl_userkernel_do(args[0], node_label, edge_label, height, base_kernel) - - run_time = time.time() - start_time - print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), run_time)) - - return Kmatrix, run_time - - - -def _wl_subtreekernel_do(Gn, node_label, edge_label, height): - """Calculate Weisfeiler-Lehman subtree kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - height : int - subtree height. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. - """ - height = int(height) - Kmatrix = np.zeros((len(Gn), len(Gn))) - all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs - - # initial for height = 0 - all_labels_ori = set() # all unique orignal labels in all graphs in this iteration - all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration - all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration - num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs - - # for each graph - for G in Gn: - # get the set of original labels - labels_ori = list(nx.get_node_attributes(G, node_label).values()) - all_labels_ori.update(labels_ori) - num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph - all_num_of_each_label.append(num_of_each_label) - num_of_labels = len(num_of_each_label) # number of all unique labels - - all_labels_ori.update(labels_ori) - - all_num_of_labels_occured += len(all_labels_ori) - - # calculate subtree kernel with the 0th iteration and add it to the final kernel - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys())) - vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ]) - vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ]) - Kmatrix[i][j] += np.dot(vector1, vector2.transpose()) - Kmatrix[j][i] = Kmatrix[i][j] - - # iterate each height - for h in range(1, height + 1): - all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration - num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs - all_labels_ori = set() - all_num_of_each_label = [] - - # for each graph - for idx, G in enumerate(Gn): - - set_multisets = [] - for node in G.nodes(data = True): - # Multiset-label determination. - multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] - # sorting each multiset - multiset.sort() - multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix - set_multisets.append(multiset) - - # label compression - set_unique = list(set(set_multisets)) # set of unique multiset labels - # a dictionary mapping original labels to new ones. - set_compressed = {} - # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label - for value in set_unique: - if value in all_set_compressed.keys(): - set_compressed.update({ value : all_set_compressed[value] }) - else: - set_compressed.update({ value : str(num_of_labels_occured + 1) }) - num_of_labels_occured += 1 - - all_set_compressed.update(set_compressed) - - # relabel nodes - for node in G.nodes(data = True): - node[1][node_label] = set_compressed[set_multisets[node[0]]] - - # get the set of compressed labels - labels_comp = list(nx.get_node_attributes(G, node_label).values()) - all_labels_ori.update(labels_comp) - num_of_each_label = dict(Counter(labels_comp)) - all_num_of_each_label.append(num_of_each_label) - - all_num_of_labels_occured += len(all_labels_ori) - - # calculate subtree kernel with h iterations and add it to the final kernel - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys())) - vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ]) - vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ]) - Kmatrix[i][j] += np.dot(vector1, vector2.transpose()) - Kmatrix[j][i] = Kmatrix[i][j] - - return Kmatrix - - -def _wl_spkernel_do(Gn, node_label, edge_label, height): - """Calculate Weisfeiler-Lehman shortest path kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - height : int - subtree height. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. - """ - from gklearn.utils.utils import getSPGraph - - # init. - height = int(height) - Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel - - Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn - - # initial for height = 0 - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - for e1 in Gn[i].edges(data = True): - for e2 in Gn[j].edges(data = True): - if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): - Kmatrix[i][j] += 1 - Kmatrix[j][i] = Kmatrix[i][j] - - # iterate each height - for h in range(1, height + 1): - all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration - num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs - for G in Gn: # for each graph - set_multisets = [] - for node in G.nodes(data = True): - # Multiset-label determination. - multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] - # sorting each multiset - multiset.sort() - multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix - set_multisets.append(multiset) - - # label compression - set_unique = list(set(set_multisets)) # set of unique multiset labels - # a dictionary mapping original labels to new ones. - set_compressed = {} - # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label - for value in set_unique: - if value in all_set_compressed.keys(): - set_compressed.update({ value : all_set_compressed[value] }) - else: - set_compressed.update({ value : str(num_of_labels_occured + 1) }) - num_of_labels_occured += 1 - - all_set_compressed.update(set_compressed) - - # relabel nodes - for node in G.nodes(data = True): - node[1][node_label] = set_compressed[set_multisets[node[0]]] - - # calculate subtree kernel with h iterations and add it to the final kernel - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - for e1 in Gn[i].edges(data = True): - for e2 in Gn[j].edges(data = True): - if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): - Kmatrix[i][j] += 1 - Kmatrix[j][i] = Kmatrix[i][j] - - return Kmatrix - - - -def _wl_edgekernel_do(Gn, node_label, edge_label, height): - """Calculate Weisfeiler-Lehman edge kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - height : int - subtree height. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. - """ - # init. - height = int(height) - Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel - - # initial for height = 0 - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - for e1 in Gn[i].edges(data = True): - for e2 in Gn[j].edges(data = True): - if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): - Kmatrix[i][j] += 1 - Kmatrix[j][i] = Kmatrix[i][j] - - # iterate each height - for h in range(1, height + 1): - all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration - num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs - for G in Gn: # for each graph - set_multisets = [] - for node in G.nodes(data = True): - # Multiset-label determination. - multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] - # sorting each multiset - multiset.sort() - multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix - set_multisets.append(multiset) - - # label compression - set_unique = list(set(set_multisets)) # set of unique multiset labels - # a dictionary mapping original labels to new ones. - set_compressed = {} - # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label - for value in set_unique: - if value in all_set_compressed.keys(): - set_compressed.update({ value : all_set_compressed[value] }) - else: - set_compressed.update({ value : str(num_of_labels_occured + 1) }) - num_of_labels_occured += 1 - - all_set_compressed.update(set_compressed) - - # relabel nodes - for node in G.nodes(data = True): - node[1][node_label] = set_compressed[set_multisets[node[0]]] - - # calculate subtree kernel with h iterations and add it to the final kernel - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - for e1 in Gn[i].edges(data = True): - for e2 in Gn[j].edges(data = True): - if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])): - Kmatrix[i][j] += 1 - Kmatrix[j][i] = Kmatrix[i][j] - - return Kmatrix - - -def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel): - """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - height : int - subtree height. - base_kernel : string - Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. - """ - # init. - height = int(height) - Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel - - # initial for height = 0 - Kmatrix = base_kernel(Gn, node_label, edge_label) - - # iterate each height - for h in range(1, height + 1): - all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration - num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs - for G in Gn: # for each graph - set_multisets = [] - for node in G.nodes(data = True): - # Multiset-label determination. - multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] - # sorting each multiset - multiset.sort() - multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix - set_multisets.append(multiset) - - # label compression - set_unique = list(set(set_multisets)) # set of unique multiset labels - # a dictionary mapping original labels to new ones. - set_compressed = {} - # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label - for value in set_unique: - if value in all_set_compressed.keys(): - set_compressed.update({ value : all_set_compressed[value] }) - else: - set_compressed.update({ value : str(num_of_labels_occured + 1) }) - num_of_labels_occured += 1 - - all_set_compressed.update(set_compressed) - - # relabel nodes - for node in G.nodes(data = True): - node[1][node_label] = set_compressed[set_multisets[node[0]]] - - # calculate kernel with h iterations and add it to the final kernel - Kmatrix += base_kernel(Gn, node_label, edge_label) - - return Kmatrix diff --git a/gklearn/preimage/common_types.py b/gklearn/preimage/common_types.py deleted file mode 100644 index 2face25..0000000 --- a/gklearn/preimage/common_types.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Mar 19 18:17:38 2020 - -@author: ljia -""" - -from enum import Enum, auto - -class AlgorithmState(Enum): - """can be used to specify the state of an algorithm. - """ - CALLED = auto # The algorithm has been called. - INITIALIZED = auto # The algorithm has been initialized. - CONVERGED = auto # The algorithm has converged. - TERMINATED = auto # The algorithm has terminated. \ No newline at end of file diff --git a/gklearn/preimage/cpp2python.py b/gklearn/preimage/cpp2python.py deleted file mode 100644 index 9d63026..0000000 --- a/gklearn/preimage/cpp2python.py +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Mar 20 11:09:04 2020 - -@author: ljia -""" -import re - -def convert_function(cpp_code): -# f_cpp = open('cpp_code.cpp', 'r') -# # f_cpp = open('cpp_ext/src/median_graph_estimator.ipp', 'r') -# cpp_code = f_cpp.read() - python_code = cpp_code.replace('else if (', 'elif ') - python_code = python_code.replace('if (', 'if ') - python_code = python_code.replace('else {', 'else:') - python_code = python_code.replace(') {', ':') - python_code = python_code.replace(';\n', '\n') - python_code = re.sub('\n(.*)}\n', '\n\n', python_code) - # python_code = python_code.replace('}\n', '') - python_code = python_code.replace('throw', 'raise') - python_code = python_code.replace('error', 'Exception') - python_code = python_code.replace('"', '\'') - python_code = python_code.replace('\\\'', '"') - python_code = python_code.replace('try {', 'try:') - python_code = python_code.replace('true', 'True') - python_code = python_code.replace('false', 'False') - python_code = python_code.replace('catch (...', 'except') - # python_code = re.sub('std::string\(\'(.*)\'\)', '$1', python_code) - - return python_code - - - -# # python_code = python_code.replace('}\n', '') - - - - -# python_code = python_code.replace('option.first', 'opt_name') -# python_code = python_code.replace('option.second', 'opt_val') -# python_code = python_code.replace('ged::Error', 'Exception') -# python_code = python_code.replace('std::string(\'Invalid argument "\')', '\'Invalid argument "\'') - - -# f_cpp.close() -# f_python = open('python_code.py', 'w') -# f_python.write(python_code) -# f_python.close() - - -def convert_function_comment(cpp_fun_cmt, param_types): - cpp_fun_cmt = cpp_fun_cmt.replace('\t', '') - cpp_fun_cmt = cpp_fun_cmt.replace('\n * ', ' ') - # split the input comment according to key words. - param_split = None - note = None - cmt_split = cpp_fun_cmt.split('@brief')[1] - brief = cmt_split - if '@param' in cmt_split: - cmt_split = cmt_split.split('@param') - brief = cmt_split[0] - param_split = cmt_split[1:] - if '@note' in cmt_split[-1]: - note_split = cmt_split[-1].split('@note') - if param_split is not None: - param_split.pop() - param_split.append(note_split[0]) - else: - brief = note_split[0] - note = note_split[1] - - # get parameters. - if param_split is not None: - for idx, param in enumerate(param_split): - _, param_name, param_desc = param.split(' ', 2) - param_name = function_comment_strip(param_name, ' *\n\t/') - param_desc = function_comment_strip(param_desc, ' *\n\t/') - param_split[idx] = (param_name, param_desc) - - # strip comments. - brief = function_comment_strip(brief, ' *\n\t/') - if note is not None: - note = function_comment_strip(note, ' *\n\t/') - - # construct the Python function comment. - python_fun_cmt = '"""' - python_fun_cmt += brief + '\n' - if param_split is not None and len(param_split) > 0: - python_fun_cmt += '\nParameters\n----------' - for idx, param in enumerate(param_split): - python_fun_cmt += '\n' + param[0] + ' : ' + param_types[idx] - python_fun_cmt += '\n\t' + param[1] + '\n' - if note is not None: - python_fun_cmt += '\nNote\n----\n' + note + '\n' - python_fun_cmt += '"""' - - return python_fun_cmt - - -def function_comment_strip(comment, bad_chars): - head_removed, tail_removed = False, False - while not head_removed or not tail_removed: - if comment[0] in bad_chars: - comment = comment[1:] - head_removed = False - else: - head_removed = True - if comment[-1] in bad_chars: - comment = comment[:-1] - tail_removed = False - else: - tail_removed = True - - return comment - - -if __name__ == '__main__': -# python_code = convert_function(""" -# if (print_to_stdout_ == 2) { -# std::cout << "\n===========================================================\n"; -# std::cout << "Block gradient descent for initial median " << median_pos + 1 << " of " << medians.size() << ".\n"; -# std::cout << "-----------------------------------------------------------\n"; -# } -# """) - - - python_fun_cmt = convert_function_comment(""" - /*! - * @brief Returns the sum of distances. - * @param[in] state The state of the estimator. - * @return The sum of distances of the median when the estimator was in the state @p state during the last call to run(). - */ - """, ['string', 'string']) \ No newline at end of file diff --git a/gklearn/preimage/find_best_k.py b/gklearn/preimage/find_best_k.py deleted file mode 100644 index df38d32..0000000 --- a/gklearn/preimage/find_best_k.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Jan 9 11:54:32 2020 - -@author: ljia -""" -import numpy as np -import random -import csv - -from gklearn.utils.graphfiles import loadDataset -from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs - -def find_best_k(): - ds = {'name': 'monoterpenoides', - 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:50] - gkernel = 'treeletkernel' - node_label = 'atom' - edge_label = 'bond_type' - ds_name = 'mono' - dir_output = 'results/test_find_best_k/' - - repeats = 50 - k_list = range(2, 11) - fit_method = 'k-graphs' - # fitted on the whole dataset - treelet - mono - edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0] - - # create result files. - fn_output_detail = 'results_detail.' + fit_method + '.csv' - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM']) - f_detail.close() - fn_output_summary = 'results_summary.csv' - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', - 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', - 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', - 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', - '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', - 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', - 'repeats better dis_k gi -> GM']) - f_summary.close() - - random.seed(1) - rdn_seed_list = random.sample(range(0, repeats * 100), repeats) - - for k in k_list: - print('\n--------- k =', k, '----------') - - sod_sm_list = [] - sod_gm_list = [] - dis_k_sm_list = [] - dis_k_gm_list = [] - dis_k_gi_min_list = [] - nb_sod_sm2gm = [0, 0, 0] - nb_dis_k_sm2gm = [0, 0, 0] - nb_dis_k_gi2sm = [0, 0, 0] - nb_dis_k_gi2gm = [0, 0, 0] - repeats_better_sod_sm2gm = [] - repeats_better_dis_k_sm2gm = [] - repeats_better_dis_k_gi2sm = [] - repeats_better_dis_k_gi2gm = [] - - - for repeat in range(repeats): - print('\nrepeat =', repeat) - random.seed(rdn_seed_list[repeat]) - median_set_idx = random.sample(range(0, len(Gn)), k) - print('median set: ', median_set_idx) - - sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ - = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, - fit_method='k-graphs', - edit_costs=edit_costs, - group_min=median_set_idx, - parallel=False) - - # write result detail. - sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm)) - dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm)) - dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min)) - dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min)) - f_detail = open(dir_output + fn_output_detail, 'a') - csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat, - median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm, - dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm, - dis_k_gi2gm]) - f_detail.close() - - # compute result summary. - sod_sm_list.append(sod_sm) - sod_gm_list.append(sod_gm) - dis_k_sm_list.append(dis_k_sm) - dis_k_gm_list.append(dis_k_gm) - dis_k_gi_min_list.append(dis_k_gi_min) - # # SOD SM -> GM - if sod_sm > sod_gm: - nb_sod_sm2gm[0] += 1 - repeats_better_sod_sm2gm.append(repeat) - elif sod_sm == sod_gm: - nb_sod_sm2gm[1] += 1 - elif sod_sm < sod_gm: - nb_sod_sm2gm[2] += 1 - # # dis_k SM -> GM - if dis_k_sm > dis_k_gm: - nb_dis_k_sm2gm[0] += 1 - repeats_better_dis_k_sm2gm.append(repeat) - elif dis_k_sm == dis_k_gm: - nb_dis_k_sm2gm[1] += 1 - elif dis_k_sm < dis_k_gm: - nb_dis_k_sm2gm[2] += 1 - # # dis_k gi -> SM - if dis_k_gi_min > dis_k_sm: - nb_dis_k_gi2sm[0] += 1 - repeats_better_dis_k_gi2sm.append(repeat) - elif dis_k_gi_min == dis_k_sm: - nb_dis_k_gi2sm[1] += 1 - elif dis_k_gi_min < dis_k_sm: - nb_dis_k_gi2sm[2] += 1 - # # dis_k gi -> GM - if dis_k_gi_min > dis_k_gm: - nb_dis_k_gi2gm[0] += 1 - repeats_better_dis_k_gi2gm.append(repeat) - elif dis_k_gi_min == dis_k_gm: - nb_dis_k_gi2gm[1] += 1 - elif dis_k_gi_min < dis_k_gm: - nb_dis_k_gi2gm[2] += 1 - - # write result summary. - sod_sm_mean = np.mean(sod_sm_list) - sod_gm_mean = np.mean(sod_gm_list) - dis_k_sm_mean = np.mean(dis_k_sm_list) - dis_k_gm_mean = np.mean(dis_k_gm_list) - dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) - sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean)) - dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) - dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) - dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) - f_summary = open(dir_output + fn_output_summary, 'a') - csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, - sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, - dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, - dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, - nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, - repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, - repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm]) - f_summary.close() - - print('\ncomplete.') - return - - -def getRelations(sign): - if sign == -1: - return 'better' - elif sign == 0: - return 'same' - elif sign == 1: - return 'worse' - - -if __name__ == '__main__': - find_best_k() \ No newline at end of file diff --git a/gklearn/preimage/fitDistance.py b/gklearn/preimage/fitDistance.py deleted file mode 100644 index 234f7fc..0000000 --- a/gklearn/preimage/fitDistance.py +++ /dev/null @@ -1,430 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Oct 16 14:20:06 2019 - -@author: ljia -""" -import numpy as np -from tqdm import tqdm -from itertools import combinations_with_replacement, combinations -import multiprocessing -from multiprocessing import Pool -from functools import partial -import time -import random -import sys - -from scipy import optimize -from scipy.optimize import minimize -import cvxpy as cp - -from gklearn.preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter, get_nb_edit_operations_nonsymbolic -from gklearn.preimage.utils import kernel_distance_matrix - -def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, - params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', - 'method': 'IPFP', 'stabilizer': None}, - init_costs=[3, 3, 1, 3, 3, 1], - dataset='monoterpenoides', Kmatrix=None, - parallel=True): -# dataset = dataset.lower() - - # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them. -# random.seed(1) -# cost_rdm = random.sample(range(1, 10), 6) -# init_costs = cost_rdm + [0] -# init_costs = cost_rdm -# init_costs = [3, 3, 1, 3, 3, 1] -# init_costs = [i * 0.01 for i in cost_rdm] + [0] -# init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0] -# init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0] -# init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0] -# idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0] - - # compute distances in feature space. - dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, - Kmatrix=Kmatrix, gkernel=gkernel) - dis_k_vec = [] - for i in range(len(dis_k_mat)): -# for j in range(i, len(dis_k_mat)): - for j in range(i + 1, len(dis_k_mat)): - dis_k_vec.append(dis_k_mat[i, j]) - dis_k_vec = np.array(dis_k_vec) - - # init ged. - print('\ninitial:') - time0 = time.time() - params_ged['dataset'] = dataset - params_ged['edit_cost_constant'] = init_costs - ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, - parallel=parallel) - residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] - time_list = [time.time() - time0] - edit_cost_list = [init_costs] - nb_cost_mat = np.array(n_edit_operations) - nb_cost_mat_list = [nb_cost_mat] - print('edit_costs:', init_costs) - print('residual_list:', residual_list) - - for itr in range(itr_max): - print('\niteration', itr) - time0 = time.time() - # "fit" geds to distances in feature space by tuning edit costs using the - # Least Squares Method. - np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm', - nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, - n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, - ged_mat=ged_mat) - edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec, - dataset=dataset, cost=params_ged['cost']) - for i in range(len(edit_costs_new)): - if -1e-9 <= edit_costs_new[i] <= 1e-9: - edit_costs_new[i] = 0 - if edit_costs_new[i] < 0: - raise ValueError('The edit cost is negative.') -# for i in range(len(edit_costs_new)): -# if edit_costs_new[i] < 0: -# edit_costs_new[i] = 0 - - # compute new GEDs and numbers of edit operations. - params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75]) - ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged, - parallel=parallel) - residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) - time_list.append(time.time() - time0) - edit_cost_list.append(edit_costs_new) - nb_cost_mat = np.array(n_edit_operations) - nb_cost_mat_list.append(nb_cost_mat) - print('edit_costs:', edit_costs_new) - print('residual_list:', residual_list) - - return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \ - time_list, nb_cost_mat_list - - -def compute_geds(Gn, params_ged, parallel=False): - edit_cost_name = params_ged['cost'] - if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2': - get_nb_eo = get_nb_edit_operations_letter - elif edit_cost_name == 'NON_SYMBOLIC': - get_nb_eo = get_nb_edit_operations_nonsymbolic - else: - get_nb_eo = get_nb_edit_operations - ged_mat = np.zeros((len(Gn), len(Gn))) - if parallel: -# print('parallel') -# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) - len_itr = int(len(Gn) * (len(Gn) - 1) / 2) - ged_vec = [0 for i in range(len_itr)] - n_edit_operations = [0 for i in range(len_itr)] -# itr = combinations_with_replacement(range(0, len(Gn)), 2) - itr = combinations(range(0, len(Gn)), 2) - n_jobs = multiprocessing.cpu_count() - if len_itr < 100 * n_jobs: - chunksize = int(len_itr / n_jobs) + 1 - else: - chunksize = 100 - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) - iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), - desc='computing GEDs', file=sys.stdout) -# iterator = pool.imap_unordered(do_partial, itr, chunksize) - for i, j, dis, n_eo_tmp in iterator: - idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2) - ged_vec[idx_itr] = dis - ged_mat[i][j] = dis - ged_mat[j][i] = dis - n_edit_operations[idx_itr] = n_eo_tmp -# print('\n-------------------------------------------') -# print(i, j, idx_itr, dis) - pool.close() - pool.join() - - else: - ged_vec = [] - n_edit_operations = [] - for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): -# for i in range(len(Gn)): - for j in range(i + 1, len(Gn)): - dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged) - ged_vec.append(dis) - ged_mat[i][j] = dis - ged_mat[j][i] = dis - n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward) - n_edit_operations.append(n_eo_tmp) - - return ged_vec, ged_mat, n_edit_operations - - -def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr): - i = itr[0] - j = itr[1] - dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo) - return i, j, dis, n_eo_tmp - - -def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo): - dis, pi_forward, pi_backward = GED(g1, g2, **params_ged) - n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0] - return dis, n_eo_tmp - - -def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides', - cost='CONSTANT', rw_constraints='inequality'): -# if dataset == 'Letter-high': - if cost == 'LETTER': - pass -# # method 1: set alpha automatically, just tune c_vir and c_eir by -# # LMS using cvxpy. -# alpha = 0.5 -# coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec) -## if np.count_nonzero(nb_cost_mat[:,4]) == 0: -## alpha = 0.75 -## else: -## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0]) -## alpha = alpha * 0.99 -# param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1]) -# param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5]) -# nb_cost_mat_new = np.column_stack((param_vir, param_eir)) -# dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3] -# -# x = cp.Variable(nb_cost_mat_new.shape[1]) -# cost = cp.sum_squares(nb_cost_mat_new * x - dis_new) -# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] -# prob = cp.Problem(cp.Minimize(cost), constraints) -# prob.solve() -# edit_costs_new = x.value -# edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha]) -# residual = np.sqrt(prob.value) - -# # method 2: tune c_vir, c_eir and alpha by nonlinear programming by -# # scipy.optimize.minimize. -# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] -# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] -# w2 = nb_cost_mat[:,3] -# w3 = dis_k_vec -# func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ -# + w2 * x[2] - w3 * x[3]) ** 2) -# bounds = ((0, None), (0., None), (0.5, 0.5), (0, None)) -# res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds) -# edit_costs_new = res.x[0:3] -# residual = res.fun - - # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy. - - -# # method 4: tune c_vir, c_eir and alpha by QP function -# # scipy.optimize.least_squares. An initial guess is required. -# w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] -# w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] -# w2 = nb_cost_mat[:,3] -# w3 = dis_k_vec -# func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ -# + w2 * x[2] - w3 * x[3]) ** 2 -# res = optimize.root(func, [0.9, 1.7, 0.75, 100]) -# edit_costs_new = res.x -# residual = None - elif cost == 'LETTER2': -# # 1. if c_vi != c_vr, c_ei != c_er. -# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] -# x = cp.Variable(nb_cost_mat_new.shape[1]) -# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) -## # 1.1 no constraints. -## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] -# # 1.2 c_vs <= c_vi + c_vr. -# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], -# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] -## # 2. if c_vi == c_vr, c_ei == c_er. -## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]] -## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] -## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] -## x = cp.Variable(nb_cost_mat_new.shape[1]) -## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) -## # 2.1 no constraints. -## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] -### # 2.2 c_vs <= c_vi + c_vr. -### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], -### np.array([2.0, -1.0, 0.0]).T@x >= 0.0] -# -# prob = cp.Problem(cp.Minimize(cost_fun), constraints) -# prob.solve() -# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] -# edit_costs_new = np.array(edit_costs_new) -# residual = np.sqrt(prob.value) - if rw_constraints == 'inequality': - # c_vs <= c_vi + c_vr. - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - try: - prob.solve(verbose=True) - except MemoryError as error0: - print('\nUsing solver "OSQP" caused a memory error.') - print('the original error message is\n', error0) - print('solver status: ', prob.status) - print('trying solver "CVXOPT" instead...\n') - try: - prob.solve(solver=cp.CVXOPT, verbose=True) - except Exception as error1: - print('\nAn error occured when using solver "CVXOPT".') - print('the original error message is\n', error1) - print('solver status: ', prob.status) - print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') - prob.solve(solver=cp.MOSEK, verbose=True) - else: - print('solver status: ', prob.status) - else: - print('solver status: ', prob.status) - print() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif rw_constraints == '2constraints': - # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er. - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, - np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0, - np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif rw_constraints == 'no-constraint': - # no constraint. - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) -# elif method == 'inequality_modified': -# # c_vs <= c_vi + c_vr. -# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] -# x = cp.Variable(nb_cost_mat_new.shape[1]) -# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) -# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], -# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] -# prob = cp.Problem(cp.Minimize(cost_fun), constraints) -# prob.solve() -# # use same costs for insertion and removal rather than the fitted costs. -# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] -# edit_costs_new = np.array(edit_costs_new) -# residual = np.sqrt(prob.value) - elif cost == 'NON_SYMBOLIC': - is_n_attr = np.count_nonzero(nb_cost_mat[:,2]) - is_e_attr = np.count_nonzero(nb_cost_mat[:,5]) - - if dataset == 'SYNTHETICnew': -# nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] - nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) -# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], -# np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] -# constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]] - constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])], - np.array([0.0, 1.0, -1.0]).T@x == 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() -# print(x.value) - edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value, - np.array([0.0]))) - residual = np.sqrt(prob.value) - - elif rw_constraints == 'inequality': - # c_vs <= c_vi + c_vr. - if is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, - np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif is_n_attr and not is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - print(x.value) - edit_costs_new = np.concatenate((x.value, np.array([0.0]))) - residual = np.sqrt(prob.value) - elif not is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) - residual = np.sqrt(prob.value) - else: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), - x.value[2:], np.array([0.0]))) - residual = np.sqrt(prob.value) - else: -# # method 1: simple least square method. -# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, -# rcond=None) - -# # method 2: least square method with x_i >= 0. -# edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) - - # method 3: solve as a quadratic program with constraints. -# P = np.dot(nb_cost_mat.T, nb_cost_mat) -# q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) -# G = -1 * np.identity(nb_cost_mat.shape[1]) -# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) -# A = np.array([1 for i in range(nb_cost_mat.shape[1])]) -# b = 1 -# x = cp.Variable(nb_cost_mat.shape[1]) -# prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x), -# [G@x <= h]) -# prob.solve() -# edit_costs_new = x.value -# residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec) - -# G = -1 * np.identity(nb_cost_mat.shape[1]) -# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) - x = cp.Variable(nb_cost_mat.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], - # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, - np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - - # method 4: - - return edit_costs_new, residual - - -if __name__ == '__main__': - print('check test_fitDistance.py') \ No newline at end of file diff --git a/gklearn/preimage/ged.py b/gklearn/preimage/ged.py deleted file mode 100644 index a66baaf..0000000 --- a/gklearn/preimage/ged.py +++ /dev/null @@ -1,467 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Oct 17 18:44:59 2019 - -@author: ljia -""" -import numpy as np -import networkx as nx -from tqdm import tqdm -import sys -import multiprocessing -from multiprocessing import Pool -from functools import partial - -#from gedlibpy_linlin import librariesImport, gedlibpy -from gklearn.gedlib import librariesImport, gedlibpy - -def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP', - edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50): - """ - Compute GED for 2 graphs. - """ - -# dataset = dataset.lower() - - if lib == 'gedlibpy': - gedlibpy.restart_env() - gedlibpy.add_nx_graph(convertGraph(g1, cost), "") - gedlibpy.add_nx_graph(convertGraph(g2, cost), "") - - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) - gedlibpy.init() - gedlibpy.set_method(method, algo_options) - gedlibpy.init_method() - - g = listID[0] - h = listID[1] - if stabilizer is None: - gedlibpy.run_method(g, h) - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - upper = gedlibpy.get_upper_bound(g, h) - lower = gedlibpy.get_lower_bound(g, h) - elif stabilizer == 'mean': - # @todo: to be finished... - upper_list = [np.inf] * repeat - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_list[itr] = gedlibpy.get_upper_bound(g, h) - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - upper = np.mean(upper_list) - elif stabilizer == 'median': - if repeat % 2 == 0: - repeat += 1 - upper_list = [np.inf] * repeat - pi_forward_list = [0] * repeat - pi_backward_list = [0] * repeat - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_list[itr] = gedlibpy.get_upper_bound(g, h) - pi_forward_list[itr] = gedlibpy.get_forward_map(g, h) - pi_backward_list[itr] = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - upper = np.median(upper_list) - idx_median = upper_list.index(upper) - pi_forward = pi_forward_list[idx_median] - pi_backward = pi_backward_list[idx_median] - elif stabilizer == 'min': - upper = np.inf - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_tmp = gedlibpy.get_upper_bound(g, h) - if upper_tmp < upper: - upper = upper_tmp - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - if upper == 0: - break - elif stabilizer == 'max': - upper = 0 - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_tmp = gedlibpy.get_upper_bound(g, h) - if upper_tmp > upper: - upper = upper_tmp - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - elif stabilizer == 'gaussian': - pass - - dis = upper - - elif lib == 'gedlib-bash': - import time - import random - import os - from gklearn.utils.graphfiles import saveDataset - - tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/' - if not os.path.exists(tmp_dir): - os.makedirs(tmp_dir) - fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) - xparams = {'method': 'gedlib', 'graph_dir': fn_collection} - saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', - filename=fn_collection, xparams=xparams) - - command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n' - command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' - command += 'export LD_LIBRARY_PATH\n' - command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n' - command += './ged_for_python_bash monoterpenoides ' + fn_collection \ - + ' \'' + algo_options + '\' ' - for ec in edit_cost_constant: - command += str(ec) + ' ' -# output = os.system(command) - stream = os.popen(command) - output = stream.readlines() -# print(output) - - dis = float(output[0].strip()) - runtime = float(output[1].strip()) - size_forward = int(output[2].strip()) - pi_forward = [int(item.strip()) for item in output[3:3+size_forward]] - pi_backward = [int(item.strip()) for item in output[3+size_forward:]] - -# print(dis) -# print(runtime) -# print(size_forward) -# print(pi_forward) -# print(pi_backward) - - - # make the map label correct (label remove map as np.inf) - nodes1 = [n for n in g1.nodes()] - nodes2 = [n for n in g2.nodes()] - nb1 = nx.number_of_nodes(g1) - nb2 = nx.number_of_nodes(g2) - pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] - pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] -# print(pi_forward) - - - return dis, pi_forward, pi_backward - - -def convertGraph(G, cost): - """Convert a graph to the proper NetworkX format that can be - recognized by library gedlibpy. - """ - G_new = nx.Graph() - if cost == 'LETTER' or cost == 'LETTER2': - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), x=str(attrs['attributes'][0]), - y=str(attrs['attributes'][1])) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2)) - elif cost == 'NON_SYMBOLIC': - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd)) - for a_name in G.graph['node_attrs']: - G_new.nodes[str(nd)][a_name] = str(attrs[a_name]) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2)) - for a_name in G.graph['edge_attrs']: - G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name]) - else: - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), chem=attrs['atom']) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) -# G_new.add_edge(str(nd1), str(nd2)) - - return G_new - - -def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', - edit_cost_constant=[], stabilizer='min', repeat=50): - """ - Compute GEDs for a group of graphs. - """ - if lib == 'gedlibpy': - def convertGraph(G): - """Convert a graph to the proper NetworkX format that can be - recognized by library gedlibpy. - """ - G_new = nx.Graph() - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), chem=attrs['atom']) - for nd1, nd2, attrs in G.edges(data=True): -# G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) - G_new.add_edge(str(nd1), str(nd2)) - - return G_new - - gedlibpy.restart_env() - gedlibpy.add_nx_graph(convertGraph(g1), "") - gedlibpy.add_nx_graph(convertGraph(g2), "") - - listID = gedlibpy.get_all_graph_ids() - gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant) - gedlibpy.init() - gedlibpy.set_method(method, "") - gedlibpy.init_method() - - g = listID[0] - h = listID[1] - if stabilizer is None: - gedlibpy.run_method(g, h) - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - upper = gedlibpy.get_upper_bound(g, h) - lower = gedlibpy.get_lower_bound(g, h) - elif stabilizer == 'min': - upper = np.inf - for itr in range(repeat): - gedlibpy.run_method(g, h) - upper_tmp = gedlibpy.get_upper_bound(g, h) - if upper_tmp < upper: - upper = upper_tmp - pi_forward = gedlibpy.get_forward_map(g, h) - pi_backward = gedlibpy.get_backward_map(g, h) - lower = gedlibpy.get_lower_bound(g, h) - if upper == 0: - break - - dis = upper - - # make the map label correct (label remove map as np.inf) - nodes1 = [n for n in g1.nodes()] - nodes2 = [n for n in g2.nodes()] - nb1 = nx.number_of_nodes(g1) - nb2 = nx.number_of_nodes(g2) - pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] - pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] - - return dis, pi_forward, pi_backward - - -def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', - 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], - 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1', - 'stabilizer': None}, parallel=False): - if parallel: - len_itr = int(len(Gn)) - pi_forward_list = [[] for i in range(len_itr)] - dis_list = [0 for i in range(len_itr)] - - itr = range(0, len_itr) - n_jobs = multiprocessing.cpu_count() - if len_itr < 100 * n_jobs: - chunksize = int(len_itr / n_jobs) + 1 - else: - chunksize = 100 - def init_worker(gn_toshare, gn_median_toshare): - global G_gn, G_gn_median - G_gn = gn_toshare - G_gn_median = gn_median_toshare - do_partial = partial(_compute_ged_median, params_ged) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median)) - if verbose: - iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), - desc='computing GEDs', file=sys.stdout) - else: - iterator = pool.imap_unordered(do_partial, itr, chunksize) - for i, dis_sum, pi_forward in iterator: - pi_forward_list[i] = pi_forward - dis_list[i] = dis_sum -# print('\n-------------------------------------------') -# print(i, j, idx_itr, dis) - pool.close() - pool.join() - - else: - dis_list = [] - pi_forward_list = [] - for idx, G in tqdm(enumerate(Gn), desc='computing median distances', - file=sys.stdout) if verbose else enumerate(Gn): - dis_sum = 0 - pi_forward_list.append([]) - for G_p in Gn_median: - dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, - **params_ged) - pi_forward_list[idx].append(pi_tmp_forward) - dis_sum += dis_tmp - dis_list.append(dis_sum) - - return dis_list, pi_forward_list - - -def _compute_ged_median(params_ged, itr): -# print(itr) - dis_sum = 0 - pi_forward = [] - for G_p in G_gn_median: - dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p, - **params_ged) - pi_forward.append(pi_tmp_forward) - dis_sum += dis_tmp - - return itr, dis_sum, pi_forward - - -def get_nb_edit_operations(g1, g2, forward_map, backward_map): - """Compute the number of each edit operations. - """ - n_vi = 0 - n_vr = 0 - n_vs = 0 - n_ei = 0 - n_er = 0 - n_es = 0 - - nodes1 = [n for n in g1.nodes()] - for i, map_i in enumerate(forward_map): - if map_i == np.inf: - n_vr += 1 - elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']: - n_vs += 1 - for map_i in backward_map: - if map_i == np.inf: - n_vi += 1 - -# idx_nodes1 = range(0, len(node1)) - - edges1 = [e for e in g1.edges()] - nb_edges2_cnted = 0 - for n1, n2 in edges1: - idx1 = nodes1.index(n1) - idx2 = nodes1.index(n2) - # one of the nodes is removed, thus the edge is removed. - if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: - n_er += 1 - # corresponding edge is in g2. - elif (forward_map[idx1], forward_map[idx2]) in g2.edges(): - nb_edges2_cnted += 1 - # edge labels are different. - if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \ - != g1.edges[(n1, n2)]['bond_type']: - n_es += 1 - elif (forward_map[idx2], forward_map[idx1]) in g2.edges(): - nb_edges2_cnted += 1 - # edge labels are different. - if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \ - != g1.edges[(n1, n2)]['bond_type']: - n_es += 1 - # corresponding nodes are in g2, however the edge is removed. - else: - n_er += 1 - n_ei = nx.number_of_edges(g2) - nb_edges2_cnted - - return n_vi, n_vr, n_vs, n_ei, n_er, n_es - - -def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map): - """Compute the number of each edit operations. - """ - n_vi = 0 - n_vr = 0 - n_vs = 0 - sod_vs = 0 - n_ei = 0 - n_er = 0 - - nodes1 = [n for n in g1.nodes()] - for i, map_i in enumerate(forward_map): - if map_i == np.inf: - n_vr += 1 - else: - n_vs += 1 - diff_x = float(g1.nodes[nodes1[i]]['x']) - float(g2.nodes[map_i]['x']) - diff_y = float(g1.nodes[nodes1[i]]['y']) - float(g2.nodes[map_i]['y']) - sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y)) - for map_i in backward_map: - if map_i == np.inf: - n_vi += 1 - -# idx_nodes1 = range(0, len(node1)) - - edges1 = [e for e in g1.edges()] - nb_edges2_cnted = 0 - for n1, n2 in edges1: - idx1 = nodes1.index(n1) - idx2 = nodes1.index(n2) - # one of the nodes is removed, thus the edge is removed. - if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf: - n_er += 1 - # corresponding edge is in g2. Edge label is not considered. - elif (forward_map[idx1], forward_map[idx2]) in g2.edges() or \ - (forward_map[idx2], forward_map[idx1]) in g2.edges(): - nb_edges2_cnted += 1 - # corresponding nodes are in g2, however the edge is removed. - else: - n_er += 1 - n_ei = nx.number_of_edges(g2) - nb_edges2_cnted - - return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er - - -def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map): - """Compute the number of each edit operations. - """ - n_vi = 0 - n_vr = 0 - n_vs = 0 - sod_vs = 0 - n_ei = 0 - n_er = 0 - n_es = 0 - sod_es = 0 - - nodes1 = [n for n in g1.nodes()] - for i, map_i in enumerate(forward_map): - if map_i == np.inf: - n_vr += 1 - else: - n_vs += 1 - sum_squares = 0 - for a_name in g1.graph['node_attrs']: - diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name]) - sum_squares += np.square(diff) - sod_vs += np.sqrt(sum_squares) - for map_i in backward_map: - if map_i == np.inf: - n_vi += 1 - -# idx_nodes1 = range(0, len(node1)) - - edges1 = [e for e in g1.edges()] - for n1, n2 in edges1: - idx1 = nodes1.index(n1) - idx2 = nodes1.index(n2) - n1_g2 = forward_map[idx1] - n2_g2 = forward_map[idx2] - # one of the nodes is removed, thus the edge is removed. - if n1_g2 == np.inf or n2_g2 == np.inf: - n_er += 1 - # corresponding edge is in g2. - elif (n1_g2, n2_g2) in g2.edges(): - n_es += 1 - sum_squares = 0 - for a_name in g1.graph['edge_attrs']: - diff = float(g1.edges[n1, n2][a_name]) - float(g2.nodes[n1_g2, n2_g2][a_name]) - sum_squares += np.square(diff) - sod_es += np.sqrt(sum_squares) - elif (n2_g2, n1_g2) in g2.edges(): - n_es += 1 - sum_squares = 0 - for a_name in g1.graph['edge_attrs']: - diff = float(g1.edges[n2, n1][a_name]) - float(g2.nodes[n2_g2, n1_g2][a_name]) - sum_squares += np.square(diff) - sod_es += np.sqrt(sum_squares) - # corresponding nodes are in g2, however the edge is removed. - else: - n_er += 1 - n_ei = nx.number_of_edges(g2) - n_es - - return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es - - -if __name__ == '__main__': - print('check test_ged.py') \ No newline at end of file diff --git a/gklearn/preimage/iam.py b/gklearn/preimage/iam.py deleted file mode 100644 index f3e2165..0000000 --- a/gklearn/preimage/iam.py +++ /dev/null @@ -1,775 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Apr 26 11:49:12 2019 - -Iterative alternate minimizations using GED. -@author: ljia -""" -import numpy as np -import random -import networkx as nx -from tqdm import tqdm - -from gklearn.utils.graphdataset import get_dataset_attributes -from gklearn.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels -from gklearn.preimage.ged import GED, ged_median - - -def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, - epsilon=0.001, node_label='atom', edge_label='bond_type', - connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, - allBestEdges=False, allBestOutput=False, - params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', - 'edit_cost_constant': [], 'stabilizer': None, - 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}): - """See my name, then you know what I do. - """ -# Gn_median = Gn_median[0:10] -# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] - node_ir = np.inf # corresponding to the node remove and insertion. - label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. - ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, - attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], - edge_label=edge_label) - node_label_set = get_node_labels(Gn_median, node_label) - edge_label_set = get_edge_labels(Gn_median, edge_label) - - - def generate_graph(G, pi_p_forward): - G_new_list = [G.copy()] # all "best" graphs generated in this iteration. -# nx.draw_networkx(G) -# import matplotlib.pyplot as plt -# plt.show() -# print(pi_p_forward) - - # update vertex labels. - # pre-compute h_i0 for each label. -# for label in get_node_labels(Gn, node_label): -# print(label) -# for nd in G.nodes(data=True): -# pass - if not ds_attrs['node_attr_dim']: # labels are symbolic - for ndi, (nd, _) in enumerate(G.nodes(data=True)): - h_i0_list = [] - label_list = [] - for label in node_label_set: - h_i0 = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][ndi] - if pi_i != node_ir and g.nodes[pi_i][node_label] == label: - h_i0 += 1 - h_i0_list.append(h_i0) - label_list.append(label) - # case when the node is to be removed. - if removeNodes: - h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above. - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][ndi] - if pi_i == node_ir: - h_i0_remove += 1 - h_i0_list.append(h_i0_remove) - label_list.append(label_r) - # get the best labels. - idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() - if allBestNodes: # choose all best graphs. - nlabel_best = [label_list[idx] for idx in idx_max] - # generate "best" graphs with regard to "best" node labels. - G_new_list_nd = [] - for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. - for nl in nlabel_best: - g_tmp = g.copy() - if nl == label_r: - g_tmp.remove_node(nd) - else: - g_tmp.nodes[nd][node_label] = nl - G_new_list_nd.append(g_tmp) - # nx.draw_networkx(g_tmp) - # import matplotlib.pyplot as plt - # plt.show() - # print(g_tmp.nodes(data=True)) - # print(g_tmp.edges(data=True)) - G_new_list = [ggg.copy() for ggg in G_new_list_nd] - else: - # choose one of the best randomly. - idx_rdm = random.randint(0, len(idx_max) - 1) - best_label = label_list[idx_max[idx_rdm]] - h_i0_max = h_i0_list[idx_max[idx_rdm]] - - g_new = G_new_list[0] - if best_label == label_r: - g_new.remove_node(nd) - else: - g_new.nodes[nd][node_label] = best_label - G_new_list = [g_new] - else: # labels are non-symbolic - for ndi, (nd, _) in enumerate(G.nodes(data=True)): - Si_norm = 0 - phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][ndi] - if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? - Si_norm += 1 - phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) - phi_i_bar /= Si_norm - G_new_list[0].nodes[nd]['attributes'] = phi_i_bar - -# for g in G_new_list: -# import matplotlib.pyplot as plt -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - # update edge labels and adjacency matrix. - if ds_attrs['edge_labeled']: - G_new_list_edge = [] - for g_new in G_new_list: - nd_list = [n for n in g_new.nodes()] - g_tmp_list = [g_new.copy()] - for nd1i in range(nx.number_of_nodes(g_new)): - nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes - for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)): - nd2 = nd_list[nd2i] -# for nd1, nd2, _ in g_new.edges(data=True): - h_ij0_list = [] - label_list = [] - for label in edge_label_set: - h_ij0 = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][nd1i] - pi_j = pi_p_forward[idx][nd2i] - h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and - g.has_edge(pi_i, pi_j) and - g.edges[pi_i, pi_j][edge_label] == label) - h_ij0 += h_ij0_p - h_ij0_list.append(h_ij0) - label_list.append(label) - - # get the best labels. - idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() - if allBestEdges: # choose all best graphs. - elabel_best = [label_list[idx] for idx in idx_max] - h_ij0_max = [h_ij0_list[idx] for idx in idx_max] - # generate "best" graphs with regard to "best" node labels. - G_new_list_ed = [] - for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. - for idxl, el in enumerate(elabel_best): - g_tmp_copy = g_tmp.copy() - # check whether a_ij is 0 or 1. - sij_norm = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][nd1i] - pi_j = pi_p_forward[idx][nd2i] - if g.has_node(pi_i) and g.has_node(pi_j) and \ - g.has_edge(pi_i, pi_j): - sij_norm += 1 - if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \ - sij_norm * (1 - (c_er + c_ei) / c_es): - if not g_tmp_copy.has_edge(nd1, nd2): - g_tmp_copy.add_edge(nd1, nd2) - g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl] - else: - if g_tmp_copy.has_edge(nd1, nd2): - g_tmp_copy.remove_edge(nd1, nd2) - G_new_list_ed.append(g_tmp_copy) - g_tmp_list = [ggg.copy() for ggg in G_new_list_ed] - else: # choose one of the best randomly. - idx_rdm = random.randint(0, len(idx_max) - 1) - best_label = label_list[idx_max[idx_rdm]] - h_ij0_max = h_ij0_list[idx_max[idx_rdm]] - - # check whether a_ij is 0 or 1. - sij_norm = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][nd1i] - pi_j = pi_p_forward[idx][nd2i] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): - if not g_new.has_edge(nd1, nd2): - g_new.add_edge(nd1, nd2) - g_new.edges[nd1, nd2][edge_label] = best_label - else: -# elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): - if g_new.has_edge(nd1, nd2): - g_new.remove_edge(nd1, nd2) - g_tmp_list = [g_new] - G_new_list_edge += g_tmp_list - G_new_list = [ggg.copy() for ggg in G_new_list_edge] - - - else: # if edges are unlabeled - # @todo: is this even right? G or g_tmp? check if the new one is right - # @todo: works only for undirected graphs. - - for g_tmp in G_new_list: - nd_list = [n for n in g_tmp.nodes()] - for nd1i in range(nx.number_of_nodes(g_tmp)): - nd1 = nd_list[nd1i] - for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)): - nd2 = nd_list[nd2i] - sij_norm = 0 - for idx, g in enumerate(Gn_median): - pi_i = pi_p_forward[idx][nd1i] - pi_j = pi_p_forward[idx][nd2i] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if sij_norm > len(Gn_median) * c_er / (c_er + c_ei): - # @todo: should we consider if nd1 and nd2 in g_tmp? - # or just add the edge anyway? - if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ - and not g_tmp.has_edge(nd1, nd2): - g_tmp.add_edge(nd1, nd2) - else: # @todo: which to use? -# elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): - if g_tmp.has_edge(nd1, nd2): - g_tmp.remove_edge(nd1, nd2) - # do not change anything when equal. - -# for i, g in enumerate(G_new_list): -# import matplotlib.pyplot as plt -# nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) -## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - -# # find the best graph generated in this iteration and update pi_p. - # @todo: should we update all graphs generated or just the best ones? - dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, - params_ged=params_ged) - # @todo: should we remove the identical and connectivity check? - # Don't know which is faster. - if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: - G_new_list, idx_list = remove_duplicates(G_new_list) - pi_forward_list = [pi_forward_list[idx] for idx in idx_list] - dis_list = [dis_list[idx] for idx in idx_list] -# if connected == True: -# G_new_list, idx_list = remove_disconnected(G_new_list) -# pi_forward_list = [pi_forward_list[idx] for idx in idx_list] -# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() -# dis_min = dis_list[idx_min_tmp_list[0]] -# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list] -# G_new_list = [G_new_list[idx] for idx in idx_min_list] - -# for g in G_new_list: -# import matplotlib.pyplot as plt -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - return G_new_list, pi_forward_list, dis_list - - - def best_median_graphs(Gn_candidate, pi_all_forward, dis_all): - idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist() - dis_min = dis_all[idx_min_list[0]] - pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list] - G_min_list = [Gn_candidate[idx] for idx in idx_min_list] - return G_min_list, pi_forward_min_list, dis_min - - - def iteration_proc(G, pi_p_forward, cur_sod): - G_list = [G] - pi_forward_list = [pi_p_forward] - old_sod = cur_sod * 2 - sod_list = [cur_sod] - dis_list = [cur_sod] - # iterations. - itr = 0 - # @todo: what if difference == 0? -# while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or -# np.abs(old_sod - cur_sod) == 0): - while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon: -# while itr < ite_max: -# for itr in range(0, 5): # the convergence condition? - print('itr_iam is', itr) - G_new_list = [] - pi_forward_new_list = [] - dis_new_list = [] - for idx, g in enumerate(G_list): -# label_set = get_node_labels(Gn_median + [g], node_label) - G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph( - g, pi_forward_list[idx]) - G_new_list += G_tmp_list - pi_forward_new_list += pi_forward_tmp_list - dis_new_list += dis_tmp_list - # @todo: need to remove duplicates here? - G_list = [ggg.copy() for ggg in G_new_list] - pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list] - dis_list = dis_new_list[:] - - old_sod = cur_sod - cur_sod = np.min(dis_list) - sod_list.append(cur_sod) - - itr += 1 - - # @todo: do we return all graphs or the best ones? - # get the best ones of the generated graphs. - G_list, pi_forward_list, dis_min = best_median_graphs( - G_list, pi_forward_list, dis_list) - - if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: - G_list, idx_list = remove_duplicates(G_list) - pi_forward_list = [pi_forward_list[idx] for idx in idx_list] -# dis_list = [dis_list[idx] for idx in idx_list] - -# import matplotlib.pyplot as plt -# for g in G_list: -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - print('\nsods:', sod_list, '\n') - - return G_list, pi_forward_list, dis_min, sod_list - - - def remove_duplicates(Gn): - """Remove duplicate graphs from list. - """ - Gn_new = [] - idx_list = [] - for idx, g in enumerate(Gn): - dupl = False - for g_new in Gn_new: - if graph_isIdentical(g_new, g): - dupl = True - break - if not dupl: - Gn_new.append(g) - idx_list.append(idx) - return Gn_new, idx_list - - - def remove_disconnected(Gn): - """Remove disconnected graphs from list. - """ - Gn_new = [] - idx_list = [] - for idx, g in enumerate(Gn): - if nx.is_connected(g): - Gn_new.append(g) - idx_list.append(idx) - return Gn_new, idx_list - - - ########################################################################### - - # phase 1: initilize. - # compute set-median. - dis_min = np.inf - dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median, - params_ged=params_ged, parallel=True) - print('finish computing GEDs.') - # find all smallest distances. - if allBestInit: # try all best init graphs. - idx_min_list = range(len(dis_list)) - dis_min = dis_list - else: - idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() - dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list) - idx_min_rdm = random.randint(0, len(idx_min_list) - 1) - idx_min_list = [idx_min_list[idx_min_rdm]] - sod_set_median = np.min(dis_min) - - - # phase 2: iteration. - G_list = [] - dis_list = [] - pi_forward_list = [] - G_set_median_list = [] -# sod_list = [] - for idx_tmp, idx_min in enumerate(idx_min_list): -# print('idx_min is', idx_min) - G = Gn_candidate[idx_min].copy() - G_set_median_list.append(G.copy()) - # list of edit operations. - pi_p_forward = pi_forward_all[idx_min] -# pi_p_backward = pi_all_backward[idx_min] - Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G, - pi_p_forward, dis_min[idx_tmp]) - G_list += Gi_list - dis_list += [dis_i_min] * len(Gi_list) - pi_forward_list += pi_i_forward_list - - - if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: - G_list, idx_list = remove_duplicates(G_list) - dis_list = [dis_list[idx] for idx in idx_list] - pi_forward_list = [pi_forward_list[idx] for idx in idx_list] - if connected == True: - G_list_con, idx_list = remove_disconnected(G_list) - # if there is no connected graphs at all, then remain the disconnected ones. - if len(G_list_con) > 0: # @todo: ?????????????????????????? - G_list = G_list_con - dis_list = [dis_list[idx] for idx in idx_list] - pi_forward_list = [pi_forward_list[idx] for idx in idx_list] - -# import matplotlib.pyplot as plt -# for g in G_list: -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - # get the best median graphs - G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs( - G_list, pi_forward_list, dis_list) -# for g in G_gen_median_list: -# nx.draw_networkx(g) -# plt.show() -# print(g.nodes(data=True)) -# print(g.edges(data=True)) - - if not allBestOutput: - # randomly choose one graph. - idx_rdm = random.randint(0, len(G_gen_median_list) - 1) - G_gen_median_list = [G_gen_median_list[idx_rdm]] - - return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median - - -def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', initial_solutions=1, - dataset='monoterpenoides', - graph_dir=''): - """Compute the iam by c++ implementation (gedlib) through bash. - """ - import os - import time - - def createCollectionFile(Gn_names, y, filename): - """Create collection file. - """ - dirname_ds = os.path.dirname(filename) - if dirname_ds != '': - dirname_ds += '/' - if not os.path.exists(dirname_ds) : - os.makedirs(dirname_ds) - - with open(filename + '.xml', 'w') as fgroup: - fgroup.write("") - fgroup.write("\n") - fgroup.write("\n") - for idx, fname in enumerate(Gn_names): - fgroup.write("\n\t") - fgroup.write("\n") - fgroup.close() - - tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/' - fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9)) - createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection) -# fn_collection = tmp_dir + 'collection_for_debug' -# graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/gxl' - -# if dataset == 'Letter-high' or dataset == 'Fingerprint': -# dataset = 'letter' - command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n' - command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n' - command += 'export LD_LIBRARY_PATH\n' - command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n' - command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \ - + ' \'' + graph_dir + '\' ' + ' ' + cost + ' ' + str(initial_solutions) + ' ' - if edit_cost_constant is None: - command += 'None' - else: - for ec in edit_cost_constant: - command += str(ec) + ' ' -# output = os.system(command) - stream = os.popen(command) - - output = stream.readlines() -# print(output) - sod_sm = float(output[0].strip()) - sod_gm = float(output[1].strip()) - - fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl' - fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl' - - return sod_sm, sod_gm, fname_sm, fname_gm - - - -############################################################################### -# Old implementations. - -def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', - connected=True): - """See my name, then you know what I do. - """ -# Gn = Gn[0:10] - Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] - - # phase 1: initilize. - # compute set-median. - dis_min = np.inf - pi_p = [] - pi_all = [] - for idx1, G_p in enumerate(Gn): - dist_sum = 0 - pi_all.append([]) - for idx2, G_p_prime in enumerate(Gn): - dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime) - pi_all[idx1].append(pi_tmp) - dist_sum += dist_tmp - if dist_sum < dis_min: - dis_min = dist_sum - G = G_p.copy() - idx_min = idx1 - # list of edit operations. - pi_p = pi_all[idx_min] - - # phase 2: iteration. - ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], - edge_label=edge_label) - for itr in range(0, 10): # @todo: the convergence condition? - G_new = G.copy() - # update vertex labels. - # pre-compute h_i0 for each label. -# for label in get_node_labels(Gn, node_label): -# print(label) -# for nd in G.nodes(data=True): -# pass - if not ds_attrs['node_attr_dim']: # labels are symbolic - for nd, _ in G.nodes(data=True): - h_i0_list = [] - label_list = [] - for label in get_node_labels(Gn, node_label): - h_i0 = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd] - if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: - h_i0 += 1 - h_i0_list.append(h_i0) - label_list.append(label) - # choose one of the best randomly. - idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() - idx_rdm = random.randint(0, len(idx_max) - 1) - G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] - else: # labels are non-symbolic - for nd, _ in G.nodes(data=True): - Si_norm = 0 - phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd] - if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? - Si_norm += 1 - phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) - phi_i_bar /= Si_norm - G_new.nodes[nd]['attributes'] = phi_i_bar - - # update edge labels and adjacency matrix. - if ds_attrs['edge_labeled']: - for nd1, nd2, _ in G.edges(data=True): - h_ij0_list = [] - label_list = [] - for label in get_edge_labels(Gn, edge_label): - h_ij0 = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd1] - pi_j = pi_p[idx][nd2] - h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and - g.has_edge(pi_i, pi_j) and - g.edges[pi_i, pi_j][edge_label] == label) - h_ij0 += h_ij0_p - h_ij0_list.append(h_ij0) - label_list.append(label) - # choose one of the best randomly. - idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() - h_ij0_max = h_ij0_list[idx_max[0]] - idx_rdm = random.randint(0, len(idx_max) - 1) - best_label = label_list[idx_max[idx_rdm]] - - # check whether a_ij is 0 or 1. - sij_norm = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd1] - pi_j = pi_p[idx][nd2] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): - if not G_new.has_edge(nd1, nd2): - G_new.add_edge(nd1, nd2) - G_new.edges[nd1, nd2][edge_label] = best_label - else: - if G_new.has_edge(nd1, nd2): - G_new.remove_edge(nd1, nd2) - else: # if edges are unlabeled - for nd1, nd2, _ in G.edges(data=True): - sij_norm = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p[idx][nd1] - pi_j = pi_p[idx][nd2] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if sij_norm > len(Gn) * c_er / (c_er + c_ei): - if not G_new.has_edge(nd1, nd2): - G_new.add_edge(nd1, nd2) - else: - if G_new.has_edge(nd1, nd2): - G_new.remove_edge(nd1, nd2) - - G = G_new.copy() - - # update pi_p - pi_p = [] - for idx1, G_p in enumerate(Gn): - dist_tmp, pi_tmp, _ = GED(G, G_p) - pi_p.append(pi_tmp) - - return G - -# --------------------------- These are tests --------------------------------# - -def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, - node_label='atom', edge_label='bond_type'): - """See my name, then you know what I do. - """ -# Gn = Gn[0:10] - Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] - - # phase 1: initilize. - # compute set-median. - dis_min = np.inf -# pi_p = [] - pi_all_forward = [] - pi_all_backward = [] - for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout): - dist_sum = 0 - pi_all_forward.append([]) - pi_all_backward.append([]) - for idx2, G_p_prime in enumerate(Gn): - dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime) - pi_all_forward[idx1].append(pi_tmp_forward) - pi_all_backward[idx1].append(pi_tmp_backward) - dist_sum += dist_tmp - if dist_sum <= dis_min: - dis_min = dist_sum - G = G_p.copy() - idx_min = idx1 - # list of edit operations. - pi_p_forward = pi_all_forward[idx_min] - pi_p_backward = pi_all_backward[idx_min] - - # phase 2: iteration. - ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'], - edge_label=edge_label) - label_set = get_node_labels(Gn + [G], node_label) - for itr in range(0, 10): # @todo: the convergence condition? - G_new = G.copy() - # update vertex labels. - # pre-compute h_i0 for each label. -# for label in get_node_labels(Gn, node_label): -# print(label) -# for nd in G.nodes(data=True): -# pass - if not ds_attrs['node_attr_dim']: # labels are symbolic - for nd in G.nodes(): - h_i0_list = [] - label_list = [] - for label in label_set: - h_i0 = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd] - if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: - h_i0 += 1 - h_i0_list.append(h_i0) - label_list.append(label) - # choose one of the best randomly. - idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() - idx_rdm = random.randint(0, len(idx_max) - 1) - G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] - else: # labels are non-symbolic - for nd in G.nodes(): - Si_norm = 0 - phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd] - if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? - Si_norm += 1 - phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) - phi_i_bar /= Si_norm - G_new.nodes[nd]['attributes'] = phi_i_bar - - # update edge labels and adjacency matrix. - if ds_attrs['edge_labeled']: - for nd1, nd2, _ in G.edges(data=True): - h_ij0_list = [] - label_list = [] - for label in get_edge_labels(Gn, edge_label): - h_ij0 = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd1] - pi_j = pi_p_forward[idx][nd2] - h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and - g.has_edge(pi_i, pi_j) and - g.edges[pi_i, pi_j][edge_label] == label) - h_ij0 += h_ij0_p - h_ij0_list.append(h_ij0) - label_list.append(label) - # choose one of the best randomly. - idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() - h_ij0_max = h_ij0_list[idx_max[0]] - idx_rdm = random.randint(0, len(idx_max) - 1) - best_label = label_list[idx_max[idx_rdm]] - - # check whether a_ij is 0 or 1. - sij_norm = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd1] - pi_j = pi_p_forward[idx][nd2] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): - if not G_new.has_edge(nd1, nd2): - G_new.add_edge(nd1, nd2) - G_new.edges[nd1, nd2][edge_label] = best_label - else: - if G_new.has_edge(nd1, nd2): - G_new.remove_edge(nd1, nd2) - else: # if edges are unlabeled - # @todo: works only for undirected graphs. - for nd1 in range(nx.number_of_nodes(G)): - for nd2 in range(nd1 + 1, nx.number_of_nodes(G)): - sij_norm = 0 - for idx, g in enumerate(Gn): - pi_i = pi_p_forward[idx][nd1] - pi_j = pi_p_forward[idx][nd2] - if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): - sij_norm += 1 - if sij_norm > len(Gn) * c_er / (c_er + c_ei): - if not G_new.has_edge(nd1, nd2): - G_new.add_edge(nd1, nd2) - elif sij_norm < len(Gn) * c_er / (c_er + c_ei): - if G_new.has_edge(nd1, nd2): - G_new.remove_edge(nd1, nd2) - # do not change anything when equal. - - G = G_new.copy() - - # update pi_p - pi_p_forward = [] - for G_p in Gn: - dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p) - pi_p_forward.append(pi_tmp_forward) - - return G - - -############################################################################### - -if __name__ == '__main__': - from gklearn.utils.graphfiles import loadDataset - ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', - 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb -# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', -# 'extra_params': {}} # node nsymb -# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', -# 'extra_params': {}} - Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) - - iam(Gn) \ No newline at end of file diff --git a/gklearn/preimage/knn.py b/gklearn/preimage/knn.py deleted file mode 100644 index c179287..0000000 --- a/gklearn/preimage/knn.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Jan 10 13:22:04 2020 - -@author: ljia -""" -import numpy as np -#import matplotlib.pyplot as plt -from tqdm import tqdm -import random -#import csv -from shutil import copyfile -import os - -from gklearn.preimage.iam import iam_bash -from gklearn.utils.graphfiles import loadDataset, loadGXL -from gklearn.preimage.ged import GED -from gklearn.preimage.utils import get_same_item_indices - -def test_knn(): - ds = {'name': 'monoterpenoides', - 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb - Gn, y_all = loadDataset(ds['dataset']) -# Gn = Gn[0:50] -# gkernel = 'treeletkernel' -# node_label = 'atom' -# edge_label = 'bond_type' -# ds_name = 'mono' - dir_output = 'results/knn/' - graph_dir = os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/' - - k_nn = 1 - percent = 0.1 - repeats = 50 - edit_cost_constant = [3, 3, 1, 3, 3, 1] - - # get indices by classes. - y_idx = get_same_item_indices(y_all) - sod_sm_list_list - for repeat in range(0, repeats): - print('\n---------------------------------') - print('repeat =', repeat) - accuracy_sm_list = [] - accuracy_gm_list = [] - sod_sm_list = [] - sod_gm_list = [] - - random.seed(repeat) - set_median_list = [] - gen_median_list = [] - train_y_set = [] - for y, values in y_idx.items(): - print('\ny =', y) - size_median_set = int(len(values) * percent) - median_set_idx = random.sample(values, size_median_set) - print('median set: ', median_set_idx) - - # compute set median and gen median using IAM (C++ through bash). - # Gn_median = [Gn[idx] for idx in median_set_idx] - group_fnames = [Gn[g].graph['filename'] for g in median_set_idx] - sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, - graph_dir=graph_dir) - print('sod_sm, sod_gm:', sod_sm, sod_gm) - sod_sm_list.append(sod_sm) - sod_gm_list.append(sod_gm) - fname_sm_new = dir_output + 'medians/set_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' - copyfile(fname_sm, fname_sm_new) - fname_gm_new = dir_output + 'medians/gen_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl' - copyfile(fname_gm, fname_gm_new) - set_median_list.append(loadGXL(fname_sm_new)) - gen_median_list.append(loadGXL(fname_gm_new)) - train_y_set.append(int(y)) - - print(sod_sm, sod_gm) - - # do 1-nn. - test_y_set = [int(y) for y in y_all] - accuracy_sm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged') - accuracy_gm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged') - accuracy_sm_list.append(accuracy_sm) - accuracy_gm_list.append(accuracy_gm) - print('current accuracy sm and gm:', accuracy_sm, accuracy_gm) - - # output - accuracy_sm_mean = np.mean(accuracy_sm_list) - accuracy_gm_mean = np.mean(accuracy_gm_list) - print('\ntotal average accuracy sm and gm:', accuracy_sm_mean, accuracy_gm_mean) - - -def knn(train_set, train_y_set, test_set, test_y_set, k=1, distance='ged'): - if k == 1 and distance == 'ged': - algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' - params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', - 'algo_options': algo_options, 'stabilizer': None} - accuracy = 0 - for idx_test, g_test in tqdm(enumerate(test_set), desc='computing 1-nn', - file=sys.stdout): - dis = np.inf - for idx_train, g_train in enumerate(train_set): - dis_cur, _, _ = GED(g_test, g_train, **params_ged) - if dis_cur < dis: - dis = dis_cur - test_y_cur = train_y_set[idx_train] - if test_y_cur == test_y_set[idx_test]: - accuracy += 1 - accuracy = accuracy / len(test_set) - - return accuracy - - - -if __name__ == '__main__': - test_knn() \ No newline at end of file diff --git a/gklearn/preimage/libs.py b/gklearn/preimage/libs.py deleted file mode 100644 index 76005c6..0000000 --- a/gklearn/preimage/libs.py +++ /dev/null @@ -1,6 +0,0 @@ -import sys -import pathlib - -# insert gedlibpy library. -sys.path.insert(0, "../../../") -from gedlibpy import librariesImport, gedlibpy diff --git a/gklearn/preimage/median.py b/gklearn/preimage/median.py deleted file mode 100644 index 1c5bb0f..0000000 --- a/gklearn/preimage/median.py +++ /dev/null @@ -1,218 +0,0 @@ -import sys -sys.path.insert(0, "../") -#import pathlib -import numpy as np -import networkx as nx -import time - -from gedlibpy import librariesImport, gedlibpy -#import script -sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") -import gklearn -from gklearn.utils.graphfiles import loadDataset - -def replace_graph_in_env(script, graph, old_id, label='median'): - """ - Replace a graph in script - - If old_id is -1, add a new graph to the environnemt - - """ - if(old_id > -1): - script.PyClearGraph(old_id) - new_id = script.PyAddGraph(label) - for i in graph.nodes(): - script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib - for e in graph.edges: - script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - return new_id - -#Dessin median courrant -def draw_Letter_graph(graph, savepath=''): - import numpy as np - import networkx as nx - import matplotlib.pyplot as plt - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['attributes'][0]), - float(graph.node[n]['attributes'][1])]) - nx.draw_networkx(graph, pos) - if savepath != '': - plt.savefig(savepath + str(time.time()) + '.eps', format='eps', dpi=300) - plt.show() - plt.clf() - -#compute new mappings -def update_mappings(script,median_id,listID): - med_distances = {} - med_mappings = {} - sod = 0 - for i in range(0,len(listID)): - script.PyRunMethod(median_id,listID[i]) - med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) - med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) - sod += med_distances[i] - return med_distances, med_mappings, sod - -def calcul_Sij(all_mappings, all_graphs,i,j): - s_ij = 0 - for k in range(0,len(all_mappings)): - cur_graph = all_graphs[k] - cur_mapping = all_mappings[k] - size_graph = cur_graph.order() - if ((cur_mapping[i] < size_graph) and - (cur_mapping[j] < size_graph) and - (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): - s_ij += 1 - - return s_ij - -# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): -# from scipy.stats.mstats import gmean - -# for i in median.nodes(): -# for k in listIdSet: -# vectors = [] #np.zeros((len(listIdSet),2)) -# if(k != median_id): -# phi_i = mappings[k][i] -# if(phi_i < dataset[k].order()): -# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) - -# new_labels = gmean(vectors) -# median.node[i]['x'] = str(new_labels[0]) -# median.node[i]['y'] = str(new_labels[1]) -# return median - -def update_median_nodes(median,dataset,mappings): - #update node attributes - for i in median.nodes(): - nb_sub=0 - mean_label = {'x' : 0, 'y' : 0} - for k in range(0,len(mappings)): - phi_i = mappings[k][i] - if ( phi_i < dataset[k].order() ): - nb_sub += 1 - mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) - mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) - median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) - median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) - return median - -def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): -#for letter high, ceir = 1.7, alpha = 0.75 - size_dataset = len(dataset) - ratio_cei_cer = cer/(cei + cer) - threshold = size_dataset*ratio_cei_cer - order_graph_median = median.order() - for i in range(0,order_graph_median): - for j in range(i+1,order_graph_median): - s_ij = calcul_Sij(mappings,dataset,i,j) - if(s_ij > threshold): - median.add_edge(i,j) - else: - if(median.has_edge(i,j)): - median.remove_edge(i,j) - return median - - - -def compute_median(script, listID, dataset,verbose=False): - """Compute a graph median of a dataset according to an environment - - Parameters - - script : An gedlib initialized environnement - listID (list): a list of ID in script: encodes the dataset - dataset (list): corresponding graphs in networkX format. We assume that graph - listID[i] corresponds to dataset[i] - - Returns: - A networkX graph, which is the median, with corresponding sod - """ - print(len(listID)) - median_set_index, median_set_sod = compute_median_set(script, listID) - print(median_set_index) - print(median_set_sod) - sods = [] - #Ajout median dans environnement - set_median = dataset[median_set_index].copy() - median = dataset[median_set_index].copy() - cur_med_id = replace_graph_in_env(script,median,-1) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite_max = 50 - old_sod = cur_sod * 2 - ite = 0 - epsilon = 0.001 - - best_median - while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): - median = update_median_nodes(median,dataset, med_mappings) - median = update_median_edges(dataset,med_mappings,median) - - cur_med_id = replace_graph_in_env(script,median,cur_med_id) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - - - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite += 1 - return median, cur_sod, sods, set_median - - draw_Letter_graph(median) - - -def compute_median_set(script,listID): - 'Returns the id in listID corresponding to median set' - #Calcul median set - N=len(listID) - map_id_to_index = {} - map_index_to_id = {} - for i in range(0,len(listID)): - map_id_to_index[listID[i]] = i - map_index_to_id[i] = listID[i] - - distances = np.zeros((N,N)) - for i in listID: - for j in listID: - script.PyRunMethod(i,j) - distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) - - median_set_index = np.argmin(np.sum(distances,0)) - sod = np.min(np.sum(distances,0)) - - return median_set_index, sod - -if __name__ == "__main__": - #Chargement du dataset - script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') - script.PySetEditCost("LETTER") - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") - - listID = script.PyGetAllGraphIds() - median, sod = compute_median(script,listID,dataset,verbose=True) - - print(sod) - draw_Letter_graph(median) - - -#if __name__ == '__main__': -# # test draw_Letter_graph -# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', -# 'extra_params': {}} # node nsymb -# Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) -# print(y_all) -# for g in Gn: -# draw_Letter_graph(g) \ No newline at end of file diff --git a/gklearn/preimage/median_benoit.py b/gklearn/preimage/median_benoit.py deleted file mode 100644 index 6712196..0000000 --- a/gklearn/preimage/median_benoit.py +++ /dev/null @@ -1,201 +0,0 @@ -import sys -import pathlib -import numpy as np -import networkx as nx - -import librariesImport -import script -sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") -import gklearn - -def replace_graph_in_env(script, graph, old_id, label='median'): - """ - Replace a graph in script - - If old_id is -1, add a new graph to the environnemt - - """ - if(old_id > -1): - script.PyClearGraph(old_id) - new_id = script.PyAddGraph(label) - for i in graph.nodes(): - script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib - for e in graph.edges: - script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - return new_id - -#Dessin median courrant -def draw_Letter_graph(graph): - import numpy as np - import networkx as nx - import matplotlib.pyplot as plt - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) - nx.draw_networkx(graph,pos) - plt.show() - -#compute new mappings -def update_mappings(script,median_id,listID): - med_distances = {} - med_mappings = {} - sod = 0 - for i in range(0,len(listID)): - script.PyRunMethod(median_id,listID[i]) - med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) - med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) - sod += med_distances[i] - return med_distances, med_mappings, sod - -def calcul_Sij(all_mappings, all_graphs,i,j): - s_ij = 0 - for k in range(0,len(all_mappings)): - cur_graph = all_graphs[k] - cur_mapping = all_mappings[k] - size_graph = cur_graph.order() - if ((cur_mapping[i] < size_graph) and - (cur_mapping[j] < size_graph) and - (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): - s_ij += 1 - - return s_ij - -# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): -# from scipy.stats.mstats import gmean - -# for i in median.nodes(): -# for k in listIdSet: -# vectors = [] #np.zeros((len(listIdSet),2)) -# if(k != median_id): -# phi_i = mappings[k][i] -# if(phi_i < dataset[k].order()): -# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) - -# new_labels = gmean(vectors) -# median.node[i]['x'] = str(new_labels[0]) -# median.node[i]['y'] = str(new_labels[1]) -# return median - -def update_median_nodes(median,dataset,mappings): - #update node attributes - for i in median.nodes(): - nb_sub=0 - mean_label = {'x' : 0, 'y' : 0} - for k in range(0,len(mappings)): - phi_i = mappings[k][i] - if ( phi_i < dataset[k].order() ): - nb_sub += 1 - mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) - mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) - median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) - median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) - return median - -def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): -#for letter high, ceir = 1.7, alpha = 0.75 - size_dataset = len(dataset) - ratio_cei_cer = cer/(cei + cer) - threshold = size_dataset*ratio_cei_cer - order_graph_median = median.order() - for i in range(0,order_graph_median): - for j in range(i+1,order_graph_median): - s_ij = calcul_Sij(mappings,dataset,i,j) - if(s_ij > threshold): - median.add_edge(i,j) - else: - if(median.has_edge(i,j)): - median.remove_edge(i,j) - return median - - - -def compute_median(script, listID, dataset,verbose=False): - """Compute a graph median of a dataset according to an environment - - Parameters - - script : An gedlib initialized environnement - listID (list): a list of ID in script: encodes the dataset - dataset (list): corresponding graphs in networkX format. We assume that graph - listID[i] corresponds to dataset[i] - - Returns: - A networkX graph, which is the median, with corresponding sod - """ - print(len(listID)) - median_set_index, median_set_sod = compute_median_set(script, listID) - print(median_set_index) - print(median_set_sod) - sods = [] - #Ajout median dans environnement - set_median = dataset[median_set_index].copy() - median = dataset[median_set_index].copy() - cur_med_id = replace_graph_in_env(script,median,-1) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite_max = 50 - old_sod = cur_sod * 2 - ite = 0 - epsilon = 0.001 - - best_median - while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): - median = update_median_nodes(median,dataset, med_mappings) - median = update_median_edges(dataset,med_mappings,median) - - cur_med_id = replace_graph_in_env(script,median,cur_med_id) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - - - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite += 1 - return median, cur_sod, sods, set_median - - draw_Letter_graph(median) - - -def compute_median_set(script,listID): - 'Returns the id in listID corresponding to median set' - #Calcul median set - N=len(listID) - map_id_to_index = {} - map_index_to_id = {} - for i in range(0,len(listID)): - map_id_to_index[listID[i]] = i - map_index_to_id[i] = listID[i] - - distances = np.zeros((N,N)) - for i in listID: - for j in listID: - script.PyRunMethod(i,j) - distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) - - median_set_index = np.argmin(np.sum(distances,0)) - sod = np.min(np.sum(distances,0)) - - return median_set_index, sod - -if __name__ == "__main__": - #Chargement du dataset - script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') - script.PySetEditCost("LETTER") - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") - - listID = script.PyGetAllGraphIds() - median, sod = compute_median(script,listID,dataset,verbose=True) - - print(sod) - draw_Letter_graph(median) diff --git a/gklearn/preimage/median_graph_estimator.py b/gklearn/preimage/median_graph_estimator.py deleted file mode 100644 index b70cc61..0000000 --- a/gklearn/preimage/median_graph_estimator.py +++ /dev/null @@ -1,826 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Mon Mar 16 18:04:55 2020 - -@author: ljia -""" -import numpy as np -from gklearn.preimage.common_types import AlgorithmState -from gklearn.preimage import misc -from gklearn.preimage.timer import Timer -from gklearn.utils.utils import graph_isIdentical -import time -from tqdm import tqdm -import sys -import networkx as nx - - -class MedianGraphEstimator(object): - - def __init__(self, ged_env, constant_node_costs): - """Constructor. - - Parameters - ---------- - ged_env : gklearn.gedlib.gedlibpy.GEDEnv - Initialized GED environment. The edit costs must be set by the user. - - constant_node_costs : Boolean - Set to True if the node relabeling costs are constant. - """ - self.__ged_env = ged_env - self.__init_method = 'BRANCH_FAST' - self.__init_options = '' - self.__descent_method = 'BRANCH_FAST' - self.__descent_options = '' - self.__refine_method = 'IPFP' - self.__refine_options = '' - self.__constant_node_costs = constant_node_costs - self.__labeled_nodes = (ged_env.get_num_node_labels() > 1) - self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1)) - self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1)) - self.__labeled_edges = (ged_env.get_num_edge_labels() > 1) - self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1)) - self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1)) - self.__init_type = 'RANDOM' - self.__num_random_inits = 10 - self.__desired_num_random_inits = 10 - self.__use_real_randomness = True - self.__seed = 0 - self.__refine = True - self.__time_limit_in_sec = 0 - self.__epsilon = 0.0001 - self.__max_itrs = 100 - self.__max_itrs_without_update = 3 - self.__num_inits_increase_order = 10 - self.__init_type_increase_order = 'K-MEANS++' - self.__max_itrs_increase_order = 10 - self.__print_to_stdout = 2 - self.__median_id = np.inf # @todo: check - self.__median_node_id_prefix = '' # @todo: check - self.__node_maps_from_median = {} - self.__sum_of_distances = 0 - self.__best_init_sum_of_distances = np.inf - self.__converged_sum_of_distances = np.inf - self.__runtime = None - self.__runtime_initialized = None - self.__runtime_converged = None - self.__itrs = [] # @todo: check: {} ? - self.__num_decrease_order = 0 - self.__num_increase_order = 0 - self.__num_converged_descents = 0 - self.__state = AlgorithmState.TERMINATED - - if ged_env is None: - raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') - elif not ged_env.is_initialized(): - raise Exception('The GED environment is uninitialized. Call gedlibpy.GEDEnv.init() before passing it to the constructor of MedianGraphEstimator.') - - - def set_options(self, options): - """Sets the options of the estimator. - - Parameters - ---------- - options : string - String that specifies with which options to run the estimator. - """ - self.__set_default_options() - options_map = misc.options_string_to_options_map(options) - for opt_name, opt_val in options_map.items(): - if opt_name == 'init-type': - self.__init_type = opt_val - if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN': - raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"') - elif opt_name == 'random-inits': - try: - self.__num_random_inits = int(opt_val) - self.__desired_num_random_inits = self.__num_random_inits - except: - raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') - - if self.__num_random_inits <= 0: - raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') - - elif opt_name == 'randomness': - if opt_val == 'PSEUDO': - self.__use_real_randomness = False - - elif opt_val == 'REAL': - self.__use_real_randomness = True - - else: - raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') - - elif opt_name == 'stdout': - if opt_val == '0': - self.__print_to_stdout = 0 - - elif opt_val == '1': - self.__print_to_stdout = 1 - - elif opt_val == '2': - self.__print_to_stdout = 2 - - else: - raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') - - elif opt_name == 'refine': - if opt_val == 'TRUE': - self.__refine = True - - elif opt_val == 'FALSE': - self.__refine = False - - else: - raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') - - elif opt_name == 'time-limit': - try: - self.__time_limit_in_sec = float(opt_val) - - except: - raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]') - - elif opt_name == 'max-itrs': - try: - self.__max_itrs = int(opt_val) - - except: - raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]') - - elif opt_name == 'max-itrs-without-update': - try: - self.__max_itrs_without_update = int(opt_val) - - except: - raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]') - - elif opt_name == 'seed': - try: - self.__seed = int(opt_val) - - except: - raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]') - - elif opt_name == 'epsilon': - try: - self.__epsilon = float(opt_val) - - except: - raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') - - if self.__epsilon <= 0: - raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') - - elif opt_name == 'inits-increase-order': - try: - self.__num_inits_increase_order = int(opt_val) - - except: - raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') - - if self.__num_inits_increase_order <= 0: - raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') - - elif opt_name == 'init-type-increase-order': - self.__init_type_increase_order = opt_val - if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': - raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') - - elif opt_name == 'max-itrs-increase-order': - try: - self.__max_itrs_increase_order = int(opt_val) - - except: - raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]') - - else: - valid_options = '[--init-type ] [--random-inits ] [--randomness ] [--seed ] [--stdout ] ' - valid_options += '[--time-limit ] [--max-itrs ] [--epsilon ] ' - valid_options += '[--inits-increase-order ] [--init-type-increase-order ] [--max-itrs-increase-order ]' - raise Exception('Invalid option "' + opt_name + '". Usage: options = "' + valid_options + '"') - - - def set_init_method(self, init_method, init_options=''): - """Selects method to be used for computing the initial medoid graph. - - Parameters - ---------- - init_method : string - The selected method. Default: ged::Options::GEDMethod::BRANCH_UNIFORM. - - init_options : string - The options for the selected method. Default: "". - - Notes - ----- - Has no effect unless "--init-type MEDOID" is passed to set_options(). - """ - self.__init_method = init_method; - self.__init_options = init_options; - - - def set_descent_method(self, descent_method, descent_options=''): - """Selects method to be used for block gradient descent.. - - Parameters - ---------- - descent_method : string - The selected method. Default: ged::Options::GEDMethod::BRANCH_FAST. - - descent_options : string - The options for the selected method. Default: "". - - Notes - ----- - Has no effect unless "--init-type MEDOID" is passed to set_options(). - """ - self.__descent_method = descent_method; - self.__descent_options = descent_options; - - - def set_refine_method(self, refine_method, refine_options): - """Selects method to be used for improving the sum of distances and the node maps for the converged median. - - Parameters - ---------- - refine_method : string - The selected method. Default: "IPFP". - - refine_options : string - The options for the selected method. Default: "". - - Notes - ----- - Has no effect if "--refine FALSE" is passed to set_options(). - """ - self.__refine_method = refine_method - self.__refine_options = refine_options - - - def run(self, graph_ids, set_median_id, gen_median_id): - """Computes a generalized median graph. - - Parameters - ---------- - graph_ids : list[integer] - The IDs of the graphs for which the median should be computed. Must have been added to the environment passed to the constructor. - - set_median_id : integer - The ID of the computed set-median. A dummy graph with this ID must have been added to the environment passed to the constructor. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). - - - gen_median_id : integer - The ID of the computed generalized median. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). - """ - # Sanity checks. - if len(graph_ids) == 0: - raise Exception('Empty vector of graph IDs, unable to compute median.') - all_graphs_empty = True - for graph_id in graph_ids: - if self.__ged_env.get_graph_num_nodes(graph_id) > 0: - self.__median_node_id_prefix = self.__ged_env.get_original_node_ids(graph_id)[0] - all_graphs_empty = False - break - if all_graphs_empty: - raise Exception('All graphs in the collection are empty.') - - # Start timer and record start time. - start = time.time() - timer = Timer(self.__time_limit_in_sec) - self.__median_id = gen_median_id - self.__state = AlgorithmState.TERMINATED - - # Get ExchangeGraph representations of the input graphs. - graphs = {} - for graph_id in graph_ids: - # @todo: get_nx_graph() function may need to be modified according to the coming code. - graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id, True, True, False) -# print(self.__ged_env.get_graph_internal_id(0)) -# print(graphs[0].graph) -# print(graphs[0].nodes(data=True)) -# print(graphs[0].edges(data=True)) -# print(nx.adjacency_matrix(graphs[0])) - - - # Construct initial medians. - medians = [] - self.__construct_initial_medians(graph_ids, timer, medians) - end_init = time.time() - self.__runtime_initialized = end_init - start -# print(medians[0].graph) -# print(medians[0].nodes(data=True)) -# print(medians[0].edges(data=True)) -# print(nx.adjacency_matrix(medians[0])) - - # Reset information about iterations and number of times the median decreases and increases. - self.__itrs = [0] * len(medians) - self.__num_decrease_order = 0 - self.__num_increase_order = 0 - self.__num_converged_descents = 0 - - # Initialize the best median. - best_sum_of_distances = np.inf - self.__best_init_sum_of_distances = np.inf - node_maps_from_best_median = {} - - # Run block gradient descent from all initial medians. - self.__ged_env.set_method(self.__descent_method, self.__descent_options) - for median_pos in range(0, len(medians)): - - # Terminate if the timer has expired and at least one SOD has been computed. - if timer.expired() and median_pos > 0: - break - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('\n===========================================================') - print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') - print('-----------------------------------------------------------') - - # Get reference to the median. - median = medians[median_pos] - - # Load initial median into the environment. - self.__ged_env.load_nx_graph(median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) - - # Print information about current iteration. - if self.__print_to_stdout == 2: - progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout) - - # Compute node maps and sum of distances for initial median. - self.__sum_of_distances = 0 - self.__node_maps_from_median.clear() # @todo - for graph_id in graph_ids: - self.__ged_env.run_method(gen_median_id, graph_id) - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) -# print(self.__node_maps_from_median[graph_id]) - self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: the C++ implementation for this function in GedLibBind.ipp re-call get_node_map() once more, this is not neccessary. -# print(self.__sum_of_distances) - # Print information about current iteration. - if self.__print_to_stdout == 2: - progress.update(1) - - self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) - self.__ged_env.load_nx_graph(median, set_median_id) -# print(self.__best_init_sum_of_distances) - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('\n') - - # Run block gradient descent from initial median. - converged = False - itrs_without_update = 0 - while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update): - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('\n===========================================================') - print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') - print('-----------------------------------------------------------') - - # Initialize flags that tell us what happened in the iteration. - median_modified = False - node_maps_modified = False - decreased_order = False - increased_order = False - - # Update the median. # @todo!!!!!!!!!!!!!!!!!!!!!! - median_modified = self.__update_median(graphs, median) - if not median_modified or self.__itrs[median_pos] == 0: - decreased_order = False - if not decreased_order or self.__itrs[median_pos] == 0: - increased_order = False - - # Update the number of iterations without update of the median. - if median_modified or decreased_order or increased_order: - itrs_without_update = 0 - else: - itrs_without_update += 1 - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('Loading median to environment: ... ', end='') - - # Load the median into the environment. - # @todo: should this function use the original node label? - self.__ged_env.load_nx_graph(median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('done.') - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('Updating induced costs: ... ', end='') - - # Compute induced costs of the old node maps w.r.t. the updated median. - for graph_id in graph_ids: -# print(self.__ged_env.get_induced_cost(gen_median_id, graph_id)) - # @todo: watch out if compute_induced_cost is correct, this may influence: increase/decrease order, induced_cost() in the following code.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - self.__ged_env.compute_induced_cost(gen_median_id, graph_id) -# print('---------------------------------------') -# print(self.__ged_env.get_induced_cost(gen_median_id, graph_id)) - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('done.') - - # Update the node maps. - node_maps_modified = self.__update_node_maps() # @todo - - # Update the order of the median if no improvement can be found with the current order. - - # Update the sum of distances. - old_sum_of_distances = self.__sum_of_distances - self.__sum_of_distances = 0 - for graph_id in self.__node_maps_from_median: - self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: see above. - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('Old local SOD: ', old_sum_of_distances) - print('New local SOD: ', self.__sum_of_distances) - print('Best converged SOD: ', best_sum_of_distances) - print('Modified median: ', median_modified) - print('Modified node maps: ', node_maps_modified) - print('Decreased order: ', decreased_order) - print('Increased order: ', increased_order) - print('===========================================================\n') - - converged = not (median_modified or node_maps_modified or decreased_order or increased_order) - - self.__itrs[median_pos] += 1 - - # Update the best median. - if self.__sum_of_distances < self.__best_init_sum_of_distances: - best_sum_of_distances = self.__sum_of_distances - node_maps_from_best_median = self.__node_maps_from_median - best_median = median - - # Update the number of converged descents. - if converged: - self.__num_converged_descents += 1 - - # Store the best encountered median. - self.__sum_of_distances = best_sum_of_distances - self.__node_maps_from_median = node_maps_from_best_median - self.__ged_env.load_nx_graph(best_median, gen_median_id) - self.__ged_env.init(self.__ged_env.get_init_type()) - end_descent = time.time() - self.__runtime_converged = end_descent - start - - # Refine the sum of distances and the node maps for the converged median. - self.__converged_sum_of_distances = self.__sum_of_distances - if self.__refine: - self.__improve_sum_of_distances(timer) # @todo - - # Record end time, set runtime and reset the number of initial medians. - end = time.time() - self.__runtime = end - start - self.__num_random_inits = self.__desired_num_random_inits - - # Print global information. - if self.__print_to_stdout != 0: - print('\n===========================================================') - print('Finished computation of generalized median graph.') - print('-----------------------------------------------------------') - print('Best SOD after initialization: ', self.__best_init_sum_of_distances) - print('Converged SOD: ', self.__converged_sum_of_distances) - if self.__refine: - print('Refined SOD: ', self.__sum_of_distances) - print('Overall runtime: ', self.__runtime) - print('Runtime of initialization: ', self.__runtime_initialized) - print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized) - if self.__refine: - print('Runtime of refinement: ', self.__runtime - self.__runtime_converged) - print('Number of initial medians: ', len(medians)) - total_itr = 0 - num_started_descents = 0 - for itr in self.__itrs: - total_itr += itr - if itr > 0: - num_started_descents += 1 - print('Size of graph collection: ', len(graph_ids)) - print('Number of started descents: ', num_started_descents) - print('Number of converged descents: ', self.__num_converged_descents) - print('Overall number of iterations: ', total_itr) - print('Overall number of times the order decreased: ', self.__num_decrease_order) - print('Overall number of times the order increased: ', self.__num_increase_order) - print('===========================================================\n') - - - def get_sum_of_distances(self, state=''): - """Returns the sum of distances. - - Parameters - ---------- - state : string - The state of the estimator. Can be 'initialized' or 'converged'. Default: "" - - Returns - ------- - float - The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned. - """ - if not self.__median_available(): - raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().') - if state == 'initialized': - return self.__best_init_sum_of_distances - if state == 'converged': - return self.__converged_sum_of_distances - return self.__sum_of_distances - - - def __set_default_options(self): - self.__init_type = 'RANDOM' - self.__num_random_inits = 10 - self.__desired_num_random_inits = 10 - self.__use_real_randomness = True - self.__seed = 0 - self.__refine = True - self.__time_limit_in_sec = 0 - self.__epsilon = 0.0001 - self.__max_itrs = 100 - self.__max_itrs_without_update = 3 - self.__num_inits_increase_order = 10 - self.__init_type_increase_order = 'K-MEANS++' - self.__max_itrs_increase_order = 10 - self.__print_to_stdout = 2 - - - def __construct_initial_medians(self, graph_ids, timer, initial_medians): - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('\n===========================================================') - print('Constructing initial median(s).') - print('-----------------------------------------------------------') - - # Compute or sample the initial median(s). - initial_medians.clear() - if self.__init_type == 'MEDOID': - self.__compute_medoid(graph_ids, timer, initial_medians) - elif self.__init_type == 'MAX': - pass # @todo -# compute_max_order_graph_(graph_ids, initial_medians) - elif self.__init_type == 'MIN': - pass # @todo -# compute_min_order_graph_(graph_ids, initial_medians) - elif self.__init_type == 'MEAN': - pass # @todo -# compute_mean_order_graph_(graph_ids, initial_medians) - else: - pass # @todo -# sample_initial_medians_(graph_ids, initial_medians) - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('===========================================================') - - - def __compute_medoid(self, graph_ids, timer, initial_medians): - # Use method selected for initialization phase. - self.__ged_env.set_method(self.__init_method, self.__init_options) - - # Print information about current iteration. - if self.__print_to_stdout == 2: - progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout) - - # Compute the medoid. - medoid_id = graph_ids[0] - best_sum_of_distances = np.inf - for g_id in graph_ids: - if timer.expired(): - self.__state = AlgorithmState.CALLED - break - sum_of_distances = 0 - for h_id in graph_ids: - self.__ged_env.run_method(g_id, h_id) - sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) - if sum_of_distances < best_sum_of_distances: - best_sum_of_distances = sum_of_distances - medoid_id = g_id - - # Print information about current iteration. - if self.__print_to_stdout == 2: - progress.update(1) - initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('\n') - - - def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): - if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): - if self.__state == AlgorithmState.TERMINATED: - self.__state = AlgorithmState.INITIALIZED - return True - return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) - - - def __update_median(self, graphs, median): - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('Updating median: ', end='') - - # Store copy of the old median. - old_median = median.copy() # @todo: this is just a shallow copy. - - # Update the node labels. - if self.__labeled_nodes: - self.__update_node_labels(graphs, median) - - # Update the edges and their labels. - self.__update_edges(graphs, median) - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('done.') - - return not self.__are_graphs_equal(median, old_median) - - - def __update_node_labels(self, graphs, median): - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('nodes ... ', end='') - - # Iterate through all nodes of the median. - for i in range(0, nx.number_of_nodes(median)): -# print('i: ', i) - # Collect the labels of the substituted nodes. - node_labels = [] - for graph_id, graph in graphs.items(): -# print('graph_id: ', graph_id) -# print(self.__node_maps_from_median[graph_id]) - k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i) -# print('k: ', k) - if k != np.inf: - node_labels.append(graph.nodes[k]) - - # Compute the median label and update the median. - if len(node_labels) > 0: - median_label = self.__ged_env.get_median_node_label(node_labels) - if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: - nx.set_node_attributes(median, {i: median_label}) - - - def __update_edges(self, graphs, median): - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('edges ... ', end='') - - # Clear the adjacency lists of the median and reset number of edges to 0. - median_edges = list(median.edges) - for (head, tail) in median_edges: - median.remove_edge(head, tail) - - # @todo: what if edge is not labeled? - # Iterate through all possible edges (i,j) of the median. - for i in range(0, nx.number_of_nodes(median)): - for j in range(i + 1, nx.number_of_nodes(median)): - - # Collect the labels of the edges to which (i,j) is mapped by the node maps. - edge_labels = [] - for graph_id, graph in graphs.items(): - k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i) - l = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], j) - if k != np.inf and l != np.inf: - if graph.has_edge(k, l): - edge_labels.append(graph.edges[(k, l)]) - - # Compute the median edge label and the overall edge relabeling cost. - rel_cost = 0 - median_label = self.__ged_env.get_edge_label(1) - if median.has_edge(i, j): - median_label = median.edges[(i, j)] - if self.__labeled_edges and len(edge_labels) > 0: - new_median_label = self.__ged_env.median_edge_label(edge_labels) - if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: - median_label = new_median_label - for edge_label in edge_labels: - rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label) - - # Update the median. - if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs): - median.add_edge(i, j, **median_label) - else: - if median.has_edge(i, j): - median.remove_edge(i, j) - - - def __update_node_maps(self): - # Print information about current iteration. - if self.__print_to_stdout == 2: - progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) - - # Update the node maps. - node_maps_were_modified = False - for graph_id in self.__node_maps_from_median: - self.__ged_env.run_method(self.__median_id, graph_id) - if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < self.__ged_env.get_induced_cost(self.__median_id, graph_id) - self.__epsilon: # @todo: see above. - self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) # @todo: node_map may not assigned. - node_maps_were_modified = True - # Print information about current iteration. - if self.__print_to_stdout == 2: - progress.update(1) - - # Print information about current iteration. - if self.__print_to_stdout == 2: - print('\n') - - # Return true if the node maps were modified. - return node_maps_were_modified - - - def __improve_sum_of_distances(self, timer): - pass - - - def __median_available(self): - return self.__median_id != np.inf - - - def __get_node_image_from_map(self, node_map, node): - """ - Return ID of the node mapping of `node` in `node_map`. - - Parameters - ---------- - node_map : list[tuple(int, int)] - List of node maps where the mapping node is found. - - node : int - The mapping node of this node is returned - - Raises - ------ - Exception - If the node with ID `node` is not contained in the source nodes of the node map. - - Returns - ------- - int - ID of the mapping of `node`. - - Notes - ----- - This function is not implemented in the `ged::MedianGraphEstimator` class of the `GEDLIB` library. Instead it is a Python implementation of the `ged::NodeMap::image` function. - """ - if node < len(node_map): - return node_map[node][1] if node_map[node][1] < len(node_map) else np.inf - else: - raise Exception('The node with ID ', str(node), ' is not contained in the source nodes of the node map.') - return np.inf - - - def __are_graphs_equal(self, g1, g2): - """ - Check if the two graphs are equal. - - Parameters - ---------- - g1 : NetworkX graph object - Graph 1 to be compared. - - g2 : NetworkX graph object - Graph 2 to be compared. - - Returns - ------- - bool - True if the two graph are equal. - - Notes - ----- - This is not an identical check. Here the two graphs are equal if and only if their original_node_ids, nodes, all node labels, edges and all edge labels are equal. This function is specifically designed for class `MedianGraphEstimator` and should not be used elsewhere. - """ - # check original node ids. - if not g1.graph['original_node_ids'] == g2.graph['original_node_ids']: - return False - # check nodes. - nlist1 = [n for n in g1.nodes(data=True)] - nlist2 = [n for n in g2.nodes(data=True)] - if not nlist1 == nlist2: - return False - # check edges. - elist1 = [n for n in g1.edges(data=True)] - elist2 = [n for n in g2.edges(data=True)] - if not elist1 == elist2: - return False - - return True - - - def compute_my_cost(g, h, node_map): - cost = 0.0 - for node in g.nodes: - cost += 0 - \ No newline at end of file diff --git a/gklearn/preimage/median_linlin.py b/gklearn/preimage/median_linlin.py deleted file mode 100644 index 6139558..0000000 --- a/gklearn/preimage/median_linlin.py +++ /dev/null @@ -1,215 +0,0 @@ -import sys -import pathlib -import numpy as np -import networkx as nx - -from gedlibpy import librariesImport, gedlibpy -sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/") -import gklearn - -def replace_graph_in_env(script, graph, old_id, label='median'): - """ - Replace a graph in script - - If old_id is -1, add a new graph to the environnemt - - """ - if(old_id > -1): - script.PyClearGraph(old_id) - new_id = script.PyAddGraph(label) - for i in graph.nodes(): - script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib - for e in graph.edges: - script.PyAddEdge(new_id, str(e[0]),str(e[1]), {}) - script.PyInitEnv() - script.PySetMethod("IPFP", "") - script.PyInitMethod() - - return new_id - -#Dessin median courrant -def draw_Letter_graph(graph): - import numpy as np - import networkx as nx - import matplotlib.pyplot as plt - plt.figure() - pos = {} - for n in graph.nodes: - pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) - nx.draw_networkx(graph,pos) - plt.show() - -#compute new mappings -def update_mappings(script,median_id,listID): - med_distances = {} - med_mappings = {} - sod = 0 - for i in range(0,len(listID)): - script.PyRunMethod(median_id,listID[i]) - med_distances[i] = script.PyGetUpperBound(median_id,listID[i]) - med_mappings[i] = script.PyGetForwardMap(median_id,listID[i]) - sod += med_distances[i] - return med_distances, med_mappings, sod - -def calcul_Sij(all_mappings, all_graphs,i,j): - s_ij = 0 - for k in range(0,len(all_mappings)): - cur_graph = all_graphs[k] - cur_mapping = all_mappings[k] - size_graph = cur_graph.order() - if ((cur_mapping[i] < size_graph) and - (cur_mapping[j] < size_graph) and - (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)): - s_ij += 1 - - return s_ij - -# def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings): -# from scipy.stats.mstats import gmean - -# for i in median.nodes(): -# for k in listIdSet: -# vectors = [] #np.zeros((len(listIdSet),2)) -# if(k != median_id): -# phi_i = mappings[k][i] -# if(phi_i < dataset[k].order()): -# vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])]) - -# new_labels = gmean(vectors) -# median.node[i]['x'] = str(new_labels[0]) -# median.node[i]['y'] = str(new_labels[1]) -# return median - -def update_median_nodes(median,dataset,mappings): - #update node attributes - for i in median.nodes(): - nb_sub=0 - mean_label = {'x' : 0, 'y' : 0} - for k in range(0,len(mappings)): - phi_i = mappings[k][i] - if ( phi_i < dataset[k].order() ): - nb_sub += 1 - mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x']) - mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y']) - median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub)) - median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub)) - return median - -def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425): -#for letter high, ceir = 1.7, alpha = 0.75 - size_dataset = len(dataset) - ratio_cei_cer = cer/(cei + cer) - threshold = size_dataset*ratio_cei_cer - order_graph_median = median.order() - for i in range(0,order_graph_median): - for j in range(i+1,order_graph_median): - s_ij = calcul_Sij(mappings,dataset,i,j) - if(s_ij > threshold): - median.add_edge(i,j) - else: - if(median.has_edge(i,j)): - median.remove_edge(i,j) - return median - - - -def compute_median(script, listID, dataset,verbose=False): - """Compute a graph median of a dataset according to an environment - - Parameters - - script : An gedlib initialized environnement - listID (list): a list of ID in script: encodes the dataset - dataset (list): corresponding graphs in networkX format. We assume that graph - listID[i] corresponds to dataset[i] - - Returns: - A networkX graph, which is the median, with corresponding sod - """ - print(len(listID)) - median_set_index, median_set_sod = compute_median_set(script, listID) - print(median_set_index) - print(median_set_sod) - sods = [] - #Ajout median dans environnement - set_median = dataset[median_set_index].copy() - median = dataset[median_set_index].copy() - cur_med_id = replace_graph_in_env(script,median,-1) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite_max = 50 - old_sod = cur_sod * 2 - ite = 0 - epsilon = 0.001 - - best_median - while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )): - median = update_median_nodes(median,dataset, med_mappings) - median = update_median_edges(dataset,med_mappings,median) - - cur_med_id = replace_graph_in_env(script,median,cur_med_id) - med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID) - - - sods.append(cur_sod) - if(verbose): - print(cur_sod) - ite += 1 - return median, cur_sod, sods, set_median - - draw_Letter_graph(median) - - -def compute_median_set(script,listID): - 'Returns the id in listID corresponding to median set' - #Calcul median set - N=len(listID) - map_id_to_index = {} - map_index_to_id = {} - for i in range(0,len(listID)): - map_id_to_index[listID[i]] = i - map_index_to_id[i] = listID[i] - - distances = np.zeros((N,N)) - for i in listID: - for j in listID: - script.PyRunMethod(i,j) - distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j) - - median_set_index = np.argmin(np.sum(distances,0)) - sod = np.min(np.sum(distances,0)) - - return median_set_index, sod - -def _convertGraph(G): - """Convert a graph to the proper NetworkX format that can be - recognized by library gedlibpy. - """ - G_new = nx.Graph() - for nd, attrs in G.nodes(data=True): - G_new.add_node(str(nd), chem=attrs['atom']) -# G_new.add_node(str(nd), x=str(attrs['attributes'][0]), -# y=str(attrs['attributes'][1])) - for nd1, nd2, attrs in G.edges(data=True): - G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type']) -# G_new.add_edge(str(nd1), str(nd2)) - - return G_new - -if __name__ == "__main__": - #Chargement du dataset - gedlibpy.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml') - gedlibpy.PySetEditCost("LETTER") - gedlibpy.PyInitEnv() - gedlibpy.PySetMethod("IPFP", "") - gedlibpy.PyInitMethod() - - dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl") - - listID = gedlibpy.PyGetAllGraphIds() - median, sod = compute_median(gedlibpy,listID,dataset,verbose=True) - - print(sod) - draw_Letter_graph(median) diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py deleted file mode 100644 index dfbaef2..0000000 --- a/gklearn/preimage/median_preimage_generator.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Mar 26 18:27:22 2020 - -@author: ljia -""" -from gklearn.preimage.preimage_generator import PreimageGenerator -# from gklearn.utils.dataset import Dataset - -class MedianPreimageGenerator(PreimageGenerator): - - def __init__(self, mge, dataset): - self.__mge = mge - self.__dataset = dataset \ No newline at end of file diff --git a/gklearn/preimage/misc.py b/gklearn/preimage/misc.py deleted file mode 100644 index 18682c8..0000000 --- a/gklearn/preimage/misc.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Thu Mar 19 18:13:56 2020 - -@author: ljia -""" - -def options_string_to_options_map(options_string): - """Transforms an options string into an options map. - - Parameters - ---------- - options_string : string - Options string of the form "[--