Revert "clear repo: remove useless files."

This reverts commit a8948a35b2.
5 years ago · b5dc9aa18f
--- a/gklearn/kernels/.tags
+++ b/gklearn/kernels/.tags
@@ -0,0 +1,188 @@
 !_TAG_FILE_FORMAT	2	/extended format; --format=1 will not append ;" to lines/
 !_TAG_FILE_SORTED	0	/0=unsorted, 1=sorted, 2=foldcase/
 !_TAG_PROGRAM_AUTHOR	Darren Hiebert	/dhiebert@users.sourceforge.net/
 !_TAG_PROGRAM_NAME	Exuberant Ctags	//
 !_TAG_PROGRAM_URL	http://ctags.sourceforge.net	/official site/
 !_TAG_PROGRAM_VERSION	5.9~svn20110310	//
 commonwalkkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^def commonwalkkernel(*args,$/;"	function	line:23
 compute_method	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    compute_method = compute_method.lower()$/;"	variable	line:67
 Gn	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    Gn = args[0] if len(args) == 1 else [args[0], args[1]]$/;"	variable	line:69
 len_gn	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    len_gn = len(Gn)$/;"	variable	line:72
 Gn	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1]$/;"	variable	line:73
 idx	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    idx = [G[0] for G in Gn]$/;"	variable	line:74
 Gn	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    Gn = [G[1] for G in Gn]$/;"	variable	line:75
 ds_attrs	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    ds_attrs = get_dataset_attributes($/;"	variable	line:81
 attr_names	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],$/;"	variable	line:83
 Gn	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^        Gn = [G.to_directed() for G in Gn]$/;"	variable	line:92
 start_time	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    start_time = time.time()$/;"	variable	line:94
 Kmatrix	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    Kmatrix = np.zeros((len(Gn), len(Gn)))$/;"	variable	line:96
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    def init_worker(gn_toshare):$/;"	function	line:99
 run_time	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^    run_time = time.time() - start_time$/;"	variable	line:173
 _commonwalkkernel_exp	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta):$/;"	function	line:181
 wrapper_cw_exp	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^def wrapper_cw_exp(node_label, edge_label, beta, itr):$/;"	function	line:249
 _commonwalkkernel_geo	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma):$/;"	function	line:255
 wrapper_cw_geo	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^def wrapper_cw_geo(node_label, edge_label, gama, itr):$/;"	function	line:290
 _commonwalkkernel_brute	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^def _commonwalkkernel_brute(walks1,$/;"	function	line:296
 find_all_walks_until_length	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^def find_all_walks_until_length(G,$/;"	function	line:336
 find_walks	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^def find_walks(G, source_node, length):$/;"	function	line:388
 find_all_walks	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/commonWalkKernel.py	/^def find_all_walks(G, length):$/;"	function	line:412
 randomwalkkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def randomwalkkernel(*args,$/;"	function	line:27
 _sylvester_equation	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs):$/;"	function	line:150
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^            def init_worker(Awl_toshare):$/;"	function	line:184	function:_sylvester_equation
 wrapper_se_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def wrapper_se_do(lmda, itr):$/;"	function	line:214
 _se_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _se_do(A_wave1, A_wave2, lmda):$/;"	function	line:220
 _conjugate_gradient	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, $/;"	function	line:236
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^        def init_worker(gn_toshare):$/;"	function	line:280	function:_conjugate_gradient
 wrapper_cg_unlabled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def wrapper_cg_unlabled_do(lmda, itr):$/;"	function	line:302
 _cg_unlabled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _cg_unlabled_do(A_wave1, A_wave2, lmda):$/;"	function	line:308
 wrapper_cg_labled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, $/;"	function	line:320
 _cg_labled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, $/;"	function	line:328
 _fixed_point	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, $/;"	function	line:351
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^        def init_worker(gn_toshare):$/;"	function	line:408	function:_fixed_point
 wrapper_fp_labled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, $/;"	function	line:418
 _fp_labled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, $/;"	function	line:426
 func_fp	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def func_fp(x, p_times, lmda, w_times):$/;"	function	line:448
 _spectral_decomposition	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs):$/;"	function	line:456
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^            def init_worker(q_T_toshare, P_toshare, D_toshare):$/;"	function	line:492	function:_spectral_decomposition
 wrapper_sd_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def wrapper_sd_do(weight, sub_kernel, itr):$/;"	function	line:516
 _sd_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):    $/;"	function	line:523
 _randomwalkkernel_kron	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def _randomwalkkernel_kron(G1, G2, node_label, edge_label):$/;"	function	line:540
 getLabels	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def getLabels(Gn, node_label, edge_label, directed):$/;"	function	line:561
 filterGramMatrix	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def filterGramMatrix(gmt, label_dict, label, directed):$/;"	function	line:581
 computeVK	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def computeVK(g1, g2, ds_attrs, node_kernels, node_label):$/;"	function	line:593
 computeW	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/rwalk_sym.py	/^def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):$/;"	function	line:627
 spkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py	/^def spkernel(*args,$/;"	function	line:24
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py	/^    def init_worker(gn_toshare):$/;"	function	line:115	function:spkernel
 spkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py	/^def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):$/;"	function	line:130
 wrapper_sp_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py	/^def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):$/;"	function	line:191
 wrapper_getSPGraph	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/sp_sym.py	/^def wrapper_getSPGraph(weight, itr_item):$/;"	function	line:197
 structuralspkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py	/^def structuralspkernel(*args,$/;"	function	line:25
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py	/^    def init_worker(spl_toshare, gs_toshare):$/;"	function	line:177	function:structuralspkernel
 structuralspkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py	/^def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,$/;"	function	line:265
 wrapper_ssp_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py	/^def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, $/;"	function	line:417
 get_shortest_paths	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py	/^def get_shortest_paths(G, weight, directed):$/;"	function	line:426
 wrapper_getSP	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/else/ssp_sym.py	/^def wrapper_getSP(weight, directed, itr_item):$/;"	function	line:461
 marginalizedkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py	/^def marginalizedkernel(*args,$/;"	function	line:31
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py	/^    def init_worker(gn_toshare):$/;"	function	line:114	function:marginalizedkernel
 _marginalizedkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py	/^def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration):$/;"	function	line:144
 wrapper_marg_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py	/^def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr):$/;"	function	line:290
 wrapper_untotter	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/marginalizedKernel.py	/^def wrapper_untotter(Gn, node_label, edge_label, i):$/;"	function	line:296
 randomwalkkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def randomwalkkernel(*args,$/;"	function	line:21
 _sylvester_equation	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True):$/;"	function	line:197
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^            def init_worker(Awl_toshare):$/;"	function	line:232	function:_sylvester_equation
 wrapper_se_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def wrapper_se_do(lmda, itr):$/;"	function	line:262
 _se_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _se_do(A_wave1, A_wave2, lmda):$/;"	function	line:268
 _conjugate_gradient	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, $/;"	function	line:284
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^        def init_worker(gn_toshare):$/;"	function	line:328	function:_conjugate_gradient
 wrapper_cg_unlabled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def wrapper_cg_unlabled_do(lmda, itr):$/;"	function	line:350
 _cg_unlabled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _cg_unlabled_do(A_wave1, A_wave2, lmda):$/;"	function	line:356
 wrapper_cg_labled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, $/;"	function	line:368
 _cg_labled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, $/;"	function	line:376
 _fixed_point	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, $/;"	function	line:399
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^        def init_worker(gn_toshare):$/;"	function	line:456	function:_fixed_point
 wrapper_fp_labled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, $/;"	function	line:466
 _fp_labled_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, $/;"	function	line:474
 func_fp	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def func_fp(x, p_times, lmda, w_times):$/;"	function	line:496
 _spectral_decomposition	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=True):$/;"	function	line:504
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^            def init_worker(q_T_toshare, P_toshare, D_toshare):$/;"	function	line:541	function:_spectral_decomposition
 wrapper_sd_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def wrapper_sd_do(weight, sub_kernel, itr):$/;"	function	line:566
 _sd_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):    $/;"	function	line:573
 _randomwalkkernel_kron	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def _randomwalkkernel_kron(G1, G2, node_label, edge_label):$/;"	function	line:590
 getLabels	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def getLabels(Gn, node_label, edge_label, directed):$/;"	function	line:611
 filterGramMatrix	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def filterGramMatrix(gmt, label_dict, label, directed):$/;"	function	line:631
 computeVK	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def computeVK(g1, g2, ds_attrs, node_kernels, node_label):$/;"	function	line:643
 computeW	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/randomWalkKernel.py	/^def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):$/;"	function	line:677
 spkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py	/^def spkernel(*args,$/;"	function	line:22
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py	/^    def init_worker(gn_toshare):$/;"	function	line:157	function:spkernel
 spkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py	/^def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):$/;"	function	line:207
 wrapper_sp_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py	/^def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):$/;"	function	line:297
 wrapper_getSPGraph	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py	/^def wrapper_getSPGraph(weight, itr_item):$/;"	function	line:310
 structuralspkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def structuralspkernel(*args,$/;"	function	line:28
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^    def init_worker(spl_toshare, gs_toshare):$/;"	function	line:179	function:structuralspkernel
 structuralspkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,$/;"	function	line:258
 wrapper_ssp_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, $/;"	function	line:346
 ssp_do_trie	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label,$/;"	function	line:355
 wrapper_ssp_do_trie	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels, $/;"	function	line:463
 getAllNodeKernels	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs):$/;"	function	line:471
 getAllEdgeKernels	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs):$/;"	function	line:505
 traverseBothTriem	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):$/;"	function	line:551
 traverseTrie2m	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):$/;"	function	line:568
 traverseBothTriev	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):$/;"	function	line:592
 traverseTrie2v	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):$/;"	function	line:609
 traverseBothTriee	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):$/;"	function	line:631
 traverseTrie2e	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):$/;"	function	line:648
 traverseBothTrieu	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]):$/;"	function	line:673
 traverseTrie2u	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]):$/;"	function	line:690
 get_shortest_paths	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def get_shortest_paths(G, weight, directed):$/;"	function	line:748
 wrapper_getSP_naive	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def wrapper_getSP_naive(weight, directed, itr_item):$/;"	function	line:783
 get_sps_as_trie	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def get_sps_as_trie(G, weight, directed):$/;"	function	line:789
 wrapper_getSP_trie	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/structuralspKernel.py	/^def wrapper_getSP_trie(weight, directed, itr_item):$/;"	function	line:830
 treeletkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py	/^def treeletkernel(*args, $/;"	function	line:23
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py	/^        def init_worker(canonkeys_toshare):$/;"	function	line:105	function:treeletkernel
 _treeletkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py	/^def _treeletkernel_do(canonkey1, canonkey2, sub_kernel):$/;"	function	line:140
 wrapper_treeletkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py	/^def wrapper_treeletkernel_do(sub_kernel, itr):$/;"	function	line:160
 get_canonkeys	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py	/^def get_canonkeys(G, node_label, edge_label, labeled, is_directed):$/;"	function	line:166
 wrapper_get_canonkeys	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py	/^def wrapper_get_canonkeys(node_label, edge_label, labeled, is_directed, itr_item):$/;"	function	line:418
 find_paths	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py	/^def find_paths(G, source_node, length):$/;"	function	line:424
 find_all_paths	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/treeletKernel.py	/^def find_all_paths(G, length, is_directed):$/;"	function	line:449
 cyclicpatternkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/cyclicPatternKernel.py	/^def cyclicpatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):$/;"	function	line:20
 _cyclicpatternkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/cyclicPatternKernel.py	/^def _cyclicpatternkernel_do(patterns1, patterns2):$/;"	function	line:63
 get_patterns	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/cyclicPatternKernel.py	/^def get_patterns(G, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):$/;"	function	line:87
 pathkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py	/^def pathkernel(*args, node_label='atom', edge_label='bond_type'):$/;"	function	line:20
 _pathkernel_do_l	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py	/^def _pathkernel_do_l(G1, G2, sp1, sp2, node_label, edge_label):$/;"	function	line:107
 _pathkernel_do_nl	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py	/^def _pathkernel_do_nl(G1, G2, sp1, sp2, node_label):$/;"	function	line:148
 _pathkernel_do_el	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py	/^def _pathkernel_do_el(G1, G2, sp1, sp2, edge_label):$/;"	function	line:171
 _pathkernel_do_unl	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py	/^def _pathkernel_do_unl(G1, G2, sp1, sp2):$/;"	function	line:196
 get_shortest_paths	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/pathKernel.py	/^def get_shortest_paths(G, weight):$/;"	function	line:211
 treepatternkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py	/^def treepatternkernel(*args,$/;"	function	line:21
 _treepatternkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py	/^def _treepatternkernel_do(G1, G2, node_label, edge_label, labeled, kernel_type,$/;"	function	line:90
 matchingset	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py	/^    def matchingset(n1, n2):$/;"	function	line:119	function:_treepatternkernel_do
 mset_com	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py	/^        def mset_com(allpairs, length):$/;"	function	line:123	function:_treepatternkernel_do.matchingset
 kernel_h	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/treePatternKernel.py	/^    def kernel_h(h):$/;"	function	line:165	function:_treepatternkernel_do
 weisfeilerlehmankernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py	/^def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):$/;"	function	line:18
 _wl_subtreekernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py	/^def _wl_subtreekernel_do(Gn, node_label, edge_label, height):$/;"	function	line:75
 _wl_spkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py	/^def _wl_spkernel_do(Gn, node_label, edge_label, height):$/;"	function	line:183
 _wl_edgekernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py	/^def _wl_edgekernel_do(Gn, node_label, edge_label, height):$/;"	function	line:264
 _wl_userkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/unfinished/weisfeilerLehmanKernel.py	/^def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):$/;"	function	line:340
 untilhpathkernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def untilhpathkernel(*args,$/;"	function	line:25
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^        def init_worker(trie_toshare):$/;"	function	line:142	function:untilhpathkernel
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^        def init_worker(plist_toshare):$/;"	function	line:149	function:untilhpathkernel
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^        def init_worker(plist_toshare):$/;"	function	line:156	function:untilhpathkernel
 _untilhpathkernel_do_trie	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def _untilhpathkernel_do_trie(trie1, trie2, k_func):$/;"	function	line:207
 traverseTrie1t	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^        def traverseTrie1t(root, trie2, setlist, pcurrent=[]):$/;"	function	line:226	function:_untilhpathkernel_do_trie
 traverseTrie2t	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^        def traverseTrie2t(root, trie1, setlist, pcurrent=[]):$/;"	function	line:244	function:_untilhpathkernel_do_trie
 traverseTrie1m	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^        def traverseTrie1m(root, trie2, sumlist, pcurrent=[]):$/;"	function	line:271	function:_untilhpathkernel_do_trie
 traverseTrie2m	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^        def traverseTrie2m(root, trie1, sumlist, pcurrent=[]):$/;"	function	line:289	function:_untilhpathkernel_do_trie
 wrapper_uhpath_do_trie	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def wrapper_uhpath_do_trie(k_func, itr):$/;"	function	line:316
 _untilhpathkernel_do_naive	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def _untilhpathkernel_do_naive(paths1, paths2, k_func):$/;"	function	line:322
 wrapper_uhpath_do_naive	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def wrapper_uhpath_do_naive(k_func, itr):$/;"	function	line:365
 _untilhpathkernel_do_kernelless	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def _untilhpathkernel_do_kernelless(paths1, paths2, k_func):$/;"	function	line:371
 wrapper_uhpath_do_kernelless	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def wrapper_uhpath_do_kernelless(k_func, itr):$/;"	function	line:414
 find_all_paths_until_length	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def find_all_paths_until_length(G,$/;"	function	line:421
 wrapper_find_all_paths_until_length	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, $/;"	function	line:492
 find_all_path_as_trie	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def find_all_path_as_trie(G,$/;"	function	line:501
 traverseGraph	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^    def traverseGraph(root, ptrie, length, G, ds_attrs, node_label, edge_label,$/;"	function	line:542	function:find_all_path_as_trie
 wrapper_find_all_path_as_trie	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def wrapper_find_all_path_as_trie(length, ds_attrs, node_label, $/;"	function	line:593
 paths2labelseqs	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/untilHPathKernel.py	/^def paths2labelseqs(plist, G, ds_attrs, node_label, edge_label):$/;"	function	line:601
 weisfeilerlehmankernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def weisfeilerlehmankernel(*args, $/;"	function	line:25
 base_kernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^    base_kernel = base_kernel.lower()$/;"	variable	line:74
 Gn	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^    Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list$/;"	variable	line:75
 Gn	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^    Gn = [g.copy() for g in Gn]$/;"	variable	line:76
 ds_attrs	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^    ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], $/;"	variable	line:77
 node_label	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^                                      node_label=node_label)$/;"	variable	line:78
 start_time	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^    start_time = time.time()$/;"	variable	line:83
 Kmatrix	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^        Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose)$/;"	variable	line:87
 Kmatrix	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^        Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)$/;"	variable	line:91
 Kmatrix	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^        Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)$/;"	variable	line:95
 Kmatrix	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^        Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)$/;"	variable	line:99
 run_time	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^    run_time = time.time() - start_time$/;"	variable	line:101
 _wl_kernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose):$/;"	function	line:109
 wl_iteration	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def wl_iteration(G, node_label):$/;"	function	line:256
 wrapper_wl_iteration	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def wrapper_wl_iteration(node_label, itr_item):$/;"	function	line:293
 compute_kernel_matrix	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose):$/;"	function	line:300
 init_worker	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^        def init_worker(alllabels_toshare):$/;"	function	line:305	function:compute_kernel_matrix
 compute_subtree_kernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):$/;"	function	line:319
 wrapper_compute_subtree_kernel	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def wrapper_compute_subtree_kernel(Kmatrix, itr):$/;"	function	line:333
 _wl_spkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def _wl_spkernel_do(Gn, node_label, edge_label, height):$/;"	function	line:339
 _wl_edgekernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def _wl_edgekernel_do(Gn, node_label, edge_label, height):$/;"	function	line:421
 _wl_userkernel_do	/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py	/^def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):$/;"	function	line:498
--- a/gklearn/kernels/else/rwalk_sym.py
+++ b/gklearn/kernels/else/rwalk_sym.py
@@ -0,0 +1,842 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sun Dec 23 16:53:57 2018

@author: ljia
@references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and 
 Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 
 11(Apr):1201–1242, 2010.
 """

 import sys
 sys.path.insert(0, "../")
 import time
 from functools import partial
 from tqdm import tqdm

 import networkx as nx
 import numpy as np
 from scipy.sparse import identity, kron
 from scipy.sparse.linalg import cg
 from scipy.optimize import fixed_point

 from gklearn.utils.graphdataset import get_dataset_attributes
 from gklearn.utils.parallel import parallel_gm

 def randomwalkkernel(*args,
                     # params for all method.
                     compute_method=None,
                     weight=1, 
                     p=None, 
                     q=None,
                     edge_weight=None,
                     # params for conjugate and fp method.
                     node_kernels=None, 
                     edge_kernels=None,
                     node_label='atom',
                     edge_label='bond_type',
                     # params for spectral method.
                     sub_kernel=None,                                          
                     n_jobs=None):
    """Calculate random walk graph kernels.
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    h : integer
        Longest length of walks.
    method : string
        Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
    """
    compute_method = compute_method.lower()
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]

    eweight = None
    if edge_weight == None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, float) or isinstance(some_weight, int):
                eweight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)

    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
                    'edge_attr_dim', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    ds_attrs['node_attr_dim'] = 0
    ds_attrs['edge_attr_dim'] = 0
    
    # remove graphs with no edges, as no walk can be found in their structures, 
    # so the weight matrix between such a graph and itself might be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        print('\n %d graphs are removed as they don\'t contain edges.\n' %
              (len_gn - len(Gn)))

    start_time = time.time()
    
 #    # get vertex and edge concatenated labels for each graph
 #    label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
 #    gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])

    if compute_method == 'sylvester':
        import warnings
        warnings.warn('All labels are ignored.')
        Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs)

    elif compute_method == 'conjugate':
        Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, 
                                              node_kernels, edge_kernels, 
                                              node_label, edge_label, eweight, n_jobs)
        
    elif compute_method == 'fp':
        Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels, 
                                       edge_kernels, node_label, edge_label, 
                                       eweight, n_jobs)

    elif compute_method == 'spectral':
        import warnings
        warnings.warn('All labels are ignored. Only works for undirected graphs.')
        Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs)

    elif compute_method == 'kron':
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
                                                       node_label, edge_label)
                Kmatrix[j][i] = Kmatrix[i][j]
    else:
        raise Exception(
            'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
        )

    run_time = time.time() - start_time
    print(
        "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time, idx


 ###############################################################################
 def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs):
    """Calculate walk graph kernels up to n between 2 graphs using Sylvester method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    if q == None:
        # don't normalize adjacency matrices if q is a uniform vector. Note
        # A_wave_list accually contains the transposes of the adjacency matrices.
        A_wave_list = [
            nx.adjacency_matrix(G, eweight).todense().transpose() for G in tqdm(
                Gn, desc='compute adjacency matrices', file=sys.stdout)
        ]
 #        # normalized adjacency matrices
 #        A_wave_list = []
 #        for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
 #            A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()   
 #            norm = A_tilde.sum(axis=0)
 #            norm[norm == 0] = 1
 #            A_wave_list.append(A_tilde / norm)
        if p == None: # p is uniform distribution as default.
            def init_worker(Awl_toshare):
                global G_Awl
                G_Awl = Awl_toshare
            do_partial = partial(wrapper_se_do, lmda)   
            parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                        glbv=(A_wave_list,), n_jobs=n_jobs)
            
 #            pbar = tqdm(
 #                total=(1 + len(Gn)) * len(Gn) / 2,
 #                desc='calculating kernels',
 #                file=sys.stdout)
 #            for i in range(0, len(Gn)):
 #                for j in range(i, len(Gn)):
 #                    S = lmda * A_wave_list[j]
 #                    T_t = A_wave_list[i]
 #                    # use uniform distribution if there is no prior knowledge.
 #                    nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
 #                    p_times_uni = 1 / nb_pd
 #                    M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni)
 #                    X = dlyap(S, T_t, M0)
 #                    X = np.reshape(X, (-1, 1), order='F')
 #                    # use uniform distribution if there is no prior knowledge.
 #                    q_times = np.full((1, nb_pd), p_times_uni)
 #                    Kmatrix[i][j] = np.dot(q_times, X)
 #                    Kmatrix[j][i] = Kmatrix[i][j]
 #                    pbar.update(1)

    return Kmatrix


 def wrapper_se_do(lmda, itr):
    i = itr[0]
    j = itr[1]
    return i, j, _se_do(G_Awl[i], G_Awl[j], lmda)


 def _se_do(A_wave1, A_wave2, lmda):
    from control import dlyap
    S = lmda * A_wave2
    T_t = A_wave1
    # use uniform distribution if there is no prior knowledge.
    nb_pd = len(A_wave1) * len(A_wave2)
    p_times_uni = 1 / nb_pd
    M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni)
    X = dlyap(S, T_t, M0)
    X = np.reshape(X, (-1, 1), order='F')
    # use uniform distribution if there is no prior knowledge.
    q_times = np.full((1, nb_pd), p_times_uni)
    return np.dot(q_times, X)


 ###############################################################################
 def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, 
                        node_label, edge_label, eweight, n_jobs):
    """Calculate walk graph kernels up to n between 2 graphs using conjugate method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    
 #    if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
 #        not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1:
 #        # this is faster from unlabeled graphs. @todo: why?
 #        if q == None:
 #            # don't normalize adjacency matrices if q is a uniform vector. Note
 #            # A_wave_list accually contains the transposes of the adjacency matrices.
 #            A_wave_list = [
 #                nx.adjacency_matrix(G, eweight).todense().transpose() for G in 
 #                    tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
 #            ]
 #            if p == None: # p is uniform distribution as default.
 #                def init_worker(Awl_toshare):
 #                    global G_Awl
 #                    G_Awl = Awl_toshare
 #                do_partial = partial(wrapper_cg_unlabled_do, lmda)   
 #                parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
 #                            glbv=(A_wave_list,), n_jobs=n_jobs)
 #    else:  
    # reindex nodes using consecutive integers for convenience of kernel calculation.
    Gn = [nx.convert_node_labels_to_integers(
            g, first_label=0, label_attribute='label_orignal') for g in tqdm(
                Gn, desc='reindex vertices', file=sys.stdout)]
    
    if p == None and q == None: # p and q are uniform distributions as default.
        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare
        do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels, 
                             node_label, edge_kernels, edge_label, lmda)   
        parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                    glbv=(Gn,), n_jobs=n_jobs)  
            
 #            pbar = tqdm(
 #                total=(1 + len(Gn)) * len(Gn) / 2,
 #                desc='calculating kernels',
 #                file=sys.stdout)
 #            for i in range(0, len(Gn)):
 #                for j in range(i, len(Gn)):
 #                    result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels,
 #                                           node_label, edge_kernels, edge_label, lmda)
 #                    Kmatrix[i][j] = result
 #                    Kmatrix[j][i] = Kmatrix[i][j]
 #                    pbar.update(1)
    return Kmatrix


 def wrapper_cg_unlabled_do(lmda, itr):
    i = itr[0]
    j = itr[1]
    return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda)


 def _cg_unlabled_do(A_wave1, A_wave2, lmda):
    nb_pd = len(A_wave1) * len(A_wave2)
    p_times_uni = 1 / nb_pd
    w_times = kron(A_wave1, A_wave2).todense()
    A = identity(w_times.shape[0]) - w_times * lmda
    b = np.full((nb_pd, 1), p_times_uni)
    x, _ = cg(A, b)
    # use uniform distribution if there is no prior knowledge.
    q_times = np.full((1, nb_pd), p_times_uni)
    return np.dot(q_times, x)


 def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, 
                         edge_label, lmda, itr):
    i = itr[0]
    j = itr[1]
    return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, 
                               node_label, edge_kernels, edge_label, lmda)


 def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, 
                  edge_kernels, edge_label, lmda):
    # Frist, ompute kernels between all pairs of nodes, method borrowed
    # from FCSP. It is faster than directly computing all edge kernels 
    # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
    # graphs compared, which is the most case we went though. For very 
    # sparse graphs, this would be slow.
    vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
                           
    # Compute weight matrix of the direct product graph.   
    w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
                              edge_kernels, edge_label)                                                            
    # use uniform distribution if there is no prior knowledge.
    p_times_uni = 1 / w_dim
    A = identity(w_times.shape[0]) - w_times * lmda
    b = np.full((w_dim, 1), p_times_uni)
    x, _ = cg(A, b)
    # use uniform distribution if there is no prior knowledge.
    q_times = np.full((1, w_dim), p_times_uni)
    return np.dot(q_times, x)


 ###############################################################################
 def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, 
                         node_label, edge_label, eweight, n_jobs):
    """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    

    Kmatrix = np.zeros((len(Gn), len(Gn)))
    
 #    if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
 #        not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1:
 #        # this is faster from unlabeled graphs. @todo: why?
 #        if q == None:
 #            # don't normalize adjacency matrices if q is a uniform vector. Note
 #            # A_wave_list accually contains the transposes of the adjacency matrices.
 #            A_wave_list = [
 #                nx.adjacency_matrix(G, eweight).todense().transpose() for G in 
 #                    tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
 #            ]
 #            if p == None: # p is uniform distribution as default.
 #                pbar = tqdm(
 #                    total=(1 + len(Gn)) * len(Gn) / 2,
 #                    desc='calculating kernels',
 #                    file=sys.stdout)
 #                for i in range(0, len(Gn)):
 #                    for j in range(i, len(Gn)):                   
 #                        # use uniform distribution if there is no prior knowledge.
 #                        nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
 #                        p_times_uni = 1 / nb_pd
 #                        w_times = kron(A_wave_list[i], A_wave_list[j]).todense()
 #                        p_times = np.full((nb_pd, 1), p_times_uni)
 #                        x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times))
 #                        # use uniform distribution if there is no prior knowledge.
 #                        q_times = np.full((1, nb_pd), p_times_uni)
 #                        Kmatrix[i][j] = np.dot(q_times, x)
 #                        Kmatrix[j][i] = Kmatrix[i][j]
 #                        pbar.update(1)
 #    else:  
    # reindex nodes using consecutive integers for convenience of kernel calculation.
    Gn = [nx.convert_node_labels_to_integers(
            g, first_label=0, label_attribute='label_orignal') for g in tqdm(
                Gn, desc='reindex vertices', file=sys.stdout)]
    
    if p == None and q == None: # p and q are uniform distributions as default.
        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare
        do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels, 
                             node_label, edge_kernels, edge_label, lmda)   
        parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                    glbv=(Gn,), n_jobs=n_jobs)
    return Kmatrix


 def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, 
                         edge_label, lmda, itr):
    i = itr[0]
    j = itr[1]
    return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, 
                               node_label, edge_kernels, edge_label, lmda)


 def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, 
                  edge_kernels, edge_label, lmda):
    # Frist, ompute kernels between all pairs of nodes, method borrowed
    # from FCSP. It is faster than directly computing all edge kernels 
    # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
    # graphs compared, which is the most case we went though. For very 
    # sparse graphs, this would be slow.
    vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
                           
    # Compute weight matrix of the direct product graph.   
    w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
                              edge_kernels, edge_label)                                                            
    # use uniform distribution if there is no prior knowledge.
    p_times_uni = 1 / w_dim
    p_times = np.full((w_dim, 1), p_times_uni)
    x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times),
                    xtol=1e-06, maxiter=1000)
    # use uniform distribution if there is no prior knowledge.
    q_times = np.full((1, w_dim), p_times_uni)
    return np.dot(q_times, x)


 def func_fp(x, p_times, lmda, w_times):
    haha = w_times * x
    haha = lmda * haha
    haha = p_times + haha
    return p_times + lmda * np.dot(w_times, x)


 ###############################################################################
 def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs):
    """Calculate walk graph kernels up to n between 2 unlabeled graphs using 
    spectral decomposition method. Labels will be ignored.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    if q == None:
        # precompute the spectral decomposition of each graph.
        P_list = []
        D_list = []
        for G in tqdm(Gn, desc='spectral decompose', file=sys.stdout):
            # don't normalize adjacency matrices if q is a uniform vector. Note
            # A accually is the transpose of the adjacency matrix.
            A = nx.adjacency_matrix(G, eweight).todense().transpose()
            ew, ev = np.linalg.eig(A)
            D_list.append(ew)
            P_list.append(ev)
 #        P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs?

        if p == None: # p is uniform distribution as default.            
            q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn]
 #            q_T_list = [q.T for q in q_list]
            def init_worker(q_T_toshare, P_toshare, D_toshare):
                global G_q_T, G_P, G_D 
                G_q_T = q_T_toshare
                G_P = P_toshare
                G_D = D_toshare
            do_partial = partial(wrapper_sd_do, weight, sub_kernel)   
            parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                        glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs)
            
            
 #            pbar = tqdm(
 #                total=(1 + len(Gn)) * len(Gn) / 2,
 #                desc='calculating kernels',
 #                file=sys.stdout)
 #            for i in range(0, len(Gn)):
 #                for j in range(i, len(Gn)):
 #                    result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], 
 #                                    D_list[i], D_list[j], weight, sub_kernel)
 #                    Kmatrix[i][j] = result
 #                    Kmatrix[j][i] = Kmatrix[i][j]
 #                    pbar.update(1)
    return Kmatrix


 def wrapper_sd_do(weight, sub_kernel, itr):
    i = itr[0]
    j = itr[1]
    return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j], 
                        weight, sub_kernel)


 def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):    
    # use uniform distribution if there is no prior knowledge.
    kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense()
    # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs
 #    kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense()
    if sub_kernel == 'exp':
        D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
        kmiddle = np.diag(np.exp(weight * D_diag))
    elif sub_kernel == 'geo':
        D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
        kmiddle = np.diag(weight * D_diag)
        kmiddle = np.identity(len(kmiddle)) - weight * kmiddle
        kmiddle = np.linalg.inv(kmiddle)
    return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0]


 ###############################################################################
 def _randomwalkkernel_kron(G1, G2, node_label, edge_label):
    """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    pass


 ###############################################################################
 def getLabels(Gn, node_label, edge_label, directed):
    """Get symbolic labels of a graph dataset, where vertex labels are dealt
    with by concatenating them to the edge labels of adjacent edges.
    """
    label_list = []
    label_set = set()
    for g in Gn:
        label_g = {}
        for e in g.edges(data=True):
            nl1 = g.node[e[0]][node_label]
            nl2 = g.node[e[1]][node_label]
            if not directed and nl1 > nl2:
                nl1, nl2 = nl2, nl1
            label = (nl1, e[2][edge_label], nl2)
            label_g[(e[0], e[1])] = label
        label_list.append(label_g)  
    label_set = set([l for lg in label_list for l in lg.values()])
    return label_list, len(label_set)


 def filterGramMatrix(gmt, label_dict, label, directed):
    """Compute (the transpose of) the Gram matrix filtered by a label.
    """
    gmf = np.zeros(gmt.shape)
    for (n1, n2), l in label_dict.items():
        if l == label:
            gmf[n2, n1] = gmt[n2, n1]
            if not directed:
                gmf[n1, n2] = gmt[n1, n2]
    return gmf


 def computeVK(g1, g2, ds_attrs, node_kernels, node_label):
    '''Compute vertex kernels between vertices of two graphs.
    '''
    vk_dict = {}  # shortest path matrices dict
    if ds_attrs['node_labeled']:
        # node symb and non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['mix']
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(
                        n1[1][node_label], n2[1][node_label],
                        n1[1]['attributes'], n2[1]['attributes'])
        # node symb labeled
        else:
            kn = node_kernels['symb']
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
                                                 n2[1][node_label])
    else:
        # node non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['nsymb']
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
                                                 n2[1]['attributes'])
        # node unlabeled
        else:
            pass
    return vk_dict


 def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):
    '''Compute weight matrix of the direct product graph.
    '''
    w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2)
    w_times = np.zeros((w_dim, w_dim))
    if vk_dict: # node labeled
        if ds_attrs['is_directed']:
            if ds_attrs['edge_labeled']:
                # edge symb and non-synb labeled
                if ds_attrs['edge_attr_dim'] > 0:
                    ke = edge_kernels['mix']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
                                         e1[2]['attributes'], e2[2]['attributes'])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
                                * ek_temp * vk_dict[(e1[1], e2[1])]
                # edge symb labeled
                else:
                    ke = edge_kernels['symb']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
                                * ek_temp * vk_dict[(e1[1], e2[1])]
            else:
                # edge non-synb labeled
                if ds_attrs['edge_attr_dim'] > 0:
                    ke = edge_kernels['nsymb']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
                                * ek_temp * vk_dict[(e1[1], e2[1])]
                # edge unlabeled
                else:
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
                                * vk_dict[(e1[1], e2[1])]                                
        else: # undirected
            if ds_attrs['edge_labeled']:
                # edge symb and non-synb labeled
                if ds_attrs['edge_attr_dim'] > 0:
                    ke = edge_kernels['mix']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
                                         e1[2]['attributes'], e2[2]['attributes'])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
                                * ek_temp * vk_dict[(e1[1], e2[1])] \
                                + vk_dict[(e1[0], e2[1])] \
                                * ek_temp * vk_dict[(e1[1], e2[0])]
                            w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
                            w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
                                     e1[1] * nx.number_of_nodes(g2) + e2[0])
                            w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
                            w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
                # edge symb labeled
                else:
                    ke = edge_kernels['symb']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
                                * ek_temp * vk_dict[(e1[1], e2[1])] \
                                + vk_dict[(e1[0], e2[1])] \
                                * ek_temp * vk_dict[(e1[1], e2[0])]
                            w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
                            w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
                                     e1[1] * nx.number_of_nodes(g2) + e2[0])
                            w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
                            w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
            else:
                # edge non-synb labeled
                if ds_attrs['edge_attr_dim'] > 0:
                    ke = edge_kernels['nsymb']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
                                * ek_temp * vk_dict[(e1[1], e2[1])] \
                                + vk_dict[(e1[0], e2[1])] \
                                * ek_temp * vk_dict[(e1[1], e2[0])]
                            w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
                            w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
                                     e1[1] * nx.number_of_nodes(g2) + e2[0])
                            w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
                            w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
                # edge unlabeled
                else:
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
                                * vk_dict[(e1[1], e2[1])] \
                                + vk_dict[(e1[0], e2[1])] \
                                * vk_dict[(e1[1], e2[0])]
                            w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
                            w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
                                     e1[1] * nx.number_of_nodes(g2) + e2[0])
                            w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
                            w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
    else: # node unlabeled
        if ds_attrs['is_directed']:
            if ds_attrs['edge_labeled']:
                # edge symb and non-synb labeled
                if ds_attrs['edge_attr_dim'] > 0:
                    ke = edge_kernels['mix']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
                                         e1[2]['attributes'], e2[2]['attributes'])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = ek_temp
                # edge symb labeled
                else:
                    ke = edge_kernels['symb']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = ek_temp
            else:
                # edge non-synb labeled
                if ds_attrs['edge_attr_dim'] > 0:
                    ke = edge_kernels['nsymb']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = ek_temp
                # edge unlabeled
                else:
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = 1                                
        else: # undirected
            if ds_attrs['edge_labeled']:
                # edge symb and non-synb labeled
                if ds_attrs['edge_attr_dim'] > 0:
                    ke = edge_kernels['mix']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
                                         e1[2]['attributes'], e2[2]['attributes'])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = ek_temp
                            w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
                            w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
                                     e1[1] * nx.number_of_nodes(g2) + e2[0])
                            w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
                            w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
                # edge symb labeled
                else:
                    ke = edge_kernels['symb']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = ek_temp
                            w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
                            w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
                                     e1[1] * nx.number_of_nodes(g2) + e2[0])
                            w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
                            w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
            else:
                # edge non-synb labeled
                if ds_attrs['edge_attr_dim'] > 0:
                    ke = edge_kernels['nsymb']
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = ek_temp
                            w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
                            w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
                                     e1[1] * nx.number_of_nodes(g2) + e2[0])
                            w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
                            w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
                # edge unlabeled
                else:
                    for e1 in g1.edges(data=True):
                        for e2 in g2.edges(data=True):
                            w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
                                     e1[1] * nx.number_of_nodes(g2) + e2[1])
                            w_times[w_idx] = 1
                            w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
                            w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
                                     e1[1] * nx.number_of_nodes(g2) + e2[0])
                            w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
                            w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
    return w_times, w_dim
--- a/gklearn/kernels/else/sp_sym.py
+++ b/gklearn/kernels/else/sp_sym.py
@@ -0,0 +1,200 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Dec 21 18:02:00 2018

@author: ljia
 """

 import sys
 import time
 from itertools import product
 from functools import partial
 from multiprocessing import Pool
 from tqdm import tqdm

 import networkx as nx
 import numpy as np

 from gklearn.utils.utils import getSPGraph
 from gklearn.utils.graphdataset import get_dataset_attributes
 from gklearn.utils.parallel import parallel_gm
 sys.path.insert(0, "../")

 def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None):
    """Calculate shortest-path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns an 
        number as the kernel value. Ignored when nodes are unlabeled.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the sp kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)
    ds_attrs['node_attr_dim'] = 0

    # remove graphs with no edges, as no sp can be found in their structures, 
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        print('\n %d graphs are removed as they don\'t contain edges.\n' %
              (len_gn - len(Gn)))

    start_time = time.time()

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
 #        # use default chunksize as pool.map when iterable is less than 100
 #        chunksize, extra = divmod(len(Gn), n_jobs * 4)
 #        if extra:
 #            chunksize += 1
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    for i, g in tqdm(
            pool.imap_unordered(getsp_partial, itr, chunksize),
            desc='getting sp graphs', file=sys.stdout):
        Gn[i] = g
    pool.close()
    pool.join()

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare
    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)   
    parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                glbv=(Gn,), n_jobs=n_jobs)

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time, idx


 def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
    
    kernel = 0

    # compute shortest path matrices first, method borrowed from FCSP.
    vk_dict = {}  # shortest path matrices dict
    if ds_attrs['node_labeled']:
        # node symb and non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['mix']
            for n1, n2 in product(
                    g1.nodes(data=True), g2.nodes(data=True)):
                vk_dict[(n1[0], n2[0])] = kn(
                    n1[1][node_label], n2[1][node_label],
                    n1[1]['attributes'], n2[1]['attributes'])
        # node symb labeled
        else:
            kn = node_kernels['symb']
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
                                                 n2[1][node_label])
    else:
        # node non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['nsymb']
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
                                                 n2[1]['attributes'])
        # node unlabeled
        else:
            for e1, e2 in product(
                    g1.edges(data=True), g2.edges(data=True)):
                if e1[2]['cost'] == e2[2]['cost']:
                    kernel += 1
            return kernel

    # compute graph kernels
    if ds_attrs['is_directed']:
        for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
            if e1[2]['cost'] == e2[2]['cost']:
                nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
                                                               e2[1])]
                kn1 = nk11 * nk22
                kernel += kn1
    else:
        for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
            if e1[2]['cost'] == e2[2]['cost']:
                # each edge walk is counted twice, starting from both its extreme nodes.
                nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
                    e1[0], e2[1])], vk_dict[(e1[1],
                                             e2[0])], vk_dict[(e1[1],
                                                               e2[1])]
                kn1 = nk11 * nk22
                kn2 = nk12 * nk21
                kernel += kn1 + kn2

    return kernel


 def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):
    i = itr[0]
    j = itr[1]
    return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels)


 def wrapper_getSPGraph(weight, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    return i, getSPGraph(g, edge_weight=weight)
--- a/gklearn/kernels/else/ssp_sym.py
+++ b/gklearn/kernels/else/ssp_sym.py
@@ -0,0 +1,464 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sun Dec 23 16:42:48 2018

@author: ljia
 """

 import sys
 import time
 from itertools import combinations, product
 from functools import partial
 from multiprocessing import Pool
 from tqdm import tqdm

 import networkx as nx
 import numpy as np

 from gklearn.utils.graphdataset import get_dataset_attributes
 from gklearn.utils.parallel import parallel_gm

 sys.path.insert(0, "../")


 def structuralspkernel(*args,
                       node_label='atom',
                       edge_weight=None,
                       edge_label='bond_type',
                       node_kernels=None,
                       edge_kernels=None,
                       n_jobs=None):
    """Calculate mean average structural shortest path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when nodes are unlabeled.
    edge_kernels: dict
        A dictionary of kernel functions for edges, including 3 items: 'symb' 
        for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' 
        for both labels. The first 2 functions take two edge labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two edges. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when edges are unlabeled.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the mean average structural 
        shortest path kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
                    'edge_attr_dim', 'is_directed'],
        node_label=node_label, edge_label=edge_label)
    ds_attrs['node_attr_dim'] = 0
    ds_attrs['edge_attr_dim'] = 0

    start_time = time.time()

    # get shortest paths of each graph in Gn
    splist = [None] * len(Gn)
    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    # chunksize = 300  # int(len(list(itr)) / n_jobs)
    for i, sp in tqdm(
            pool.imap_unordered(getsp_partial, itr, chunksize),
            desc='getting shortest paths',
            file=sys.stdout):
        splist[i] = sp
 #        time.sleep(10)
    pool.close()
    pool.join()
    
    
 #    # get shortest paths of each graph in Gn
 #    splist = [[] for _ in range(len(Gn))]
 #    # get shortest path graphs of Gn
 #    getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
 #    itr = zip(Gn, range(0, len(Gn)))
 #    if len(Gn) < 1000 * n_jobs:
 #        chunksize = int(len(Gn) / n_jobs) + 1
 #    else:
 #        chunksize = 1000
 #    # chunksize = 300  # int(len(list(itr)) / n_jobs)
 #    from contextlib import closing  
 #    with closing(Pool(n_jobs)) as pool:
 ##        for i, sp in tqdm(
 #        res = pool.imap_unordered(getsp_partial, itr, 10)
 ##                desc='getting shortest paths',
 ##                file=sys.stdout):
 ##            splist[i] = sp
 ##        time.sleep(10)
 #    pool.close()
 #    pool.join()
    
 #    ss = 0
 #    ss += sys.getsizeof(splist)
 #    for spss in splist:
 #        ss += sys.getsizeof(spss)
 #        for spp in spss:
 #            ss += sys.getsizeof(spp)
    
    
 #    time.sleep(20)
    
 #    # ---- direct running, normally use single CPU core. ----
 #    splist = []
 #    for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
 #        splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))

    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
    # for i in result_sp:
    #     sp_ml[i[0]] = i[1]
    # edge_x_g = [[] for i in range(len(sp_ml))]
    # edge_y_g = [[] for i in range(len(sp_ml))]
    # edge_w_g = [[] for i in range(len(sp_ml))]
    # for idx, item in enumerate(sp_ml):
    #     for i1 in range(len(item)):
    #         for i2 in range(i1 + 1, len(item)):
    #             if item[i1, i2] != np.inf:
    #                 edge_x_g[idx].append(i1)
    #                 edge_y_g[idx].append(i2)
    #                 edge_w_g[idx].append(item[i1, i2])
    # print(len(edge_x_g[0]))
    # print(len(edge_y_g[0]))
    # print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))
    
    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(spl_toshare, gs_toshare):
        global G_spl, G_gs
        G_spl = spl_toshare
        G_gs = gs_toshare      
    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
                         node_kernels, edge_kernels)   
    parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                        glbv=(splist, Gn), n_jobs=n_jobs) 
    

 #    # ---- use pool.imap_unordered to parallel and track progress. ----
 #    pool = Pool(n_jobs)
 #    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
 #                         node_kernels, edge_kernels)
 #    itr = zip(combinations_with_replacement(Gn, 2),
 #              combinations_with_replacement(splist, 2),
 #              combinations_with_replacement(range(0, len(Gn)), 2))
 #    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
 #    if len_itr < 1000 * n_jobs:
 #        chunksize = int(len_itr / n_jobs) + 1
 #    else:
 #        chunksize = 1000
 #    for i, j, kernel in tqdm(
 #            pool.imap_unordered(do_partial, itr, chunksize),
 #            desc='calculating kernels',
 #            file=sys.stdout):
 #        Kmatrix[i][j] = kernel
 #        Kmatrix[j][i] = kernel
 #    pool.close()
 #    pool.join()
    
 #    # ---- use pool.map to parallel. ----
 #    pool = Pool(n_jobs)
 #    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
 #                         node_kernels, edge_kernels)
 #    itr = zip(combinations_with_replacement(Gn, 2),
 #              combinations_with_replacement(splist, 2),
 #              combinations_with_replacement(range(0, len(Gn)), 2))
 #    for i, j, kernel in tqdm(
 #            pool.map(do_partial, itr), desc='calculating kernels',
 #            file=sys.stdout):
 #        Kmatrix[i][j] = kernel
 #        Kmatrix[j][i] = kernel
 #    pool.close()
 #    pool.join()

 #    # ---- use pool.imap_unordered to parallel and track progress. ----
 #    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, 
 #                         node_kernels, edge_kernels)
 #    itr = zip(combinations_with_replacement(Gn, 2),
 #              combinations_with_replacement(splist, 2),
 #              combinations_with_replacement(range(0, len(Gn)), 2))
 #    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
 #    if len_itr < 1000 * n_jobs:
 #        chunksize = int(len_itr / n_jobs) + 1
 #    else:
 #        chunksize = 1000
 #    from contextlib import closing
 #    with closing(Pool(n_jobs)) as pool:
 #        for i, j, kernel in tqdm(
 #                pool.imap_unordered(do_partial, itr, 1000),
 #                desc='calculating kernels',
 #                file=sys.stdout):
 #            Kmatrix[i][j] = kernel
 #            Kmatrix[j][i] = kernel
 #    pool.close()
 #    pool.join()


 #    # ---- direct running, normally use single CPU core. ----
 #    from itertools import combinations_with_replacement
 #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
 #    for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
 #        kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
 #                ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
 ##        if(kernel > 1):
 ##            print("error here ")
 #        Kmatrix[i][j] = kernel
 #        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time


 def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
                          node_kernels, edge_kernels):
    
    kernel = 0

    # First, compute shortest path matrices, method borrowed from FCSP.
    vk_dict = {}  # shortest path matrices dict
    if ds_attrs['node_labeled']:
        # node symb and non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['mix']
            for n1, n2 in product(
                    g1.nodes(data=True), g2.nodes(data=True)):
                vk_dict[(n1[0], n2[0])] = kn(
                    n1[1][node_label], n2[1][node_label],
                    n1[1]['attributes'], n2[1]['attributes'])
        # node symb labeled
        else:
            kn = node_kernels['symb']
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
                                                 n2[1][node_label])
    else:
        # node non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['nsymb']
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
                                                 n2[1]['attributes'])
        # node unlabeled
        else:
            pass

    # Then, compute kernels between all pairs of edges, which idea is an
    # extension of FCSP. It suits sparse graphs, which is the most case we
    # went though. For dense graphs, this would be slow.
    ek_dict = {}  # dict of edge kernels
    if ds_attrs['edge_labeled']:
        # edge symb and non-synb labeled
        if ds_attrs['edge_attr_dim'] > 0:
            ke = edge_kernels['mix']
            for e1, e2 in product(
                    g1.edges(data=True), g2.edges(data=True)):
                ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
                    e1[2]['attributes'], e2[2]['attributes'])
                ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
                ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
                ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
                ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
        # edge symb labeled
        else:
            ke = edge_kernels['symb']
            for e1 in g1.edges(data=True):
                for e2 in g2.edges(data=True):
                    ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
                    ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
                    ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
                    ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
                    ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
    else:
        # edge non-synb labeled
        if ds_attrs['edge_attr_dim'] > 0:
            ke = edge_kernels['nsymb']
            for e1 in g1.edges(data=True):
                for e2 in g2.edges(data=True):
                    ek_temp = kn(e1[2]['attributes'], e2[2]['attributes'])
                    ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
                    ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
                    ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
                    ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
        # edge unlabeled
        else:
            pass

    # compute graph kernels
    if vk_dict:
        if ek_dict:
            for p1, p2 in product(spl1, spl2):
                if len(p1) == len(p2):
                    kpath = vk_dict[(p1[0], p2[0])]
                    if kpath:
                        for idx in range(1, len(p1)):
                            kpath *= vk_dict[(p1[idx], p2[idx])] * \
                                ek_dict[((p1[idx-1], p1[idx]),
                                         (p2[idx-1], p2[idx]))]
                            if not kpath:
                                break
                        kernel += kpath  # add up kernels of all paths
        else:
            for p1, p2 in product(spl1, spl2):
                if len(p1) == len(p2):
                    kpath = vk_dict[(p1[0], p2[0])]
                    if kpath:
                        for idx in range(1, len(p1)):
                            kpath *= vk_dict[(p1[idx], p2[idx])]
                            if not kpath:
                                break
                        kernel += kpath  # add up kernels of all paths
    else:
        if ek_dict:
            for p1, p2 in product(spl1, spl2):
                if len(p1) == len(p2):
                    if len(p1) == 0:
                        kernel += 1
                    else:
                        kpath = 1
                        for idx in range(0, len(p1) - 1):
                            kpath *= ek_dict[((p1[idx], p1[idx+1]),
                                              (p2[idx], p2[idx+1]))]
                            if not kpath:
                                break
                        kernel += kpath  # add up kernels of all paths
        else:
            for p1, p2 in product(spl1, spl2):
                if len(p1) == len(p2):
                    kernel += 1

    kernel = kernel / (len(spl1) * len(spl2))  # calculate mean average

    # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
    # # compute vertex kernel matrix
    # try:
    #     vk_mat = np.zeros((nx.number_of_nodes(g1),
    #                        nx.number_of_nodes(g2)))
    #     g1nl = enumerate(g1.nodes(data=True))
    #     g2nl = enumerate(g2.nodes(data=True))
    #     for i1, n1 in g1nl:
    #         for i2, n2 in g2nl:
    #             vk_mat[i1][i2] = kn(
    #                 n1[1][node_label], n2[1][node_label],
    #                 [n1[1]['attributes']], [n2[1]['attributes']])

    #     range1 = range(0, len(edge_w_g[i]))
    #     range2 = range(0, len(edge_w_g[j]))
    #     for i1 in range1:
    #         x1 = edge_x_g[i][i1]
    #         y1 = edge_y_g[i][i1]
    #         w1 = edge_w_g[i][i1]
    #         for i2 in range2:
    #             x2 = edge_x_g[j][i2]
    #             y2 = edge_y_g[j][i2]
    #             w2 = edge_w_g[j][i2]
    #             ke = (w1 == w2)
    #             if ke > 0:
    #                 kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
    #                 kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
    #                 Kmatrix += kn1 + kn2
    return kernel


 def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, 
                   edge_kernels, itr):
    i = itr[0]
    j = itr[1]
    return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], 
                                       ds_attrs, node_label, edge_label, 
                                       node_kernels, edge_kernels)


 def get_shortest_paths(G, weight, directed):
    """Get all shortest paths of a graph.

    Parameters
    ----------
    G : NetworkX graphs
        The graphs whose paths are calculated.
    weight : string/None
        edge attribute used as weight to calculate the shortest path.
    directed: boolean
        Whether graph is directed.

    Return
    ------
    sp : list of list
        List of shortest paths of the graph, where each path is represented by a list of nodes.
    """
    sp = []
    for n1, n2 in combinations(G.nodes(), 2):
        try:
            spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
        except nx.NetworkXNoPath:  # nodes not connected
            #            sp.append([])
            pass
        else:
            sp += spltemp
            # each edge walk is counted twice, starting from both its extreme nodes.
            if not directed:
                sp += [sptemp[::-1] for sptemp in spltemp]
                
    # add single nodes as length 0 paths.
    sp += [[n] for n in G.nodes()]
    return sp


 def wrapper_getSP(weight, directed, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    return i, get_shortest_paths(g, weight, directed)
--- a/gklearn/kernels/unfinished/cyclicPatternKernel.py
+++ b/gklearn/kernels/unfinished/cyclicPatternKernel.py
@@ -0,0 +1,147 @@
 """
@author: linlin <jajupmochi@gmail.com>
@references:
    [1] Tamás Horváth, Thomas Gärtner, and Stefan Wrobel. Cyclic pattern kernels for predictive graph mining. In Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining, pages 158–167. ACM, 2004.
    [2]	Hopcroft, J.; Tarjan, R. (1973). “Efficient algorithms for graph manipulation”. Communications of the ACM 16: 372–378. doi:10.1145/362248.362272.
    [3] Finding all the elementary circuits of a directed graph. D. B. Johnson, SIAM Journal on Computing 4, no. 1, 77-84, 1975. http://dx.doi.org/10.1137/0204007
 """

 import sys
 import pathlib
 sys.path.insert(0, "../")
 import time

 import networkx as nx
 import numpy as np

 from tqdm import tqdm


 def cyclicpatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):
    """Calculate cyclic pattern graph kernels between graphs.
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.
    depth : integer
        Depth of search. Longest length of paths.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
    """
    Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    start_time = time.time()

    # get all cyclic and tree patterns of all graphs before calculating kernels to save time, but this may consume a lot of memory for large dataset.
    all_patterns = [ get_patterns(Gn[i], node_label=node_label, edge_label = edge_label, labeled = labeled, cycle_bound = cycle_bound)
        for i in tqdm(range(0, len(Gn)), desc='retrieve patterns', file=sys.stdout) ]

    for i in tqdm(range(0, len(Gn)), desc='calculate kernels', file=sys.stdout):
        for j in range(i, len(Gn)):
            Kmatrix[i][j] = _cyclicpatternkernel_do(all_patterns[i], all_patterns[j])
            Kmatrix[j][i] = Kmatrix[i][j]

    run_time = time.time() - start_time
    print("\n --- kernel matrix of cyclic pattern kernel of size %d built in %s seconds ---" % (len(Gn), run_time))

    return Kmatrix, run_time


 def _cyclicpatternkernel_do(patterns1, patterns2):
    """Calculate path graph kernels up to depth d between 2 graphs.

    Parameters
    ----------
    paths1, paths2 : list
        List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
    k_func : function
        A kernel function used using different notions of fingerprint similarity.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.

    Return
    ------
    kernel : float
        Treelet Kernel between 2 graphs.
    """
    return len(set(patterns1) & set(patterns2))


 def get_patterns(G, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):
    """Find all cyclic and tree patterns in a graph.

    Parameters
    ----------
    G : NetworkX graphs
        The graph in which paths are searched.
    length : integer
        The maximum length of paths.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.

    Return
    ------
    path : list
        List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
    """
    number_simplecycles = 0
    bridges = nx.Graph()
    patterns = []

    bicomponents = nx.biconnected_component_subgraphs(G) # all biconnected components of G. this function use algorithm in reference [2], which (i guess) is slightly different from the one used in paper [1]
    for subgraph in bicomponents:
        if nx.number_of_edges(subgraph) > 1:
            simple_cycles = list(nx.simple_cycles(G.to_directed())) # all simple cycles in biconnected components. this function use algorithm in reference [3], which has time complexity O((n+e)(N+1)) for n nodes, e edges and N simple cycles. Which might be slower than the algorithm applied in paper [1]
            if cycle_bound != None and len(simple_cycles) > cycle_bound - number_simplecycles: # in paper [1], when applying another algorithm (subroutine RT), this becomes len(simple_cycles) == cycle_bound - number_simplecycles + 1, check again.
                return []
            else:

                # calculate canonical representation for each simple cycle
                all_canonkeys = []
                for cycle in simple_cycles:
                    canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[:-1] ]
                    canonkey = ''.join(canonlist)
                    canonkey = canonkey if canonkey < canonkey[::-1] else canonkey[::-1]
                    for i in range(1, len(cycle[:-1])):
                        canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[i:-1] + cycle[:i] ]
                        canonkey_t = ''.join(canonlist)
                        canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1]
                        canonkey = canonkey if canonkey < canonkey_t else canonkey_t
                    all_canonkeys.append(canonkey)

                patterns = list(set(patterns) | set(all_canonkeys))
                number_simplecycles += len(simple_cycles)
        else:
            bridges.add_edges_from(subgraph.edges(data=True))

    # calculate canonical representation for each connected component in bridge set
    components = list(nx.connected_component_subgraphs(bridges)) # all connected components in the bridge
    tree_patterns = []
    for tree in components:
        break



    # patterns += pi(bridges)
    return patterns
--- a/gklearn/kernels/unfinished/pathKernel.py
+++ b/gklearn/kernels/unfinished/pathKernel.py
@@ -0,0 +1,234 @@
 """
@author: linlin
@references: Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
 """

 import sys
 import pathlib
 sys.path.insert(0, "../")
 import time
 import itertools
 from tqdm import tqdm

 import networkx as nx
 import numpy as np

 from gklearn.kernels.deltaKernel import deltakernel
 from gklearn.utils.graphdataset import get_dataset_attributes


 def pathkernel(*args, node_label='atom', edge_label='bond_type'):
    """Calculate mean average path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.

    Return
    ------
    Kmatrix/kernel : Numpy matrix/float
        Kernel matrix, each element of which is the path kernel between 2 praphs. / Path kernel between 2 graphs.
    """
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    try:
        some_weight = list(nx.get_edge_attributes(Gn[0],
                                                  edge_label).values())[0]
        weight = edge_label if isinstance(some_weight, float) or isinstance(
            some_weight, int) else None
    except:
        weight = None

    start_time = time.time()

    splist = [
        get_shortest_paths(Gn[i], weight) for i in tqdm(
            range(0, len(Gn)), desc='getting shortest paths', file=sys.stdout)
    ]

    pbar = tqdm(
        total=((len(Gn) + 1) * len(Gn) / 2),
        desc='calculating kernels',
        file=sys.stdout)
    if ds_attrs['node_labeled']:
        if ds_attrs['edge_labeled']:
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _pathkernel_do_l(Gn[i], Gn[j], splist[i],
                                                     splist[j], node_label,
                                                     edge_label)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
        else:
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _pathkernel_do_nl(Gn[i], Gn[j], splist[i],
                                                      splist[j], node_label)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)

    else:
        if ds_attrs['edge_labeled']:
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _pathkernel_do_el(Gn[i], Gn[j], splist[i],
                                                      splist[j], edge_label)
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)
        else:
            for i in range(0, len(Gn)):
                for j in range(i, len(Gn)):
                    Kmatrix[i][j] = _pathkernel_do_unl(Gn[i], Gn[j], splist[i],
                                                       splist[j])
                    Kmatrix[j][i] = Kmatrix[i][j]
                    pbar.update(1)

    run_time = time.time() - start_time
    print(
        "\n --- mean average path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time


 def _pathkernel_do_l(G1, G2, sp1, sp2, node_label, edge_label):
    """Calculate mean average path kernel between 2 fully-labeled graphs.

    Parameters
    ----------
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    sp1, sp2 : list of list
        List of shortest paths of 2 graphs, where each path is represented by a list of nodes.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.

    Return
    ------
    kernel : float
        Path Kernel between 2 graphs.
    """
    # calculate kernel
    kernel = 0
    #     if len(sp1) == 0 or len(sp2) == 0:
    #         return 0 # @todo: should it be zero?
    for path1 in sp1:
        for path2 in sp2:
            if len(path1) == len(path2):
                kernel_path = (G1.node[path1[0]][node_label] == G2.node[path2[
                    0]][node_label])
                if kernel_path:
                    for i in range(1, len(path1)):
                        # kernel = 1 if all corresponding nodes and edges in the 2 paths have same labels, otherwise 0
                        if G1[path1[i - 1]][path1[i]][edge_label] != G2[path2[i - 1]][path2[i]][edge_label] or G1.node[path1[i]][node_label] != G2.node[path2[i]][node_label]:
                            kernel_path = 0
                            break
                    kernel += kernel_path  # add up kernels of all paths

    kernel = kernel / (len(sp1) * len(sp2))  # calculate mean average

    return kernel


 def _pathkernel_do_nl(G1, G2, sp1, sp2, node_label):
    """Calculate mean average path kernel between 2 node-labeled graphs.
    """
    # calculate kernel
    kernel = 0
    #     if len(sp1) == 0 or len(sp2) == 0:
    #         return 0 # @todo: should it be zero?
    for path1 in sp1:
        for path2 in sp2:
            if len(path1) == len(path2):
                kernel_path = 1
                for i in range(0, len(path1)):
                    # kernel = 1 if all corresponding nodes in the 2 paths have same labels, otherwise 0
                    if G1.node[path1[i]][node_label] != G2.node[path2[i]][node_label]:
                        kernel_path = 0
                        break
                kernel += kernel_path

    kernel = kernel / (len(sp1) * len(sp2))  # calculate mean average

    return kernel


 def _pathkernel_do_el(G1, G2, sp1, sp2, edge_label):
    """Calculate mean average path kernel between 2 edge-labeled graphs.
    """
    # calculate kernel
    kernel = 0
    for path1 in sp1:
        for path2 in sp2:
            if len(path1) == len(path2):
                if len(path1) == 0:
                    kernel += 1
                else:
                    kernel_path = 1
                    for i in range(0, len(path1) - 1):
                        # kernel = 1 if all corresponding edges in the 2 paths have same labels, otherwise 0
                        if G1[path1[i]][path1[i + 1]][edge_label] != G2[path2[
                                i]][path2[i + 1]][edge_label]:
                            kernel_path = 0
                            break
                    kernel += kernel_path

    kernel = kernel / (len(sp1) * len(sp2))  # calculate mean average

    return kernel


 def _pathkernel_do_unl(G1, G2, sp1, sp2):
    """Calculate mean average path kernel between 2 unlabeled graphs.
    """
    # calculate kernel
    kernel = 0
    for path1 in sp1:
        for path2 in sp2:
            if len(path1) == len(path2):
                kernel += 1

    kernel = kernel / (len(sp1) * len(sp2))  # calculate mean average

    return kernel


 def get_shortest_paths(G, weight):
    """Get all shortest paths of a graph.

    Parameters
    ----------
    G : NetworkX graphs
        The graphs whose paths are calculated.
    weight : string/None
        edge attribute used as weight to calculate the shortest path.

    Return
    ------
    sp : list of list
        List of shortest paths of the graph, where each path is represented by a list of nodes.
    """
    sp = []
    for n1, n2 in itertools.combinations(G.nodes(), 2):
        try:
            sp.append(nx.shortest_path(G, n1, n2, weight=weight))
        except nx.NetworkXNoPath:  # nodes not connected
            sp.append([])
    # add single nodes as length 0 paths.
    sp += [[n] for n in G.nodes()]
    return sp
--- a/gklearn/kernels/unfinished/treePatternKernel.py
+++ b/gklearn/kernels/unfinished/treePatternKernel.py
@@ -0,0 +1,241 @@
 """
@author: linlin
@references: Pierre Mahé and Jean-Philippe Vert. Graph kernels based on tree patterns for molecules. Machine learning, 75(1):3–35, 2009.
 """

 import sys
 import pathlib
 sys.path.insert(0, "../")
 import time

 import networkx as nx
 import numpy as np

 from collections import Counter
 from tqdm import tqdm
 tqdm.monitor_interval = 0

 from gklearn.utils.utils import untotterTransformation


 def treepatternkernel(*args,
                      node_label='atom',
                      edge_label='bond_type',
                      labeled=True,
                      kernel_type='untiln',
                      lmda=1,
                      h=1,
                      remove_totters=True):
    """Calculate tree pattern graph kernels between graphs.
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.
    kernel_type : string
        Type of tree pattern kernel, could be 'untiln', 'size' or 'branching'.
    lmda : float
        Weight to decide whether linear patterns or trees pattern of increasing complexity are favored.
    h : integer
        The upper bound of the height of tree patterns.
    remove_totters : boolean
        whether to remove totters. The default value is True.

    Return
    ------
    Kmatrix: Numpy matrix
        Kernel matrix, each element of which is the tree pattern graph kernel between 2 praphs.
    """
    if h < 1:
        raise Exception('h > 0 is requested.')
    kernel_type = kernel_type.lower()
    # arrange all graphs in a list
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    h = int(h)

    start_time = time.time()

    if remove_totters:
        Gn = [untotterTransformation(G, node_label, edge_label) for G in Gn]

    pbar = tqdm(
        total=(1 + len(Gn)) * len(Gn) / 2,
        desc='calculate kernels',
        file=sys.stdout)
    for i in range(0, len(Gn)):
        for j in range(i, len(Gn)):
            Kmatrix[i][j] = _treepatternkernel_do(Gn[i], Gn[j], node_label,
                                                  edge_label, labeled,
                                                  kernel_type, lmda, h)
            Kmatrix[j][i] = Kmatrix[i][j]
            pbar.update(1)

    run_time = time.time() - start_time
    print(
        "\n --- kernel matrix of tree pattern kernel of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time


 def _treepatternkernel_do(G1, G2, node_label, edge_label, labeled, kernel_type,
                          lmda, h):
    """Calculate tree pattern graph kernels between 2 graphs.

    Parameters
    ----------
    paths1, paths2 : list
        List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
    k_func : function
        A kernel function used using different notions of fingerprint similarity.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    labeled : boolean
        Whether the graphs are labeled. The default is True.
    kernel_type : string
        Type of tree pattern kernel, could be 'untiln', 'size' or 'branching'.
    lmda : float
        Weight to decide whether linear patterns or trees pattern of increasing complexity are favored.
    h : integer
        The upper bound of the height of tree patterns.

    Return
    ------
    kernel : float
        Treelet Kernel between 2 graphs.
    """

    def matchingset(n1, n2):
        """Get neiborhood matching set of two nodes in two graphs.
        """

        def mset_com(allpairs, length):
            """Find all sets R of pairs by combination.
            """
            if length == 1:
                mset = [[pair] for pair in allpairs]
                return mset, mset
            else:
                mset, mset_l = mset_com(allpairs, length - 1)
                mset_tmp = []
                for pairset in mset_l:  # for each pair set of length l-1
                    nodeset1 = [pair[0] for pair in pairset
                                ]  # nodes already in the set
                    nodeset2 = [pair[1] for pair in pairset]
                    for pair in allpairs:
                        if (pair[0] not in nodeset1) and (
                                pair[1] not in nodeset2
                        ):  # nodes in R should be unique
                            mset_tmp.append(
                                pairset + [pair]
                            )  # add this pair to the pair set of length l-1, constructing a new set of length l
                            nodeset1.append(pair[0])
                            nodeset2.append(pair[1])

                mset.extend(mset_tmp)

                return mset, mset_tmp

        allpairs = [
        ]  # all pairs those have the same node labels and edge labels
        for neighbor1 in G1[n1]:
            for neighbor2 in G2[n2]:
                if G1.node[neighbor1][node_label] == G2.node[neighbor2][node_label] \
                   and G1[n1][neighbor1][edge_label] == G2[n2][neighbor2][edge_label]:
                    allpairs.append([neighbor1, neighbor2])

        if allpairs != []:
            mset, _ = mset_com(allpairs, len(allpairs))
        else:
            mset = []

        return mset

    def kernel_h(h):
        """Calculate kernel of h-th iteration.
        """

        if kernel_type == 'untiln':
            all_kh = { str(n1) + '.' + str(n2) : (G1.node[n1][node_label] == G2.node[n2][node_label]) \
                for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ]
            all_kh_tmp = all_kh.copy()
            for i in range(2, h + 1):
                for n1 in G1.nodes():
                    for n2 in G2.nodes():
                        kh = 0
                        mset = all_msets[str(n1) + '.' + str(n2)]
                        for R in mset:
                            kh_tmp = 1
                            for pair in R:
                                kh_tmp *= lmda * all_kh[str(pair[0])
                                                        + '.' + str(pair[1])]
                            kh += 1 / lmda * kh_tmp
                        kh = (G1.node[n1][node_label] == G2.node[n2][
                            node_label]) * (1 + kh)
                        all_kh_tmp[str(n1) + '.' + str(n2)] = kh
                all_kh = all_kh_tmp.copy()

        elif kernel_type == 'size':
            all_kh = { str(n1) + '.' + str(n2) : lmda * (G1.node[n1][node_label] == G2.node[n2][node_label]) \
                for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ]
            all_kh_tmp = all_kh.copy()
            for i in range(2, h + 1):
                for n1 in G1.nodes():
                    for n2 in G2.nodes():
                        kh = 0
                        mset = all_msets[str(n1) + '.' + str(n2)]
                        for R in mset:
                            kh_tmp = 1
                            for pair in R:
                                kh_tmp *= lmda * all_kh[str(pair[0])
                                                        + '.' + str(pair[1])]
                            kh += kh_tmp
                        kh *= lmda * (
                            G1.node[n1][node_label] == G2.node[n2][node_label])
                        all_kh_tmp[str(n1) + '.' + str(n2)] = kh
                all_kh = all_kh_tmp.copy()

        elif kernel_type == 'branching':
            all_kh = { str(n1) + '.' + str(n2) : (G1.node[n1][node_label] == G2.node[n2][node_label]) \
                for n1 in G1.nodes() for n2 in G2.nodes() } # kernels between all pair of nodes with h = 1 ]
            all_kh_tmp = all_kh.copy()
            for i in range(2, h + 1):
                for n1 in G1.nodes():
                    for n2 in G2.nodes():
                        kh = 0
                        mset = all_msets[str(n1) + '.' + str(n2)]
                        for R in mset:
                            kh_tmp = 1
                            for pair in R:
                                kh_tmp *= lmda * all_kh[str(pair[0])
                                                        + '.' + str(pair[1])]
                            kh += 1 / lmda * kh_tmp
                        kh *= (
                            G1.node[n1][node_label] == G2.node[n2][node_label])
                        all_kh_tmp[str(n1) + '.' + str(n2)] = kh
                all_kh = all_kh_tmp.copy()

        return all_kh

    # calculate matching sets for every pair of nodes at first to avoid calculating in every iteration.
    all_msets = ({ str(node1) + '.' + str(node2) : matchingset(node1, node2) for node1 in G1.nodes() \
        for node2 in G2.nodes() } if h > 1 else {})

    all_kh = kernel_h(h)
    kernel = sum(all_kh.values())

    if kernel_type == 'size':
        kernel = kernel / (lmda**h)

    return kernel
--- a/gklearn/kernels/unfinished/weisfeilerLehmanKernel.py
+++ b/gklearn/kernels/unfinished/weisfeilerLehmanKernel.py
@@ -0,0 +1,403 @@
 """
@author: linlin
@references:
    [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.
 """

 import sys
 import pathlib
 from collections import Counter
 sys.path.insert(0, "../")

 import networkx as nx
 import numpy as np
 import time

 from gklearn.kernels.pathKernel import pathkernel

 def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'):
    """Calculate Weisfeiler-Lehman kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.        
    node_label : string
        node attribute used as label. The default node label is atom.        
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.        
    height : int
        subtree height    
    base_kernel : string
        base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. For user-defined kernel, base_kernel is the name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.

    Notes
    -----
    This function now supports WL subtree kernel, WL shortest path kernel and WL edge kernel.
    """
    base_kernel = base_kernel.lower()
    Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    start_time = time.time()

    # for WL subtree kernel
    if base_kernel == 'subtree':           
        Kmatrix = _wl_subtreekernel_do(args[0], node_label, edge_label, height)

    # for WL shortest path kernel
    elif base_kernel == 'sp':
        Kmatrix = _wl_spkernel_do(args[0], node_label, edge_label, height)

    # for WL edge kernel
    elif base_kernel == 'edge':
        Kmatrix = _wl_edgekernel_do(args[0], node_label, edge_label, height)

    # for user defined base kernel
    else:
        Kmatrix = _wl_userkernel_do(args[0], node_label, edge_label, height, base_kernel)

    run_time = time.time() - start_time
    print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), run_time))

    return Kmatrix, run_time



 def _wl_subtreekernel_do(Gn, node_label, edge_label, height):
    """Calculate Weisfeiler-Lehman subtree kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.      
    height : int
        subtree height.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs

    # initial for height = 0
    all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
    all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
    all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
    num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs

    # for each graph
    for G in Gn:
        # get the set of original labels
        labels_ori = list(nx.get_node_attributes(G, node_label).values())
        all_labels_ori.update(labels_ori)
        num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph
        all_num_of_each_label.append(num_of_each_label)
        num_of_labels = len(num_of_each_label) # number of all unique labels

        all_labels_ori.update(labels_ori)

    all_num_of_labels_occured += len(all_labels_ori)

    # calculate subtree kernel with the 0th iteration and add it to the final kernel
    for i in range(0, len(Gn)):
        for j in range(i, len(Gn)):
            labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
            vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
            vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
            Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
            Kmatrix[j][i] = Kmatrix[i][j]

    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
        all_labels_ori = set()
        all_num_of_each_label = []

        # for each graph
        for idx, G in enumerate(Gn):

            set_multisets = []
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)

            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]

            # get the set of compressed labels
            labels_comp = list(nx.get_node_attributes(G, node_label).values())
            all_labels_ori.update(labels_comp)
            num_of_each_label = dict(Counter(labels_comp))
            all_num_of_each_label.append(num_of_each_label)

        all_num_of_labels_occured += len(all_labels_ori)

        # calculate subtree kernel with h iterations and add it to the final kernel
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
                vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
                vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
                Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
                Kmatrix[j][i] = Kmatrix[i][j]

    return Kmatrix


 def _wl_spkernel_do(Gn, node_label, edge_label, height):
    """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.      
    edge_label : string
        edge attribute used as label.       
    height : int
        subtree height.
        
    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """
    from gklearn.utils.utils import getSPGraph
      
    # init.
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel

    Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
    
    # initial for height = 0
    for i in range(0, len(Gn)):
        for j in range(i, len(Gn)):
            for e1 in Gn[i].edges(data = True):
                for e2 in Gn[j].edges(data = True):          
                    if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                        Kmatrix[i][j] += 1
            Kmatrix[j][i] = Kmatrix[i][j]
            
    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
        for G in Gn: # for each graph
            set_multisets = []
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)          

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)
            
            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]
                
        # calculate subtree kernel with h iterations and add it to the final kernel
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                for e1 in Gn[i].edges(data = True):
                    for e2 in Gn[j].edges(data = True):          
                        if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                            Kmatrix[i][j] += 1
                Kmatrix[j][i] = Kmatrix[i][j]
        
    return Kmatrix



 def _wl_edgekernel_do(Gn, node_label, edge_label, height):
    """Calculate Weisfeiler-Lehman edge kernels between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.      
    edge_label : string
        edge attribute used as label.       
    height : int
        subtree height.
        
    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """      
    # init.
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
    # initial for height = 0
    for i in range(0, len(Gn)):
        for j in range(i, len(Gn)):
            for e1 in Gn[i].edges(data = True):
                for e2 in Gn[j].edges(data = True):          
                    if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                        Kmatrix[i][j] += 1
            Kmatrix[j][i] = Kmatrix[i][j]
            
    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
        for G in Gn: # for each graph
            set_multisets = []            
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)          

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)
            
            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]
                
        # calculate subtree kernel with h iterations and add it to the final kernel
        for i in range(0, len(Gn)):
            for j in range(i, len(Gn)):
                for e1 in Gn[i].edges(data = True):
                    for e2 in Gn[j].edges(data = True):          
                        if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
                            Kmatrix[i][j] += 1
                Kmatrix[j][i] = Kmatrix[i][j]
        
    return Kmatrix


 def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
    """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.       
    node_label : string
        node attribute used as label.      
    edge_label : string
        edge attribute used as label.       
    height : int
        subtree height.
    base_kernel : string
        Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
        
    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
    """      
    # init.
    height = int(height)
    Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  
    # initial for height = 0
    Kmatrix = base_kernel(Gn, node_label, edge_label)
            
    # iterate each height
    for h in range(1, height + 1):
        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
        num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
        for G in Gn: # for each graph
            set_multisets = []           
            for node in G.nodes(data = True):
                # Multiset-label determination.
                multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
                # sorting each multiset
                multiset.sort()
                multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix 
                set_multisets.append(multiset)          

            # label compression
            set_unique = list(set(set_multisets)) # set of unique multiset labels
            # a dictionary mapping original labels to new ones. 
            set_compressed = {}
            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label 
            for value in set_unique:
                if value in all_set_compressed.keys():
                    set_compressed.update({ value : all_set_compressed[value] })
                else:
                    set_compressed.update({ value : str(num_of_labels_occured + 1) })
                    num_of_labels_occured += 1

            all_set_compressed.update(set_compressed)
            
            # relabel nodes
            for node in G.nodes(data = True):
                node[1][node_label] = set_compressed[set_multisets[node[0]]]
                
        # calculate kernel with h iterations and add it to the final kernel
        Kmatrix += base_kernel(Gn, node_label, edge_label)
        
    return Kmatrix
--- a/gklearn/preimage/common_types.py
+++ b/gklearn/preimage/common_types.py
@@ -0,0 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar 19 18:17:38 2020

@author: ljia
 """

 from enum import Enum, auto

 class AlgorithmState(Enum):
    """can be used to specify the state of an algorithm.
    """
    CALLED = auto # The algorithm has been called.
    INITIALIZED = auto # The algorithm has been initialized.
    CONVERGED = auto # The algorithm has converged.
    TERMINATED = auto # The algorithm has terminated.
--- a/gklearn/preimage/cpp2python.py
+++ b/gklearn/preimage/cpp2python.py
@@ -0,0 +1,134 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Mar 20 11:09:04 2020

@author: ljia
 """
 import re

 def convert_function(cpp_code):
 # f_cpp = open('cpp_code.cpp', 'r')
 # # f_cpp = open('cpp_ext/src/median_graph_estimator.ipp', 'r')
 # 	cpp_code = f_cpp.read()
 	python_code = cpp_code.replace('else if (', 'elif ')
 	python_code = python_code.replace('if (', 'if ')
 	python_code = python_code.replace('else {', 'else:')
 	python_code = python_code.replace(') {', ':')
 	python_code = python_code.replace(';\n', '\n')
 	python_code = re.sub('\n(.*)}\n', '\n\n', python_code)
 	# python_code = python_code.replace('}\n', '')
 	python_code = python_code.replace('throw', 'raise')
 	python_code = python_code.replace('error', 'Exception')
 	python_code = python_code.replace('"', '\'')
 	python_code = python_code.replace('\\\'', '"')
 	python_code = python_code.replace('try {', 'try:')
 	python_code = python_code.replace('true', 'True')
 	python_code = python_code.replace('false', 'False')
 	python_code = python_code.replace('catch (...', 'except')
 	# python_code = re.sub('std::string\(\'(.*)\'\)', '$1', python_code)
 	
 	return python_code



 # # python_code = python_code.replace('}\n', '')




 # python_code = python_code.replace('option.first', 'opt_name')
 # python_code = python_code.replace('option.second', 'opt_val')
 # python_code = python_code.replace('ged::Error', 'Exception')
 # python_code = python_code.replace('std::string(\'Invalid argument "\')', '\'Invalid argument "\'')


 # f_cpp.close()
 # f_python = open('python_code.py', 'w')
 # f_python.write(python_code)
 # f_python.close()


 def convert_function_comment(cpp_fun_cmt, param_types):
 	cpp_fun_cmt = cpp_fun_cmt.replace('\t', '')
 	cpp_fun_cmt = cpp_fun_cmt.replace('\n * ', ' ')
 	# split the input comment according to key words.
 	param_split = None
 	note = None
 	cmt_split = cpp_fun_cmt.split('@brief')[1]
 	brief = cmt_split
 	if '@param' in cmt_split:
 		cmt_split = cmt_split.split('@param')
 		brief = cmt_split[0]
 		param_split = cmt_split[1:]
 	if '@note' in cmt_split[-1]:
 		note_split = cmt_split[-1].split('@note')
 		if param_split is not None:
 			param_split.pop()
 			param_split.append(note_split[0])
 		else:
 			brief = note_split[0]
 		note = note_split[1]
 		
 	# get parameters.
 	if param_split is not None:
 		for idx, param in enumerate(param_split):
 			_, param_name, param_desc = param.split(' ', 2)
 			param_name = function_comment_strip(param_name, ' *\n\t/')
 			param_desc = function_comment_strip(param_desc, ' *\n\t/')
 			param_split[idx] = (param_name, param_desc)
 		
 	# strip comments.
 	brief = function_comment_strip(brief, ' *\n\t/')
 	if note is not None:
 		note = function_comment_strip(note, ' *\n\t/')
 		
 	# construct the Python function comment.
 	python_fun_cmt = '"""'
 	python_fun_cmt += brief + '\n'
 	if param_split is not None and len(param_split) > 0:
 		python_fun_cmt += '\nParameters\n----------'
 		for idx, param in enumerate(param_split):
 			python_fun_cmt += '\n' + param[0] + ' : ' + param_types[idx]
 			python_fun_cmt += '\n\t' + param[1] + '\n'
 	if note is not None:
 		python_fun_cmt += '\nNote\n----\n' + note + '\n'
 	python_fun_cmt += '"""'
 	
 	return python_fun_cmt
 			
 		
 def function_comment_strip(comment, bad_chars):
 	head_removed, tail_removed = False, False
 	while not head_removed or not tail_removed:
 		if comment[0] in bad_chars:
 			comment = comment[1:]
 			head_removed = False
 		else:
 			head_removed = True
 		if comment[-1] in bad_chars:
 			comment = comment[:-1]
 			tail_removed = False
 		else:
 			tail_removed = True
 			
 	return comment

 		
 if __name__ == '__main__':
 #  	python_code = convert_function("""
 # 		if (print_to_stdout_ == 2) {
 # 			std::cout << "\n===========================================================\n";
 # 			std::cout << "Block gradient descent for initial median " << median_pos + 1 << " of " << medians.size() << ".\n";
 # 			std::cout << "-----------------------------------------------------------\n";
 # 		}
 # 								""")
 	
 	
 	python_fun_cmt = convert_function_comment("""
 	/*!
 	 * @brief Returns the sum of distances.
 	 * @param[in] state The state of the estimator.
 	 * @return The sum of distances of the median when the estimator was in the state @p state during the last call to run().
 	 */
 						""", ['string', 'string'])
--- a/gklearn/preimage/find_best_k.py
+++ b/gklearn/preimage/find_best_k.py
@@ -0,0 +1,170 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Jan  9 11:54:32 2020

@author: ljia
 """
 import numpy as np
 import random
 import csv

 from gklearn.utils.graphfiles import loadDataset
 from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs

 def find_best_k():
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    gkernel = 'treeletkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    ds_name = 'mono'
    dir_output = 'results/test_find_best_k/'
    
    repeats = 50
    k_list = range(2, 11)
    fit_method = 'k-graphs'
    # fitted on the whole dataset - treelet - mono
    edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0]
    
    # create result files.
    fn_output_detail = 'results_detail.' + fit_method + '.csv'
    f_detail = open(dir_output + fn_output_detail, 'a')
    csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
              'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
              'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
              'dis_k gi -> GM'])
    f_detail.close()
    fn_output_summary = 'results_summary.csv'
    f_summary = open(dir_output + fn_output_summary, 'a')
    csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
              'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
              'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
              'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', 
              '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', 
              'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', 
              'repeats better dis_k gi -> GM'])
    f_summary.close()
    
    random.seed(1)
    rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
    
    for k in k_list:
        print('\n--------- k =', k, '----------')
        
        sod_sm_list = []
        sod_gm_list = []
        dis_k_sm_list = []
        dis_k_gm_list = []
        dis_k_gi_min_list = []
        nb_sod_sm2gm = [0, 0, 0]
        nb_dis_k_sm2gm = [0, 0, 0]
        nb_dis_k_gi2sm = [0, 0, 0]
        nb_dis_k_gi2gm = [0, 0, 0]
        repeats_better_sod_sm2gm = []
        repeats_better_dis_k_sm2gm = []
        repeats_better_dis_k_gi2sm = []
        repeats_better_dis_k_gi2gm = []
        
        
        for repeat in range(repeats):
            print('\nrepeat =', repeat)
            random.seed(rdn_seed_list[repeat])
            median_set_idx = random.sample(range(0, len(Gn)), k)
            print('median set: ', median_set_idx)
            
            sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
                = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, 
                                             fit_method='k-graphs', 
                                             edit_costs=edit_costs,
                                             group_min=median_set_idx,
                                             parallel=False)
                
            # write result detail.
            sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
            dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
            dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
            dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
            f_detail = open(dir_output + fn_output_detail, 'a')
            csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat,
                      median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm, 
                      dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
                      dis_k_gi2gm])
            f_detail.close()
            
            # compute result summary.
            sod_sm_list.append(sod_sm)
            sod_gm_list.append(sod_gm)
            dis_k_sm_list.append(dis_k_sm)
            dis_k_gm_list.append(dis_k_gm)
            dis_k_gi_min_list.append(dis_k_gi_min)
            # # SOD SM -> GM
            if sod_sm > sod_gm:
                nb_sod_sm2gm[0] += 1
                repeats_better_sod_sm2gm.append(repeat)
            elif sod_sm == sod_gm:
                nb_sod_sm2gm[1] += 1
            elif sod_sm < sod_gm:
                nb_sod_sm2gm[2] += 1
            # # dis_k SM -> GM
            if dis_k_sm > dis_k_gm:
                nb_dis_k_sm2gm[0] += 1
                repeats_better_dis_k_sm2gm.append(repeat)
            elif dis_k_sm == dis_k_gm:
                nb_dis_k_sm2gm[1] += 1
            elif dis_k_sm < dis_k_gm:
                nb_dis_k_sm2gm[2] += 1
            # # dis_k gi -> SM
            if dis_k_gi_min > dis_k_sm:
                nb_dis_k_gi2sm[0] += 1
                repeats_better_dis_k_gi2sm.append(repeat)
            elif dis_k_gi_min == dis_k_sm:
                nb_dis_k_gi2sm[1] += 1
            elif dis_k_gi_min < dis_k_sm:
                nb_dis_k_gi2sm[2] += 1
            # # dis_k gi -> GM
            if dis_k_gi_min > dis_k_gm:
                nb_dis_k_gi2gm[0] += 1
                repeats_better_dis_k_gi2gm.append(repeat)
            elif dis_k_gi_min == dis_k_gm:
                nb_dis_k_gi2gm[1] += 1
            elif dis_k_gi_min < dis_k_gm:
                nb_dis_k_gi2gm[2] += 1
            
        # write result summary. 
        sod_sm_mean = np.mean(sod_sm_list)
        sod_gm_mean = np.mean(sod_gm_list)
        dis_k_sm_mean = np.mean(dis_k_sm_list)
        dis_k_gm_mean = np.mean(dis_k_gm_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
        dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        f_summary = open(dir_output + fn_output_summary, 'a')
        csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 
                  sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
                  dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, 
                  dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, 
                  nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, 
                  repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, 
                  repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
        f_summary.close()
        
    print('\ncomplete.')
    return


 def getRelations(sign):
    if sign == -1:
        return 'better'
    elif sign == 0:
        return 'same'
    elif sign == 1:
        return 'worse'


 if __name__ == '__main__':
    find_best_k()
--- a/gklearn/preimage/fitDistance.py
+++ b/gklearn/preimage/fitDistance.py
@@ -0,0 +1,430 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed Oct 16 14:20:06 2019

@author: ljia
 """
 import numpy as np
 from tqdm import tqdm
 from itertools import combinations_with_replacement, combinations
 import multiprocessing
 from multiprocessing import Pool
 from functools import partial
 import time
 import random
 import sys

 from scipy import optimize
 from scipy.optimize import minimize
 import cvxpy as cp

 from gklearn.preimage.ged import GED, get_nb_edit_operations, get_nb_edit_operations_letter, get_nb_edit_operations_nonsymbolic
 from gklearn.preimage.utils import kernel_distance_matrix

 def fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max,
                               params_ged={'lib': 'gedlibpy', 'cost': 'CONSTANT', 
                                           'method': 'IPFP', 'stabilizer': None},
                               init_costs=[3, 3, 1, 3, 3, 1],
                               dataset='monoterpenoides', Kmatrix=None,
                               parallel=True):
 #    dataset = dataset.lower()
    
    # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
 #    random.seed(1)
 #    cost_rdm = random.sample(range(1, 10), 6)
 #    init_costs = cost_rdm + [0]
 #    init_costs = cost_rdm
 #    init_costs = [3, 3, 1, 3, 3, 1]
 #    init_costs = [i * 0.01 for i in cost_rdm] + [0]
 #    init_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
 #    init_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
 #    init_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
 #    idx_cost_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
    
    # compute distances in feature space.
    dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, 
                                                Kmatrix=Kmatrix, gkernel=gkernel)
    dis_k_vec = []
    for i in range(len(dis_k_mat)):
 #        for j in range(i, len(dis_k_mat)):
        for j in range(i + 1, len(dis_k_mat)):
            dis_k_vec.append(dis_k_mat[i, j])
    dis_k_vec = np.array(dis_k_vec)
    
    # init ged.
    print('\ninitial:')
    time0 = time.time()
    params_ged['dataset'] = dataset
    params_ged['edit_cost_constant'] = init_costs
    ged_vec_init, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
                                                            parallel=parallel)
    residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]    
    time_list = [time.time() - time0]
    edit_cost_list = [init_costs]  
    nb_cost_mat = np.array(n_edit_operations)
    nb_cost_mat_list = [nb_cost_mat]
    print('edit_costs:', init_costs)
    print('residual_list:', residual_list)
    
    for itr in range(itr_max):
        print('\niteration', itr)
        time0 = time.time()
        # "fit" geds to distances in feature space by tuning edit costs using the
        # Least Squares Method.
        np.savez('results/xp_fit_method/fit_data_debug' + str(itr) + '.gm', 
                 nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, 
                 n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init,
                 ged_mat=ged_mat)
        edit_costs_new, residual = update_costs(nb_cost_mat, dis_k_vec, 
                                                dataset=dataset, cost=params_ged['cost'])
        for i in range(len(edit_costs_new)):
            if -1e-9 <= edit_costs_new[i] <= 1e-9:
                edit_costs_new[i] = 0
            if edit_costs_new[i] < 0:
                raise ValueError('The edit cost is negative.')
 #        for i in range(len(edit_costs_new)):
 #            if edit_costs_new[i] < 0:
 #                edit_costs_new[i] = 0

        # compute new GEDs and numbers of edit operations.
        params_ged['edit_cost_constant'] = edit_costs_new # np.array([edit_costs_new[0], edit_costs_new[1], 0.75])
        ged_vec, ged_mat, n_edit_operations = compute_geds(Gn, params_ged,
                                                           parallel=parallel)
        residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
        time_list.append(time.time() - time0)
        edit_cost_list.append(edit_costs_new)
        nb_cost_mat = np.array(n_edit_operations)
        nb_cost_mat_list.append(nb_cost_mat)                        
        print('edit_costs:', edit_costs_new)
        print('residual_list:', residual_list)
    
    return edit_costs_new, residual_list, edit_cost_list, dis_k_mat, ged_mat, \
        time_list, nb_cost_mat_list


 def compute_geds(Gn, params_ged, parallel=False):
    edit_cost_name = params_ged['cost']
    if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2':
        get_nb_eo = get_nb_edit_operations_letter
    elif edit_cost_name == 'NON_SYMBOLIC':
        get_nb_eo = get_nb_edit_operations_nonsymbolic
    else: 
        get_nb_eo = get_nb_edit_operations
    ged_mat = np.zeros((len(Gn), len(Gn)))
    if parallel:
 #        print('parallel')
 #        len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
        len_itr = int(len(Gn) * (len(Gn) - 1) / 2)
        ged_vec = [0 for i in range(len_itr)]
        n_edit_operations = [0 for i in range(len_itr)]
 #        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        itr = combinations(range(0, len(Gn)), 2)
        n_jobs = multiprocessing.cpu_count()
        if len_itr < 100 * n_jobs:
            chunksize = int(len_itr / n_jobs) + 1
        else:
            chunksize = 100
        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare
        do_partial = partial(_wrapper_compute_ged_parallel, params_ged, get_nb_eo)
        pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
        iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
                        desc='computing GEDs', file=sys.stdout)
 #        iterator = pool.imap_unordered(do_partial, itr, chunksize)
        for i, j, dis, n_eo_tmp in iterator:
            idx_itr = int(len(Gn) * i + j - (i + 1) * (i + 2) / 2)
            ged_vec[idx_itr] = dis
            ged_mat[i][j] = dis
            ged_mat[j][i] = dis
            n_edit_operations[idx_itr] = n_eo_tmp
 #            print('\n-------------------------------------------')
 #            print(i, j, idx_itr, dis)
        pool.close()
        pool.join()
        
    else:
        ged_vec = []
        n_edit_operations = []
        for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
 #        for i in range(len(Gn)):
            for j in range(i + 1, len(Gn)):
                dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], **params_ged)
                ged_vec.append(dis)
                ged_mat[i][j] = dis
                ged_mat[j][i] = dis
                n_eo_tmp = get_nb_eo(Gn[i], Gn[j], pi_forward, pi_backward)
                n_edit_operations.append(n_eo_tmp)
                    
    return ged_vec, ged_mat, n_edit_operations
                    

 def _wrapper_compute_ged_parallel(params_ged, get_nb_eo, itr):
    i = itr[0]
    j = itr[1]
    dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], params_ged, get_nb_eo)
    return i, j, dis, n_eo_tmp


 def _compute_ged_parallel(g1, g2, params_ged, get_nb_eo):
    dis, pi_forward, pi_backward = GED(g1, g2, **params_ged)
    n_eo_tmp = get_nb_eo(g1, g2, pi_forward, pi_backward) # [0,0,0,0,0,0]
    return dis, n_eo_tmp


 def update_costs(nb_cost_mat, dis_k_vec, dataset='monoterpenoides', 
                 cost='CONSTANT', rw_constraints='inequality'):
 #    if dataset == 'Letter-high':
    if cost == 'LETTER':            
        pass
 #        # method 1: set alpha automatically, just tune c_vir and c_eir by 
 #        # LMS using cvxpy.
 #        alpha = 0.5
 #        coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec)
 ##        if np.count_nonzero(nb_cost_mat[:,4]) == 0:
 ##            alpha = 0.75
 ##        else:
 ##            alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0])
 ##        alpha = alpha * 0.99
 #        param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1])
 #        param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5])
 #        nb_cost_mat_new = np.column_stack((param_vir, param_eir))
 #        dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3]
 #        
 #        x = cp.Variable(nb_cost_mat_new.shape[1])
 #        cost = cp.sum_squares(nb_cost_mat_new * x - dis_new)
 #        constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
 #        prob = cp.Problem(cp.Minimize(cost), constraints)
 #        prob.solve()
 #        edit_costs_new = x.value
 #        edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha])
 #        residual = np.sqrt(prob.value)
    
 #        # method 2: tune c_vir, c_eir and alpha by nonlinear programming by 
 #        # scipy.optimize.minimize.
 #        w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
 #        w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
 #        w2 = nb_cost_mat[:,3]
 #        w3 = dis_k_vec
 #        func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
 #                             + w2 * x[2] - w3 * x[3]) ** 2)
 #        bounds = ((0, None), (0., None), (0.5, 0.5), (0, None))
 #        res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds)
 #        edit_costs_new = res.x[0:3]
 #        residual = res.fun
    
    # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy.
    
    
 #        # method 4: tune c_vir, c_eir and alpha by QP function
 #        # scipy.optimize.least_squares. An initial guess is required.
 #        w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1]
 #        w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5]
 #        w2 = nb_cost_mat[:,3]
 #        w3 = dis_k_vec
 #        func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \
 #                             + w2 * x[2] - w3 * x[3]) ** 2
 #        res = optimize.root(func, [0.9, 1.7, 0.75, 100])
 #        edit_costs_new = res.x
 #        residual = None
    elif cost == 'LETTER2':
 #            # 1. if c_vi != c_vr, c_ei != c_er.
 #            nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
 #            x = cp.Variable(nb_cost_mat_new.shape[1])
 #            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
 ##            # 1.1 no constraints.
 ##            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
 #            # 1.2 c_vs <= c_vi + c_vr.
 #            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
 #                           np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]            
 ##            # 2. if c_vi == c_vr, c_ei == c_er.
 ##            nb_cost_mat_new = nb_cost_mat[:,[0,3,4]]
 ##            nb_cost_mat_new[:,0] += nb_cost_mat[:,1]
 ##            nb_cost_mat_new[:,2] += nb_cost_mat[:,5]
 ##            x = cp.Variable(nb_cost_mat_new.shape[1])
 ##            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
 ##            # 2.1 no constraints.
 ##            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
 ###            # 2.2 c_vs <= c_vi + c_vr.
 ###            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
 ###                           np.array([2.0, -1.0, 0.0]).T@x >= 0.0]     
 #            
 #            prob = cp.Problem(cp.Minimize(cost_fun), constraints)
 #            prob.solve()
 #            edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
 #            edit_costs_new = np.array(edit_costs_new)
 #            residual = np.sqrt(prob.value)
        if rw_constraints == 'inequality':
            # c_vs <= c_vi + c_vr.
            nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
            x = cp.Variable(nb_cost_mat_new.shape[1])
            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
            constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
                           np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
            prob = cp.Problem(cp.Minimize(cost_fun), constraints)
            try:
                prob.solve(verbose=True)
            except MemoryError as error0:
                print('\nUsing solver "OSQP" caused a memory error.')
                print('the original error message is\n', error0)
                print('solver status: ', prob.status)
                print('trying solver "CVXOPT" instead...\n')
                try:
                    prob.solve(solver=cp.CVXOPT, verbose=True)
                except Exception as error1:
                    print('\nAn error occured when using solver "CVXOPT".')
                    print('the original error message is\n', error1)
                    print('solver status: ', prob.status)
                    print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n')
                    prob.solve(solver=cp.MOSEK, verbose=True)
                else:
                    print('solver status: ', prob.status)                    
            else:
                print('solver status: ', prob.status)
            print()
            edit_costs_new = x.value
            residual = np.sqrt(prob.value)
        elif rw_constraints == '2constraints':
            # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
            nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
            x = cp.Variable(nb_cost_mat_new.shape[1])
            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
            constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
                           np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
                           np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
                           np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
            prob = cp.Problem(cp.Minimize(cost_fun), constraints)
            prob.solve()
            edit_costs_new = x.value
            residual = np.sqrt(prob.value)
        elif rw_constraints == 'no-constraint':
            # no constraint.
            nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
            x = cp.Variable(nb_cost_mat_new.shape[1])
            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
            constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
            prob = cp.Problem(cp.Minimize(cost_fun), constraints)
            prob.solve()
            edit_costs_new = x.value
            residual = np.sqrt(prob.value)
 #            elif method == 'inequality_modified':
 #                # c_vs <= c_vi + c_vr.
 #                nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
 #                x = cp.Variable(nb_cost_mat_new.shape[1])
 #                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
 #                constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
 #                               np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
 #                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
 #                prob.solve()
 #                # use same costs for insertion and removal rather than the fitted costs.
 #                edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
 #                edit_costs_new = np.array(edit_costs_new)
 #                residual = np.sqrt(prob.value)
    elif cost == 'NON_SYMBOLIC':
        is_n_attr = np.count_nonzero(nb_cost_mat[:,2])
        is_e_attr = np.count_nonzero(nb_cost_mat[:,5])
        
        if dataset == 'SYNTHETICnew':
 #            nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
            nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
            x = cp.Variable(nb_cost_mat_new.shape[1])
            cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
 #            constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
 #                           np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
 #            constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]]
            constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])],
                   np.array([0.0, 1.0, -1.0]).T@x == 0.0]
            prob = cp.Problem(cp.Minimize(cost_fun), constraints)
            prob.solve()
 #            print(x.value)
            edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value, 
                                             np.array([0.0])))
            residual = np.sqrt(prob.value)
            
        elif rw_constraints == 'inequality':
            # c_vs <= c_vi + c_vr.
            if is_n_attr and is_e_attr:
                nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
                x = cp.Variable(nb_cost_mat_new.shape[1])
                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
                constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
                               np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
                               np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
                prob.solve()
                edit_costs_new = x.value
                residual = np.sqrt(prob.value)
            elif is_n_attr and not is_e_attr:
                nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
                x = cp.Variable(nb_cost_mat_new.shape[1])
                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
                constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
                               np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
                prob.solve()
                print(x.value)
                edit_costs_new = np.concatenate((x.value, np.array([0.0])))
                residual = np.sqrt(prob.value)
            elif not is_n_attr and is_e_attr:
                nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
                x = cp.Variable(nb_cost_mat_new.shape[1])
                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
                constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
                               np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
                prob.solve()
                edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
                residual = np.sqrt(prob.value)
            else:
                nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
                x = cp.Variable(nb_cost_mat_new.shape[1])
                cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
                constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
                prob = cp.Problem(cp.Minimize(cost_fun), constraints)
                prob.solve()
                edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), 
                                                 x.value[2:], np.array([0.0])))
                residual = np.sqrt(prob.value)
    else:
 #    # method 1: simple least square method.
 #    edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
 #                                                     rcond=None)
    
 #    # method 2: least square method with x_i >= 0.
 #    edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec)
    
    # method 3: solve as a quadratic program with constraints.
 #    P = np.dot(nb_cost_mat.T, nb_cost_mat)
 #    q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat)
 #    G = -1 * np.identity(nb_cost_mat.shape[1])
 #    h = np.array([0 for i in range(nb_cost_mat.shape[1])])
 #    A = np.array([1 for i in range(nb_cost_mat.shape[1])])
 #    b = 1
 #    x = cp.Variable(nb_cost_mat.shape[1])
 #    prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
 #                      [G@x <= h])
 #    prob.solve()
 #    edit_costs_new = x.value
 #    residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
    
 #    G = -1 * np.identity(nb_cost_mat.shape[1])
 #    h = np.array([0 for i in range(nb_cost_mat.shape[1])])
        x = cp.Variable(nb_cost_mat.shape[1])
        cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
        constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
    #                   np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
                       np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
                       np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
        prob = cp.Problem(cp.Minimize(cost_fun), constraints)
        prob.solve()
        edit_costs_new = x.value
        residual = np.sqrt(prob.value)
    
    # method 4: 
    
    return edit_costs_new, residual


 if __name__ == '__main__':
    print('check test_fitDistance.py')
--- a/gklearn/preimage/ged.py
+++ b/gklearn/preimage/ged.py
@@ -0,0 +1,467 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Oct 17 18:44:59 2019

@author: ljia
 """
 import numpy as np
 import networkx as nx
 from tqdm import tqdm
 import sys
 import multiprocessing
 from multiprocessing import Pool
 from functools import partial

 #from gedlibpy_linlin import librariesImport, gedlibpy
 from gklearn.gedlib import librariesImport, gedlibpy

 def GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy', cost='CHEM_1', method='IPFP', 
        edit_cost_constant=[], algo_options='', stabilizer='min', repeat=50):
    """
    Compute GED for 2 graphs.
    """    
    
 #    dataset = dataset.lower()
    
    if lib == 'gedlibpy':
        gedlibpy.restart_env()
        gedlibpy.add_nx_graph(convertGraph(g1, cost), "")
        gedlibpy.add_nx_graph(convertGraph(g2, cost), "")

        listID = gedlibpy.get_all_graph_ids()
        gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
        gedlibpy.init()
        gedlibpy.set_method(method, algo_options)
        gedlibpy.init_method()

        g = listID[0]
        h = listID[1]
        if stabilizer is None:
            gedlibpy.run_method(g, h)
            pi_forward = gedlibpy.get_forward_map(g, h)
            pi_backward = gedlibpy.get_backward_map(g, h)
            upper = gedlibpy.get_upper_bound(g, h)
            lower = gedlibpy.get_lower_bound(g, h)        
        elif stabilizer == 'mean':
            # @todo: to be finished...
            upper_list = [np.inf] * repeat
            for itr in range(repeat):                
                gedlibpy.run_method(g, h)                
                upper_list[itr] = gedlibpy.get_upper_bound(g, h)
                pi_forward = gedlibpy.get_forward_map(g, h)
                pi_backward = gedlibpy.get_backward_map(g, h)
                lower = gedlibpy.get_lower_bound(g, h)
            upper = np.mean(upper_list)
        elif stabilizer == 'median':
            if repeat % 2 == 0:
                repeat += 1
            upper_list = [np.inf] * repeat
            pi_forward_list = [0] * repeat
            pi_backward_list = [0] * repeat
            for itr in range(repeat):                
                gedlibpy.run_method(g, h)                
                upper_list[itr] = gedlibpy.get_upper_bound(g, h)
                pi_forward_list[itr] = gedlibpy.get_forward_map(g, h)
                pi_backward_list[itr] = gedlibpy.get_backward_map(g, h)
                lower = gedlibpy.get_lower_bound(g, h)
            upper = np.median(upper_list)
            idx_median = upper_list.index(upper)
            pi_forward = pi_forward_list[idx_median]
            pi_backward = pi_backward_list[idx_median]
        elif stabilizer == 'min':
            upper = np.inf
            for itr in range(repeat):                
                gedlibpy.run_method(g, h)                
                upper_tmp = gedlibpy.get_upper_bound(g, h)                
                if upper_tmp < upper:
                    upper = upper_tmp
                    pi_forward = gedlibpy.get_forward_map(g, h)
                    pi_backward = gedlibpy.get_backward_map(g, h)
                    lower = gedlibpy.get_lower_bound(g, h)
                if upper == 0:
                    break
        elif stabilizer == 'max':
            upper = 0
            for itr in range(repeat):                
                gedlibpy.run_method(g, h)                
                upper_tmp = gedlibpy.get_upper_bound(g, h)                
                if upper_tmp > upper:
                    upper = upper_tmp
                    pi_forward = gedlibpy.get_forward_map(g, h)
                    pi_backward = gedlibpy.get_backward_map(g, h)
                    lower = gedlibpy.get_lower_bound(g, h)
        elif stabilizer == 'gaussian':
            pass
                    
        dis = upper
        
    elif lib == 'gedlib-bash':
        import time
        import random
        import os
        from gklearn.utils.graphfiles import saveDataset
        
        tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/'
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)
        fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
        xparams = {'method': 'gedlib', 'graph_dir': fn_collection}
        saveDataset([g1, g2], ['dummy', 'dummy'], gformat='gxl', group='xml', 
                    filename=fn_collection, xparams=xparams)
        
        command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/others/gedlib/gedlib2\'\n'
        command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
        command += 'export LD_LIBRARY_PATH\n'
        command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n'
        command += './ged_for_python_bash monoterpenoides ' + fn_collection \
                + ' \'' + algo_options + '\' '
        for ec in edit_cost_constant:
            command += str(ec) + ' '
 #        output = os.system(command)
        stream = os.popen(command)
        output = stream.readlines()
 #        print(output)
        
        dis = float(output[0].strip())
        runtime = float(output[1].strip())
        size_forward = int(output[2].strip())
        pi_forward = [int(item.strip()) for item in output[3:3+size_forward]]
        pi_backward = [int(item.strip()) for item in output[3+size_forward:]]

 #        print(dis)
 #        print(runtime)
 #        print(size_forward)
 #        print(pi_forward)
 #        print(pi_backward)
                
        
    # make the map label correct (label remove map as np.inf)
    nodes1 = [n for n in g1.nodes()]
    nodes2 = [n for n in g2.nodes()]
    nb1 = nx.number_of_nodes(g1)
    nb2 = nx.number_of_nodes(g2)
    pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
    pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
 #        print(pi_forward)
              
        
    return dis, pi_forward, pi_backward


 def convertGraph(G, cost):
    """Convert a graph to the proper NetworkX format that can be
    recognized by library gedlibpy.
    """
    G_new = nx.Graph()
    if cost == 'LETTER' or cost == 'LETTER2':   
        for nd, attrs in G.nodes(data=True):
            G_new.add_node(str(nd), x=str(attrs['attributes'][0]), 
                           y=str(attrs['attributes'][1]))
        for nd1, nd2, attrs in G.edges(data=True):
            G_new.add_edge(str(nd1), str(nd2))
    elif cost == 'NON_SYMBOLIC':
        for nd, attrs in G.nodes(data=True):
            G_new.add_node(str(nd))
            for a_name in G.graph['node_attrs']:
                G_new.nodes[str(nd)][a_name] = str(attrs[a_name])
        for nd1, nd2, attrs in G.edges(data=True):
            G_new.add_edge(str(nd1), str(nd2))
            for a_name in G.graph['edge_attrs']:
                G_new.edges[str(nd1), str(nd2)][a_name] = str(attrs[a_name])
    else:
        for nd, attrs in G.nodes(data=True):
            G_new.add_node(str(nd), chem=attrs['atom'])
        for nd1, nd2, attrs in G.edges(data=True):
            G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
 #                G_new.add_edge(str(nd1), str(nd2))
        
    return G_new


 def GED_n(Gn, lib='gedlibpy', cost='CHEM_1', method='IPFP', 
        edit_cost_constant=[], stabilizer='min', repeat=50):
    """
    Compute GEDs for a group of graphs.
    """
    if lib == 'gedlibpy':
        def convertGraph(G):
            """Convert a graph to the proper NetworkX format that can be
            recognized by library gedlibpy.
            """
            G_new = nx.Graph()
            for nd, attrs in G.nodes(data=True):
                G_new.add_node(str(nd), chem=attrs['atom'])
            for nd1, nd2, attrs in G.edges(data=True):
 #                G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
                G_new.add_edge(str(nd1), str(nd2))
                
            return G_new
        
        gedlibpy.restart_env()
        gedlibpy.add_nx_graph(convertGraph(g1), "")
        gedlibpy.add_nx_graph(convertGraph(g2), "")

        listID = gedlibpy.get_all_graph_ids()
        gedlibpy.set_edit_cost(cost, edit_cost_constant=edit_cost_constant)
        gedlibpy.init()
        gedlibpy.set_method(method, "")
        gedlibpy.init_method()

        g = listID[0]
        h = listID[1]
        if stabilizer is None:
            gedlibpy.run_method(g, h)
            pi_forward = gedlibpy.get_forward_map(g, h)
            pi_backward = gedlibpy.get_backward_map(g, h)
            upper = gedlibpy.get_upper_bound(g, h)
            lower = gedlibpy.get_lower_bound(g, h)        
        elif stabilizer == 'min':
            upper = np.inf
            for itr in range(repeat):                
                gedlibpy.run_method(g, h)                
                upper_tmp = gedlibpy.get_upper_bound(g, h)                
                if upper_tmp < upper:
                    upper = upper_tmp
                    pi_forward = gedlibpy.get_forward_map(g, h)
                    pi_backward = gedlibpy.get_backward_map(g, h)
                    lower = gedlibpy.get_lower_bound(g, h)
                if upper == 0:
                    break
                    
        dis = upper
        
        # make the map label correct (label remove map as np.inf)
        nodes1 = [n for n in g1.nodes()]
        nodes2 = [n for n in g2.nodes()]
        nb1 = nx.number_of_nodes(g1)
        nb2 = nx.number_of_nodes(g2)
        pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
        pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]      
        
    return dis, pi_forward, pi_backward


 def ged_median(Gn, Gn_median, verbose=False, params_ged={'lib': 'gedlibpy', 
               'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], 
               'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1',
               'stabilizer': None}, parallel=False):
    if parallel:
        len_itr = int(len(Gn))
        pi_forward_list = [[] for i in range(len_itr)]
        dis_list = [0 for i in range(len_itr)]
               
        itr = range(0, len_itr)
        n_jobs = multiprocessing.cpu_count()
        if len_itr < 100 * n_jobs:
            chunksize = int(len_itr / n_jobs) + 1
        else:
            chunksize = 100
        def init_worker(gn_toshare, gn_median_toshare):
            global G_gn, G_gn_median
            G_gn = gn_toshare
            G_gn_median = gn_median_toshare
        do_partial = partial(_compute_ged_median, params_ged)
        pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, Gn_median))
        if verbose:
            iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
                            desc='computing GEDs', file=sys.stdout)
        else:
            iterator = pool.imap_unordered(do_partial, itr, chunksize)
        for i, dis_sum, pi_forward in iterator:
            pi_forward_list[i] = pi_forward
            dis_list[i] = dis_sum
 #            print('\n-------------------------------------------')
 #            print(i, j, idx_itr, dis)
        pool.close()
        pool.join()
        
    else:
        dis_list = []
        pi_forward_list = []
        for idx, G in tqdm(enumerate(Gn), desc='computing median distances', 
                           file=sys.stdout) if verbose else enumerate(Gn):
            dis_sum = 0
            pi_forward_list.append([])
            for G_p in Gn_median:
                dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p, 
                    **params_ged)
                pi_forward_list[idx].append(pi_tmp_forward)
                dis_sum += dis_tmp
            dis_list.append(dis_sum)
            
    return dis_list, pi_forward_list


 def _compute_ged_median(params_ged, itr):
 #    print(itr)
    dis_sum = 0
    pi_forward = []
    for G_p in G_gn_median:
        dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_gn[itr], G_p, 
                    **params_ged)
        pi_forward.append(pi_tmp_forward)
        dis_sum += dis_tmp
        
    return itr, dis_sum, pi_forward


 def get_nb_edit_operations(g1, g2, forward_map, backward_map):
    """Compute the number of each edit operations.
    """
    n_vi = 0
    n_vr = 0
    n_vs = 0
    n_ei = 0
    n_er = 0
    n_es = 0
    
    nodes1 = [n for n in g1.nodes()]
    for i, map_i in enumerate(forward_map):
        if map_i == np.inf:
            n_vr += 1
        elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']:
            n_vs += 1
    for map_i in backward_map:
        if map_i == np.inf:
            n_vi += 1
    
 #    idx_nodes1 = range(0, len(node1))
    
    edges1 = [e for e in g1.edges()]
    nb_edges2_cnted = 0
    for n1, n2 in edges1:
        idx1 = nodes1.index(n1)
        idx2 = nodes1.index(n2)
        # one of the nodes is removed, thus the edge is removed.
        if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf:
            n_er += 1
        # corresponding edge is in g2.
        elif (forward_map[idx1], forward_map[idx2]) in g2.edges():
            nb_edges2_cnted += 1
            # edge labels are different.
            if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \
                != g1.edges[(n1, n2)]['bond_type']:
                    n_es += 1
        elif (forward_map[idx2], forward_map[idx1]) in g2.edges():
            nb_edges2_cnted += 1
            # edge labels are different.
            if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \
                != g1.edges[(n1, n2)]['bond_type']:
                    n_es += 1                
        # corresponding nodes are in g2, however the edge is removed.
        else:
            n_er += 1
    n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
    
    return n_vi, n_vr, n_vs, n_ei, n_er, n_es


 def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map):
    """Compute the number of each edit operations.
    """
    n_vi = 0
    n_vr = 0
    n_vs = 0
    sod_vs = 0
    n_ei = 0
    n_er = 0
    
    nodes1 = [n for n in g1.nodes()]
    for i, map_i in enumerate(forward_map):
        if map_i == np.inf:
            n_vr += 1
        else:
            n_vs += 1
            diff_x = float(g1.nodes[nodes1[i]]['x']) - float(g2.nodes[map_i]['x'])
            diff_y = float(g1.nodes[nodes1[i]]['y']) - float(g2.nodes[map_i]['y'])
            sod_vs += np.sqrt(np.square(diff_x) + np.square(diff_y))
    for map_i in backward_map:
        if map_i == np.inf:
            n_vi += 1
    
 #    idx_nodes1 = range(0, len(node1))
    
    edges1 = [e for e in g1.edges()]
    nb_edges2_cnted = 0
    for n1, n2 in edges1:
        idx1 = nodes1.index(n1)
        idx2 = nodes1.index(n2)
        # one of the nodes is removed, thus the edge is removed.
        if forward_map[idx1] == np.inf or forward_map[idx2] == np.inf:
            n_er += 1
        # corresponding edge is in g2. Edge label is not considered.
        elif (forward_map[idx1], forward_map[idx2]) in g2.edges() or \
            (forward_map[idx2], forward_map[idx1]) in g2.edges():
                nb_edges2_cnted += 1
        # corresponding nodes are in g2, however the edge is removed.
        else:
            n_er += 1
    n_ei = nx.number_of_edges(g2) - nb_edges2_cnted
    
    return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er


 def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map):
    """Compute the number of each edit operations.
    """
    n_vi = 0
    n_vr = 0
    n_vs = 0
    sod_vs = 0
    n_ei = 0
    n_er = 0
    n_es = 0
    sod_es = 0
    
    nodes1 = [n for n in g1.nodes()]
    for i, map_i in enumerate(forward_map):
        if map_i == np.inf:
            n_vr += 1
        else:
            n_vs += 1
            sum_squares = 0
            for a_name in g1.graph['node_attrs']:
                diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name])
                sum_squares += np.square(diff)
            sod_vs += np.sqrt(sum_squares)
    for map_i in backward_map:
        if map_i == np.inf:
            n_vi += 1
    
 #    idx_nodes1 = range(0, len(node1))
    
    edges1 = [e for e in g1.edges()]
    for n1, n2 in edges1:
        idx1 = nodes1.index(n1)
        idx2 = nodes1.index(n2)
        n1_g2 = forward_map[idx1]
        n2_g2 = forward_map[idx2]
        # one of the nodes is removed, thus the edge is removed.
        if n1_g2 == np.inf or n2_g2 == np.inf:
            n_er += 1
        # corresponding edge is in g2.
        elif (n1_g2, n2_g2) in g2.edges():
            n_es += 1
            sum_squares = 0
            for a_name in g1.graph['edge_attrs']:
                diff = float(g1.edges[n1, n2][a_name]) - float(g2.nodes[n1_g2, n2_g2][a_name])
                sum_squares += np.square(diff)
            sod_es += np.sqrt(sum_squares)
        elif (n2_g2, n1_g2) in g2.edges():
            n_es += 1
            sum_squares = 0
            for a_name in g1.graph['edge_attrs']:
                diff = float(g1.edges[n2, n1][a_name]) - float(g2.nodes[n2_g2, n1_g2][a_name])
                sum_squares += np.square(diff)
            sod_es += np.sqrt(sum_squares)
        # corresponding nodes are in g2, however the edge is removed.
        else:
            n_er += 1
    n_ei = nx.number_of_edges(g2) - n_es
        
    return n_vi, n_vr, sod_vs, n_ei, n_er, sod_es


 if __name__ == '__main__':
    print('check test_ged.py')
--- a/gklearn/preimage/iam.py
+++ b/gklearn/preimage/iam.py
@@ -0,0 +1,775 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Apr 26 11:49:12 2019

 Iterative alternate minimizations using GED.
@author: ljia
 """
 import numpy as np
 import random
 import networkx as nx
 from tqdm import tqdm

 from gklearn.utils.graphdataset import get_dataset_attributes
 from gklearn.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels
 from gklearn.preimage.ged import GED, ged_median


 def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, 
        epsilon=0.001, node_label='atom', edge_label='bond_type', 
        connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
        allBestEdges=False, allBestOutput=False,
        params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', 
                    'edit_cost_constant': [], 'stabilizer': None, 
                    'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}):
    """See my name, then you know what I do.
    """
 #    Gn_median = Gn_median[0:10]
 #    Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
    node_ir = np.inf # corresponding to the node remove and insertion.
    label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
    ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, 
                                      attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], 
                                      edge_label=edge_label)
    node_label_set = get_node_labels(Gn_median, node_label)
    edge_label_set = get_edge_labels(Gn_median, edge_label)

    
    def generate_graph(G, pi_p_forward):
        G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
 #        nx.draw_networkx(G)
 #        import matplotlib.pyplot as plt
 #        plt.show()
 #        print(pi_p_forward)
                    
        # update vertex labels.
        # pre-compute h_i0 for each label.
 #        for label in get_node_labels(Gn, node_label):
 #            print(label)
 #        for nd in G.nodes(data=True):
 #            pass
        if not ds_attrs['node_attr_dim']: # labels are symbolic
            for ndi, (nd, _) in enumerate(G.nodes(data=True)):
                h_i0_list = []
                label_list = []
                for label in node_label_set:
                    h_i0 = 0
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][ndi]
                        if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
                            h_i0 += 1
                    h_i0_list.append(h_i0)
                    label_list.append(label)
                # case when the node is to be removed.
                if removeNodes:
                    h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above.
                    for idx, g in enumerate(Gn_median):
                        pi_i = pi_p_forward[idx][ndi]
                        if pi_i == node_ir:
                            h_i0_remove += 1
                    h_i0_list.append(h_i0_remove)
                    label_list.append(label_r)
                # get the best labels.
                idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
                if allBestNodes: # choose all best graphs.                    
                    nlabel_best = [label_list[idx] for idx in idx_max]
                    # generate "best" graphs with regard to "best" node labels.
                    G_new_list_nd = []
                    for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
                        for nl in nlabel_best:
                            g_tmp = g.copy()
                            if nl == label_r:
                                g_tmp.remove_node(nd)
                            else:
                                g_tmp.nodes[nd][node_label] = nl
                            G_new_list_nd.append(g_tmp)
    #                            nx.draw_networkx(g_tmp)
    #                            import matplotlib.pyplot as plt
    #                            plt.show()
    #                            print(g_tmp.nodes(data=True))
    #                            print(g_tmp.edges(data=True))
                    G_new_list = [ggg.copy() for ggg in G_new_list_nd]
                else: 
                    # choose one of the best randomly.
                    idx_rdm = random.randint(0, len(idx_max) - 1)
                    best_label = label_list[idx_max[idx_rdm]]
                    h_i0_max = h_i0_list[idx_max[idx_rdm]]

                    g_new = G_new_list[0]
                    if best_label == label_r:
                        g_new.remove_node(nd) 
                    else:
                        g_new.nodes[nd][node_label] = best_label
                    G_new_list = [g_new]
        else: # labels are non-symbolic
            for ndi, (nd, _) in enumerate(G.nodes(data=True)):
                Si_norm = 0
                phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
                for idx, g in enumerate(Gn_median):
                    pi_i = pi_p_forward[idx][ndi]
                    if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
                        Si_norm += 1
                        phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])                
                phi_i_bar /= Si_norm
                G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
                
 #        for g in G_new_list:
 #            import matplotlib.pyplot as plt 
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 #            plt.show()
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
                                            
        # update edge labels and adjacency matrix.
        if ds_attrs['edge_labeled']:
            G_new_list_edge = []
            for g_new in G_new_list:
                nd_list = [n for n in g_new.nodes()]
                g_tmp_list = [g_new.copy()]
                for nd1i in range(nx.number_of_nodes(g_new)): 
                    nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes
                    for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)):
                        nd2 = nd_list[nd2i]
 #                for nd1, nd2, _ in g_new.edges(data=True): 
                        h_ij0_list = []
                        label_list = []
                        for label in edge_label_set:
                            h_ij0 = 0
                            for idx, g in enumerate(Gn_median):
                                pi_i = pi_p_forward[idx][nd1i]
                                pi_j = pi_p_forward[idx][nd2i]
                                h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and 
                                           g.has_edge(pi_i, pi_j) and 
                                           g.edges[pi_i, pi_j][edge_label] == label)
                                h_ij0 += h_ij0_p
                            h_ij0_list.append(h_ij0)
                            label_list.append(label)
                        
                        # get the best labels.
                        idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
                        if allBestEdges: # choose all best graphs.
                            elabel_best = [label_list[idx] for idx in idx_max]
                            h_ij0_max = [h_ij0_list[idx] for idx in idx_max]
                            # generate "best" graphs with regard to "best" node labels.
                            G_new_list_ed = []
                            for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
                                for idxl, el in enumerate(elabel_best):
                                    g_tmp_copy = g_tmp.copy()
                                    # check whether a_ij is 0 or 1.
                                    sij_norm = 0
                                    for idx, g in enumerate(Gn_median):
                                        pi_i = pi_p_forward[idx][nd1i]
                                        pi_j = pi_p_forward[idx][nd2i]
                                        if g.has_node(pi_i) and g.has_node(pi_j) and \
                                            g.has_edge(pi_i, pi_j):
                                           sij_norm += 1
                                    if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \
                                        sij_norm * (1 - (c_er + c_ei) / c_es):
                                        if not g_tmp_copy.has_edge(nd1, nd2):
                                            g_tmp_copy.add_edge(nd1, nd2)
                                        g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl]
                                    else:
                                        if g_tmp_copy.has_edge(nd1, nd2):
                                            g_tmp_copy.remove_edge(nd1, nd2)
                                    G_new_list_ed.append(g_tmp_copy)
                            g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
                        else: # choose one of the best randomly.
                            idx_rdm = random.randint(0, len(idx_max) - 1)
                            best_label = label_list[idx_max[idx_rdm]]
                            h_ij0_max = h_ij0_list[idx_max[idx_rdm]]
                                   
                            # check whether a_ij is 0 or 1.
                            sij_norm = 0
                            for idx, g in enumerate(Gn_median):
                                pi_i = pi_p_forward[idx][nd1i]
                                pi_j = pi_p_forward[idx][nd2i]
                                if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                                   sij_norm += 1
                            if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                                if not g_new.has_edge(nd1, nd2):
                                    g_new.add_edge(nd1, nd2)
                                g_new.edges[nd1, nd2][edge_label] = best_label
                            else:
 #                            elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                                if g_new.has_edge(nd1, nd2):
                                    g_new.remove_edge(nd1, nd2) 
                            g_tmp_list = [g_new]
                G_new_list_edge += g_tmp_list
            G_new_list = [ggg.copy() for ggg in G_new_list_edge]    
                    
               
        else: # if edges are unlabeled
            # @todo: is this even right? G or g_tmp? check if the new one is right
            # @todo: works only for undirected graphs.
            
            for g_tmp in G_new_list:
                nd_list = [n for n in g_tmp.nodes()]
                for nd1i in range(nx.number_of_nodes(g_tmp)):
                    nd1 = nd_list[nd1i]
                    for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)):
                        nd2 = nd_list[nd2i]
                        sij_norm = 0
                        for idx, g in enumerate(Gn_median):
                            pi_i = pi_p_forward[idx][nd1i]
                            pi_j = pi_p_forward[idx][nd2i]
                            if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                               sij_norm += 1
                        if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
                            # @todo: should we consider if nd1 and nd2 in g_tmp?
                            # or just add the edge anyway?
                            if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
                                and not g_tmp.has_edge(nd1, nd2):
                                g_tmp.add_edge(nd1, nd2)
                        else: # @todo: which to use?
 #                        elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
                            if g_tmp.has_edge(nd1, nd2):
                                g_tmp.remove_edge(nd1, nd2)
                        # do not change anything when equal.     
                        
 #        for i, g in enumerate(G_new_list):
 #            import matplotlib.pyplot as plt 
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
 #            plt.show()
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
        
 #        # find the best graph generated in this iteration and update pi_p.
        # @todo: should we update all graphs generated or just the best ones?
        dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, 
            params_ged=params_ged)
        # @todo: should we remove the identical and connectivity check? 
        # Don't know which is faster.
        if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
            G_new_list, idx_list = remove_duplicates(G_new_list)
            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
            dis_list = [dis_list[idx] for idx in idx_list]
 #        if connected == True:
 #            G_new_list, idx_list = remove_disconnected(G_new_list)
 #            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
 #        idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
 #        dis_min = dis_list[idx_min_tmp_list[0]]
 #        pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
 #        G_new_list = [G_new_list[idx] for idx in idx_min_list] 
        
 #        for g in G_new_list:
 #            import matplotlib.pyplot as plt 
 #            nx.draw_networkx(g)
 #            plt.show()
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
        
        return G_new_list, pi_forward_list, dis_list
    
    
    def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
        idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
        dis_min = dis_all[idx_min_list[0]]
        pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
        G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
        return G_min_list, pi_forward_min_list, dis_min
    
    
    def iteration_proc(G, pi_p_forward, cur_sod):
        G_list = [G]
        pi_forward_list = [pi_p_forward]
        old_sod = cur_sod * 2
        sod_list = [cur_sod]
        dis_list = [cur_sod]
        # iterations.
        itr = 0
        # @todo: what if difference == 0?
 #        while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
 #                                 np.abs(old_sod - cur_sod) == 0):
        while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
 #        while itr < ite_max:
 #        for itr in range(0, 5): # the convergence condition?
            print('itr_iam is', itr)
            G_new_list = []
            pi_forward_new_list = []
            dis_new_list = []
            for idx, g in enumerate(G_list):
 #                label_set = get_node_labels(Gn_median + [g], node_label)                        
                G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
                    g, pi_forward_list[idx])
                G_new_list += G_tmp_list
                pi_forward_new_list += pi_forward_tmp_list
                dis_new_list += dis_tmp_list
            # @todo: need to remove duplicates here?
            G_list = [ggg.copy() for ggg in G_new_list]
            pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list]
            dis_list = dis_new_list[:]
            
            old_sod = cur_sod
            cur_sod = np.min(dis_list)
            sod_list.append(cur_sod)
            
            itr += 1
        
        # @todo: do we return all graphs or the best ones?
        # get the best ones of the generated graphs.
        G_list, pi_forward_list, dis_min = best_median_graphs(
            G_list, pi_forward_list, dis_list)
        
        if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
            G_list, idx_list = remove_duplicates(G_list)
            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
 #            dis_list = [dis_list[idx] for idx in idx_list]
            
 #        import matplotlib.pyplot as plt
 #        for g in G_list:             
 #            nx.draw_networkx(g)
 #            plt.show()
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
            
        print('\nsods:', sod_list, '\n')
            
        return G_list, pi_forward_list, dis_min, sod_list
    
    
    def remove_duplicates(Gn):
        """Remove duplicate graphs from list.
        """
        Gn_new = []
        idx_list = []
        for idx, g in enumerate(Gn):
            dupl = False
            for g_new in Gn_new:
                if graph_isIdentical(g_new, g):
                    dupl = True
                    break
            if not dupl:
                Gn_new.append(g)
                idx_list.append(idx)
        return Gn_new, idx_list
    
    
    def remove_disconnected(Gn):
        """Remove disconnected graphs from list.
        """
        Gn_new = []
        idx_list = []
        for idx, g in enumerate(Gn):
            if nx.is_connected(g):
                Gn_new.append(g)
                idx_list.append(idx)
        return Gn_new, idx_list

    
    ###########################################################################
    
    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
    dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median,
        params_ged=params_ged, parallel=True)
    print('finish computing GEDs.')
    # find all smallest distances.
    if allBestInit: # try all best init graphs.
        idx_min_list = range(len(dis_list))
        dis_min = dis_list
    else:
        idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
        dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
        idx_min_rdm = random.randint(0, len(idx_min_list) - 1)
        idx_min_list = [idx_min_list[idx_min_rdm]]
    sod_set_median = np.min(dis_min)
        
    
    # phase 2: iteration.
    G_list = []
    dis_list = []
    pi_forward_list = []
    G_set_median_list = []
 #    sod_list = []
    for idx_tmp, idx_min in enumerate(idx_min_list):
 #        print('idx_min is', idx_min)
        G = Gn_candidate[idx_min].copy()
        G_set_median_list.append(G.copy())
        # list of edit operations.        
        pi_p_forward = pi_forward_all[idx_min]
 #        pi_p_backward = pi_all_backward[idx_min]        
        Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G, 
                                                pi_p_forward, dis_min[idx_tmp])            
        G_list += Gi_list
        dis_list += [dis_i_min] * len(Gi_list)
        pi_forward_list += pi_i_forward_list
        
        
    if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
        G_list, idx_list = remove_duplicates(G_list)
        dis_list = [dis_list[idx] for idx in idx_list]
        pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
    if connected == True:
        G_list_con, idx_list = remove_disconnected(G_list)
        # if there is no connected graphs at all, then remain the disconnected ones.
        if len(G_list_con) > 0: # @todo: ??????????????????????????
            G_list = G_list_con
            dis_list = [dis_list[idx] for idx in idx_list]
            pi_forward_list = [pi_forward_list[idx] for idx in idx_list]

 #    import matplotlib.pyplot as plt 
 #    for g in G_list:
 #        nx.draw_networkx(g)
 #        plt.show()
 #        print(g.nodes(data=True))
 #        print(g.edges(data=True))
    
    # get the best median graphs
    G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs(
            G_list, pi_forward_list, dis_list)
 #    for g in G_gen_median_list:
 #        nx.draw_networkx(g)
 #        plt.show()
 #        print(g.nodes(data=True))
 #        print(g.edges(data=True))
    
    if not allBestOutput:
        # randomly choose one graph.
        idx_rdm = random.randint(0, len(G_gen_median_list) - 1)
        G_gen_median_list = [G_gen_median_list[idx_rdm]]
    
    return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median


 def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT', initial_solutions=1,
             dataset='monoterpenoides',
             graph_dir=''):
    """Compute the iam by c++ implementation (gedlib) through bash.
    """
    import os
    import time

    def createCollectionFile(Gn_names, y, filename):
        """Create collection file.
        """
        dirname_ds = os.path.dirname(filename)
        if dirname_ds != '':
            dirname_ds += '/'
            if not os.path.exists(dirname_ds) :
                os.makedirs(dirname_ds)
                
        with open(filename + '.xml', 'w') as fgroup:
            fgroup.write("<?xml version=\"1.0\"?>")
            fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
            fgroup.write("\n<GraphCollection>")
            for idx, fname in enumerate(Gn_names):
                fgroup.write("\n\t<graph file=\"" + fname + "\" class=\"" + str(y[idx]) + "\"/>")
            fgroup.write("\n</GraphCollection>")
            fgroup.close()

    tmp_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/'
    fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
    createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection)
 #    fn_collection = tmp_dir + 'collection_for_debug'
 #    graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/gxl'
    
 #    if dataset == 'Letter-high' or dataset == 'Fingerprint':
 #        dataset = 'letter'
    command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n'
    command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
    command += 'export LD_LIBRARY_PATH\n'
    command += 'cd \'' + os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/bin\'\n'
    command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \
            + ' \'' + graph_dir + '\' ' + ' ' + cost + ' ' + str(initial_solutions) + ' '
    if edit_cost_constant is None:
        command += 'None'
    else:
        for ec in edit_cost_constant:
            command += str(ec) + ' '
 #        output = os.system(command)
    stream = os.popen(command)

    output = stream.readlines()    
 #    print(output)
    sod_sm = float(output[0].strip())
    sod_gm = float(output[1].strip())
    
    fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
    fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
    
    return sod_sm, sod_gm, fname_sm, fname_gm



 ###############################################################################
 # Old implementations.
    
 def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', 
        connected=True):
    """See my name, then you know what I do.
    """
 #    Gn = Gn[0:10]
    Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
    
    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
    pi_p = []
    pi_all = []
    for idx1, G_p in enumerate(Gn):
        dist_sum = 0
        pi_all.append([])
        for idx2, G_p_prime in enumerate(Gn):
            dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime)
            pi_all[idx1].append(pi_tmp)
            dist_sum += dist_tmp
        if dist_sum < dis_min:
            dis_min = dist_sum
            G = G_p.copy()
            idx_min = idx1
    # list of edit operations.        
    pi_p = pi_all[idx_min]
            
    # phase 2: iteration.
    ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], 
                                      edge_label=edge_label)
    for itr in range(0, 10): # @todo: the convergence condition?
        G_new = G.copy()
        # update vertex labels.
        # pre-compute h_i0 for each label.
 #        for label in get_node_labels(Gn, node_label):
 #            print(label)
 #        for nd in G.nodes(data=True):
 #            pass
        if not ds_attrs['node_attr_dim']: # labels are symbolic
            for nd, _ in G.nodes(data=True):
                h_i0_list = []
                label_list = []
                for label in get_node_labels(Gn, node_label):
                    h_i0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p[idx][nd]
                        if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
                            h_i0 += 1
                    h_i0_list.append(h_i0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
                idx_rdm = random.randint(0, len(idx_max) - 1)
                G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
        else: # labels are non-symbolic
            for nd, _ in G.nodes(data=True):
                Si_norm = 0
                phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
                for idx, g in enumerate(Gn):
                    pi_i = pi_p[idx][nd]
                    if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
                        Si_norm += 1
                        phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])                
                phi_i_bar /= Si_norm
                G_new.nodes[nd]['attributes'] = phi_i_bar
                                            
        # update edge labels and adjacency matrix.
        if ds_attrs['edge_labeled']:
            for nd1, nd2, _ in G.edges(data=True):
                h_ij0_list = []
                label_list = []
                for label in get_edge_labels(Gn, edge_label):
                    h_ij0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p[idx][nd1]
                        pi_j = pi_p[idx][nd2]
                        h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and 
                                   g.has_edge(pi_i, pi_j) and 
                                   g.edges[pi_i, pi_j][edge_label] == label)
                        h_ij0 += h_ij0_p
                    h_ij0_list.append(h_ij0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
                h_ij0_max = h_ij0_list[idx_max[0]]
                idx_rdm = random.randint(0, len(idx_max) - 1)
                best_label = label_list[idx_max[idx_rdm]]
                       
                # check whether a_ij is 0 or 1.
                sij_norm = 0
                for idx, g in enumerate(Gn):
                    pi_i = pi_p[idx][nd1]
                    pi_j = pi_p[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                    G_new.edges[nd1, nd2][edge_label] = best_label
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)                
        else: # if edges are unlabeled
            for nd1, nd2, _ in G.edges(data=True):
                sij_norm = 0
                for idx, g in enumerate(Gn):
                    pi_i = pi_p[idx][nd1]
                    pi_j = pi_p[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if sij_norm > len(Gn) * c_er / (c_er + c_ei):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)
                        
        G = G_new.copy()
        
        # update pi_p
        pi_p = []
        for idx1, G_p in enumerate(Gn):
            dist_tmp, pi_tmp, _ = GED(G, G_p)
            pi_p.append(pi_tmp)
    
    return G

 # --------------------------- These are tests --------------------------------#
    
 def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, 
                                      node_label='atom', edge_label='bond_type'):
    """See my name, then you know what I do.
    """
 #    Gn = Gn[0:10]
    Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
    
    # phase 1: initilize.
    # compute set-median.
    dis_min = np.inf
 #    pi_p = []
    pi_all_forward = []
    pi_all_backward = []
    for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout):
        dist_sum = 0
        pi_all_forward.append([])
        pi_all_backward.append([])
        for idx2, G_p_prime in enumerate(Gn):
            dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime)
            pi_all_forward[idx1].append(pi_tmp_forward)
            pi_all_backward[idx1].append(pi_tmp_backward)
            dist_sum += dist_tmp
        if dist_sum <= dis_min:
            dis_min = dist_sum
            G = G_p.copy()
            idx_min = idx1
    # list of edit operations.        
    pi_p_forward = pi_all_forward[idx_min]
    pi_p_backward = pi_all_backward[idx_min]
            
    # phase 2: iteration.
    ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'], 
                                      edge_label=edge_label)
    label_set = get_node_labels(Gn + [G], node_label)
    for itr in range(0, 10): # @todo: the convergence condition?
        G_new = G.copy()
        # update vertex labels.
        # pre-compute h_i0 for each label.
 #        for label in get_node_labels(Gn, node_label):
 #            print(label)
 #        for nd in G.nodes(data=True):
 #            pass
        if not ds_attrs['node_attr_dim']: # labels are symbolic
            for nd in G.nodes():
                h_i0_list = []
                label_list = []
                for label in label_set:
                    h_i0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p_forward[idx][nd]
                        if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
                            h_i0 += 1
                    h_i0_list.append(h_i0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
                idx_rdm = random.randint(0, len(idx_max) - 1)
                G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
        else: # labels are non-symbolic
            for nd in G.nodes():
                Si_norm = 0
                phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
                for idx, g in enumerate(Gn):
                    pi_i = pi_p_forward[idx][nd]
                    if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
                        Si_norm += 1
                        phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])                
                phi_i_bar /= Si_norm
                G_new.nodes[nd]['attributes'] = phi_i_bar
                                            
        # update edge labels and adjacency matrix.
        if ds_attrs['edge_labeled']:
            for nd1, nd2, _ in G.edges(data=True):
                h_ij0_list = []
                label_list = []
                for label in get_edge_labels(Gn, edge_label):
                    h_ij0 = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p_forward[idx][nd1]
                        pi_j = pi_p_forward[idx][nd2]
                        h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and 
                                   g.has_edge(pi_i, pi_j) and 
                                   g.edges[pi_i, pi_j][edge_label] == label)
                        h_ij0 += h_ij0_p
                    h_ij0_list.append(h_ij0)
                    label_list.append(label)
                # choose one of the best randomly.
                idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
                h_ij0_max = h_ij0_list[idx_max[0]]
                idx_rdm = random.randint(0, len(idx_max) - 1)
                best_label = label_list[idx_max[idx_rdm]]
                       
                # check whether a_ij is 0 or 1.
                sij_norm = 0
                for idx, g in enumerate(Gn):
                    pi_i = pi_p_forward[idx][nd1]
                    pi_j = pi_p_forward[idx][nd2]
                    if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                       sij_norm += 1
                if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
                    if not G_new.has_edge(nd1, nd2):
                        G_new.add_edge(nd1, nd2)
                    G_new.edges[nd1, nd2][edge_label] = best_label
                else:
                    if G_new.has_edge(nd1, nd2):
                        G_new.remove_edge(nd1, nd2)                
        else: # if edges are unlabeled
            # @todo: works only for undirected graphs.
            for nd1 in range(nx.number_of_nodes(G)):
                for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
                    sij_norm = 0
                    for idx, g in enumerate(Gn):
                        pi_i = pi_p_forward[idx][nd1]
                        pi_j = pi_p_forward[idx][nd2]
                        if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
                           sij_norm += 1
                    if sij_norm > len(Gn) * c_er / (c_er + c_ei):
                        if not G_new.has_edge(nd1, nd2):
                            G_new.add_edge(nd1, nd2)
                    elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
                        if G_new.has_edge(nd1, nd2):
                            G_new.remove_edge(nd1, nd2)
                    # do not change anything when equal.
                        
        G = G_new.copy()
        
        # update pi_p
        pi_p_forward = []
        for G_p in Gn:
            dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
            pi_p_forward.append(pi_tmp_forward)
    
    return G


 ###############################################################################

 if __name__ == '__main__':
    from gklearn.utils.graphfiles import loadDataset
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
          'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}  # node/edge symb
 #    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
 #          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
 #          'extra_params': {}}
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])

    iam(Gn)
--- a/gklearn/preimage/knn.py
+++ b/gklearn/preimage/knn.py
@@ -0,0 +1,114 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Jan 10 13:22:04 2020

@author: ljia
 """
 import numpy as np
 #import matplotlib.pyplot as plt
 from tqdm import tqdm
 import random
 #import csv
 from shutil import copyfile
 import os

 from gklearn.preimage.iam import iam_bash
 from gklearn.utils.graphfiles import loadDataset, loadGXL
 from gklearn.preimage.ged import GED
 from gklearn.preimage.utils import get_same_item_indices

 def test_knn():
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
 #    gkernel = 'treeletkernel'
 #    node_label = 'atom'
 #    edge_label = 'bond_type'
 #    ds_name = 'mono'
    dir_output = 'results/knn/'
    graph_dir = os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'
    
    k_nn = 1
    percent = 0.1
    repeats = 50
    edit_cost_constant = [3, 3, 1, 3, 3, 1]
    
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    sod_sm_list_list
    for repeat in range(0, repeats):
        print('\n---------------------------------')
        print('repeat =', repeat)
        accuracy_sm_list = []
        accuracy_gm_list = []
        sod_sm_list = []
        sod_gm_list = []
        
        random.seed(repeat)
        set_median_list = []
        gen_median_list = []
        train_y_set = []
        for y, values in y_idx.items():
            print('\ny =', y)
            size_median_set = int(len(values) * percent)
            median_set_idx = random.sample(values, size_median_set)
            print('median set: ', median_set_idx)
            
            # compute set median and gen median using IAM (C++ through bash).
    #        Gn_median = [Gn[idx] for idx in median_set_idx]
            group_fnames = [Gn[g].graph['filename'] for g in median_set_idx]
            sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant,
                                                          graph_dir=graph_dir)
            print('sod_sm, sod_gm:', sod_sm, sod_gm)
            sod_sm_list.append(sod_sm)
            sod_gm_list.append(sod_gm)
            fname_sm_new = dir_output + 'medians/set_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
            copyfile(fname_sm, fname_sm_new)
            fname_gm_new = dir_output + 'medians/gen_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
            copyfile(fname_gm, fname_gm_new)
            set_median_list.append(loadGXL(fname_sm_new))
            gen_median_list.append(loadGXL(fname_gm_new))
            train_y_set.append(int(y))
        
        print(sod_sm, sod_gm)
        
        # do 1-nn.
        test_y_set = [int(y) for y in y_all]
        accuracy_sm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged')
        accuracy_gm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged')
        accuracy_sm_list.append(accuracy_sm)
        accuracy_gm_list.append(accuracy_gm)
        print('current accuracy sm and gm:', accuracy_sm, accuracy_gm)
        
    # output
    accuracy_sm_mean = np.mean(accuracy_sm_list)
    accuracy_gm_mean = np.mean(accuracy_gm_list)
    print('\ntotal average accuracy sm and gm:', accuracy_sm_mean, accuracy_gm_mean)

        
 def knn(train_set, train_y_set, test_set, test_y_set, k=1, distance='ged'):
    if k == 1 and distance == 'ged':
        algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 
                    'algo_options': algo_options, 'stabilizer': None}
        accuracy = 0
        for idx_test, g_test in tqdm(enumerate(test_set), desc='computing 1-nn', 
                                     file=sys.stdout):
            dis = np.inf
            for idx_train, g_train in enumerate(train_set):
                dis_cur, _, _ = GED(g_test, g_train, **params_ged)
                if dis_cur < dis:
                    dis = dis_cur
                    test_y_cur = train_y_set[idx_train]
            if test_y_cur == test_y_set[idx_test]:
                accuracy += 1
        accuracy = accuracy / len(test_set)
        
    return accuracy

    

 if __name__ == '__main__':
    test_knn()
--- a/gklearn/preimage/libs.py
+++ b/gklearn/preimage/libs.py
@@ -0,0 +1,6 @@
 import sys
 import pathlib

 # insert gedlibpy library.
 sys.path.insert(0, "../../../")
 from gedlibpy import librariesImport, gedlibpy
--- a/gklearn/preimage/median.py
+++ b/gklearn/preimage/median.py
@@ -0,0 +1,218 @@
 import sys
 sys.path.insert(0, "../")
 #import pathlib
 import numpy as np
 import networkx as nx
 import time

 from gedlibpy import librariesImport, gedlibpy
 #import script
 sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
 import gklearn
 from gklearn.utils.graphfiles import loadDataset

 def replace_graph_in_env(script, graph, old_id, label='median'):
    """
    Replace a graph in script

    If old_id is -1, add a new graph to the environnemt

    """
    if(old_id > -1):
        script.PyClearGraph(old_id)
    new_id = script.PyAddGraph(label)
    for i in graph.nodes():
        script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib
    for e in graph.edges:
        script.PyAddEdge(new_id, str(e[0]),str(e[1]), {})
    script.PyInitEnv()
    script.PySetMethod("IPFP", "")
    script.PyInitMethod()

    return new_id
    
 #Dessin median courrant
 def draw_Letter_graph(graph, savepath=''):
    import numpy as np
    import networkx as nx
    import matplotlib.pyplot as plt
    plt.figure()
    pos = {}
    for n in graph.nodes:
        pos[n] = np.array([float(graph.node[n]['attributes'][0]),
           float(graph.node[n]['attributes'][1])])
    nx.draw_networkx(graph, pos)
    if savepath != '':
        plt.savefig(savepath + str(time.time()) + '.eps', format='eps', dpi=300)
    plt.show()
    plt.clf()
    
 #compute new mappings
 def update_mappings(script,median_id,listID):
    med_distances = {}
    med_mappings = {}
    sod = 0
    for i in range(0,len(listID)):
        script.PyRunMethod(median_id,listID[i])
        med_distances[i] = script.PyGetUpperBound(median_id,listID[i])
        med_mappings[i] = script.PyGetForwardMap(median_id,listID[i])
        sod += med_distances[i]
    return med_distances, med_mappings, sod

 def calcul_Sij(all_mappings, all_graphs,i,j):
    s_ij = 0
    for k in range(0,len(all_mappings)):
        cur_graph =  all_graphs[k]
        cur_mapping = all_mappings[k]
        size_graph = cur_graph.order()
        if ((cur_mapping[i] < size_graph) and 
            (cur_mapping[j] < size_graph) and 
            (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)):
                s_ij += 1
        
    return s_ij

 # def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings):
 #     from scipy.stats.mstats import gmean

 #     for i in median.nodes():
 #         for k in listIdSet:
 #             vectors = [] #np.zeros((len(listIdSet),2))
 #             if(k != median_id):
 #                 phi_i = mappings[k][i]
 #                 if(phi_i < dataset[k].order()):
 #                     vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])])

 #         new_labels = gmean(vectors)
 #         median.node[i]['x'] = str(new_labels[0])
 #         median.node[i]['y'] = str(new_labels[1])
 #     return median

 def update_median_nodes(median,dataset,mappings):
    #update node attributes
    for i in median.nodes():
        nb_sub=0
        mean_label = {'x' : 0, 'y' : 0}
        for k in range(0,len(mappings)):
            phi_i = mappings[k][i]
            if ( phi_i < dataset[k].order() ):
                nb_sub += 1
                mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x'])
                mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y'])
        median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub))
        median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub))
    return median

 def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425):
 #for letter high, ceir = 1.7, alpha = 0.75
    size_dataset = len(dataset)
    ratio_cei_cer = cer/(cei + cer)
    threshold = size_dataset*ratio_cei_cer
    order_graph_median = median.order()
    for i in range(0,order_graph_median):
        for j in range(i+1,order_graph_median):
            s_ij = calcul_Sij(mappings,dataset,i,j)
            if(s_ij > threshold):
                median.add_edge(i,j)
            else:
                if(median.has_edge(i,j)):
                    median.remove_edge(i,j)
    return median



 def compute_median(script, listID, dataset,verbose=False):
    """Compute a graph median of a dataset according to an environment

    Parameters

    script : An gedlib initialized environnement 
    listID (list): a list of ID in script: encodes the dataset 
    dataset (list): corresponding graphs in networkX format. We assume that graph
    listID[i] corresponds to dataset[i]

    Returns:
    A networkX graph, which is the median, with corresponding sod
    """
    print(len(listID))
    median_set_index, median_set_sod = compute_median_set(script, listID)
    print(median_set_index)
    print(median_set_sod)
    sods = []
    #Ajout median dans environnement
    set_median = dataset[median_set_index].copy()
    median = dataset[median_set_index].copy()
    cur_med_id = replace_graph_in_env(script,median,-1)
    med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
    sods.append(cur_sod)
    if(verbose):
        print(cur_sod)
    ite_max = 50
    old_sod = cur_sod * 2
    ite = 0
    epsilon = 0.001

    best_median 
    while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )):
        median = update_median_nodes(median,dataset, med_mappings)
        median = update_median_edges(dataset,med_mappings,median)

        cur_med_id = replace_graph_in_env(script,median,cur_med_id)
        med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
        
        
        sods.append(cur_sod)
        if(verbose):
            print(cur_sod)
        ite += 1
    return median, cur_sod, sods, set_median
    
    draw_Letter_graph(median)


 def compute_median_set(script,listID):
    'Returns the id in listID corresponding to median set'
    #Calcul median set
    N=len(listID)
    map_id_to_index = {}
    map_index_to_id = {}
    for i in range(0,len(listID)):
        map_id_to_index[listID[i]] = i
        map_index_to_id[i] = listID[i]
        
    distances = np.zeros((N,N))
    for i in listID:
        for j in listID:
            script.PyRunMethod(i,j)
            distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j)

    median_set_index = np.argmin(np.sum(distances,0))
    sod = np.min(np.sum(distances,0))
    
    return median_set_index, sod

 if __name__ == "__main__":
    #Chargement du dataset
    script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
    script.PySetEditCost("LETTER")
    script.PyInitEnv()
    script.PySetMethod("IPFP", "")
    script.PyInitMethod()

    dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")

    listID = script.PyGetAllGraphIds()
    median, sod = compute_median(script,listID,dataset,verbose=True)
    
    print(sod)
    draw_Letter_graph(median)


 #if __name__ == '__main__':
 #    # test draw_Letter_graph
 #    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
 #          'extra_params': {}} # node nsymb
 #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    print(y_all)
 #    for g in Gn:
 #        draw_Letter_graph(g)
--- a/gklearn/preimage/median_benoit.py
+++ b/gklearn/preimage/median_benoit.py
@@ -0,0 +1,201 @@
 import sys
 import pathlib
 import numpy as np
 import networkx as nx

 import librariesImport
 import script
 sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
 import gklearn

 def replace_graph_in_env(script, graph, old_id, label='median'):
    """
    Replace a graph in script

    If old_id is -1, add a new graph to the environnemt

    """
    if(old_id > -1):
        script.PyClearGraph(old_id)
    new_id = script.PyAddGraph(label)
    for i in graph.nodes():
        script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib
    for e in graph.edges:
        script.PyAddEdge(new_id, str(e[0]),str(e[1]), {})
    script.PyInitEnv()
    script.PySetMethod("IPFP", "")
    script.PyInitMethod()

    return new_id
    
 #Dessin median courrant
 def draw_Letter_graph(graph):
    import numpy as np
    import networkx as nx
    import matplotlib.pyplot as plt
    plt.figure()
    pos = {}
    for n in graph.nodes:
        pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
    nx.draw_networkx(graph,pos)
    plt.show()
    
 #compute new mappings
 def update_mappings(script,median_id,listID):
    med_distances = {}
    med_mappings = {}
    sod = 0
    for i in range(0,len(listID)):
        script.PyRunMethod(median_id,listID[i])
        med_distances[i] = script.PyGetUpperBound(median_id,listID[i])
        med_mappings[i] = script.PyGetForwardMap(median_id,listID[i])
        sod += med_distances[i]
    return med_distances, med_mappings, sod

 def calcul_Sij(all_mappings, all_graphs,i,j):
    s_ij = 0
    for k in range(0,len(all_mappings)):
        cur_graph =  all_graphs[k]
        cur_mapping = all_mappings[k]
        size_graph = cur_graph.order()
        if ((cur_mapping[i] < size_graph) and 
            (cur_mapping[j] < size_graph) and 
            (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)):
                s_ij += 1
        
    return s_ij

 # def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings):
 #     from scipy.stats.mstats import gmean

 #     for i in median.nodes():
 #         for k in listIdSet:
 #             vectors = [] #np.zeros((len(listIdSet),2))
 #             if(k != median_id):
 #                 phi_i = mappings[k][i]
 #                 if(phi_i < dataset[k].order()):
 #                     vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])])

 #         new_labels = gmean(vectors)
 #         median.node[i]['x'] = str(new_labels[0])
 #         median.node[i]['y'] = str(new_labels[1])
 #     return median

 def update_median_nodes(median,dataset,mappings):
    #update node attributes
    for i in median.nodes():
        nb_sub=0
        mean_label = {'x' : 0, 'y' : 0}
        for k in range(0,len(mappings)):
            phi_i = mappings[k][i]
            if ( phi_i < dataset[k].order() ):
                nb_sub += 1
                mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x'])
                mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y'])
        median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub))
        median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub))
    return median

 def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425):
 #for letter high, ceir = 1.7, alpha = 0.75
    size_dataset = len(dataset)
    ratio_cei_cer = cer/(cei + cer)
    threshold = size_dataset*ratio_cei_cer
    order_graph_median = median.order()
    for i in range(0,order_graph_median):
        for j in range(i+1,order_graph_median):
            s_ij = calcul_Sij(mappings,dataset,i,j)
            if(s_ij > threshold):
                median.add_edge(i,j)
            else:
                if(median.has_edge(i,j)):
                    median.remove_edge(i,j)
    return median



 def compute_median(script, listID, dataset,verbose=False):
    """Compute a graph median of a dataset according to an environment

    Parameters

    script : An gedlib initialized environnement 
    listID (list): a list of ID in script: encodes the dataset 
    dataset (list): corresponding graphs in networkX format. We assume that graph
    listID[i] corresponds to dataset[i]

    Returns:
    A networkX graph, which is the median, with corresponding sod
    """
    print(len(listID))
    median_set_index, median_set_sod = compute_median_set(script, listID)
    print(median_set_index)
    print(median_set_sod)
    sods = []
    #Ajout median dans environnement
    set_median = dataset[median_set_index].copy()
    median = dataset[median_set_index].copy()
    cur_med_id = replace_graph_in_env(script,median,-1)
    med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
    sods.append(cur_sod)
    if(verbose):
        print(cur_sod)
    ite_max = 50
    old_sod = cur_sod * 2
    ite = 0
    epsilon = 0.001

    best_median 
    while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )):
        median = update_median_nodes(median,dataset, med_mappings)
        median = update_median_edges(dataset,med_mappings,median)

        cur_med_id = replace_graph_in_env(script,median,cur_med_id)
        med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
        
        
        sods.append(cur_sod)
        if(verbose):
            print(cur_sod)
        ite += 1
    return median, cur_sod, sods, set_median
    
    draw_Letter_graph(median)


 def compute_median_set(script,listID):
    'Returns the id in listID corresponding to median set'
    #Calcul median set
    N=len(listID)
    map_id_to_index = {}
    map_index_to_id = {}
    for i in range(0,len(listID)):
        map_id_to_index[listID[i]] = i
        map_index_to_id[i] = listID[i]
        
    distances = np.zeros((N,N))
    for i in listID:
        for j in listID:
            script.PyRunMethod(i,j)
            distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j)

    median_set_index = np.argmin(np.sum(distances,0))
    sod = np.min(np.sum(distances,0))
    
    return median_set_index, sod

 if __name__ == "__main__":
    #Chargement du dataset
    script.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
    script.PySetEditCost("LETTER")
    script.PyInitEnv()
    script.PySetMethod("IPFP", "")
    script.PyInitMethod()

    dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")

    listID = script.PyGetAllGraphIds()
    median, sod = compute_median(script,listID,dataset,verbose=True)
    
    print(sod)
    draw_Letter_graph(median)
--- a/gklearn/preimage/median_graph_estimator.py
+++ b/gklearn/preimage/median_graph_estimator.py
@@ -0,0 +1,826 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Mar 16 18:04:55 2020

@author: ljia
 """
 import numpy as np
 from gklearn.preimage.common_types import AlgorithmState
 from gklearn.preimage import misc
 from gklearn.preimage.timer import Timer
 from gklearn.utils.utils import graph_isIdentical
 import time
 from tqdm import tqdm
 import sys
 import networkx as nx


 class MedianGraphEstimator(object):
 	
 	def __init__(self, ged_env, constant_node_costs):
 		"""Constructor.
 		
 		Parameters
 		----------
 		ged_env : gklearn.gedlib.gedlibpy.GEDEnv
 			Initialized GED environment. The edit costs must be set by the user.
 			
 		constant_node_costs : Boolean
 			Set to True if the node relabeling costs are constant.
 		"""
 		self.__ged_env = ged_env
 		self.__init_method = 'BRANCH_FAST'
 		self.__init_options = ''
 		self.__descent_method = 'BRANCH_FAST'
 		self.__descent_options = ''
 		self.__refine_method = 'IPFP'
 		self.__refine_options = ''
 		self.__constant_node_costs = constant_node_costs
 		self.__labeled_nodes = (ged_env.get_num_node_labels() > 1)
 		self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1))
 		self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1))
 		self.__labeled_edges = (ged_env.get_num_edge_labels() > 1)
 		self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1))
 		self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1))
 		self.__init_type = 'RANDOM'
 		self.__num_random_inits = 10
 		self.__desired_num_random_inits = 10
 		self.__use_real_randomness = True
 		self.__seed = 0
 		self.__refine = True
 		self.__time_limit_in_sec = 0
 		self.__epsilon = 0.0001
 		self.__max_itrs = 100
 		self.__max_itrs_without_update = 3
 		self.__num_inits_increase_order = 10
 		self.__init_type_increase_order = 'K-MEANS++'
 		self.__max_itrs_increase_order = 10
 		self.__print_to_stdout = 2
 		self.__median_id = np.inf # @todo: check
 		self.__median_node_id_prefix = '' # @todo: check
 		self.__node_maps_from_median = {}
 		self.__sum_of_distances = 0
 		self.__best_init_sum_of_distances = np.inf
 		self.__converged_sum_of_distances = np.inf
 		self.__runtime = None
 		self.__runtime_initialized = None
 		self.__runtime_converged = None
 		self.__itrs = [] # @todo: check: {} ?
 		self.__num_decrease_order = 0
 		self.__num_increase_order = 0
 		self.__num_converged_descents = 0
 		self.__state = AlgorithmState.TERMINATED
 		
 		if ged_env is None:
 			raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.')
 		elif not ged_env.is_initialized():
 			raise Exception('The GED environment is uninitialized. Call gedlibpy.GEDEnv.init() before passing it to the constructor of MedianGraphEstimator.')
 	
 	
 	def set_options(self, options):
 		"""Sets the options of the estimator.

 		Parameters
 		----------
 		options : string
 			String that specifies with which options to run the estimator.
 		"""
 		self.__set_default_options()
 		options_map = misc.options_string_to_options_map(options)
 		for opt_name, opt_val in options_map.items():
 			if opt_name == 'init-type':
 				self.__init_type = opt_val
 				if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN':
 					raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"')
 			elif opt_name == 'random-inits':
 				try:
 					self.__num_random_inits = int(opt_val)
 					self.__desired_num_random_inits = self.__num_random_inits
 				except:
 					raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"')

 				if self.__num_random_inits <= 0:
 					raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"')
 	
 			elif opt_name == 'randomness':
 				if opt_val == 'PSEUDO':
 					self.__use_real_randomness = False
 	
 				elif opt_val == 'REAL':
 					self.__use_real_randomness = True
 	
 				else:
 					raise Exception('Invalid argument "' + opt_val  + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"')
 	
 			elif opt_name == 'stdout':
 				if opt_val == '0':
 					self.__print_to_stdout = 0
 	
 				elif opt_val == '1':
 					self.__print_to_stdout = 1
 	
 				elif opt_val == '2':
 					self.__print_to_stdout = 2
 	
 				else:
 					raise Exception('Invalid argument "' + opt_val  + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"')
 	
 			elif opt_name == 'refine':
 				if opt_val == 'TRUE':
 					self.__refine = True
 	
 				elif opt_val == 'FALSE':
 					self.__refine = False
 	
 				else:
 					raise Exception('Invalid argument "' + opt_val  + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"')
 	
 			elif opt_name == 'time-limit':
 				try:
 					self.__time_limit_in_sec = float(opt_val)
 	
 				except:
 					raise Exception('Invalid argument "' + opt_val + '" for option time-limit.  Usage: options = "[--time-limit <convertible to double>] [...]')
 	
 			elif opt_name == 'max-itrs':
 				try:
 					self.__max_itrs = int(opt_val)
 	
 				except:
 					raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs <convertible to int>] [...]')
 	
 			elif opt_name == 'max-itrs-without-update':
 				try:
 					self.__max_itrs_without_update = int(opt_val)
 	
 				except:
 					raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update <convertible to int>] [...]')
 	
 			elif opt_name == 'seed':
 				try:
 					self.__seed = int(opt_val)
 	
 				except:
 					raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed <convertible to int greater equal 0>] [...]')
 	
 			elif opt_name == 'epsilon':
 				try:
 					self.__epsilon = float(opt_val)
 	
 				except:
 					raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]')
 	
 				if self.__epsilon <= 0:
 					raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]')
 	
 			elif opt_name == 'inits-increase-order':
 				try:
 					self.__num_inits_increase_order = int(opt_val)
 	
 				except:
 					raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"')
 	
 				if self.__num_inits_increase_order <= 0:
 					raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"')

 			elif opt_name == 'init-type-increase-order':
 				self.__init_type_increase_order = opt_val
 				if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++':
 					raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"')
 	
 			elif opt_name == 'max-itrs-increase-order':
 				try:
 					self.__max_itrs_increase_order = int(opt_val)
 	
 				except:
 					raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order <convertible to int>] [...]')

 			else:
 				valid_options = '[--init-type <arg>] [--random-inits <arg>] [--randomness <arg>] [--seed <arg>] [--stdout <arg>] '
 				valid_options += '[--time-limit <arg>] [--max-itrs <arg>] [--epsilon <arg>] '
 				valid_options += '[--inits-increase-order <arg>] [--init-type-increase-order <arg>] [--max-itrs-increase-order <arg>]'
 				raise Exception('Invalid option "' + opt_name + '". Usage: options = "' + valid_options + '"')
 
 		
 	def set_init_method(self, init_method, init_options=''):
 		"""Selects method to be used for computing the initial medoid graph.
 		
 		Parameters
 		----------
 		init_method : string
 			The selected method. Default: ged::Options::GEDMethod::BRANCH_UNIFORM.
 		
 		init_options : string
 			The options for the selected method. Default: "".
 		
 		Notes
 		-----
 		Has no effect unless "--init-type MEDOID" is passed to set_options().
 		"""
 		self.__init_method = init_method;
 		self.__init_options = init_options;
 	
 	
 	def set_descent_method(self, descent_method, descent_options=''):
 		"""Selects method to be used for block gradient descent..
 		
 		Parameters
 		----------
 		descent_method : string
 			The selected method. Default: ged::Options::GEDMethod::BRANCH_FAST.
 		
 		descent_options : string
 			The options for the selected method. Default: "".
 		
 		Notes
 		-----
 		Has no effect unless "--init-type MEDOID" is passed to set_options().
 		"""
 		self.__descent_method = descent_method;
 		self.__descent_options = descent_options;

 	
 	def set_refine_method(self, refine_method, refine_options):
 		"""Selects method to be used for improving the sum of distances and the node maps for the converged median.
 		
 		Parameters
 		----------
 		refine_method : string
 			The selected method. Default: "IPFP".
 			
 		refine_options : string 
 			The options for the selected method. Default: "".
 					
 		Notes
 		-----
 		Has no effect if "--refine FALSE" is passed to set_options().
 		"""
 		self.__refine_method = refine_method
 		self.__refine_options = refine_options

 	
 	def run(self, graph_ids, set_median_id, gen_median_id):
 		"""Computes a generalized median graph.
 		
 		Parameters
 		----------
 		graph_ids : list[integer]
 			The IDs of the graphs for which the median should be computed. Must have been added to the environment passed to the constructor.
 		
 		set_median_id : integer
 			The ID of the computed set-median. A dummy graph with this ID must have been added to the environment passed to the constructor. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph().


 		gen_median_id : integer
 			The ID of the computed generalized median. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph().
 		"""
 		# Sanity checks.
 		if len(graph_ids) == 0:
 			raise Exception('Empty vector of graph IDs, unable to compute median.')
 		all_graphs_empty = True
 		for graph_id in graph_ids:
 			if self.__ged_env.get_graph_num_nodes(graph_id) > 0:
 				self.__median_node_id_prefix = self.__ged_env.get_original_node_ids(graph_id)[0]
 				all_graphs_empty = False
 				break
 		if all_graphs_empty:
 			raise Exception('All graphs in the collection are empty.')
 			
 		# Start timer and record start time.
 		start = time.time()
 		timer = Timer(self.__time_limit_in_sec)
 		self.__median_id = gen_median_id
 		self.__state = AlgorithmState.TERMINATED
 		
 		# Get ExchangeGraph representations of the input graphs.
 		graphs = {}
 		for graph_id in graph_ids:
 			# @todo: get_nx_graph() function may need to be modified according to the coming code.
 			graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id, True, True, False)
 # 		print(self.__ged_env.get_graph_internal_id(0))
 # 		print(graphs[0].graph)
 # 		print(graphs[0].nodes(data=True))
 # 		print(graphs[0].edges(data=True))
 # 		print(nx.adjacency_matrix(graphs[0]))

 			
 		# Construct initial medians.
 		medians = []
 		self.__construct_initial_medians(graph_ids, timer, medians)
 		end_init = time.time()
 		self.__runtime_initialized = end_init - start
 # 		print(medians[0].graph)
 # 		print(medians[0].nodes(data=True))
 # 		print(medians[0].edges(data=True))
 # 		print(nx.adjacency_matrix(medians[0]))
 		
 		# Reset information about iterations and number of times the median decreases and increases.
 		self.__itrs = [0] * len(medians)
 		self.__num_decrease_order = 0
 		self.__num_increase_order = 0
 		self.__num_converged_descents = 0
 		
 		# Initialize the best median.
 		best_sum_of_distances = np.inf
 		self.__best_init_sum_of_distances = np.inf
 		node_maps_from_best_median = {}
 		
 		# Run block gradient descent from all initial medians.
 		self.__ged_env.set_method(self.__descent_method, self.__descent_options)
 		for median_pos in range(0, len(medians)):
 			
 			# Terminate if the timer has expired and at least one SOD has been computed.
 			if timer.expired() and median_pos > 0:
 				break
 			
 			# Print information about current iteration.
 			if self.__print_to_stdout == 2:
 				print('\n===========================================================')
 				print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.')
 				print('-----------------------------------------------------------')
 				
 			# Get reference to the median.
 			median = medians[median_pos]
 			
 			# Load initial median into the environment.
 			self.__ged_env.load_nx_graph(median, gen_median_id)
 			self.__ged_env.init(self.__ged_env.get_init_type())
 			
 			# Print information about current iteration.
 			if self.__print_to_stdout == 2:
 				progress = tqdm(desc='\rComputing initial node maps', total=len(graph_ids), file=sys.stdout)
 				
 			# Compute node maps and sum of distances for initial median.
 			self.__sum_of_distances = 0
 			self.__node_maps_from_median.clear() # @todo
 			for graph_id in graph_ids:
 				self.__ged_env.run_method(gen_median_id, graph_id)
 				self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id)
 # 				print(self.__node_maps_from_median[graph_id])
 				self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: the C++ implementation for this function in GedLibBind.ipp re-call get_node_map() once more, this is not neccessary.
 # 				print(self.__sum_of_distances)
 				# Print information about current iteration.
 				if self.__print_to_stdout == 2:
 					progress.update(1)
 					
 			self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances)
 			self.__ged_env.load_nx_graph(median, set_median_id)
 # 			print(self.__best_init_sum_of_distances)
 			
 			# Print information about current iteration.
 			if self.__print_to_stdout == 2:
 				print('\n')
 				
 			# Run block gradient descent from initial median.
 			converged = False
 			itrs_without_update = 0
 			while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update):
 				
 				# Print information about current iteration.
 				if self.__print_to_stdout == 2:
 					print('\n===========================================================')
 					print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.')
 					print('-----------------------------------------------------------')
 					
 				# Initialize flags that tell us what happened in the iteration.
 				median_modified = False
 				node_maps_modified = False
 				decreased_order = False
 				increased_order = False
 				
 				# Update the median. # @todo!!!!!!!!!!!!!!!!!!!!!!
 				median_modified = self.__update_median(graphs, median)
 				if not median_modified or self.__itrs[median_pos] == 0:
 					decreased_order = False
 					if not decreased_order or self.__itrs[median_pos] == 0:
 						increased_order = False
 						
 				# Update the number of iterations without update of the median.
 				if median_modified or decreased_order or increased_order:
 					itrs_without_update = 0
 				else:
 					itrs_without_update += 1
 					
 				# Print information about current iteration.
 				if self.__print_to_stdout == 2:
 					print('Loading median to environment: ... ', end='')
 					
 				# Load the median into the environment.
 				# @todo: should this function use the original node label?
 				self.__ged_env.load_nx_graph(median, gen_median_id)
 				self.__ged_env.init(self.__ged_env.get_init_type())
 					
 				# Print information about current iteration.
 				if self.__print_to_stdout == 2:
 					print('done.')					
 					
 				# Print information about current iteration.
 				if self.__print_to_stdout == 2:
 					print('Updating induced costs: ... ', end='')

 				# Compute induced costs of the old node maps w.r.t. the updated median.
 				for graph_id in graph_ids:
 # 					print(self.__ged_env.get_induced_cost(gen_median_id, graph_id))
 					# @todo: watch out if compute_induced_cost is correct, this may influence: increase/decrease order, induced_cost() in the following code.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 					self.__ged_env.compute_induced_cost(gen_median_id, graph_id)
 # 					print('---------------------------------------')
 # 					print(self.__ged_env.get_induced_cost(gen_median_id, graph_id))
 					
 				# Print information about current iteration.
 				if self.__print_to_stdout == 2:
 					print('done.')					
 					
 				# Update the node maps.
 				node_maps_modified = self.__update_node_maps() # @todo

 				# Update the order of the median if no improvement can be found with the current order.
 				
 				# Update the sum of distances.
 				old_sum_of_distances = self.__sum_of_distances
 				self.__sum_of_distances = 0
 				for graph_id in self.__node_maps_from_median:
 					self.__sum_of_distances += self.__ged_env.get_induced_cost(gen_median_id, graph_id) # @todo: see above.
 					
 				# Print information about current iteration.
 				if self.__print_to_stdout == 2:
 					print('Old local SOD: ', old_sum_of_distances)
 					print('New local SOD: ', self.__sum_of_distances)
 					print('Best converged SOD: ', best_sum_of_distances)
 					print('Modified median: ', median_modified)
 					print('Modified node maps: ', node_maps_modified)
 					print('Decreased order: ', decreased_order)
 					print('Increased order: ', increased_order)
 					print('===========================================================\n')
 					
 				converged = not (median_modified or node_maps_modified or decreased_order or increased_order)
 				
 				self.__itrs[median_pos] += 1
 				
 			# Update the best median.
 			if self.__sum_of_distances < self.__best_init_sum_of_distances:
 				best_sum_of_distances = self.__sum_of_distances
 				node_maps_from_best_median = self.__node_maps_from_median
 				best_median = median
 				
 			# Update the number of converged descents.
 			if converged:
 				self.__num_converged_descents += 1
 				
 		# Store the best encountered median.
 		self.__sum_of_distances = best_sum_of_distances
 		self.__node_maps_from_median = node_maps_from_best_median
 		self.__ged_env.load_nx_graph(best_median, gen_median_id)
 		self.__ged_env.init(self.__ged_env.get_init_type())
 		end_descent = time.time()
 		self.__runtime_converged = end_descent - start
 		
 		# Refine the sum of distances and the node maps for the converged median.
 		self.__converged_sum_of_distances = self.__sum_of_distances
 		if self.__refine:
 			self.__improve_sum_of_distances(timer) # @todo
 		
 		# Record end time, set runtime and reset the number of initial medians.
 		end = time.time()
 		self.__runtime = end - start
 		self.__num_random_inits = self.__desired_num_random_inits
 		
 		# Print global information.
 		if self.__print_to_stdout != 0:
 			print('\n===========================================================')
 			print('Finished computation of generalized median graph.')
 			print('-----------------------------------------------------------')
 			print('Best SOD after initialization: ', self.__best_init_sum_of_distances)
 			print('Converged SOD: ', self.__converged_sum_of_distances)
 			if self.__refine:
 				print('Refined SOD: ', self.__sum_of_distances)
 			print('Overall runtime: ', self.__runtime)
 			print('Runtime of initialization: ', self.__runtime_initialized)
 			print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized)
 			if self.__refine:
 				print('Runtime of refinement: ', self.__runtime - self.__runtime_converged)
 			print('Number of initial medians: ', len(medians))
 			total_itr = 0
 			num_started_descents = 0
 			for itr in self.__itrs:
 				total_itr += itr
 				if itr > 0:
 					num_started_descents += 1
 			print('Size of graph collection: ', len(graph_ids))
 			print('Number of started descents: ', num_started_descents)
 			print('Number of converged descents: ', self.__num_converged_descents)
 			print('Overall number of iterations: ', total_itr)
 			print('Overall number of times the order decreased: ', self.__num_decrease_order)
 			print('Overall number of times the order increased: ', self.__num_increase_order)
 			print('===========================================================\n')
 	
 	
 	def get_sum_of_distances(self, state=''):
 		"""Returns the sum of distances.
 		
 		Parameters
 		----------
 		state : string
 			The state of the estimator. Can be 'initialized' or 'converged'. Default: ""
 			
 		Returns
 		-------
 		float
 			The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned.
 		"""
 		if not self.__median_available():
 			raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().')
 		if state == 'initialized':
 			return self.__best_init_sum_of_distances
 		if state == 'converged':
 			return self.__converged_sum_of_distances
 		return self.__sum_of_distances
 	
 	
 	def __set_default_options(self):
 		self.__init_type = 'RANDOM'
 		self.__num_random_inits = 10
 		self.__desired_num_random_inits = 10
 		self.__use_real_randomness = True
 		self.__seed = 0
 		self.__refine = True
 		self.__time_limit_in_sec = 0
 		self.__epsilon = 0.0001
 		self.__max_itrs = 100
 		self.__max_itrs_without_update = 3
 		self.__num_inits_increase_order = 10
 		self.__init_type_increase_order = 'K-MEANS++'
 		self.__max_itrs_increase_order = 10
 		self.__print_to_stdout = 2
 		
 		
 	def __construct_initial_medians(self, graph_ids, timer, initial_medians):
 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			print('\n===========================================================')
 			print('Constructing initial median(s).')
 			print('-----------------------------------------------------------')
 			
 		# Compute or sample the initial median(s).
 		initial_medians.clear()
 		if self.__init_type == 'MEDOID':
 			self.__compute_medoid(graph_ids, timer, initial_medians)
 		elif self.__init_type == 'MAX':
 			pass # @todo
 # 			compute_max_order_graph_(graph_ids, initial_medians)
 		elif self.__init_type == 'MIN':
 			pass # @todo
 # 			compute_min_order_graph_(graph_ids, initial_medians)
 		elif self.__init_type == 'MEAN':
 			pass # @todo
 # 			compute_mean_order_graph_(graph_ids, initial_medians)
 		else:
 			pass # @todo
 # 			sample_initial_medians_(graph_ids, initial_medians)

 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			print('===========================================================')
 			
 			
 	def __compute_medoid(self, graph_ids, timer, initial_medians):
 		# Use method selected for initialization phase.
 		self.__ged_env.set_method(self.__init_method, self.__init_options)
 		
 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			progress = tqdm(desc='\rComputing medoid', total=len(graph_ids), file=sys.stdout)
 			
 		# Compute the medoid.
 		medoid_id = graph_ids[0]
 		best_sum_of_distances = np.inf
 		for g_id in graph_ids:
 			if timer.expired():
 				self.__state = AlgorithmState.CALLED
 				break
 			sum_of_distances = 0
 			for h_id in graph_ids:
 				self.__ged_env.run_method(g_id, h_id)
 				sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id)
 			if sum_of_distances < best_sum_of_distances:
 				best_sum_of_distances = sum_of_distances
 				medoid_id = g_id
 				
 			# Print information about current iteration.
 			if self.__print_to_stdout == 2:
 				progress.update(1)
 		initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo
 		
 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			print('\n')
 			
 		
 	def __termination_criterion_met(self, converged, timer, itr, itrs_without_update):
 		if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False):
 			if self.__state == AlgorithmState.TERMINATED:
 				self.__state = AlgorithmState.INITIALIZED
 			return True
 		return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False)
 	
 	
 	def __update_median(self, graphs, median):
 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			print('Updating median: ', end='')
 			
 		# Store copy of the old median.
 		old_median = median.copy() # @todo: this is just a shallow copy.
 		
 		# Update the node labels.
 		if self.__labeled_nodes:
 			self.__update_node_labels(graphs, median)
 			
 		# Update the edges and their labels.
 		self.__update_edges(graphs, median)
 		
 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			print('done.')
 			
 		return not self.__are_graphs_equal(median, old_median)
 		
 		
 	def __update_node_labels(self, graphs, median):
 		
 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			print('nodes ... ', end='')
 			
 		# Iterate through all nodes of the median.
 		for i in range(0, nx.number_of_nodes(median)):
 # 			print('i: ', i)
 			# Collect the labels of the substituted nodes.
 			node_labels = []
 			for graph_id, graph in graphs.items():
 # 				print('graph_id: ', graph_id)
 # 				print(self.__node_maps_from_median[graph_id])
 				k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i)
 # 				print('k: ', k)
 				if k != np.inf:
 					node_labels.append(graph.nodes[k])
 					
 			# Compute the median label and update the median.
 			if len(node_labels) > 0:
 				median_label = self.__ged_env.get_median_node_label(node_labels)
 				if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon:
 					nx.set_node_attributes(median, {i: median_label})
 					
 					
 	def __update_edges(self, graphs, median):
 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			print('edges ... ', end='')
 			
 		# Clear the adjacency lists of the median and reset number of edges to 0.
 		median_edges = list(median.edges)		
 		for (head, tail) in median_edges:
 			median.remove_edge(head, tail)
 		
 		# @todo: what if edge is not labeled?
 		# Iterate through all possible edges (i,j) of the median.
 		for i in range(0, nx.number_of_nodes(median)):
 			for j in range(i + 1, nx.number_of_nodes(median)):
 				
 				# Collect the labels of the edges to which (i,j) is mapped by the node maps.
 				edge_labels = []
 				for graph_id, graph in graphs.items():
 					k = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], i)
 					l = self.__get_node_image_from_map(self.__node_maps_from_median[graph_id], j)
 					if k != np.inf and l != np.inf:
 						if graph.has_edge(k, l):
 							edge_labels.append(graph.edges[(k, l)])
 							
 				# Compute the median edge label and the overall edge relabeling cost.
 				rel_cost = 0
 				median_label = self.__ged_env.get_edge_label(1)
 				if median.has_edge(i, j):
 					median_label = median.edges[(i, j)]
 				if self.__labeled_edges and len(edge_labels) > 0:
 					new_median_label = self.__ged_env.median_edge_label(edge_labels)
 					if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon:
 						median_label = new_median_label
 					for edge_label in edge_labels:
 						rel_cost += self.__ged_env.get_edge_rel_cost(median_label, edge_label)
 						
 				# Update the median.
 				if rel_cost < (self.__edge_ins_cost + self.__edge_del_cost) * len(edge_labels) - self.__edge_del_cost * len(graphs):
 					median.add_edge(i, j, **median_label)
 				else:
 					if median.has_edge(i, j):
 						median.remove_edge(i, j)


 	def __update_node_maps(self):
 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			progress = tqdm(desc='\rUpdating node maps', total=len(self.__node_maps_from_median), file=sys.stdout)
 			
 		# Update the node maps.
 		node_maps_were_modified = False
 		for graph_id in self.__node_maps_from_median:
 			self.__ged_env.run_method(self.__median_id, graph_id)
 			if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < self.__ged_env.get_induced_cost(self.__median_id, graph_id) - self.__epsilon: # @todo: see above.
 				self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) # @todo: node_map may not assigned.
 				node_maps_were_modified = True
 			# Print information about current iteration.
 			if self.__print_to_stdout == 2:
 				progress.update(1)
 			
 		# Print information about current iteration.
 		if self.__print_to_stdout == 2:
 			print('\n')
 			
 		# Return true if the node maps were modified.
 		return node_maps_were_modified
 	
 	
 	def __improve_sum_of_distances(self, timer):
 		pass
 	
 	
 	def __median_available(self):
 		return self.__median_id != np.inf
 		
 				
 	def __get_node_image_from_map(self, node_map, node):
 		"""
 		Return ID of the node mapping of `node` in `node_map`.

 		Parameters
 		----------
 		node_map : list[tuple(int, int)]
 			List of node maps where the mapping node is found.
 		
 		node : int
 			The mapping node of this node is returned

 		Raises
 		------
 		Exception
 			If the node with ID `node` is not contained in the source nodes of the node map.

 		Returns
 		-------
 		int
 			ID of the mapping of `node`.
 			
 		Notes
 		-----
 		This function is not implemented in the `ged::MedianGraphEstimator` class of the `GEDLIB` library. Instead it is a Python implementation of the `ged::NodeMap::image` function.
 		"""
 		if node < len(node_map):
 			return node_map[node][1] if node_map[node][1] < len(node_map) else np.inf
 		else:
 			raise Exception('The node with ID ', str(node), ' is not contained in the source nodes of the node map.')
 		return np.inf
 				
 	
 	def __are_graphs_equal(self, g1, g2):
 		"""
 		Check if the two graphs are equal.

 		Parameters
 		----------
 		g1 : NetworkX graph object
 			Graph 1 to be compared.
 		
 		g2 : NetworkX graph object
 			Graph 2 to be compared.

 		Returns
 		-------
 		bool
 			True if the two graph are equal.
 			
 		Notes
 		-----
 		This is not an identical check. Here the two graphs are equal if and only if their original_node_ids, nodes, all node labels, edges and all edge labels are equal. This function is specifically designed for class `MedianGraphEstimator` and should not be used elsewhere.
 		"""
 		# check original node ids.
 		if not g1.graph['original_node_ids'] == g2.graph['original_node_ids']:
 			return False
 		# check nodes.
 		nlist1 = [n for n in g1.nodes(data=True)]
 		nlist2 = [n for n in g2.nodes(data=True)]
 		if not nlist1 == nlist2:
 			return False
 		# check edges.
 		elist1 = [n for n in g1.edges(data=True)]
 		elist2 = [n for n in g2.edges(data=True)]
 		if not elist1 == elist2:
 			return False

 		return True
 	
 	
 	def compute_my_cost(g, h, node_map):
 		cost = 0.0
 		for node in g.nodes:
 			cost += 0
 		
--- a/gklearn/preimage/median_linlin.py
+++ b/gklearn/preimage/median_linlin.py
@@ -0,0 +1,215 @@
 import sys
 import pathlib
 import numpy as np
 import networkx as nx

 from gedlibpy import librariesImport, gedlibpy
 sys.path.insert(0, "/home/bgauzere/dev/optim-graphes/")
 import gklearn

 def replace_graph_in_env(script, graph, old_id, label='median'):
    """
    Replace a graph in script

    If old_id is -1, add a new graph to the environnemt

    """
    if(old_id > -1):
        script.PyClearGraph(old_id)
    new_id = script.PyAddGraph(label)
    for i in graph.nodes():
        script.PyAddNode(new_id,str(i),graph.node[i]) # !! strings are required bt gedlib
    for e in graph.edges:
        script.PyAddEdge(new_id, str(e[0]),str(e[1]), {})
    script.PyInitEnv()
    script.PySetMethod("IPFP", "")
    script.PyInitMethod()

    return new_id
    
 #Dessin median courrant
 def draw_Letter_graph(graph):
    import numpy as np
    import networkx as nx
    import matplotlib.pyplot as plt
    plt.figure()
    pos = {}
    for n in graph.nodes:
        pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
    nx.draw_networkx(graph,pos)
    plt.show()
    
 #compute new mappings
 def update_mappings(script,median_id,listID):
    med_distances = {}
    med_mappings = {}
    sod = 0
    for i in range(0,len(listID)):
        script.PyRunMethod(median_id,listID[i])
        med_distances[i] = script.PyGetUpperBound(median_id,listID[i])
        med_mappings[i] = script.PyGetForwardMap(median_id,listID[i])
        sod += med_distances[i]
    return med_distances, med_mappings, sod

 def calcul_Sij(all_mappings, all_graphs,i,j):
    s_ij = 0
    for k in range(0,len(all_mappings)):
        cur_graph =  all_graphs[k]
        cur_mapping = all_mappings[k]
        size_graph = cur_graph.order()
        if ((cur_mapping[i] < size_graph) and 
            (cur_mapping[j] < size_graph) and 
            (cur_graph.has_edge(cur_mapping[i], cur_mapping[j]) == True)):
                s_ij += 1
        
    return s_ij

 # def update_median_nodes_L1(median,listIdSet,median_id,dataset, mappings):
 #     from scipy.stats.mstats import gmean

 #     for i in median.nodes():
 #         for k in listIdSet:
 #             vectors = [] #np.zeros((len(listIdSet),2))
 #             if(k != median_id):
 #                 phi_i = mappings[k][i]
 #                 if(phi_i < dataset[k].order()):
 #                     vectors.append([float(dataset[k].node[phi_i]['x']),float(dataset[k].node[phi_i]['y'])])

 #         new_labels = gmean(vectors)
 #         median.node[i]['x'] = str(new_labels[0])
 #         median.node[i]['y'] = str(new_labels[1])
 #     return median

 def update_median_nodes(median,dataset,mappings):
    #update node attributes
    for i in median.nodes():
        nb_sub=0
        mean_label = {'x' : 0, 'y' : 0}
        for k in range(0,len(mappings)):
            phi_i = mappings[k][i]
            if ( phi_i < dataset[k].order() ):
                nb_sub += 1
                mean_label['x'] += 0.75*float(dataset[k].node[phi_i]['x'])
                mean_label['y'] += 0.75*float(dataset[k].node[phi_i]['y'])
        median.node[i]['x'] = str((1/0.75)*(mean_label['x']/nb_sub))
        median.node[i]['y'] = str((1/0.75)*(mean_label['y']/nb_sub))
    return median

 def update_median_edges(dataset, mappings, median, cei=0.425,cer=0.425):
 #for letter high, ceir = 1.7, alpha = 0.75
    size_dataset = len(dataset)
    ratio_cei_cer = cer/(cei + cer)
    threshold = size_dataset*ratio_cei_cer
    order_graph_median = median.order()
    for i in range(0,order_graph_median):
        for j in range(i+1,order_graph_median):
            s_ij = calcul_Sij(mappings,dataset,i,j)
            if(s_ij > threshold):
                median.add_edge(i,j)
            else:
                if(median.has_edge(i,j)):
                    median.remove_edge(i,j)
    return median



 def compute_median(script, listID, dataset,verbose=False):
    """Compute a graph median of a dataset according to an environment

    Parameters

    script : An gedlib initialized environnement 
    listID (list): a list of ID in script: encodes the dataset 
    dataset (list): corresponding graphs in networkX format. We assume that graph
    listID[i] corresponds to dataset[i]

    Returns:
    A networkX graph, which is the median, with corresponding sod
    """
    print(len(listID))
    median_set_index, median_set_sod = compute_median_set(script, listID)
    print(median_set_index)
    print(median_set_sod)
    sods = []
    #Ajout median dans environnement
    set_median = dataset[median_set_index].copy()
    median = dataset[median_set_index].copy()
    cur_med_id = replace_graph_in_env(script,median,-1)
    med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
    sods.append(cur_sod)
    if(verbose):
        print(cur_sod)
    ite_max = 50
    old_sod = cur_sod * 2
    ite = 0
    epsilon = 0.001

    best_median 
    while((ite < ite_max) and (np.abs(old_sod - cur_sod) > epsilon )):
        median = update_median_nodes(median,dataset, med_mappings)
        median = update_median_edges(dataset,med_mappings,median)

        cur_med_id = replace_graph_in_env(script,median,cur_med_id)
        med_distances, med_mappings, cur_sod = update_mappings(script,cur_med_id,listID)
        
        
        sods.append(cur_sod)
        if(verbose):
            print(cur_sod)
        ite += 1
    return median, cur_sod, sods, set_median
    
    draw_Letter_graph(median)


 def compute_median_set(script,listID):
    'Returns the id in listID corresponding to median set'
    #Calcul median set
    N=len(listID)
    map_id_to_index = {}
    map_index_to_id = {}
    for i in range(0,len(listID)):
        map_id_to_index[listID[i]] = i
        map_index_to_id[i] = listID[i]
        
    distances = np.zeros((N,N))
    for i in listID:
        for j in listID:
            script.PyRunMethod(i,j)
            distances[map_id_to_index[i],map_id_to_index[j]] = script.PyGetUpperBound(i,j)

    median_set_index = np.argmin(np.sum(distances,0))
    sod = np.min(np.sum(distances,0))
    
    return median_set_index, sod

 def _convertGraph(G):
    """Convert a graph to the proper NetworkX format that can be
    recognized by library gedlibpy.
    """
    G_new = nx.Graph()
    for nd, attrs in G.nodes(data=True):
        G_new.add_node(str(nd), chem=attrs['atom'])
 #                G_new.add_node(str(nd), x=str(attrs['attributes'][0]), 
 #                               y=str(attrs['attributes'][1]))
    for nd1, nd2, attrs in G.edges(data=True):
        G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
 #                G_new.add_edge(str(nd1), str(nd2))
        
    return G_new

 if __name__ == "__main__":
    #Chargement du dataset
    gedlibpy.PyLoadGXLGraph('/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/', '/home/bgauzere/dev/gedlib/data/collections/Letter_Z.xml')
    gedlibpy.PySetEditCost("LETTER")
    gedlibpy.PyInitEnv()
    gedlibpy.PySetMethod("IPFP", "")
    gedlibpy.PyInitMethod()

    dataset,my_y = gklearn.utils.graphfiles.loadDataset("/home/bgauzere/dev/gedlib/data/datasets/Letter/HIGH/Letter_Z.cxl")

    listID = gedlibpy.PyGetAllGraphIds()
    median, sod = compute_median(gedlibpy,listID,dataset,verbose=True)
    
    print(sod)
    draw_Letter_graph(median)
--- a/gklearn/preimage/median_preimage_generator.py
+++ b/gklearn/preimage/median_preimage_generator.py
@@ -0,0 +1,15 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar 26 18:27:22 2020

@author: ljia
 """
 from gklearn.preimage.preimage_generator import PreimageGenerator
 # from gklearn.utils.dataset import Dataset

 class MedianPreimageGenerator(PreimageGenerator):
 	
 	def __init__(self, mge, dataset):
 		self.__mge = mge
 		self.__dataset = dataset
--- a/gklearn/preimage/misc.py
+++ b/gklearn/preimage/misc.py
@@ -0,0 +1,108 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar 19 18:13:56 2020

@author: ljia
 """

 def options_string_to_options_map(options_string):
    """Transforms an options string into an options map.
    
    Parameters
    ----------
    options_string : string
        Options string of the form "[--<option> <arg>] [...]".
    
    Return
    ------
    options_map : dict{string : string}
        Map with one key-value pair (<option>, <arg>) for each option contained in the string.
    """
    if options_string == '':
        return
    options_map = {}
    words = []
    tokenize(options_string, ' ', words)
    expect_option_name = True
    for word in words:
        if expect_option_name:
            is_opt_name, word = is_option_name(word)
            if is_opt_name:
                option_name = word
                if option_name in options_map:
                    raise Exception('Multiple specification of option "' + option_name + '".')
                options_map[option_name] = ''
            else:
                raise Exception('Invalid options "' + options_string + '". Usage: options = "[--<option> <arg>] [...]"')
        else:
            is_opt_name, word = is_option_name(word)
            if is_opt_name:
                raise Exception('Invalid options "' + options_string + '". Usage: options = "[--<option> <arg>] [...]"')
            else:
                options_map[option_name] = word
        expect_option_name = not expect_option_name
    return options_map
    

 def tokenize(sentence, sep, words):
    """Separates a sentence into words separated by sep (unless contained in single quotes).
    
    Parameters
    ----------
    sentence : string
        The sentence that should be tokenized.
        
    sep : string 
        The separator. Must be different from "'".
        
    words : list[string]
        The obtained words.
    """
    outside_quotes = True
    word_length = 0
    pos_word_start = 0
    for pos in range(0, len(sentence)):
        if sentence[pos] == '\'':
            if not outside_quotes and pos < len(sentence) - 1:
                if sentence[pos + 1] != sep:
                    raise Exception('Sentence contains closing single quote which is followed by a char different from ' + sep + '.')
            word_length += 1
            outside_quotes = not outside_quotes
        elif outside_quotes and sentence[pos] == sep:
            if word_length > 0:
                words.append(sentence[pos_word_start:pos_word_start + word_length])
            pos_word_start = pos + 1
            word_length = 0
        else:
            word_length += 1
    if not outside_quotes:
        raise Exception('Sentence contains unbalanced single quotes.')
    if word_length > 0:
        words.append(sentence[pos_word_start:pos_word_start + word_length])


 def is_option_name(word):
    """Checks whether a word is an option name and, if so, removes the leading dashes.
    
    Parameters
    ----------
    word : string
        Word.
        
    return
    ------
    True if word is of the form "--<option>".
    
    word : string
        The word without the leading dashes.
    """
    if word[0] == '\'':
        word = word[1:len(word) - 2]
        return False, word
    if len(word) < 3:
        return False, word
    if word[0] == '-' and word[1] == '-' and word[2] != '-':
        word = word[2:]
        return True, word
    return False, word
--- a/gklearn/preimage/pathfrequency.py
+++ b/gklearn/preimage/pathfrequency.py
@@ -0,0 +1,201 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed Mar 20 10:12:15 2019

 inferring a graph grom path frequency.
@author: ljia
 """
 #import numpy as np
 import networkx as nx
 from scipy.spatial.distance import hamming
 import itertools

 def SISF(K, v):
    if output:
        return output
    else:
        return 'no solution'

    
 def SISF_M(K, v):
    return output


 def GIPF_tree(v_obj, K=1, alphabet=[0, 1]):
    if K == 1:
        n_graph = v_obj[0] + v_obj[1]
        D_T, father_idx = getDynamicTable(n_graph, alphabet)
        
        # get the vector the closest to v_obj.
        if v_obj not in D_T:
            print('no exact solution')
            dis_lim = 1 / len(v_obj) # the possible shortest distance.
            dis_min = 1.0 # minimum proportional distance
            v_min = v_obj
            for vc in D_T:
                if vc[0] + vc[1] == n_graph:
 #                    print(vc)
                    dis = hamming(vc, v_obj)
                    if dis < dis_min:
                        dis_min = dis
                        v_min = vc
                    if dis_min <= dis_lim:
                        break
            v_obj = v_min
            
        # obtain required graph by traceback procedure.        
        return getObjectGraph(v_obj, D_T, father_idx, alphabet), v_obj
    
 def GIPF_M(K, v):
    return G


 def getDynamicTable(n_graph, alphabet=[0, 1]):
    # init. When only one node exists.
    D_T = {(1, 0, 0, 0, 0, 0): 1, (0, 1, 0, 0, 0, 0): 1, (0, 0, 1, 0, 0, 0): 0, 
           (0, 0, 0, 1, 0, 0): 0, (0, 0, 0, 0, 1, 0): 0, (0, 0, 0, 0, 0, 1): 0,}
    D_T = [(1, 0, 0, 0, 0, 0), (0, 1, 0, 0, 0, 0)]
    father_idx = [-1, -1] # index of each vector's father
    # add possible vectors.
    for idx, v in enumerate(D_T):
        if v[0] + v[1] < n_graph:
            D_T.append((v[0] + 1, v[1], v[2] + 2, v[3], v[4], v[5]))
            D_T.append((v[0] + 1, v[1], v[2], v[3] + 1, v[4] + 1, v[5]))
            D_T.append((v[0], v[1] + 1, v[2], v[3] + 1, v[4] + 1, v[5]))
            D_T.append((v[0], v[1] + 1, v[2], v[3], v[4], v[5] + 2))
            father_idx += [idx, idx, idx, idx]
    
 #    D_T = itertools.chain([(1, 0, 0, 0, 0, 0)], [(0, 1, 0, 0, 0, 0)])
 #    father_idx = itertools.chain([-1], [-1]) # index of each vector's father
 #    # add possible vectors.
 #    for idx, v in enumerate(D_T):
 #        if v[0] + v[1] < n_graph:
 #            D_T = itertools.chain(D_T, [(v[0] + 1, v[1], v[2] + 2, v[3], v[4], v[5])])
 #            D_T = itertools.chain(D_T, [(v[0] + 1, v[1], v[2], v[3] + 1, v[4] + 1, v[5])])
 #            D_T = itertools.chain(D_T, [(v[0], v[1] + 1, v[2], v[3] + 1, v[4] + 1, v[5])])
 #            D_T = itertools.chain(D_T, [(v[0], v[1] + 1, v[2], v[3], v[4], v[5] + 2)])
 #            father_idx = itertools.chain(father_idx, [idx, idx, idx, idx])
    return D_T, father_idx


 def getObjectGraph(v_obj, D_T, father_idx, alphabet=[0, 1]):
    g_obj = nx.Graph()
    
    # do vector traceback.
    v_tb = [list(v_obj)] # traceback vectors.
    v_tb_idx = [D_T.index(v_obj)] # indices of traceback vectors.
    while v_tb_idx[-1] > 1:
        idx_pre = father_idx[v_tb_idx[-1]]
        v_tb_idx.append(idx_pre)
        v_tb.append(list(D_T[idx_pre]))
    v_tb = v_tb[::-1] # reverse
 #    v_tb_idx = v_tb_idx[::-1]

    # construct tree.
    v_c = v_tb[0] # current vector.
    if v_c[0] == 1:
        g_obj.add_node(0, node_label=alphabet[0])
    else:
        g_obj.add_node(0, node_label=alphabet[1])
    for vct in v_tb[1:]:
        if vct[0] - v_c[0] == 1:
            if vct[2] - v_c[2] == 2: # transfer 1
                label1 = alphabet[0]
                label2 = alphabet[0]
            else: # transfer 2
                label1 = alphabet[1]
                label2 = alphabet[0]
        else: 
            if vct[3] - v_c[3] == 1: # transfer 3
                label1 = alphabet[0]
                label2 = alphabet[1]
            else: # transfer 4
                label1 = alphabet[1]
                label2 = alphabet[1]
        for nd, attr in g_obj.nodes(data=True):
            if attr['node_label'] == label1:
                nb_node = nx.number_of_nodes(g_obj)
                g_obj.add_node(nb_node, node_label=label2)
                g_obj.add_edge(nd, nb_node)
                break
        v_c = vct
    return g_obj


 import random
 def hierarchy_pos(G, root=None, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5):

    '''
    From Joel's answer at https://stackoverflow.com/a/29597209/2966723.  
    Licensed under Creative Commons Attribution-Share Alike 

    If the graph is a tree this will return the positions to plot this in a 
    hierarchical layout.

    G: the graph (must be a tree)

    root: the root node of current branch 
    - if the tree is directed and this is not given, 
      the root will be found and used
    - if the tree is directed and this is given, then 
      the positions will be just for the descendants of this node.
    - if the tree is undirected and not given, 
      then a random choice will be used.

    width: horizontal space allocated for this branch - avoids overlap with other branches

    vert_gap: gap between levels of hierarchy

    vert_loc: vertical location of root

    xcenter: horizontal location of root
    '''
    if not nx.is_tree(G):
        raise TypeError('cannot use hierarchy_pos on a graph that is not a tree')

    if root is None:
        if isinstance(G, nx.DiGraph):
            root = next(iter(nx.topological_sort(G)))  #allows back compatibility with nx version 1.11
        else:
            root = random.choice(list(G.nodes))

    def _hierarchy_pos(G, root, width=1., vert_gap = 0.2, vert_loc = 0, xcenter = 0.5, pos = None, parent = None):
        '''
        see hierarchy_pos docstring for most arguments

        pos: a dict saying where all nodes go if they have been assigned
        parent: parent of this branch. - only affects it if non-directed

        '''

        if pos is None:
            pos = {root:(xcenter,vert_loc)}
        else:
            pos[root] = (xcenter, vert_loc)
        children = list(G.neighbors(root))
        if not isinstance(G, nx.DiGraph) and parent is not None:
            children.remove(parent)  
        if len(children)!=0:
            dx = width/len(children) 
            nextx = xcenter - width/2 - dx/2
            for child in children:
                nextx += dx
                pos = _hierarchy_pos(G,child, width = dx, vert_gap = vert_gap, 
                                    vert_loc = vert_loc-vert_gap, xcenter=nextx,
                                    pos=pos, parent = root)
        return pos


    return _hierarchy_pos(G, root, width, vert_gap, vert_loc, xcenter)


 if __name__ == '__main__':
    v_obj = (6, 4, 10, 3, 3, 2)
 #    v_obj = (6, 5, 10, 3, 3, 2)
    tree_obj, v_obj = GIPF_tree(v_obj)
    print('One closest vector is', v_obj)
    # plot
    pos = hierarchy_pos(tree_obj, 0) 
    node_labels = nx.get_node_attributes(tree_obj, 'node_label')
    nx.draw(tree_obj, pos=pos, labels=node_labels, with_labels=True)
--- a/gklearn/preimage/preimage_generator.py
+++ b/gklearn/preimage/preimage_generator.py
@@ -0,0 +1,12 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Mar 26 18:26:36 2020

@author: ljia
 """

 class PreimageGenerator(object):
 	
 	def __init__(self):
 		pass
--- a/gklearn/preimage/preimage_iam.py
+++ b/gklearn/preimage/preimage_iam.py
@@ -0,0 +1,705 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Apr 30 17:07:43 2019

 A graph pre-image method combining iterative pre-image method in reference [1] 
 and the iterative alternate minimizations (IAM) in reference [2].
@author: ljia
@references:
    [1] Gökhan H Bakir, Alexander Zien, and Koji Tsuda. Learning to and graph 
    pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004.
    [2] Generalized median graph via iterative alternate minimization.
 """
 import sys
 import numpy as np
 from tqdm import tqdm
 import networkx as nx
 import matplotlib.pyplot as plt
 import random

 from iam import iam_upgraded
 from utils import dis_gstar, compute_kernel


 def preimage_iam(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, 
                 gkernel, epsilon=0.001, InitIAMWithAllDk=False,
                 params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, 
                             'ite_max': 50, 'epsilon': 0.001, 
                             'removeNodes': True, 'connected': False},
                 params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', 
                             'edit_cost_constant': [], 'stabilizer': 'min', 
                             'repeat': 50}):
    """This function constructs graph pre-image by the iterative pre-image 
    framework in reference [1], algorithm 1, where the step of generating new 
    graphs randomly is replaced by the IAM algorithm in reference [2].
    
    notes
    -----
    Every time a set of n better graphs is acquired, their distances in kernel space are
    compared with the k nearest ones, and the k nearest distances from the k+n
    distances will be used as the new ones.
    """
    # compute k nearest neighbors of phi in DN.
    dis_all = [] # distance between g_star and each graph.
    term3 = 0
    for i1, a1 in enumerate(alpha):
        for i2, a2 in enumerate(alpha):
            term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
    for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
        dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
        dis_all.append(dtemp)
        
    # sort
    sort_idx = np.argsort(dis_all)
    dis_k = [dis_all[idis] for idis in sort_idx[0:k]] # the k shortest distances
    nb_best = len(np.argwhere(dis_k == dis_k[0]).flatten().tolist())
    ghat_list = [Gn_init[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
    if dis_k[0] == 0: # the exact pre-image.
        print('The exact pre-image is found from the input dataset.')
        return 0, ghat_list, 0, 0
    dhat = dis_k[0] # the nearest distance
 #    for g in ghat_list:
 #        draw_Letter_graph(g)
 #        nx.draw_networkx(g)
 #        plt.show()
 #        print(g.nodes(data=True))
 #        print(g.edges(data=True))
    Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
 #    for gi in Gk:
 #        nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True)
 ##        nx.draw_networkx(gi)
 #        plt.show()
 ##        draw_Letter_graph(g)
 #        print(gi.nodes(data=True))
 #        print(gi.edges(data=True))
    
 #    i = 1
    r = 0
    itr_total = 0
    dis_of_each_itr = [dhat]
    found = False
    nb_updated = 0
    nb_updated_k = 0
    while r < r_max:# and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon:
        print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-')
        print('Current preimage iteration =', r)
        print('Total preimage iteration =', itr_total, '\n')
        found = False
        
        Gn_nearest_median = [g.copy() for g in Gk]
        if InitIAMWithAllDk: # each graph in D_k is used to initialize IAM.
            ghat_new_list = []
            for g_tmp in Gk:
                Gn_nearest_init = [g_tmp.copy()]
                ghat_new_list_tmp, _, _ = iam_upgraded(Gn_nearest_median, 
                        Gn_nearest_init, params_ged=params_ged, **params_iam)
                ghat_new_list += ghat_new_list_tmp
        else: # only the best graph in D_k is used to initialize IAM.
            Gn_nearest_init = [g.copy() for g in Gk]
            ghat_new_list, _, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, 
                    params_ged=params_ged, **params_iam)

 #        for g in g_tmp_list:
 #            nx.draw_networkx(g)
 #            plt.show()
 #            draw_Letter_graph(g)
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
            
        # compute distance between \psi and the new generated graphs.
        knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
        dhat_new_list = []
        for idx, g_tmp in enumerate(ghat_new_list):
            # @todo: the term3 below could use the one at the beginning of the function.
            dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), 
                                len(ghat_new_list) + len(Gn_median) + 1), 
                                alpha, knew, withterm3=False))
        
        for idx_g, ghat_new in enumerate(ghat_new_list):          
            dhat_new = dhat_new_list[idx_g]
            
            # if the new distance is smaller than the max of D_k.           
            if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon:
                # check if the new distance is the same as one in D_k.
                is_duplicate = False
                for dis_tmp in dis_k[1:-1]:
                    if np.abs(dhat_new - dis_tmp) < epsilon:
                        is_duplicate = True
                        print('IAM: duplicate k nearest graph generated.')
                        break
                if not is_duplicate:
                    if np.abs(dhat_new - dhat) < epsilon:
                        print('IAM: I am equal!')
 #                        dhat = dhat_new
 #                        ghat_list = [ghat_new.copy()]
                    else:
                        print('IAM: we got better k nearest neighbors!')
                        nb_updated_k += 1
                        print('the k nearest neighbors are updated', 
                              nb_updated_k, 'times.')
                        
                        dis_k = [dhat_new] + dis_k[0:k-1] # add the new nearest distance.
                        Gk = [ghat_new.copy()] + Gk[0:k-1] # add the corresponding graph.
                        sort_idx = np.argsort(dis_k)
                        dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
                        Gk = [Gk[idx] for idx in sort_idx[0:k]]
                        if dhat_new < dhat:
                            print('IAM: I have smaller distance!')
                            print(str(dhat) + '->' + str(dhat_new))
                            dhat = dhat_new
                            ghat_list = [Gk[0].copy()]
                            r = 0
                            nb_updated += 1
                        
                            print('the graph is updated', nb_updated, 'times.')                       
                            nx.draw(Gk[0], labels=nx.get_node_attributes(Gk[0], 'atom'), 
                                with_labels=True)
                    ##            plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
                            plt.show()
                        
                        found = True
        if not found:
            r += 1            

        dis_of_each_itr.append(dhat)
        itr_total += 1
        print('\nthe k shortest distances are', dis_k)
        print('the shortest distances for previous iterations are', dis_of_each_itr)
        
    print('\n\nthe graph is updated', nb_updated, 'times.')
    print('\nthe k nearest neighbors are updated', nb_updated_k, 'times.')
    print('distances in kernel space:', dis_of_each_itr, '\n')
    
    return dhat, ghat_list, dis_of_each_itr[-1], nb_updated, nb_updated_k




 def preimage_iam_random_mix(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, 
                            l_max, gkernel, epsilon=0.001, 
                            InitIAMWithAllDk=False, InitRandomWithAllDk=True,
                            params_iam={'c_ei': 1, 'c_er': 1, 'c_es': 1, 
                                        'ite_max': 50, 'epsilon': 0.001, 
                                        'removeNodes': True, 'connected': False},
                            params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 
                                        'method': 'IPFP', 'edit_cost_constant': [], 
                                        'stabilizer': 'min', 'repeat': 50}):
    """This function constructs graph pre-image by the iterative pre-image 
    framework in reference [1], algorithm 1, where new graphs are generated 
    randomly and by the IAM algorithm in reference [2].
    
    notes
    -----
    Every time a set of n better graphs is acquired, their distances in kernel space are
    compared with the k nearest ones, and the k nearest distances from the k+n
    distances will be used as the new ones.
    """
    Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init]
    # compute k nearest neighbors of phi in DN.
    dis_all = [] # distance between g_star and each graph.
    term3 = 0
    for i1, a1 in enumerate(alpha):
        for i2, a2 in enumerate(alpha):
            term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
    for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
        dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
        dis_all.append(dtemp)
        
    # sort
    sort_idx = np.argsort(dis_all)
    dis_k = [dis_all[idis] for idis in sort_idx[0:k]] # the k shortest distances
    nb_best = len(np.argwhere(dis_k == dis_k[0]).flatten().tolist())
    ghat_list = [Gn_init[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of psi in DN
    if dis_k[0] == 0: # the exact pre-image.
        print('The exact pre-image is found from the input dataset.')
        return 0, ghat_list, 0, 0
    dhat = dis_k[0] # the nearest distance
 #    for g in ghat_list:
 #        draw_Letter_graph(g)
 #        nx.draw_networkx(g)
 #        plt.show()
 #        print(g.nodes(data=True))
 #        print(g.edges(data=True))
    Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
 #    for gi in Gk:
 #        nx.draw(gi, labels=nx.get_node_attributes(gi, 'atom'), with_labels=True)
 ##        nx.draw_networkx(gi)
 #        plt.show()
 ##        draw_Letter_graph(g)
 #        print(gi.nodes(data=True))
 #        print(gi.edges(data=True))
    
    r = 0
    itr_total = 0
    dis_of_each_itr = [dhat]
    nb_updated_iam = 0
    nb_updated_k_iam = 0
    nb_updated_random = 0
    nb_updated_k_random = 0
 #    is_iam_duplicate = False
    while r < r_max: # and not found: # @todo: if not found?# and np.abs(old_dis - cur_dis) > epsilon:
        print('\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-')
        print('Current preimage iteration =', r)
        print('Total preimage iteration =', itr_total, '\n')
        found_iam = False

        Gn_nearest_median = [g.copy() for g in Gk]
        if InitIAMWithAllDk: # each graph in D_k is used to initialize IAM.
            ghat_new_list = []
            for g_tmp in Gk:
                Gn_nearest_init = [g_tmp.copy()]
                ghat_new_list_tmp, _ = iam_upgraded(Gn_nearest_median, 
                        Gn_nearest_init, params_ged=params_ged, **params_iam)
                ghat_new_list += ghat_new_list_tmp
        else: # only the best graph in D_k is used to initialize IAM.
            Gn_nearest_init = [g.copy() for g in Gk]
            ghat_new_list, _ = iam_upgraded(Gn_nearest_median, Gn_nearest_init, 
                    params_ged=params_ged, **params_iam)

 #        for g in g_tmp_list:
 #            nx.draw_networkx(g)
 #            plt.show()
 #            draw_Letter_graph(g)
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
            
        # compute distance between \psi and the new generated graphs.
        knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
        dhat_new_list = []
        
        for idx, g_tmp in enumerate(ghat_new_list):
            # @todo: the term3 below could use the one at the beginning of the function.
            dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), 
                            len(ghat_new_list) + len(Gn_median) + 1), 
                            alpha, knew, withterm3=False))
                
        # find the new k nearest graphs. 
        for idx_g, ghat_new in enumerate(ghat_new_list):          
            dhat_new = dhat_new_list[idx_g]
            
            # if the new distance is smaller than the max of D_k.           
            if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon:
                # check if the new distance is the same as one in D_k.
                is_duplicate = False
                for dis_tmp in dis_k[1:-1]:
                    if np.abs(dhat_new - dis_tmp) < epsilon:
                        is_duplicate = True
                        print('IAM: duplicate k nearest graph generated.')
                        break
                if not is_duplicate:
                    if np.abs(dhat_new - dhat) < epsilon:
                        print('IAM: I am equal!')
 #                        dhat = dhat_new
 #                        ghat_list = [ghat_new.copy()]
                    else:
                        print('IAM: we got better k nearest neighbors!')
                        nb_updated_k_iam += 1
                        print('the k nearest neighbors are updated', 
                              nb_updated_k_iam, 'times.')
                        
                        dis_k = [dhat_new] + dis_k[0:k-1] # add the new nearest distance.
                        Gk = [ghat_new.copy()] + Gk[0:k-1] # add the corresponding graph.
                        sort_idx = np.argsort(dis_k)
                        dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
                        Gk = [Gk[idx] for idx in sort_idx[0:k]]
                        if dhat_new < dhat:
                            print('IAM: I have smaller distance!')
                            print(str(dhat) + '->' + str(dhat_new))
                            dhat = dhat_new
                            ghat_list = [Gk[0].copy()]
                            r = 0
                            nb_updated_iam += 1
                        
                            print('the graph is updated by IAM', nb_updated_iam, 
                                  'times.')                       
                            nx.draw(Gk[0], labels=nx.get_node_attributes(Gk[0], 'atom'), 
                                with_labels=True)
                    ##            plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
                            plt.show()
                        
                        found_iam = True
                        
        # when new distance is not smaller than the max of D_k, use random generation.
        if not found_iam:
            print('Distance not better, switching to random generation now.')
            print(str(dhat) + '->' + str(dhat_new))
            
            if InitRandomWithAllDk: # use all k nearest graphs as the initials.
                init_list = [g_init.copy() for g_init in Gk]
            else: # use just the nearest graph as the initial.
                init_list = [Gk[0].copy()]
            
            # number of edges to be changed.
            if len(init_list) == 1:
                # @todo what if the log is negetive? how to choose alpha (scalar)? seems fdgs is always 1.
    #            fdgs = dhat_new
                fdgs = nb_updated_random + 1
                if fdgs < 1:
                    fdgs = 1
                fdgs = int(np.ceil(np.log(fdgs)))
                if fdgs < 1:
                    fdgs += 1
    #            fdgs = nb_updated_random + 1 # @todo:
                fdgs_list = [fdgs]
            else:
                # @todo what if the log is negetive? how to choose alpha (scalar)?
                fdgs_list = np.array(dis_k[:])
                if np.min(fdgs_list) < 1:
                    fdgs_list /= dis_k[0]
                fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))]
                if np.min(fdgs_list) < 1:
                    fdgs_list = np.array(fdgs_list) + 1
                
            l = 0
            found_random = False
            while l < l_max and not found_random:
                for idx_g, g_tmp in enumerate(init_list):
                    # add and delete edges.
                    ghat_new = nx.convert_node_labels_to_integers(g_tmp.copy())
                    # @todo: should we use just half of the adjacency matrix for undirected graphs?
                    nb_vpairs = nx.number_of_nodes(ghat_new) * (nx.number_of_nodes(ghat_new) - 1)
                    np.random.seed()
                    # which edges to change.                
                    # @todo: what if fdgs is bigger than nb_vpairs?
                    idx_change = random.sample(range(nb_vpairs), fdgs_list[idx_g] if 
                                               fdgs_list[idx_g] < nb_vpairs else nb_vpairs)
 #                idx_change = np.random.randint(0, nx.number_of_nodes(gs) * 
 #                                               (nx.number_of_nodes(gs) - 1), fdgs)
                    for item in idx_change:
                        node1 = int(item / (nx.number_of_nodes(ghat_new) - 1))
                        node2 = (item - node1 * (nx.number_of_nodes(ghat_new) - 1))
                        if node2 >= node1: # skip the self pair.
                            node2 += 1
                        # @todo: is the randomness correct?
                        if not ghat_new.has_edge(node1, node2):
                            ghat_new.add_edge(node1, node2)
    #                        nx.draw_networkx(gs)
    #                        plt.show()
    #                        nx.draw_networkx(ghat_new)
    #                        plt.show()
                        else:
                            ghat_new.remove_edge(node1, node2)
    #                        nx.draw_networkx(gs)
    #                        plt.show()
    #                        nx.draw_networkx(ghat_new)
    #                        plt.show()
    #                nx.draw_networkx(ghat_new)
    #                plt.show()
                            
                    # compute distance between \psi and the new generated graph.
                    knew = compute_kernel([ghat_new] + Gn_median, gkernel, verbose=False)
                    dhat_new = dis_gstar(0, range(1, len(Gn_median) + 1), 
                                         alpha, knew, withterm3=False)
                    # @todo: the new distance is smaller or also equal?
                    if dhat_new < dis_k[-1] and np.abs(dhat_new - dis_k[-1]) >= epsilon:
                        # check if the new distance is the same as one in D_k.
                        is_duplicate = False
                        for dis_tmp in dis_k[1:-1]:
                            if np.abs(dhat_new - dis_tmp) < epsilon:
                                is_duplicate = True
                                print('Random: duplicate k nearest graph generated.')
                                break
                        if not is_duplicate:
                            if np.abs(dhat_new - dhat) < epsilon:
                                print('Random: I am equal!')
        #                        dhat = dhat_new
        #                        ghat_list = [ghat_new.copy()]
                            else:
                                print('Random: we got better k nearest neighbors!')
                                print('l =', str(l))
                                nb_updated_k_random += 1
                                print('the k nearest neighbors are updated by random generation', 
                                          nb_updated_k_random, 'times.')
                                
                                dis_k = [dhat_new] + dis_k # add the new nearest distances.
                                Gk = [ghat_new.copy()] + Gk # add the corresponding graphs.
                                sort_idx = np.argsort(dis_k)
                                dis_k = [dis_k[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
                                Gk = [Gk[idx] for idx in sort_idx[0:k]]
                                if dhat_new < dhat:
                                    print('\nRandom: I am smaller!')
                                    print('l =', str(l))
                                    print(dhat, '->', dhat_new)                       
                                    dhat = dhat_new
                                    ghat_list = [ghat_new.copy()]
                                    r = 0
                                    nb_updated_random += 1
        
                                    print('the graph is updated by random generation', 
                                          nb_updated_random, 'times.')
                                             
                                    nx.draw(ghat_new, labels=nx.get_node_attributes(ghat_new, 'atom'), 
                                        with_labels=True)
        ##            plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
                                    plt.show()
                                found_random = True
                                break
                l += 1
            if not found_random: # l == l_max:
                r += 1            
            
        dis_of_each_itr.append(dhat)
        itr_total += 1
        print('\nthe k shortest distances are', dis_k)
        print('the shortest distances for previous iterations are', dis_of_each_itr)
        
    print('\n\nthe graph is updated by IAM', nb_updated_iam, 'times, and by random generation',
          nb_updated_random, 'times.')
    print('\nthe k nearest neighbors are updated by IAM', nb_updated_k_iam, 
          'times, and by random generation', nb_updated_k_random, 'times.')
    print('distances in kernel space:', dis_of_each_itr, '\n')
    
    return dhat, ghat_list, dis_of_each_itr[-1], \
            nb_updated_iam, nb_updated_random, nb_updated_k_iam, nb_updated_k_random


 ###############################################################################
 # Old implementations.
    
 #def gk_iam(Gn, alpha):
 #    """This function constructs graph pre-image by the iterative pre-image 
 #    framework in reference [1], algorithm 1, where the step of generating new 
 #    graphs randomly is replaced by the IAM algorithm in reference [2].
 #    
 #    notes
 #    -----
 #    Every time a better graph is acquired, the older one is replaced by it.
 #    """
 #    pass
 #    # compute k nearest neighbors of phi in DN.
 #    dis_list = [] # distance between g_star and each graph.
 #    for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
 #        dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
 #                      k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * 
 #                      (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
 #                      k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
 #        dis_list.append(dtemp)
 #        
 #    # sort
 #    sort_idx = np.argsort(dis_list)
 #    dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
 #    g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
 #    if dis_gs[0] == 0: # the exact pre-image.
 #        print('The exact pre-image is found from the input dataset.')
 #        return 0, g0hat
 #    dhat = dis_gs[0] # the nearest distance
 #    Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
 #    gihat_list = []
 #    
 ##    i = 1
 #    r = 1
 #    while r < r_max:
 #        print('r =', r)
 ##        found = False
 #        Gs_nearest = Gk + gihat_list
 #        g_tmp = iam(Gs_nearest)
 #        
 #        # compute distance between \psi and the new generated graph.
 #        knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
 #                       p_quit=lmbda, n_iteration=20, remove_totters=False,
 #                       n_jobs=multiprocessing.cpu_count(), verbose=False)
 #        dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * 
 #              knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * 
 #              (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
 #              k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
 #        if dnew <= dhat: # the new distance is smaller
 #            print('I am smaller!')
 #            dhat = dnew
 #            g_new = g_tmp.copy() # found better graph.
 #            gihat_list = [g_new]
 #            dis_gs.append(dhat)
 #            r = 0
 #        else:
 #            r += 1
 #            
 #    ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list)
 #    
 #    return dhat, ghat


 #def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
 #    """This function constructs graph pre-image by the iterative pre-image 
 #    framework in reference [1], algorithm 1, where the step of generating new 
 #    graphs randomly is replaced by the IAM algorithm in reference [2].
 #    
 #    notes
 #    -----
 #    Every time a better graph is acquired, its distance in kernel space is
 #    compared with the k nearest ones, and the k nearest distances from the k+1
 #    distances will be used as the new ones.
 #    """
 #    # compute k nearest neighbors of phi in DN.
 #    dis_list = [] # distance between g_star and each graph.
 #    for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
 #        dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix)
 ##        dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
 ##                      k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha * 
 ##                      (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha * 
 ##                      k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6])
 #        dis_list.append(dtemp)
 #        
 #    # sort
 #    sort_idx = np.argsort(dis_list)
 #    dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
 #    g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
 #    if dis_gs[0] == 0: # the exact pre-image.
 #        print('The exact pre-image is found from the input dataset.')
 #        return 0, g0hat
 #    dhat = dis_gs[0] # the nearest distance
 #    ghat = g0hat.copy()
 #    Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
 #    for gi in Gk:
 #        nx.draw_networkx(gi)
 #        plt.show()
 #        print(gi.nodes(data=True))
 #        print(gi.edges(data=True))
 #    Gs_nearest = Gk.copy()
 ##    gihat_list = []
 #    
 ##    i = 1
 #    r = 1
 #    while r < r_max:
 #        print('r =', r)
 ##        found = False
 ##        Gs_nearest = Gk + gihat_list
 ##        g_tmp = iam(Gs_nearest)
 #        g_tmp = test_iam_with_more_graphs_as_init(Gs_nearest, Gs_nearest, c_ei=1, c_er=1, c_es=1)
 #        nx.draw_networkx(g_tmp)
 #        plt.show()
 #        print(g_tmp.nodes(data=True))
 #        print(g_tmp.edges(data=True))
 #        
 #        # compute distance between \psi and the new generated graph.
 #        gi_list = [Gn[i] for i in idx_gi]
 #        knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False)
 #        dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew)
 #        
 ##        dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] * 
 ##              knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] * 
 ##              alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] * 
 ##              k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
 #        if dnew <= dhat and g_tmp != ghat: # the new distance is smaller
 #            print('I am smaller!')
 #            print(str(dhat) + '->' + str(dnew))
 ##            nx.draw_networkx(ghat)
 ##            plt.show()
 ##            print('->')
 ##            nx.draw_networkx(g_tmp)
 ##            plt.show()
 #            
 #            dhat = dnew
 #            g_new = g_tmp.copy() # found better graph.
 #            ghat = g_tmp.copy()
 #            dis_gs.append(dhat) # add the new nearest distance.
 #            Gs_nearest.append(g_new) # add the corresponding graph.
 #            sort_idx = np.argsort(dis_gs)
 #            dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
 #            Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
 #            r = 0
 #        else:
 #            r += 1
 #    
 #    return dhat, ghat


 #def gk_iam_nearest_multi(Gn, alpha, idx_gi, Kmatrix, k, r_max):
 #    """This function constructs graph pre-image by the iterative pre-image 
 #    framework in reference [1], algorithm 1, where the step of generating new 
 #    graphs randomly is replaced by the IAM algorithm in reference [2].
 #    
 #    notes
 #    -----
 #    Every time a set of n better graphs is acquired, their distances in kernel space are
 #    compared with the k nearest ones, and the k nearest distances from the k+n
 #    distances will be used as the new ones.
 #    """
 #    Gn_median = [Gn[idx].copy() for idx in idx_gi]
 #    # compute k nearest neighbors of phi in DN.
 #    dis_list = [] # distance between g_star and each graph.
 #    for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
 #        dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix)
 ##        dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
 ##                      k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha * 
 ##                      (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha * 
 ##                      k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6])
 #        dis_list.append(dtemp)
 #        
 #    # sort
 #    sort_idx = np.argsort(dis_list)
 #    dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
 #    nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
 #    g0hat_list = [Gn[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
 #    if dis_gs[0] == 0: # the exact pre-image.
 #        print('The exact pre-image is found from the input dataset.')
 #        return 0, g0hat_list
 #    dhat = dis_gs[0] # the nearest distance
 #    ghat_list = [g.copy() for g in g0hat_list]
 #    for g in ghat_list:
 #        nx.draw_networkx(g)
 #        plt.show()
 #        print(g.nodes(data=True))
 #        print(g.edges(data=True))
 #    Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
 #    for gi in Gk:
 #        nx.draw_networkx(gi)
 #        plt.show()
 #        print(gi.nodes(data=True))
 #        print(gi.edges(data=True))
 #    Gs_nearest = Gk.copy()
 ##    gihat_list = []
 #    
 ##    i = 1
 #    r = 1
 #    while r < r_max:
 #        print('r =', r)
 ##        found = False
 ##        Gs_nearest = Gk + gihat_list
 ##        g_tmp = iam(Gs_nearest)
 #        g_tmp_list = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
 #                Gn_median, Gs_nearest, c_ei=1, c_er=1, c_es=1)
 #        for g in g_tmp_list:
 #            nx.draw_networkx(g)
 #            plt.show()
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
 #        
 #        # compute distance between \psi and the new generated graphs.
 #        gi_list = [Gn[i] for i in idx_gi]
 #        knew = compute_kernel(g_tmp_list + gi_list, 'marginalizedkernel', False)
 #        dnew_list = []
 #        for idx, g_tmp in enumerate(g_tmp_list):
 #            dnew_list.append(dis_gstar(idx, range(len(g_tmp_list), 
 #                            len(g_tmp_list) + len(gi_list) + 1), alpha, knew))
 #        
 ##        dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] * 
 ##              knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] * 
 ##              alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] * 
 ##              k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
 #            
 #        # find the new k nearest graphs.
 #        dis_gs = dnew_list + dis_gs # add the new nearest distances.
 #        Gs_nearest = [g.copy() for g in g_tmp_list] + Gs_nearest # add the corresponding graphs.
 #        sort_idx = np.argsort(dis_gs)
 #        if len([i for i in sort_idx[0:k] if i < len(dnew_list)]) > 0:
 #            print('We got better k nearest neighbors! Hurray!')
 #            dis_gs = [dis_gs[idx] for idx in sort_idx[0:k]] # the new k nearest distances.
 #            print(dis_gs[-1])
 #            Gs_nearest = [Gs_nearest[idx] for idx in sort_idx[0:k]]
 #            nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
 #            if len([i for i in sort_idx[0:nb_best] if i < len(dnew_list)]) > 0:
 #                print('I have smaller or equal distance!')
 #                dhat = dis_gs[0]
 #                print(str(dhat) + '->' + str(dhat))
 #                idx_best_list = np.argwhere(dnew_list == dhat).flatten().tolist()
 #                ghat_list = [g_tmp_list[idx].copy() for idx in idx_best_list]
 #                for g in ghat_list:
 #                    nx.draw_networkx(g)
 #                    plt.show()
 #                    print(g.nodes(data=True))
 #                    print(g.edges(data=True))
 #            r = 0
 #        else:
 #            r += 1
 #    
 #    return dhat, ghat_list
--- a/gklearn/preimage/preimage_random.py
+++ b/gklearn/preimage/preimage_random.py
@@ -0,0 +1,309 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed Mar  6 16:03:11 2019

 pre-image
@author: ljia
 """

 import sys
 import numpy as np
 import random
 from tqdm import tqdm
 import networkx as nx
 import matplotlib.pyplot as plt

 from gklearn.preimage.utils import compute_kernel, dis_gstar


 def preimage_random(Gn_init, Gn_median, alpha, idx_gi, Kmatrix, k, r_max, l, gkernel):
    Gn_init = [nx.convert_node_labels_to_integers(g) for g in Gn_init]
    
    # compute k nearest neighbors of phi in DN.
    dis_list = [] # distance between g_star and each graph.
    term3 = 0
    for i1, a1 in enumerate(alpha):
        for i2, a2 in enumerate(alpha):
            term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
    for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
        dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
        dis_list.append(dtemp)
 #    print(np.max(dis_list))
 #    print(np.min(dis_list))
 #    print(np.min([item for item in dis_list if item != 0]))
 #    print(np.mean(dis_list))
        
    # sort
    sort_idx = np.argsort(dis_list)
    dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] # the k shortest distances
    nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
    g0hat_list = [Gn_init[idx] for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in DN
    if dis_gs[0] == 0: # the exact pre-image.
        print('The exact pre-image is found from the input dataset.')
        return 0, g0hat_list[0], 0
    dhat = dis_gs[0] # the nearest distance
 #    ghat_list = [g.copy() for g in g0hat_list]
 #    for g in ghat_list:
 #        draw_Letter_graph(g)
 #        nx.draw_networkx(g)
 #        plt.show()
 #        print(g.nodes(data=True))
 #        print(g.edges(data=True))
    Gk = [Gn_init[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
 #    for gi in Gk:
 ##        nx.draw_networkx(gi)
 ##        plt.show()
 #        draw_Letter_graph(g)
 #        print(gi.nodes(data=True))
 #        print(gi.edges(data=True))
    Gs_nearest = [g.copy() for g in Gk]
    gihat_list = []
    dihat_list = []
    
 #    i = 1
    r = 0
 #    sod_list = [dhat]
 #    found = False
    dis_of_each_itr = [dhat]
    nb_updated = 0
    g_best = []
    while r < r_max:
        print('\nr =', r)
        print('itr for gk =', nb_updated, '\n')
        found = False
        dis_bests = dis_gs + dihat_list
        # @todo what if the log is negetive? how to choose alpha (scalar)?
        fdgs_list = np.array(dis_bests)
        if np.min(fdgs_list) < 1:
            fdgs_list /= np.min(dis_bests)
        fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))]
        if np.min(fdgs_list) < 1:
            fdgs_list = np.array(fdgs_list) + 1
            
        for ig, gs in enumerate(Gs_nearest + gihat_list):
 #            nx.draw_networkx(gs)
 #            plt.show()
            for trail in range(0, l):
 #            for trail in tqdm(range(0, l), desc='l loops', file=sys.stdout):
                # add and delete edges.
                gtemp = gs.copy()
                np.random.seed()
                # which edges to change.
                # @todo: should we use just half of the adjacency matrix for undirected graphs?
                nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1)
                # @todo: what if fdgs is bigger than nb_vpairs?
                idx_change = random.sample(range(nb_vpairs), fdgs_list[ig] if 
                                           fdgs_list[ig] < nb_vpairs else nb_vpairs)
 #                idx_change = np.random.randint(0, nx.number_of_nodes(gs) * 
 #                                               (nx.number_of_nodes(gs) - 1), fdgs)
                for item in idx_change:
                    node1 = int(item / (nx.number_of_nodes(gs) - 1))
                    node2 = (item - node1 * (nx.number_of_nodes(gs) - 1))
                    if node2 >= node1: # skip the self pair.
                        node2 += 1
                    # @todo: is the randomness correct?
                    if not gtemp.has_edge(node1, node2):
                        gtemp.add_edge(node1, node2)
 #                        nx.draw_networkx(gs)
 #                        plt.show()
 #                        nx.draw_networkx(gtemp)
 #                        plt.show()
                    else:
                        gtemp.remove_edge(node1, node2)
 #                        nx.draw_networkx(gs)
 #                        plt.show()
 #                        nx.draw_networkx(gtemp)
 #                        plt.show()
 #                nx.draw_networkx(gtemp)
 #                plt.show()
                
                # compute distance between \psi and the new generated graph.
 #                knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None,
 #                               p_quit=lmbda, n_iteration=20, remove_totters=False,
 #                               n_jobs=multiprocessing.cpu_count(), verbose=False)
                knew = compute_kernel([gtemp] + Gn_median, gkernel, verbose=False)
                dnew = dis_gstar(0, range(1, len(Gn_median) + 1), alpha, knew, 
                                 withterm3=False)
                if dnew <= dhat: # @todo: the new distance is smaller or also equal?
                    if dnew < dhat:
                        print('\nI am smaller!')
                        print('ig =', str(ig), ', l =', str(trail))
                        print(dhat, '->', dnew)
                        nb_updated += 1
                    elif dnew == dhat:                   
                        print('I am equal!') 
 #                    nx.draw_networkx(gtemp)
 #                    plt.show()
 #                    print(gtemp.nodes(data=True))
 #                    print(gtemp.edges(data=True))
                    dhat = dnew
                    gnew = gtemp.copy()
                    found = True # found better graph.                  
        if found:
            r = 0
            gihat_list = [gnew]
            dihat_list = [dhat]
        else:
            r += 1
            
        dis_of_each_itr.append(dhat)
        print('the shortest distances for previous iterations are', dis_of_each_itr)
 #    dis_best.append(dhat)
    g_best = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0])
    print('distances in kernel space:', dis_of_each_itr, '\n')
    
    return dhat, g_best, nb_updated
 #    return 0, 0, 0


 if __name__ == '__main__':
    from gklearn.utils.graphfiles import loadDataset
    
 #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
 #          'extra_params': {}}  # node/edge symb
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
 #          'extra_params': {}}
 #    ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
 #            'extra_params': {}} # node symb
    
    DN, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    #DN = DN[0:10]
    
    lmbda = 0.03 # termination probalility
    r_max = 3 # 10 # iteration limit.
    l = 500
    alpha_range = np.linspace(0.5, 0.5, 1)
    #alpha_range = np.linspace(0.1, 0.9, 9)
    k = 10 # 5 # k nearest neighbors
    
    # randomly select two molecules
    #np.random.seed(1)
    #idx1, idx2 = np.random.randint(0, len(DN), 2)
    #g1 = DN[idx1]
    #g2 = DN[idx2]
    idx1 = 0
    idx2 = 6
    g1 = DN[idx1]
    g2 = DN[idx2]
    
    # compute 
    k_list = [] # kernel between each graph and itself.
    k_g1_list = [] # kernel between each graph and g1
    k_g2_list = [] # kernel between each graph and g2
    for ig, g in tqdm(enumerate(DN), desc='computing self kernels', file=sys.stdout): 
    #    ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None,
    #                               p_quit=lmbda, n_iteration=20, remove_totters=False,
    #                               n_jobs=multiprocessing.cpu_count(), verbose=False)
        ktemp = compute_kernel([g, g1, g2], 'untilhpathkernel', verbose=False)
        k_list.append(ktemp[0, 0])
        k_g1_list.append(ktemp[0, 1])
        k_g2_list.append(ktemp[0, 2])
    
    g_best = []
    dis_best = []
    # for each alpha
    for alpha in alpha_range:
        print('alpha =', alpha)
        # compute k nearest neighbors of phi in DN.
        dis_list = [] # distance between g_star and each graph.
        for ig, g in tqdm(enumerate(DN), desc='computing distances', file=sys.stdout):
            dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * 
                          k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * 
                          (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
                          k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
            dis_list.append(np.sqrt(dtemp))
        
        # sort
        sort_idx = np.argsort(dis_list)
        dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
        g0hat = DN[sort_idx[0]] # the nearest neighbor of phi in DN
        if dis_gs[0] == 0: # the exact pre-image.
            print('The exact pre-image is found from the input dataset.')
            g_pimg = g0hat
            break
        dhat = dis_gs[0] # the nearest distance
        Dk = [DN[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
        gihat_list = []
        
        i = 1
        r = 1
        while r < r_max:
            print('r =', r)
            found = False
            for ig, gs in enumerate(Dk + gihat_list):
    #            nx.draw_networkx(gs)
    #            plt.show()
                # @todo what if the log is negetive?
                fdgs = int(np.abs(np.ceil(np.log(alpha * dis_gs[ig]))))
                for trail in tqdm(range(0, l), desc='l loop', file=sys.stdout):
                    # add and delete edges.
                    gtemp = gs.copy()
                    np.random.seed()
                    # which edges to change.
                    # @todo: should we use just half of the adjacency matrix for undirected graphs?
                    nb_vpairs = nx.number_of_nodes(gs) * (nx.number_of_nodes(gs) - 1)
                    # @todo: what if fdgs is bigger than nb_vpairs?
                    idx_change = random.sample(range(nb_vpairs), fdgs if fdgs < nb_vpairs else nb_vpairs)
    #                idx_change = np.random.randint(0, nx.number_of_nodes(gs) * 
    #                                               (nx.number_of_nodes(gs) - 1), fdgs)
                    for item in idx_change:
                        node1 = int(item / (nx.number_of_nodes(gs) - 1))
                        node2 = (item - node1 * (nx.number_of_nodes(gs) - 1))
                        if node2 >= node1: # skip the self pair.
                            node2 += 1
                        # @todo: is the randomness correct?
                        if not gtemp.has_edge(node1, node2):
                            # @todo: how to update the bond_type? 0 or 1?
                            gtemp.add_edges_from([(node1, node2, {'bond_type': 1})])
    #                        nx.draw_networkx(gs)
    #                        plt.show()
    #                        nx.draw_networkx(gtemp)
    #                        plt.show()
                        else:
                            gtemp.remove_edge(node1, node2)
    #                        nx.draw_networkx(gs)
    #                        plt.show()
    #                        nx.draw_networkx(gtemp)
    #                        plt.show()
    #                nx.draw_networkx(gtemp)
    #                plt.show()
                    
                    # compute distance between phi and the new generated graph.
    #                knew = marginalizedkernel([gtemp, g1, g2], node_label='atom', edge_label=None,
    #                               p_quit=lmbda, n_iteration=20, remove_totters=False,
    #                               n_jobs=multiprocessing.cpu_count(), verbose=False)
                    knew = compute_kernel([gtemp, g1, g2], 'untilhpathkernel', verbose=False)
                    dnew = np.sqrt(knew[0, 0] - 2 * (alpha * knew[0, 1] + (1 - alpha) * 
                          knew[0, 2]) + (alpha * alpha * k_list[idx1] + alpha * 
                          (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * 
                          k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]))
                    if dnew < dhat: # @todo: the new distance is smaller or also equal?
                        print('I am smaller!')
                        print(dhat, '->', dnew)
                        nx.draw_networkx(gtemp)
                        plt.show()
                        print(gtemp.nodes(data=True))
                        print(gtemp.edges(data=True))
                        dhat = dnew
                        gnew = gtemp.copy()
                        found = True # found better graph.
                        r = 0
                    elif dnew == dhat:                   
                        print('I am equal!')                   
            if found:
                gihat_list = [gnew]
                dis_gs.append(dhat)
            else:
                r += 1
        dis_best.append(dhat)
        g_best += ([g0hat] if len(gihat_list) == 0 else gihat_list)       
    
    
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', dis_best[idx])
        print('the corresponding pre-image is')
        nx.draw_networkx(g_best[idx])
        plt.show()
--- a/gklearn/preimage/python_code.py
+++ b/gklearn/preimage/python_code.py
@@ -0,0 +1,122 @@
 		elif opt_name == 'random-inits':
 			try:
 				num_random_inits_ = std::stoul(opt_val)
 				desired_num_random_inits_ = num_random_inits_

 			except:
 				raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"')

 			if num_random_inits_ <= 0:
 				raise Error('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits <convertible to int greater 0>]"')

 		}
 		elif opt_name == 'randomness':
 			if opt_val == 'PSEUDO':
 				use_real_randomness_ = False

 			elif opt_val == 'REAL':
 				use_real_randomness_ = True

 			else:
 				raise Error('Invalid argument "' + opt_val  + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"')

 		}
 		elif opt_name == 'stdout':
 			if opt_val == '0':
 				print_to_stdout_ = 0

 			elif opt_val == '1':
 				print_to_stdout_ = 1

 			elif opt_val == '2':
 				print_to_stdout_ = 2

 			else:
 				raise Error('Invalid argument "' + opt_val  + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"')

 		}
 		elif opt_name == 'refine':
 			if opt_val == 'TRUE':
 				refine_ = True

 			elif opt_val == 'FALSE':
 				refine_ = False

 			else:
 				raise Error('Invalid argument "' + opt_val  + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"')

 		}
 		elif opt_name == 'time-limit':
 			try:
 				time_limit_in_sec_ = std::stod(opt_val)

 			except:
 				raise Error('Invalid argument "' + opt_val + '" for option time-limit.  Usage: options = "[--time-limit <convertible to double>] [...]')

 		}
 		elif opt_name == 'max-itrs':
 			try:
 				max_itrs_ = std::stoi(opt_val)

 			except:
 				raise Error('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs <convertible to int>] [...]')

 		}
 		elif opt_name == 'max-itrs-without-update':
 			try:
 				max_itrs_without_update_ = std::stoi(opt_val)

 			except:
 				raise Error('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update <convertible to int>] [...]')

 		}
 		elif opt_name == 'seed':
 			try:
 				seed_ = std::stoul(opt_val)

 			except:
 				raise Error('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed <convertible to int greater equal 0>] [...]')

 		}
 		elif opt_name == 'epsilon':
 			try:
 				epsilon_ = std::stod(opt_val)

 			except:
 				raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]')

 			if epsilon_ <= 0:
 				raise Error('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon <convertible to double greater 0>] [...]')

 		}
 		elif opt_name == 'inits-increase-order':
 			try:
 				num_inits_increase_order_ = std::stoul(opt_val)

 			except:
 				raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"')

 			if num_inits_increase_order_ <= 0:
 				raise Error('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order <convertible to int greater 0>]"')

 		}
 		elif opt_name == 'init-type-increase-order':
 			init_type_increase_order_ = opt_val
 			if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++':
 				raise Exception(std::string('Invalid argument ') + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"')

 		}
 		elif opt_name == 'max-itrs-increase-order':
 			try:
 				max_itrs_increase_order_ = std::stoi(opt_val)

 			except:
 				raise Error('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order <convertible to int>] [...]')

 		}
 		else:
 			std::string valid_options('[--init-type <arg>] [--random-inits <arg>] [--randomness <arg>] [--seed <arg>] [--stdout <arg>] ')
 			valid_options += '[--time-limit <arg>] [--max-itrs <arg>] [--epsilon <arg>] '
 			valid_options += '[--inits-increase-order <arg>] [--init-type-increase-order <arg>] [--max-itrs-increase-order <arg>]'
 			raise Error(std::string('Invalid option "') + opt_name + '". Usage: options = "' + valid_options + '"')

--- a/gklearn/preimage/test.py
+++ b/gklearn/preimage/test.py
@@ -0,0 +1,83 @@
 #export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/gedlibpy/lib/fann/:/export/home/lambertn/Documents/gedlibpy/lib/libsvm.3.22:/export/home/lambertn/Documents/gedlibpy/lib/nomad

 #Pour que "import script" trouve les librairies qu'a besoin GedLib
 #Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash
 import gedlibpy.librariesImport
 from  gedlibpy import gedlibpy
 import networkx as nx


 def init() :
    print("List of Edit Cost Options : ")
    for i in gedlibpy.list_of_edit_cost_options :
        print (i)
    print("")

    print("List of Method Options : ")
    for j in gedlibpy.list_of_method_options :
        print (j)
    print("")

    print("List of Init Options : ")
    for k in gedlibpy.list_of_init_options :
        print (k)
    print("")
    
 def test():
    
    gedlibpy.load_GXL_graphs('include/gedlib-master/data/datasets/Mutagenicity/data/', 'collections/MUTA_10.xml')
    listID = gedlibpy.get_all_graph_ids()
    gedlibpy.set_edit_cost("CHEM_1")
    gedlibpy.init()
    gedlibpy.set_method("IPFP", "")
    gedlibpy.init_method()
    g = listID[0]
    h = listID[1]
    gedlibpy.run_method(g, h)
    print("Node Map : ", gedlibpy.get_node_map(g,h))
    print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h))
    print("Assignment Matrix : ")
    print(gedlibpy.get_assignment_matrix(g, h))
    print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g,h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h)))


 def convertGraph(G):
    G_new = nx.Graph()
    for nd, attrs in G.nodes(data=True):
        G_new.add_node(str(nd), chem=attrs['atom'])
    for nd1, nd2, attrs in G.edges(data=True):
        G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])
        
    return G_new


 def testNxGrapĥ():
    from gklearn.utils.graphfiles import loadDataset
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    
    gedlibpy.restart_env()
    for graph in Gn:
        g_new = convertGraph(graph)
        gedlibpy.add_nx_graph(g_new, "")
        
    listID = gedlibpy.get_all_graph_ids()
    gedlibpy.set_edit_cost("CHEM_1")
    gedlibpy.init()
    gedlibpy.set_method("IPFP", "")
    gedlibpy.init_method()

    print(listID)
    g = listID[0]
    h = listID[1]

    gedlibpy.run_method(g, h)

    print("Node Map : ", gedlibpy.get_node_map(g, h))
    print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h))
    print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g, h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h)))

 #test()
 init() 
 #testNxGrapĥ()
--- a/gklearn/preimage/test_fitDistance.py
+++ b/gklearn/preimage/test_fitDistance.py
@@ -0,0 +1,648 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Oct 24 11:50:56 2019

@author: ljia
 """
 from matplotlib import pyplot as plt
 import numpy as np
 from tqdm import tqdm

 from gklearn.utils.graphfiles import loadDataset
 from gklearn.preimage.utils import remove_edges
 from gklearn.preimage.fitDistance import fit_GED_to_kernel_distance
 from gklearn.preimage.utils import normalize_distance_matrix


 def test_update_costs():
    from preimage.fitDistance import update_costs
    import cvxpy as cp
    
    ds = np.load('results/xp_fit_method/fit_data_debug4.gm.npz')
    nb_cost_mat = ds['nb_cost_mat']
    dis_k_vec = ds['dis_k_vec']
    n_edit_operations = ds['n_edit_operations']
    ged_vec_init = ds['ged_vec_init']
    ged_mat = ds['ged_mat']
    
    nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
    x = cp.Variable(nb_cost_mat_new.shape[1])
    cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
 #    constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])],
 #                   np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
 #    constraints = [x >= [0.000 for i in range(nb_cost_mat_new.shape[1])],
 #                   np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
 #                   np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
    constraints = [x >= [0.00 for i in range(nb_cost_mat_new.shape[1])],
                   np.array([0.0, 1.0, -1.0]).T@x == 0.0]
 #    constraints = [x >= [0.00000 for i in range(nb_cost_mat_new.shape[1])]]
    prob = cp.Problem(cp.Minimize(cost_fun), constraints)
    prob.solve()
    print(x.value)
    edit_costs_new = np.concatenate((x.value, np.array([0.0])))
    residual = np.sqrt(prob.value)


 def median_paper_clcpc_python_best():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with 
       python invoking the c++ code by bash command (with updated library).
    """
 #    ds = {'name': 'monoterpenoides', 
 #          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
 #    _, y_all = loadDataset(ds['dataset'])
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 6
    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 
                'algo_options': algo_options, 'stabilizer': None}
    
    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50
    collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'
    
    fn_edit_costs_output = 'results/median_paper/edit_costs_output.python_init40.k10.txt'

    for y in y_all:
        for repeat in range(repeats):
            edit_costs_output_file = open(fn_edit_costs_output, 'a')
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
            Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
            edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
                nb_cost_mat_list = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
                                            gkernel, itr_max, params_ged=params_ged, 
                                            parallel=True)
            total_time = np.sum(time_list)
 #            print('\nedit_costs:', edit_costs)
 #            print('\nresidual_list:', residual_list)
 #            print('\nedit_cost_list:', edit_cost_list)
 #            print('\ndistance matrix in kernel space:', dis_k_mat)
 #            print('\nged matrix:', ged_mat)
 #            print('\ntotal time:', total_time)
 #            print('\nnb_cost_mat:', nb_cost_mat_list[-1])
            np.savez('results/median_paper/fit_distance.clcpc.python_init40.monot.elabeled.uhpkernel.y' 
                     + y + '.repeat' + str(repeat) + '.k10..gm', 
                     edit_costs=edit_costs, 
                     residual_list=residual_list, edit_cost_list=edit_cost_list,
                     dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
                     total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
            
            for ec in edit_costs:
                edit_costs_output_file.write(str(ec) + ' ')
            edit_costs_output_file.write('\n')
            edit_costs_output_file.close()
    
    
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
    
            nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
            print(nb_consistent, nb_inconsistent, ratio_consistent)
                      
 #            norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
 #            plt.imshow(norm_dis_k_mat)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #            
 #            norm_ged_mat = normalize_distance_matrix(ged_mat)
 #            plt.imshow(norm_ged_mat)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #            
 #            norm_diff = norm_ged_mat - norm_dis_k_mat
 #            plt.imshow(norm_diff)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_best.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #        #    draw_count_bar(norm_diff)


 def median_paper_clcpc_python_bash_cpp():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with 
       python invoking the c++ code by bash command (with updated library).
    """
 #    ds = {'name': 'monoterpenoides', 
 #          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
 #    _, y_all = loadDataset(ds['dataset'])
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 20
    algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
    params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', 
                'algo_options': algo_options}
    
    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50
    collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'
    
    fn_edit_costs_output = 'results/median_paper/edit_costs_output.txt'

    for y in y_all:
        for repeat in range(repeats):
            edit_costs_output_file = open(fn_edit_costs_output, 'a')
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
            Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
            edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
                nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
                                            gkernel, itr_max, params_ged=params_ged, 
                                            parallel=False)
            total_time = np.sum(time_list)
 #            print('\nedit_costs:', edit_costs)
 #            print('\nresidual_list:', residual_list)
 #            print('\nedit_cost_list:', edit_cost_list)
 #            print('\ndistance matrix in kernel space:', dis_k_mat)
 #            print('\nged matrix:', ged_mat)
 #            print('\ntotal time:', total_time)
 #            print('\nnb_cost_mat:', nb_cost_mat_list[-1])
            np.savez('results/median_paper/fit_distance.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
                     + y + '.repeat' + str(repeat) + '.gm', 
                     edit_costs=edit_costs, 
                     residual_list=residual_list, edit_cost_list=edit_cost_list,
                     dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
                     total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, 
                     coef_dk=coef_dk)
            
            for ec in edit_costs:
                edit_costs_output_file.write(str(ec) + ' ')
            edit_costs_output_file.write('\n')
            edit_costs_output_file.close()
    
    
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
 #    coef_dk = gmfile['coef_dk']
    
            nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
            print(nb_consistent, nb_inconsistent, ratio_consistent)
                      
 #            norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
 #            plt.imshow(norm_dis_k_mat)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/norm_dis_k_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #            
 #            norm_ged_mat = normalize_distance_matrix(ged_mat)
 #            plt.imshow(norm_ged_mat)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/norm_ged_mat.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #            
 #            norm_diff = norm_ged_mat - norm_dis_k_mat
 #            plt.imshow(norm_diff)
 #            plt.colorbar()
 #            plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.eps', format='eps', dpi=300)
 #            plt.savefig('results/median_paper/diff_mat_norm_ged_dis_k.clcpc.python_bash_cpp.monot.elabeled.uhpkernel.y' 
 #                        + y + '.repeat' + str(repeat) + '.png', format='png')
 #        #    plt.show()
 #            plt.clf()
 #        #    draw_count_bar(norm_diff)





 def test_cs_leq_ci_plus_cr_python_bash_cpp():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er with ged computation with 
       python invoking the c++ code by bash command (with updated library).
    """
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:10]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 10
    algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'
    params_ged = {'lib': 'gedlib-bash', 'cost': 'CONSTANT', 'method': 'IPFP', 
                'algo_options': algo_options}
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
                                    gkernel, itr_max, params_ged=params_ged, 
                                    parallel=False)
    total_time = np.sum(time_list)
    print('\nedit_costs:', edit_costs)
    print('\nresidual_list:', residual_list)
    print('\nedit_cost_list:', edit_cost_list)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    print('\ntotal time:', total_time)
    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    np.savez('results/fit_distance.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel.gm', 
             edit_costs=edit_costs, 
             residual_list=residual_list, edit_cost_list=edit_cost_list,
             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, 
             coef_dk=coef_dk)
    
 #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
 #          'extra_params': {}}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 ##    Gn = Gn[0:10]
 ##    remove_edges(Gn)
 #    gkernel = 'untilhpathkernel'
 #    node_label = 'atom'
 #    edge_label = 'bond_type'
 #    itr_max = 10
 #    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
 #        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
 #                                                      gkernel, itr_max)
 #    total_time = np.sum(time_list)
 #    print('\nedit_costs:', edit_costs)
 #    print('\nresidual_list:', residual_list)
 #    print('\nedit_cost_list:', edit_cost_list)
 #    print('\ndistance matrix in kernel space:', dis_k_mat)
 #    print('\nged matrix:', ged_mat)
 #    print('\ntotal time:', total_time)
 #    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
 #    np.savez('results/fit_distance.cs_leq_ci_plus_cr.mutag.elabeled.uhpkernel.gm', 
 #             edit_costs=edit_costs, 
 #             residual_list=residual_list, edit_cost_list=edit_cost_list,
 #             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
 #             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
    
    
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.monot.elabeled.uhpkernel.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
 #    coef_dk = gmfile['coef_dk']
    
    nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
    print(nb_consistent, nb_inconsistent, ratio_consistent)
    
 #    dis_k_sub = pairwise_substitution(dis_k_mat)
 #    ged_sub = pairwise_substitution(ged_mat)    
 #    np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.gm', 
 #             dis_k_sub=dis_k_sub, ged_sub=ged_sub)
    
    
    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.python_bash_cpp.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
 #    draw_count_bar(norm_diff)


 def test_anycosts():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:10]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    itr_max = 10
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
    total_time = np.sum(time_list)
    print('\nedit_costs:', edit_costs)
    print('\nresidual_list:', residual_list)
    print('\nedit_cost_list:', edit_cost_list)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    print('\ntotal time:', total_time)
    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    np.savez('results/fit_distance.any_costs.gm', edit_costs=edit_costs, 
             residual_list=residual_list, edit_cost_list=edit_cost_list,
             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list)
    
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.any_costs.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 ##    nb_cost_mat_list = gmfile['nb_cost_mat_list']
    
    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.any_costs' + '.eps', format='eps', dpi=300)
 #    plt.savefig('results/norm_dis_k_mat.any_costs' + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.any_costs' + '.eps', format='eps', dpi=300)
 #    plt.savefig('results/norm_ged_mat.any_costs' + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.eps', format='eps', dpi=300)
 #    plt.savefig('results/diff_mat_norm_ged_dis_k.any_costs' + '.png', format='png')
 #    plt.show()
    plt.clf()
 #    draw_count_bar(norm_diff)
    

 def test_cs_leq_ci_plus_cr():
    """c_vs <= c_vi + c_vr, c_es <= c_ei + c_er
    """
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:10]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    itr_max = 10
    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
                                                      gkernel, itr_max,
                                                      fitkernel='gaussian')
    total_time = np.sum(time_list)
    print('\nedit_costs:', edit_costs)
    print('\nresidual_list:', residual_list)
    print('\nedit_cost_list:', edit_cost_list)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
    print('\ntotal time:', total_time)
    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
    np.savez('results/fit_distance.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel.gm', 
             edit_costs=edit_costs, 
             residual_list=residual_list, edit_cost_list=edit_cost_list,
             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, 
             coef_dk=coef_dk)
    
 #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
 #          'extra_params': {}}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 ##    Gn = Gn[0:10]
 ##    remove_edges(Gn)
 #    gkernel = 'untilhpathkernel'
 #    node_label = 'atom'
 #    edge_label = 'bond_type'
 #    itr_max = 10
 #    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat, time_list, \
 #        nb_cost_mat_list, coef_dk = fit_GED_to_kernel_distance(Gn, node_label, edge_label, 
 #                                                      gkernel, itr_max)
 #    total_time = np.sum(time_list)
 #    print('\nedit_costs:', edit_costs)
 #    print('\nresidual_list:', residual_list)
 #    print('\nedit_cost_list:', edit_cost_list)
 #    print('\ndistance matrix in kernel space:', dis_k_mat)
 #    print('\nged matrix:', ged_mat)
 #    print('\ntotal time:', total_time)
 #    print('\nnb_cost_mat:', nb_cost_mat_list[-1])
 #    np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.mutag.elabeled.uhpkernel.gm', 
 #             edit_costs=edit_costs, 
 #             residual_list=residual_list, edit_cost_list=edit_cost_list,
 #             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
 #             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list, coef_dk)
    
    
 #    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.monot.elabeled.uhpkernel.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
 #    coef_dk = gmfile['coef_dk']
    
    nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
    print(nb_consistent, nb_inconsistent, ratio_consistent)
    
 #    dis_k_sub = pairwise_substitution(dis_k_mat)
 #    ged_sub = pairwise_substitution(ged_mat)    
 #    np.savez('results/sub_dis_mat.cs_leq_ci_plus_cr.cost_leq_1en2.gm', 
 #             dis_k_sub=dis_k_sub, ged_sub=ged_sub)
    
    
    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_dis_k_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_ged_mat.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.eps', format='eps', dpi=300)
    plt.savefig('results/diff_mat_norm_ged_dis_k.cs_leq_ci_plus_cr.gaussian.cost_leq_1en2.monot.elabeled.uhpkernel' 
                + '.png', format='png')
 #    plt.show()
    plt.clf()
 #    draw_count_bar(norm_diff)
    
    
 def test_unfitted():
    """unfitted.
    """  
    from fitDistance import compute_geds
    from utils import kernel_distance_matrix
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:10]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
        

 #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
 #          'extra_params': {}}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 ##    Gn = Gn[0:10]
 ##    remove_edges(Gn)
 #    gkernel = 'marginalizedkernel'

    dis_k_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, gkernel=gkernel)
    ged_all, ged_mat, n_edit_operations = compute_geds(Gn, [3, 3, 1, 3, 3, 1], 
            [0, 1, 2, 3, 4, 5], parallel=True)
    print('\ndistance matrix in kernel space:', dis_k_mat)
    print('\nged matrix:', ged_mat)
 #    np.savez('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en2.gm', edit_costs=edit_costs, 
 #             residual_list=residual_list, edit_cost_list=edit_cost_list,
 #             dis_k_mat=dis_k_mat, ged_mat=ged_mat, time_list=time_list, 
 #             total_time=total_time, nb_cost_mat_list=nb_cost_mat_list) 
    
    # normalized distance matrices.
 #    gmfile = np.load('results/fit_distance.cs_leq_ci_plus_cr.cost_leq_1en3.gm.npz')
 #    edit_costs = gmfile['edit_costs']
 #    residual_list = gmfile['residual_list']
 #    edit_cost_list = gmfile['edit_cost_list']
 #    dis_k_mat = gmfile['dis_k_mat']
 #    ged_mat = gmfile['ged_mat']
 #    total_time = gmfile['total_time']
 #    nb_cost_mat_list = gmfile['nb_cost_mat_list']
    
    nb_consistent, nb_inconsistent, ratio_consistent = pairwise_substitution_consistence(dis_k_mat, ged_mat)
    print(nb_consistent, nb_inconsistent, ratio_consistent)
    
    norm_dis_k_mat = normalize_distance_matrix(dis_k_mat)
    plt.imshow(norm_dis_k_mat)
    plt.colorbar()
    plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_dis_k_mat.unfitted.MUTAG' + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_ged_mat = normalize_distance_matrix(ged_mat)
    plt.imshow(norm_ged_mat)
    plt.colorbar()
    plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
    plt.savefig('results/norm_ged_mat.unfitted.MUTAG' + '.png', format='png')
 #    plt.show()
    plt.clf()
    
    norm_diff = norm_ged_mat - norm_dis_k_mat
    plt.imshow(norm_diff)
    plt.colorbar()
    plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.eps', format='eps', dpi=300)
    plt.savefig('results/diff_mat_norm_ged_dis_k.unfitted.MUTAG' + '.png', format='png')
 #    plt.show()
    plt.clf()
    draw_count_bar(norm_diff)
    
    
 def pairwise_substitution_consistence(mat1, mat2):
    """
    """
    nb_consistent = 0
    nb_inconsistent = 0
    # the matrix is considered symmetric.
    upper_tri1 = mat1[np.triu_indices_from(mat1)]
    upper_tri2 = mat2[np.tril_indices_from(mat2)]
    for i in tqdm(range(len(upper_tri1)), desc='computing consistence', file=sys.stdout):
        for j in range(i, len(upper_tri1)):
            if np.sign(upper_tri1[i] - upper_tri1[j]) == np.sign(upper_tri2[i] - upper_tri2[j]):
                nb_consistent += 1
            else:
                nb_inconsistent += 1
    return nb_consistent, nb_inconsistent, nb_consistent / (nb_consistent + nb_inconsistent)


 def pairwise_substitution(mat):
    # the matrix is considered symmetric.
    upper_tri = mat[np.triu_indices_from(mat)]
    sub_list = []
    for i in tqdm(range(len(upper_tri)), desc='computing', file=sys.stdout):
        for j in range(i, len(upper_tri)):
            sub_list.append(upper_tri[i] - upper_tri[j])
    return sub_list
    
    
 def draw_count_bar(norm_diff):
    import pandas
    from collections import Counter, OrderedDict
    norm_diff_cnt = norm_diff.flatten()
    norm_diff_cnt = norm_diff_cnt * 10
    norm_diff_cnt = np.floor(norm_diff_cnt)
    norm_diff_cnt = Counter(norm_diff_cnt)
    norm_diff_cnt = OrderedDict(sorted(norm_diff_cnt.items()))
    df = pandas.DataFrame.from_dict(norm_diff_cnt, orient='index')
    df.plot(kind='bar')
    
    
 if __name__ == '__main__':
 #    test_anycosts()
 #    test_cs_leq_ci_plus_cr()
 #    test_unfitted()
    
 #    test_cs_leq_ci_plus_cr_python_bash_cpp()
 #    median_paper_clcpc_python_bash_cpp()
 #    median_paper_clcpc_python_best()

 #    x = np.array([[1,2,3],[4,5,6],[7,8,9]])
 #    xx = pairwise_substitution(x)
    
    test_update_costs()
--- a/gklearn/preimage/test_ged.py
+++ b/gklearn/preimage/test_ged.py
@@ -0,0 +1,520 @@
 #export LD_LIBRARY_PATH=.:/export/home/lambertn/Documents/gedlibpy/lib/fann/:/export/home/lambertn/Documents/gedlibpy/lib/libsvm.3.22:/export/home/lambertn/Documents/gedlibpy/lib/nomad

 #Pour que "import script" trouve les librairies qu'a besoin GedLib
 #Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash
 #import gedlibpy_linlin.librariesImport
 #from  gedlibpy_linlin import gedlibpy
 from libs import *
 import networkx as nx
 import numpy as np
 from tqdm import tqdm
 import sys


 def test_NON_SYMBOLIC_cost():
    """Test edit cost LETTER2.
    """
    from gklearn.preimage.ged import GED, get_nb_edit_operations_nonsymbolic, get_nb_edit_operations_letter
    from gklearn.preimage.test_k_closest_graphs import reform_attributes
    from gklearn.utils.graphfiles import loadDataset

    dataset = '../../datasets/Letter-high/Letter-high_A.txt'
    Gn, y_all = loadDataset(dataset)

    g1 = Gn[200]
    g2 = Gn[1780]
    reform_attributes(g1)
    reform_attributes(g2)

    c_vi = 0.675
    c_vr = 0.675
    c_vs = 0.75
    c_ei = 0.425
    c_er = 0.425
    c_es = 0

    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy',
        cost='NON_SYMBOLIC', method='IPFP', edit_cost_constant=edit_cost_constant,
        algo_options='', stabilizer=None)
    n_vi, n_vr, sod_vs, n_ei, n_er, sod_es = get_nb_edit_operations_nonsymbolic(g1, g2,
        pi_forward, pi_backward)

    print('# of operations:', n_vi, n_vr, sod_vs, n_ei, n_er, sod_es)
    print('c_vi, c_vr, c_vs, c_ei, c_er:', c_vi, c_vr, c_vs, c_ei, c_er, c_es)
    cost_computed = c_vi * n_vi + c_vr * n_vr + c_vs * sod_vs \
        + c_ei * n_ei + c_er * n_er + c_es * sod_es
    print('dis (cost computed by GED):', dis)
    print('cost computed by # of operations and edit cost constants:', cost_computed)


 def test_LETTER2_cost():
    """Test edit cost LETTER2.
    """
    from gklearn.preimage.ged import GED, get_nb_edit_operations_letter
    from gklearn.preimage.test_k_closest_graphs import reform_attributes
    from gklearn.utils.graphfiles import loadDataset

    ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
          'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])

    g1 = Gn[200]
    g2 = Gn[1780]
    reform_attributes(g1)
    reform_attributes(g2)

    c_vi = 0.675
    c_vr = 0.675
    c_vs = 0.75
    c_ei = 0.425
    c_er = 0.425

    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er]
    dis, pi_forward, pi_backward = GED(g1, g2, dataset='letter', lib='gedlibpy',
        cost='LETTER2', method='IPFP', edit_cost_constant=edit_cost_constant,
        algo_options='', stabilizer=None)
    n_vi, n_vr, n_vs, sod_vs, n_ei, n_er = get_nb_edit_operations_letter(g1, g2,
        pi_forward, pi_backward)

    print('# of operations:', n_vi, n_vr, n_vs, sod_vs, n_ei, n_er)
    print('c_vi, c_vr, c_vs, c_ei, c_er:', c_vi, c_vr, c_vs, c_ei, c_er)
    cost_computed = c_vi * n_vi + c_vr * n_vr + c_vs * sod_vs \
        + c_ei * n_ei + c_er * n_er
    print('dis (cost computed by GED):', dis)
    print('cost computed by # of operations and edit cost constants:', cost_computed)



 def test_get_nb_edit_operations_letter():
    """Test whether function preimage.ged.get_nb_edit_operations_letter returns
    correct numbers of edit operations. The distance/cost computed by GED
    should be the same as the cost computed by number of operations and edit
    cost constants.
    """
    from gklearn.preimage.ged import GED, get_nb_edit_operations_letter
    from gklearn.preimage.test_k_closest_graphs import reform_attributes
    from gklearn.utils.graphfiles import loadDataset

    ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
          'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])

    g1 = Gn[200]
    g2 = Gn[1780]
    reform_attributes(g1)
    reform_attributes(g2)

    c_vir = 0.9
    c_eir = 1.7
    alpha = 0.75

    edit_cost_constant = [c_vir, c_eir, alpha]
    dis, pi_forward, pi_backward = GED(g1, g2, dataset='letter', lib='gedlibpy',
        cost='LETTER', method='IPFP', edit_cost_constant=edit_cost_constant,
        algo_options='', stabilizer=None)
    n_vi, n_vr, n_vs, c_vs, n_ei, n_er = get_nb_edit_operations_letter(g1, g2,
        pi_forward, pi_backward)

    print('# of operations and costs:', n_vi, n_vr, n_vs, c_vs, n_ei, n_er)
    print('c_vir, c_eir, alpha:', c_vir, c_eir, alpha)
    cost_computed = alpha * c_vir * (n_vi + n_vr) \
        + alpha * c_vs \
        + (1 - alpha) * c_eir * (n_ei + n_er)
    print('dis (cost computed by GED):', dis)
    print('cost computed by # of operations and edit cost constants:', cost_computed)


 def test_get_nb_edit_operations():
    """Test whether function preimage.ged.get_nb_edit_operations returns correct
    numbers of edit operations. The distance/cost computed by GED should be the
    same as the cost computed by number of operations and edit cost constants.
    """
    from gklearn.preimage.ged import GED, get_nb_edit_operations
    from gklearn.utils.graphfiles import loadDataset
    import os

    ds = {'dataset': '../../datasets/monoterpenoides/dataset_10+.ds',
          'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])

    g1 = Gn[20]
    g2 = Gn[108]

    c_vi = 3
    c_vr = 3
    c_vs = 1
    c_ei = 3
    c_er = 3
    c_es = 1

    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    dis, pi_forward, pi_backward = GED(g1, g2, dataset='monoterpenoides', lib='gedlibpy',
        cost='CONSTANT', method='IPFP', edit_cost_constant=edit_cost_constant,
        algo_options='', stabilizer=None)
    n_vi, n_vr, n_vs, n_ei, n_er, n_es = get_nb_edit_operations(g1, g2,
        pi_forward, pi_backward)

    print('# of operations and costs:', n_vi, n_vr, n_vs, n_ei, n_er, n_es)
    print('edit costs:', c_vi, c_vr, c_vs, c_ei, c_er, c_es)
    cost_computed = n_vi * c_vi + n_vr * c_vr + n_vs * c_vs \
        + n_ei * c_ei + n_er * c_er + n_es * c_es
    print('dis (cost computed by GED):', dis)
    print('cost computed by # of operations and edit cost constants:', cost_computed)


 def test_ged_python_bash_cpp():
    """Test ged computation with python invoking the c++ code by bash command (with updated library).
    """
    from gklearn.utils.graphfiles import loadDataset
    from gklearn.preimage.ged import GED

    data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
 #    collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
    collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/monoterpenoides_3_20.xml'
    graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'

    Gn, y = loadDataset(collection_file, extra_params=graph_dir)

    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'

    for repeat in range(0, 3):
        # Generate the result file.
        ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_bash_' + str(repeat) + '_init40.3_20.txt'
 #        runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_min_' + str(repeat) + '.txt'

        ged_file = open(ged_filename, 'a')
 #        runtime_file = open(runtime_filename, 'a')

        ged_mat = np.empty((len(Gn), len(Gn)))
 #        runtime_mat = np.empty((len(Gn), len(Gn)))

        for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
            for j in range(len(Gn)):
                print(i, j)
                g1 = Gn[i]
                g2 = Gn[j]
                upper_bound, _, _ = GED(g1, g2, lib='gedlib-bash', cost='CONSTANT',
                                method='IPFP',
                                edit_cost_constant=[3.0, 3.0, 1.0, 3.0, 3.0, 1.0],
                                algo_options=algo_options)
 #                runtime = gedlibpy.get_runtime(g1, g2)
                ged_mat[i][j] = upper_bound
 #                runtime_mat[i][j] = runtime

                # Write to files.
                ged_file.write(str(int(upper_bound)) + ' ')
 #                runtime_file.write(str(runtime) + ' ')

            ged_file.write('\n')
 #            runtime_file.write('\n')

        ged_file.close()
 #        runtime_file.close()

    print('ged_mat')
    print(ged_mat)
 #    print('runtime_mat:')
 #    print(runtime_mat)

    return



 def test_ged_best_settings_updated():
    """Test ged computation with best settings the same as in the C++ code (with updated library).
    """

    data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
    collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
 #    collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/monoterpenoides_3_20.xml'

    graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'

    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'

    for repeat in range(0, 3):
        # Generate the result file.
        ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_updated_' + str(repeat) + '_init40.txt'
        runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_updated_' + str(repeat) + '_init40.txt'

        gedlibpy.restart_env()
        gedlibpy.load_GXL_graphs(graph_dir, collection_file)
        listID = gedlibpy.get_all_graph_ids()
        gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0])
        gedlibpy.init()
        gedlibpy.set_method("IPFP", algo_options)
        gedlibpy.init_method()

        ged_mat = np.empty((len(listID), len(listID)))
        runtime_mat = np.empty((len(listID), len(listID)))

        for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout):
            ged_file = open(ged_filename, 'a')
            runtime_file = open(runtime_filename, 'a')

            for j in range(len(listID)):
                g1 = listID[i]
                g2 = listID[j]
                gedlibpy.run_method(g1, g2)
                upper_bound = gedlibpy.get_upper_bound(g1, g2)
                runtime = gedlibpy.get_runtime(g1, g2)
                ged_mat[i][j] = upper_bound
                runtime_mat[i][j] = runtime

                # Write to files.
                ged_file.write(str(int(upper_bound)) + ' ')
                runtime_file.write(str(runtime) + ' ')

            ged_file.write('\n')
            runtime_file.write('\n')

            ged_file.close()
            runtime_file.close()

    print('ged_mat')
    print(ged_mat)
    print('runtime_mat:')
    print(runtime_mat)

    return


 def test_ged_best_settings():
    """Test ged computation with best settings the same as in the C++ code.
    """

    data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
    collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
    graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'

    algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'

    for repeat in range(0, 3):
        # Generate the result file.
        ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_best_settings_' + str(repeat) + '.txt'
        runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_best_settings_' + str(repeat) + '.txt'

        ged_file = open(ged_filename, 'a')
        runtime_file = open(runtime_filename, 'a')

        gedlibpy.restart_env()
        gedlibpy.load_GXL_graphs(graph_dir, collection_file)
        listID = gedlibpy.get_all_graph_ids()
        gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0])
        gedlibpy.init()
        gedlibpy.set_method("IPFP", algo_options)
        gedlibpy.init_method()

        ged_mat = np.empty((len(listID), len(listID)))
        runtime_mat = np.empty((len(listID), len(listID)))

        for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout):
            for j in range(len(listID)):
                g1 = listID[i]
                g2 = listID[j]
                gedlibpy.run_method(g1, g2)
                upper_bound = gedlibpy.get_upper_bound(g1, g2)
                runtime = gedlibpy.get_runtime(g1, g2)
                ged_mat[i][j] = upper_bound
                runtime_mat[i][j] = runtime

                # Write to files.
                ged_file.write(str(int(upper_bound)) + ' ')
                runtime_file.write(str(runtime) + ' ')

            ged_file.write('\n')
            runtime_file.write('\n')

        ged_file.close()
        runtime_file.close()

    print('ged_mat')
    print(ged_mat)
    print('runtime_mat:')
    print(runtime_mat)

    return



 def test_ged_default():
    """Test ged computation with default settings.
    """

    data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
    collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
    graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'

    for repeat in range(3):
        # Generate the result file.
        ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_default_' + str(repeat) + '.txt'
        runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_default_' + str(repeat) + '.txt'

        ged_file = open(ged_filename, 'a')
        runtime_file = open(runtime_filename, 'a')

        gedlibpy.restart_env()
        gedlibpy.load_GXL_graphs(graph_dir, collection_file)
        listID = gedlibpy.get_all_graph_ids()
        gedlibpy.set_edit_cost('CONSTANT', [3.0, 3.0, 1.0, 3.0, 3.0, 1.0])
        gedlibpy.init()
        gedlibpy.set_method("IPFP", "")
        gedlibpy.init_method()

        ged_mat = np.empty((len(listID), len(listID)))
        runtime_mat = np.empty((len(listID), len(listID)))

        for i in tqdm(range(len(listID)), desc='computing GEDs', file=sys.stdout):
            for j in range(len(listID)):
                g1 = listID[i]
                g2 = listID[j]
                gedlibpy.run_method(g1, g2)
                upper_bound = gedlibpy.get_upper_bound(g1, g2)
                runtime = gedlibpy.get_runtime(g1, g2)
                ged_mat[i][j] = upper_bound
                runtime_mat[i][j] = runtime

                # Write to files.
                ged_file.write(str(int(upper_bound)) + ' ')
                runtime_file.write(str(runtime) + ' ')

            ged_file.write('\n')
            runtime_file.write('\n')

        ged_file.close()
        runtime_file.close()

    print('ged_mat')
    print(ged_mat)
    print('runtime_mat:')
    print(runtime_mat)

    return


 def test_ged_min():
    """Test ged computation with the "min" stabilizer.
    """
    from gklearn.utils.graphfiles import loadDataset
    from gklearn.preimage.ged import GED

    data_dir_prefix = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/'
    collection_file = data_dir_prefix + 'generated_datsets/monoterpenoides/gxl/monoterpenoides.xml'
    graph_dir = data_dir_prefix +'generated_datsets/monoterpenoides/gxl/'

    Gn, y = loadDataset(collection_file, extra_params=graph_dir)

 #    algo_options = '--threads 6 --initial-solutions 10 --ratio-runs-from-initial-solutions .5'

    for repeat in range(0, 3):
        # Generate the result file.
        ged_filename = data_dir_prefix + 'output/test_ged/ged_mat_python_min_' + str(repeat) + '.txt'
 #        runtime_filename = data_dir_prefix + 'output/test_ged/runtime_mat_python_min_' + str(repeat) + '.txt'

        ged_file = open(ged_filename, 'a')
 #        runtime_file = open(runtime_filename, 'a')

        ged_mat = np.empty((len(Gn), len(Gn)))
 #        runtime_mat = np.empty((len(Gn), len(Gn)))

        for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
            for j in range(len(Gn)):
                g1 = Gn[i]
                g2 = Gn[j]
                upper_bound, _, _ = GED(g1, g2, lib='gedlibpy', cost='CONSTANT',
                                method='IPFP',
                                edit_cost_constant=[3.0, 3.0, 1.0, 3.0, 3.0, 1.0],
                                stabilizer='min', repeat=10)
 #                runtime = gedlibpy.get_runtime(g1, g2)
                ged_mat[i][j] = upper_bound
 #                runtime_mat[i][j] = runtime

                # Write to files.
                ged_file.write(str(int(upper_bound)) + ' ')
 #                runtime_file.write(str(runtime) + ' ')

            ged_file.write('\n')
 #            runtime_file.write('\n')

        ged_file.close()
 #        runtime_file.close()

    print('ged_mat')
    print(ged_mat)
 #    print('runtime_mat:')
 #    print(runtime_mat)

    return


 def init() :
    print("List of Edit Cost Options : ")
    for i in gedlibpy.list_of_edit_cost_options :
        print (i)
    print("")

    print("List of Method Options : ")
    for j in gedlibpy.list_of_method_options :
        print (j)
    print("")

    print("List of Init Options : ")
    for k in gedlibpy.list_of_init_options :
        print (k)
    print("")




 def convertGraph(G):
    G_new = nx.Graph()
    for nd, attrs in G.nodes(data=True):
        G_new.add_node(str(nd), chem=attrs['atom'])
    for nd1, nd2, attrs in G.edges(data=True):
        G_new.add_edge(str(nd1), str(nd2), valence=attrs['bond_type'])

    return G_new


 def testNxGrapĥ():
    from gklearn.utils.graphfiles import loadDataset
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])

    gedlibpy.restart_env()
    for graph in Gn:
        g_new = convertGraph(graph)
        gedlibpy.add_nx_graph(g_new, "")

    listID = gedlibpy.get_all_graph_ids()
    gedlibpy.set_edit_cost("CHEM_1")
    gedlibpy.init()
    gedlibpy.set_method("IPFP", "")
    gedlibpy.init_method()

    print(listID)
    g = listID[0]
    h = listID[1]

    gedlibpy.run_method(g, h)

    print("Node Map : ", gedlibpy.get_node_map(g, h))
    print("Forward map : " , gedlibpy.get_forward_map(g, h), ", Backward map : ", gedlibpy.get_backward_map(g, h))
    print ("Upper Bound = " + str(gedlibpy.get_upper_bound(g, h)) + ", Lower Bound = " + str(gedlibpy.get_lower_bound(g, h)) + ", Runtime = " + str(gedlibpy.get_runtime(g, h)))

 if __name__ == '__main__':
 #    test_ged_default()
 #    test_ged_min()
 #    test_ged_best_settings()
 #    test_ged_best_settings_updated()
 #    test_ged_python_bash_cpp()
 #    test_get_nb_edit_operations()
 #    test_get_nb_edit_operations_letter()
 #    test_LETTER2_cost()
    test_NON_SYMBOLIC_cost()


    #init()
    #testNxGrapĥ()
--- a/gklearn/preimage/test_iam.py
+++ b/gklearn/preimage/test_iam.py
@@ -0,0 +1,964 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Sep  5 15:59:00 2019

@author: ljia
 """

 import numpy as np
 import networkx as nx
 import matplotlib.pyplot as plt
 import time
 import random
 #from tqdm import tqdm

 from gklearn.utils.graphfiles import loadDataset
 #from gklearn.utils.logger2file import *
 from gklearn.preimage.iam import iam_upgraded
 from gklearn.preimage.utils import remove_edges, compute_kernel, get_same_item_indices, dis_gstar
 #from gklearn.preimage.ged import ged_median


 def test_iam_monoterpenoides_with_init40():
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    # unfitted edit costs.
    c_vi = 3
    c_vr = 3
    c_vs = 1
    c_ei = 3
    c_er = 3
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.0001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    ged_stabilizer = None
 #    ged_repeat = 50
    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'algo_options': algo_options,
                  'stabilizer': ged_stabilizer}

    
    collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'
    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50
    
    # classify graphs according to classes.
    time_list = []
    dis_ks_min_list = []
    dis_ks_set_median_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    sod_list_list = []
    for y in y_all:
        print('\n-------------------------------------------------------')
        print('class of y:', y)
        
        time_list.append([])
        dis_ks_min_list.append([])
        dis_ks_set_median_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])
        
        for repeat in range(repeats):
            # load median set.
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
            Gn_median, _ = loadDataset(collection_file, extra_params=graph_dir)
            Gn_candidate = [g.copy() for g in Gn_median]
            
            time0 = time.time()
            G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median, 
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, node_label=node_label, edge_label=edge_label, 
                connected=connected_iam, removeNodes=removeNodes, 
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(G_gen_median_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_gen_median)
            print('\nsmallest sod in graph space:', sod_gen_median)
            sod_list_list.append(sod_list)
            
 #            # show the best graph and save it to file.
 #            print('one of the possible corresponding pre-images is')
 #            nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), 
 #                    with_labels=True)
 ##            plt.show()
 #    #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
 ##            plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + 
 ##                        '_repeat' + str(repeat) + '_' + str(time.time()) +
 ##                        '.png', format="PNG")
 #            plt.clf()
 #    #        print(G_gen_median_list[0].nodes(data=True))
 #    #        print(G_gen_median_list[0].edges(data=True))
            
        print('\nsods of the set median for this class:', sod_set_median_list[-1])
        print('\nsods in graph space for this class:', sod_gs_list[-1])
 #        print('\ndistance in kernel space of set median for this class:', 
 #              dis_ks_set_median_list[-1])
 #        print('\nsmallest distances in kernel space for this class:', 
 #              dis_ks_min_list[-1])   
        print('\ntimes for this class:', time_list[-1])
        
        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
 #        dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
 #        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])
        
    print()
    print('\nmean sods of the set median for each class:', sod_set_median_list)
    print('\nmean sods in graph space for each class:', sod_gs_list)
 #    print('\ndistances in kernel space of set median for each class:', 
 #            dis_ks_set_median_list)
 #    print('\nmean smallest distances in kernel space for each class:', 
 #            dis_ks_min_list)
    print('\nmean times for each class:', time_list)
    
    print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
 #    print('\nmean distances in kernel space of set median of all:', 
 #            np.mean(dis_ks_set_median_list))
 #    print('\nmean smallest distances in kernel space of all:', 
 #            np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))




 def test_iam_monoterpenoides():
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
    # parameters for GED function from the IAM paper.
    # fitted edit costs (Gaussian).
    c_vi = 0.03620133402089074
    c_vr = 0.0417574590207099
    c_vs = 0.009992282328587499
    c_ei = 0.08293120042342755
    c_er = 0.09512220476358019
    c_es = 0.09222529696841467
 #    # fitted edit costs (linear combinations).
 #    c_vi = 0.1749684054238749
 #    c_vr = 0.0734054228711457
 #    c_vs = 0.05017781726016715
 #    c_ei = 0.1869431164806936
 #    c_er = 0.32055856948274
 #    c_es = 0.2569469379247611
 #    # unfitted edit costs.
 #    c_vi = 3
 #    c_vr = 3
 #    c_vs = 1
 #    c_ei = 3
 #    c_er = 3
 #    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
 #    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # classify graphs according to letters.
    time_list = []
    dis_ks_min_list = []
    dis_ks_set_median_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    sod_list_list = []
    idx_dict = get_same_item_indices(y_all)
    for y_class in idx_dict:
        print('\n-------------------------------------------------------')
        print('class of y:', y_class)
        Gn_class = [Gn[i].copy() for i in idx_dict[y_class]]
        
        time_list.append([])
        dis_ks_min_list.append([])
        dis_ks_set_median_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])
        
        for repeat in range(50):
            idx_rdm = random.sample(range(len(Gn_class)), 10)
            print('graphs chosen:', idx_rdm)
            Gn_median = [Gn_class[idx].copy() for idx in idx_rdm]
            Gn_candidate = [g.copy() for g in Gn_median]
        
            alpha_range = [1 / len(Gn_median)] * len(Gn_median)
            time0 = time.time()
            G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median, 
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(G_gen_median_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_gen_median)
            print('\nsmallest sod in graph space:', sod_gen_median)
            sod_list_list.append(sod_list)
            
            # show the best graph and save it to file.
            print('one of the possible corresponding pre-images is')
            nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), 
                    with_labels=True)
 #            plt.show()
    #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
 #            plt.savefig('results/iam/paper_compare/monoter_y' + str(y_class) + 
 #                        '_repeat' + str(repeat) + '_' + str(time.time()) +
 #                        '.png', format="PNG")
            plt.clf()
    #        print(G_gen_median_list[0].nodes(data=True))
    #        print(G_gen_median_list[0].edges(data=True))
            
    
            # compute distance between \psi and the set median graph.
            knew_set_median = compute_kernel(G_set_median_list + Gn_median, 
                gkernel, node_label, edge_label, False)
            dhat_new_set_median_list = []
            for idx, g_tmp in enumerate(G_set_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), 
                    len(G_set_median_list) + len(Gn_median) + 1), 
                    alpha_range, knew_set_median, withterm3=False))
                
            print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) 
            dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0])
            
            
            # compute distance between \psi and the new generated graphs.
            knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
                              edge_label, False)
            dhat_new_list = []
            for idx, g_tmp in enumerate(G_gen_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), 
                                    len(G_gen_median_list) + len(Gn_median) + 1), 
                                    alpha_range, knew, withterm3=False))
                
            print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
            dis_ks_min_list[-1].append(dhat_new_list[0])
            

        print('\nsods of the set median for this class:', sod_set_median_list[-1])
        print('\nsods in graph space for this class:', sod_gs_list[-1])
        print('\ndistance in kernel space of set median for this class:', 
              dis_ks_set_median_list[-1])
        print('\nsmallest distances in kernel space for this class:', 
              dis_ks_min_list[-1])   
        print('\ntimes for this class:', time_list[-1])
        
        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
        dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])
        
    print()
    print('\nmean sods of the set median for each class:', sod_set_median_list)
    print('\nmean sods in graph space for each class:', sod_gs_list)
    print('\ndistances in kernel space of set median for each class:', 
            dis_ks_set_median_list)
    print('\nmean smallest distances in kernel space for each class:', 
            dis_ks_min_list)
    print('\nmean times for each class:', time_list)
    
    print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
    print('\nmean distances in kernel space of set median of all:', 
            np.mean(dis_ks_set_median_list))
    print('\nmean smallest distances in kernel space of all:', 
            np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))
    
    nb_better_sods = 0
    nb_worse_sods = 0
    nb_same_sods = 0
    for sods in sod_list_list:
        if sods[0] > sods[-1]:
            nb_better_sods += 1
        elif sods[0] < sods[-1]:
            nb_worse_sods += 1
        else:
            nb_same_sods += 1
    print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), 
          'are getting better,', str(nb_worse_sods), 'are getting worse,', 
          str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)),
          'sods are improved.')
    
    
 def test_iam_mutag():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
    # parameters for GED function from the IAM paper.
    # fitted edit costs.
    c_vi = 0.03523843108436513
    c_vr = 0.03347339739350128
    c_vs = 0.06871290673612238
    c_ei = 0.08591999846720685
    c_er = 0.07962086440894103
    c_es = 0.08596855855478233
    # unfitted edit costs.
 #    c_vi = 3
 #    c_vr = 3
 #    c_vs = 1
 #    c_ei = 3
 #    c_er = 3
 #    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
 #    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # classify graphs according to letters.
    time_list = []
    dis_ks_min_list = []
    dis_ks_set_median_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    sod_list_list = []
    idx_dict = get_same_item_indices(y_all)
    for y_class in idx_dict:
        print('\n-------------------------------------------------------')
        print('class of y:', y_class)
        Gn_class = [Gn[i].copy() for i in idx_dict[y_class]]
        
        time_list.append([])
        dis_ks_min_list.append([])
        dis_ks_set_median_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])
        
        for repeat in range(50):
            idx_rdm = random.sample(range(len(Gn_class)), 10)
            print('graphs chosen:', idx_rdm)
            Gn_median = [Gn_class[idx].copy() for idx in idx_rdm]
            Gn_candidate = [g.copy() for g in Gn_median]
        
            alpha_range = [1 / len(Gn_median)] * len(Gn_median)
            time0 = time.time()
            G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median, 
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(G_gen_median_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_gen_median)
            print('\nsmallest sod in graph space:', sod_gen_median)
            sod_list_list.append(sod_list)
            
            # show the best graph and save it to file.
            print('one of the possible corresponding pre-images is')
            nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), 
                    with_labels=True)
 #            plt.show()
    #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
 #            plt.savefig('results/iam/paper_compare/mutag_y' + str(y_class) + 
 #                        '_repeat' + str(repeat) + '_' + str(time.time()) +
 #                        '.png', format="PNG")
            plt.clf()
    #        print(G_gen_median_list[0].nodes(data=True))
    #        print(G_gen_median_list[0].edges(data=True))
            
    
            # compute distance between \psi and the set median graph.
            knew_set_median = compute_kernel(G_set_median_list + Gn_median, 
                gkernel, node_label, edge_label, False)
            dhat_new_set_median_list = []
            for idx, g_tmp in enumerate(G_set_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_set_median_list.append(dis_gstar(idx, range(len(G_set_median_list), 
                    len(G_set_median_list) + len(Gn_median) + 1), 
                    alpha_range, knew_set_median, withterm3=False))
                
            print('\ndistance in kernel space of set median: ', dhat_new_set_median_list[0]) 
            dis_ks_set_median_list[-1].append(dhat_new_set_median_list[0])
            
            
            # compute distance between \psi and the new generated graphs.
            knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
                              edge_label, False)
            dhat_new_list = []
            for idx, g_tmp in enumerate(G_gen_median_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), 
                                    len(G_gen_median_list) + len(Gn_median) + 1), 
                                    alpha_range, knew, withterm3=False))
                
            print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
            dis_ks_min_list[-1].append(dhat_new_list[0])
            

        print('\nsods of the set median for this class:', sod_set_median_list[-1])
        print('\nsods in graph space for this class:', sod_gs_list[-1])
        print('\ndistance in kernel space of set median for this class:', 
              dis_ks_set_median_list[-1])
        print('\nsmallest distances in kernel space for this class:', 
              dis_ks_min_list[-1])   
        print('\ntimes for this class:', time_list[-1])
        
        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
        dis_ks_set_median_list[-1] = np.mean(dis_ks_set_median_list[-1])
        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])
        
    print()
    print('\nmean sods of the set median for each class:', sod_set_median_list)
    print('\nmean sods in graph space for each class:', sod_gs_list)
    print('\ndistances in kernel space of set median for each class:', 
            dis_ks_set_median_list)
    print('\nmean smallest distances in kernel space for each class:', 
            dis_ks_min_list)
    print('\nmean times for each class:', time_list)
    
    print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
    print('\nmean distances in kernel space of set median of all:', 
            np.mean(dis_ks_set_median_list))
    print('\nmean smallest distances in kernel space of all:', 
            np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))
    
    nb_better_sods = 0
    nb_worse_sods = 0
    nb_same_sods = 0
    for sods in sod_list_list:
        if sods[0] > sods[-1]:
            nb_better_sods += 1
        elif sods[0] < sods[-1]:
            nb_worse_sods += 1
        else:
            nb_same_sods += 1
    print('\n In', str(len(sod_list_list)), 'sod lists,', str(nb_better_sods), 
          'are getting better,', str(nb_worse_sods), 'are getting worse,', 
          str(nb_same_sods), 'are not changed; ', str(nb_better_sods / len(sod_list_list)),
          'sods are improved.')
    

 ###############################################################################
 # tests on different numbers of median-sets.

 def test_iam_median_nb():
    
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
 #    # parameters for GED function
 #    c_vi = 0.037
 #    c_vr = 0.038
 #    c_vs = 0.075
 #    c_ei = 0.001
 #    c_er = 0.001
 #    c_es = 0.0
 #    ite_max_iam = 50
 #    epsilon_iam = 0.001
 #    removeNodes = False
 #    connected_iam = False
 #    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
 #    ged_method = 'IPFP'
 #    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
 #    ged_stabilizer = 'min'
 #    ged_repeat = 50
 #    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
 #                  'edit_cost_constant': edit_cost_constant, 
 #                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # parameters for GED function
    c_vi = 4
    c_vr = 4
    c_vs = 2
    c_ei = 1
    c_er = 1
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
    ged_cost = 'CHEM_1'
    ged_method = 'IPFP'
    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
    # number of graphs; we what to compute the median of these graphs. 
 #    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    nb_median_range = [len(Gn)]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0    
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
 #    sod_gs_min_list = []
 #    nb_updated_list = []
 #    nb_updated_k_list = []
    g_best = []
    for nb_median in nb_median_range:
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        Gn_candidate = [g.copy() for g in Gn]
        
 #        for g in Gn_median:
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
 #            plt.show()
 #            plt.clf()                         
                    
        ###################################################################
 #        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
 #        km_tmp = gmfile['gm']
 #        time_km = gmfile['gmtime']
 #        # modify mixed gram matrix.
 #        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
 #        for i in range(len(Gn)):
 #            for j in range(i, len(Gn)):
 #                km[i, j] = km_tmp[i, j]
 #                km[j, i] = km[i, j]
 #        for i in range(len(Gn)):
 #            for j, idx in enumerate(idx_rdm):
 #                km[i, len(Gn) + j] = km[i, idx]
 #                km[len(Gn) + j, i] = km[i, idx]
 #        for i, idx1 in enumerate(idx_rdm):
 #            for j, idx2 in enumerate(idx_rdm):
 #                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        time0 = time.time()
        ghat_new_list, sod_min = iam_upgraded(Gn_median, Gn_candidate, 
            c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
            epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
            params_ged=params_ged)
            
        time_total = time.time() - time0
        print('\ntime: ', time_total)
        time_list.append(time_total)
        
        # compute distance between \psi and the new generated graphs.
        knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
        dhat_new_list = []
        for idx, g_tmp in enumerate(ghat_new_list):
            # @todo: the term3 below could use the one at the beginning of the function.
            dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), 
                                len(ghat_new_list) + len(Gn_median) + 1), 
                                alpha_range, knew, withterm3=False))
            
        print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
        dis_ks_min_list.append(dhat_new_list[0])
        g_best.append(ghat_new_list[0])
        
        # show the best graph and save it to file.
 #        print('the shortest distance is', dhat)
        print('one of the possible corresponding pre-images is')
        nx.draw(ghat_new_list[0], labels=nx.get_node_attributes(ghat_new_list[0], 'atom'), 
                with_labels=True)
        plt.show()
 #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
        plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + 
                    '.png', format="PNG")
        plt.clf()
 #        print(ghat_list[0].nodes(data=True))
 #        print(ghat_list[0].edges(data=True))
    
        sod_gs_list.append(sod_min)
 #        sod_gs_min_list.append(np.min(sod_min))
        print('\nsmallest sod in graph space: ', sod_min)
        
    print('\nsods in graph space: ', sod_gs_list)
 #    print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs: ', 
          dis_ks_min_list) 
 #    print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', 
 #          nb_updated_list)
 #    print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', 
 #          nb_updated_k_list)
    print('\ntimes:', time_list)
    
    
 def test_iam_letter_h():
    from median import draw_Letter_graph
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
 #          'extra_params': {}} # node nsymb
 #    Gn = Gn[0:50]
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    gkernel = 'structuralspkernel'
    
    # parameters for GED function from the IAM paper.
    c_vi = 3
    c_vr = 3
    c_vs = 1
    c_ei = 3
    c_er = 3
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
    ged_cost = 'LETTER'
    ged_method = 'IPFP'
 #    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # classify graphs according to letters.
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    g_best = []
    sod_set_median_list = []
    idx_dict = get_same_item_indices(y_all)
    for letter in idx_dict:
        print('\n-------------------------------------------------------')
        print('letter', letter)
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
        
        time_list.append([])
        dis_ks_min_list.append([])
        sod_gs_list.append([])
        g_best.append([])
        sod_set_median_list.append([])
        
        for repeat in range(50):
            idx_rdm = random.sample(range(len(Gn_let)), 50)
            print('graphs chosen:', idx_rdm)
            Gn_median = [Gn_let[idx].copy() for idx in idx_rdm]
            Gn_candidate = [g.copy() for g in Gn_median]
        
            alpha_range = [1 / len(Gn_median)] * len(Gn_median)
            time0 = time.time()
            ghat_new_list, sod_min, sod_set_median = iam_upgraded(Gn_median, 
                Gn_candidate, c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
                epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
                params_ged=params_ged)
            time_total = time.time() - time0
            print('\ntime: ', time_total)
            time_list[-1].append(time_total)
            g_best[-1].append(ghat_new_list[0])
            sod_set_median_list[-1].append(sod_set_median)
            print('\nsmallest sod of the set median:', sod_set_median)
            sod_gs_list[-1].append(sod_min)
            print('\nsmallest sod in graph space:', sod_min)
            
            # show the best graph and save it to file.
            print('one of the possible corresponding pre-images is')
            draw_Letter_graph(ghat_new_list[0], savepath='results/iam/paper_compare/')
            
            # compute distance between \psi and the new generated graphs.
            knew = compute_kernel(ghat_new_list + Gn_median, gkernel, False)
            dhat_new_list = []
            for idx, g_tmp in enumerate(ghat_new_list):
                # @todo: the term3 below could use the one at the beginning of the function.
                dhat_new_list.append(dis_gstar(idx, range(len(ghat_new_list), 
                                    len(ghat_new_list) + len(Gn_median) + 1), 
                                    alpha_range, knew, withterm3=False))
                
            print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
            dis_ks_min_list[-1].append(dhat_new_list[0])            
        
        print('\nsods of the set median for this letter:', sod_set_median_list[-1])
        print('\nsods in graph space for this letter:', sod_gs_list[-1])
        print('\nsmallest distances in kernel space for this letter:', 
              dis_ks_min_list[-1])
        print('\ntimes for this letter:', time_list[-1])
        
        sod_set_median_list[-1] = np.mean(sod_set_median_list[-1])
        sod_gs_list[-1] = np.mean(sod_gs_list[-1])
        dis_ks_min_list[-1] = np.mean(dis_ks_min_list[-1])
        time_list[-1] = np.mean(time_list[-1])
        
    print('\nmean sods of the set median for each letter:', sod_set_median_list)
    print('\nmean sods in graph space for each letter:', sod_gs_list)
    print('\nmean smallest distances in kernel space for each letter:', 
            dis_ks_min_list)
    print('\nmean times for each letter:', time_list)
    
    print('\nmean sods of the set median of all:', np.mean(sod_set_median_list))
    print('\nmean sods in graph space of all:', np.mean(sod_gs_list))
    print('\nmean smallest distances in kernel space of all:', 
            np.mean(dis_ks_min_list))
    print('\nmean times of all:', np.mean(time_list))
    
    

    


    
    

 def test_iam_fitdistance():
    
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
 #    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
 #    lmbda = 0.03 # termination probalility
 #    # parameters for GED function
 #    c_vi = 0.037
 #    c_vr = 0.038
 #    c_vs = 0.075
 #    c_ei = 0.001
 #    c_er = 0.001
 #    c_es = 0.0
 #    ite_max_iam = 50
 #    epsilon_iam = 0.001
 #    removeNodes = False
 #    connected_iam = False
 #    # parameters for IAM function
 #    ged_cost = 'CONSTANT'
 #    ged_method = 'IPFP'
 #    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
 #    ged_stabilizer = 'min'
 #    ged_repeat = 50
 #    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
 #                  'edit_cost_constant': edit_cost_constant, 
 #                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # parameters for GED function
    c_vi = 4
    c_vr = 4
    c_vs = 2
    c_ei = 1
    c_er = 1
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = False
    connected_iam = False
    # parameters for IAM function
    ged_cost = 'CHEM_1'
    ged_method = 'IPFP'
    edit_cost_constant = []
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
    # number of graphs; we what to compute the median of these graphs. 
 #    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    nb_median_range = [10]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
    
    time_list = []
    dis_ks_min_list = []
    dis_ks_gen_median_list = []
    sod_gs_list = []
 #    sod_gs_min_list = []
 #    nb_updated_list = []
 #    nb_updated_k_list = []
    g_best = []
    for nb_median in nb_median_range:
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        Gn_candidate = [g.copy() for g in Gn_median]
        
 #        for g in Gn_median:
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
 #            plt.show()
 #            plt.clf()                         
                    
        ###################################################################
 #        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
 #        km_tmp = gmfile['gm']
 #        time_km = gmfile['gmtime']
 #        # modify mixed gram matrix.
 #        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
 #        for i in range(len(Gn)):
 #            for j in range(i, len(Gn)):
 #                km[i, j] = km_tmp[i, j]
 #                km[j, i] = km[i, j]
 #        for i in range(len(Gn)):
 #            for j, idx in enumerate(idx_rdm):
 #                km[i, len(Gn) + j] = km[i, idx]
 #                km[len(Gn) + j, i] = km[i, idx]
 #        for i, idx1 in enumerate(idx_rdm):
 #            for j, idx2 in enumerate(idx_rdm):
 #                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        time0 = time.time()
        G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median \
            = iam_upgraded(Gn_median, Gn_candidate, 
            c_ei=c_ei, c_er=c_er, c_es=c_es, ite_max=ite_max_iam,
            epsilon=epsilon_iam, connected=connected_iam, removeNodes=removeNodes, 
            params_ged=params_ged)
            
        time_total = time.time() - time0
        print('\ntime: ', time_total)
        time_list.append(time_total)
        
        # compute distance between \psi and the new generated graphs.
        knew = compute_kernel(G_gen_median_list + Gn_median, gkernel, node_label,
                              edge_label, False)
        dhat_new_list = []
        for idx, g_tmp in enumerate(G_gen_median_list):
            # @todo: the term3 below could use the one at the beginning of the function.
            dhat_new_list.append(dis_gstar(idx, range(len(G_gen_median_list), 
                                len(G_gen_median_list) + len(Gn_median) + 1), 
                                alpha_range, knew, withterm3=False))
            
        print('\nsmallest distance in kernel space: ', dhat_new_list[0]) 
        dis_ks_min_list.append(dhat_new_list[0])
        g_best.append(G_gen_median_list[0])
        
        # show the best graph and save it to file.
 #        print('the shortest distance is', dhat)
        print('one of the possible corresponding pre-images is')
        nx.draw(G_gen_median_list[0], labels=nx.get_node_attributes(G_gen_median_list[0], 'atom'), 
                with_labels=True)
        plt.show()
 #        plt.savefig('results/iam/mutag_median.fit_costs2.001.nb' + str(nb_median) + 
 #        plt.savefig('results/iam/mutag_median_unfit2.nb' + str(nb_median) + 
 #                    '.png', format="PNG")
        plt.clf()
 #        print(ghat_list[0].nodes(data=True))
 #        print(ghat_list[0].edges(data=True))
    
        sod_gs_list.append(sod_gen_median)
 #        sod_gs_min_list.append(np.min(sod_gen_median))
        print('\nsmallest sod in graph space: ', sod_gen_median)
        print('\nsmallest sod of set median in graph space: ', sod_set_median)
        
    print('\nsods in graph space: ', sod_gs_list)
 #    print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs: ', 
          dis_ks_min_list) 
 #    print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', 
 #          nb_updated_list)
 #    print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', 
 #          nb_updated_k_list)
    print('\ntimes:', time_list)
        
    
            
    
    
 ###############################################################################

    
 if __name__ == '__main__':
 ###############################################################################
 # tests on different numbers of median-sets.
 #    test_iam_median_nb()
 #    test_iam_letter_h()
 #    test_iam_monoterpenoides()
 #    test_iam_mutag()
    
 #    test_iam_fitdistance()
 #    print("test log")
    
    test_iam_monoterpenoides_with_init40()
--- a/gklearn/preimage/test_k_closest_graphs.py
+++ b/gklearn/preimage/test_k_closest_graphs.py
@@ -0,0 +1,462 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Dec 16 11:53:54 2019

@author: ljia
 """
 import numpy as np
 import math
 import networkx as nx
 import matplotlib.pyplot as plt
 import time
 import random
 from tqdm import tqdm
 from itertools import combinations, islice
 import multiprocessing
 from multiprocessing import Pool
 from functools import partial

 from gklearn.utils.graphfiles import loadDataset, loadGXL
 #from gklearn.utils.logger2file import *
 from gklearn.preimage.iam import iam_upgraded, iam_bash
 from gklearn.preimage.utils import compute_kernel, dis_gstar, kernel_distance_matrix
 from gklearn.preimage.fitDistance import fit_GED_to_kernel_distance
 #from gklearn.preimage.ged import ged_median


 def fit_edit_cost_constants(fit_method, edit_cost_name, 
                            edit_cost_constants=None, initial_solutions=1,
                            Gn_median=None, node_label=None, edge_label=None,
                            gkernel=None, dataset=None, init_ecc=None,
                            Gn=None, Kmatrix_median=None):
    """fit edit cost constants.    
    """
    if fit_method == 'random': # random
        if edit_cost_name == 'LETTER':
            edit_cost_constants = random.sample(range(1, 10), 3)
            edit_cost_constants = [item * 0.1 for item in edit_cost_constants]
        elif edit_cost_name == 'LETTER2':
            random.seed(time.time())
            edit_cost_constants = random.sample(range(1, 10), 5)
 #            edit_cost_constants = [item * 0.1 for item in edit_cost_constants]
        elif edit_cost_name == 'NON_SYMBOLIC':
            edit_cost_constants = random.sample(range(1, 10), 6)
            if Gn_median[0].graph['node_attrs'] == []:
                edit_cost_constants[2] = 0
            if Gn_median[0].graph['edge_attrs'] == []:
                edit_cost_constants[5] = 0
        else:
            edit_cost_constants = random.sample(range(1, 10), 6)
        print('edit cost constants used:', edit_cost_constants)
    elif fit_method == 'expert': # expert
        if init_ecc is None:
            if edit_cost_name == 'LETTER':
                edit_cost_constants = [0.9, 1.7, 0.75] 
            elif edit_cost_name == 'LETTER2':
                edit_cost_constants = [0.675, 0.675, 0.75, 0.425, 0.425]
            else:
                edit_cost_constants = [3, 3, 1, 3, 3, 1] 
        else:
            edit_cost_constants = init_ecc
    elif fit_method == 'k-graphs':
        itr_max = 6
        if init_ecc is None:
            if edit_cost_name == 'LETTER':
                init_costs = [0.9, 1.7, 0.75] 
            elif edit_cost_name == 'LETTER2':
                init_costs = [0.675, 0.675, 0.75, 0.425, 0.425]
            elif edit_cost_name == 'NON_SYMBOLIC':
                init_costs = [0, 0, 1, 1, 1, 0]
                if Gn_median[0].graph['node_attrs'] == []:
                    init_costs[2] = 0
                if Gn_median[0].graph['edge_attrs'] == []:
                    init_costs[5] = 0
            else:
                init_costs = [3, 3, 1, 3, 3, 1] 
        else:
            init_costs = init_ecc
        algo_options = '--threads 1 --initial-solutions ' \
                        + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1'
        params_ged = {'lib': 'gedlibpy', 'cost': edit_cost_name, 'method': 'IPFP', 
                      'algo_options': algo_options, 'stabilizer': None}
        # fit on k-graph subset
        edit_cost_constants, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, 
                node_label, edge_label, gkernel, itr_max, params_ged=params_ged, 
                init_costs=init_costs, dataset=dataset, Kmatrix=Kmatrix_median, 
                parallel=True)
    elif fit_method == 'whole-dataset':
        itr_max = 6
        if init_ecc is None:
            if edit_cost_name == 'LETTER':
                init_costs = [0.9, 1.7, 0.75] 
            elif edit_cost_name == 'LETTER2':
                init_costs = [0.675, 0.675, 0.75, 0.425, 0.425]
            else:
                init_costs = [3, 3, 1, 3, 3, 1] 
        else:
            init_costs = init_ecc
        algo_options = '--threads 1 --initial-solutions ' \
                        + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1'
        params_ged = {'lib': 'gedlibpy', 'cost': edit_cost_name, 'method': 'IPFP', 
                    'algo_options': algo_options, 'stabilizer': None}
        # fit on all subset
        edit_cost_constants, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, 
                node_label, edge_label, gkernel, itr_max, params_ged=params_ged, 
                init_costs=init_costs, dataset=dataset, parallel=True)
    elif fit_method == 'precomputed':
        pass
    
    return edit_cost_constants


 def compute_distances_to_true_median(Gn_median, fname_sm, fname_gm,
                                     gkernel, edit_cost_name, 
                                     Kmatrix_median=None):
    # reform graphs.
    set_median = loadGXL(fname_sm)
    gen_median = loadGXL(fname_gm)
 #    print(gen_median.nodes(data=True))
 #    print(gen_median.edges(data=True))
    if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2' or edit_cost_name == 'NON_SYMBOLIC':
 #        dataset == 'Fingerprint':
 #        for g in Gn_median:
 #            reform_attributes(g)
        reform_attributes(set_median, Gn_median[0].graph['node_attrs'], 
                          Gn_median[0].graph['edge_attrs'])
        reform_attributes(gen_median, Gn_median[0].graph['node_attrs'], 
                          Gn_median[0].graph['edge_attrs'])
    
    if edit_cost_name == 'LETTER' or edit_cost_name == 'LETTER2' or edit_cost_name == 'NON_SYMBOLIC':
        node_label = None
        edge_label = None
    else:
        node_label = 'chem'
        edge_label = 'valence'
        
    # compute Gram matrix for median set.
    if Kmatrix_median is None:
        Kmatrix_median = compute_kernel(Gn_median, gkernel, node_label, edge_label, False)
        
    # compute distance in kernel space for set median.
    kernel_sm = []
    for G_median in Gn_median:
        km_tmp = compute_kernel([set_median, G_median], gkernel, node_label, edge_label, False)
        kernel_sm.append(km_tmp[0, 1])
    Kmatrix_sm = np.concatenate((np.array([kernel_sm]), np.copy(Kmatrix_median)), axis=0)
    Kmatrix_sm = np.concatenate((np.array([[km_tmp[0, 0]] + kernel_sm]).T, Kmatrix_sm), axis=1)
 #    Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, 
 #                                node_label, edge_label, False)
    dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), 
                         [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False)
 #    print(gen_median.nodes(data=True))
 #    print(gen_median.edges(data=True))
 #    print(set_median.nodes(data=True))
 #    print(set_median.edges(data=True))
    
    # compute distance in kernel space for generalized median.
    kernel_gm = []
    for G_median in Gn_median:
        km_tmp = compute_kernel([gen_median, G_median], gkernel, node_label, edge_label, False)
        kernel_gm.append(km_tmp[0, 1])
    Kmatrix_gm = np.concatenate((np.array([kernel_gm]), np.copy(Kmatrix_median)), axis=0)
    Kmatrix_gm = np.concatenate((np.array([[km_tmp[0, 0]] + kernel_gm]).T, Kmatrix_gm), axis=1)
 #    Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, 
 #                                node_label, edge_label, False)
    dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), 
                         [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False)
    
    # compute distance in kernel space for each graph in median set.
    dis_k_gi = []
    for idx in range(len(Gn_median)):
        dis_k_gi.append(dis_gstar(idx+1, range(1, 1+len(Gn_median)), 
                             [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False))

    print('dis_k_sm:', dis_k_sm)
    print('dis_k_gm:', dis_k_gm)
    print('dis_k_gi:', dis_k_gi)
    idx_dis_k_gi_min = np.argmin(dis_k_gi)
    dis_k_gi_min = dis_k_gi[idx_dis_k_gi_min]
    print('min dis_k_gi:', dis_k_gi_min)    
    
    return dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min


 def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method,
                               graph_dir=None, initial_solutions=1,
                               edit_cost_constants=None, group_min=None, 
                               dataset=None, edit_cost_name=None, init_ecc=None,
                               Kmatrix=None, parallel=True):
 #    dataset = dataset.lower()
    
 #    # compute distances in kernel space.
 #    dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, 
 #                                              Kmatrix=None, gkernel=gkernel)
 #    # ged.
 #    gmfile = np.load('results/test_k_closest_graphs/ged_mat.fit_on_whole_dataset.with_medians.gm.npz')
 #    ged_mat = gmfile['ged_mat']
 #    dis_mat = ged_mat[0:len(Gn), 0:len(Gn)]
    
 #    # choose k closest graphs
 #    time0 = time.time()
 #    sod_ks_min, group_min = get_closest_k_graphs(dis_mat, k, parallel)
 #    time_spent = time.time() - time0
 #    print('closest graphs:', sod_ks_min, group_min)
 #    print('time spent:', time_spent)
 #    group_min = (12, 13, 22, 29) # closest w.r.t path kernel
 #    group_min = (77, 85, 160, 171) # closest w.r.t ged
 #    group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel
    Gn_median = [Gn[g].copy() for g in group_min]
    if Kmatrix is not None:
        Kmatrix_median = np.copy(Kmatrix[group_min,:])
        Kmatrix_median = Kmatrix_median[:,group_min]
    else:
        Kmatrix_median = None
        

    # 1. fit edit cost constants. 
    time0 = time.time()
    edit_cost_constants = fit_edit_cost_constants(fit_method, edit_cost_name,
        edit_cost_constants=edit_cost_constants, initial_solutions=initial_solutions,
        Gn_median=Gn_median, node_label=node_label, edge_label=edge_label,
        gkernel=gkernel, dataset=dataset, init_ecc=init_ecc,
        Gn=Gn, Kmatrix_median=Kmatrix_median)
    time_fitting = time.time() - time0
    
    
    # 2. compute set median and gen median using IAM (C++ through bash).
    print('\nstart computing set median and gen median using IAM (C++ through bash)...\n')
    group_fnames = [Gn[g].graph['filename'] for g in group_min]
    time0 = time.time()
    sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constants,
            cost=edit_cost_name, initial_solutions=initial_solutions,
            graph_dir=graph_dir, dataset=dataset)
    time_generating = time.time() - time0
    print('\nmedians computed.\n')
    
    
    # 3. compute distances to real median.
    print('\nstart computing distances to true median....\n')
    Gn_median = [Gn[g].copy() for g in group_min]
    dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min = \
        compute_distances_to_true_median(Gn_median, fname_sm, fname_gm,
                                         gkernel, edit_cost_name, 
                                         Kmatrix_median=Kmatrix_median)
    idx_dis_k_gi_min = group_min[idx_dis_k_gi_min]
    print('index min dis_k_gi:', idx_dis_k_gi_min)
    print('sod_sm:', sod_sm)
    print('sod_gm:', sod_gm)
    
    # collect return values.
    return (sod_sm, sod_gm), \
           (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \
           (time_fitting, time_generating)


 def reform_attributes(G, na_names=[], ea_names=[]):
    if not na_names == []: 
        for node in G.nodes:
            G.nodes[node]['attributes'] = [G.node[node][a_name] for a_name in na_names]
    if not ea_names == []:
        for edge in G.edges:
            G.edges[edge]['attributes'] = [G.edge[edge][a_name] for a_name in ea_names]


 def get_closest_k_graphs(dis_mat, k, parallel):
    k_graph_groups = combinations(range(0, len(dis_mat)), k)
    sod_ks_min = np.inf
    if parallel:
        len_combination = get_combination_length(len(dis_mat), k)
        len_itr_max = int(len_combination if len_combination < 1e7 else 1e7)
 #        pos_cur = 0
        graph_groups_slices = split_iterable(k_graph_groups, len_itr_max, len_combination)
        for graph_groups_cur in graph_groups_slices:
 #        while True:
 #            graph_groups_cur = islice(k_graph_groups, pos_cur, pos_cur + len_itr_max)
            graph_groups_cur_list = list(graph_groups_cur) 
            print('current position:', graph_groups_cur_list[0])
            len_itr_cur = len(graph_groups_cur_list)
 #            if len_itr_cur < len_itr_max:
 #                break

            itr = zip(graph_groups_cur_list, range(0, len_itr_cur))
            sod_k_list = np.empty(len_itr_cur)
            graphs_list = [None] * len_itr_cur
            n_jobs = multiprocessing.cpu_count()
            chunksize = int(len_itr_max / n_jobs + 1)
            n_jobs = multiprocessing.cpu_count()
            def init_worker(dis_mat_toshare):
                global G_dis_mat
                G_dis_mat = dis_mat_toshare
            pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(dis_mat,))
 #            iterator = tqdm(pool.imap_unordered(_get_closest_k_graphs_parallel, 
 #                                                itr, chunksize),
 #                            desc='Choosing k closest graphs', file=sys.stdout)
            iterator = pool.imap_unordered(_get_closest_k_graphs_parallel, itr, chunksize)
            for graphs, i, sod_ks in iterator:
                sod_k_list[i] = sod_ks
                graphs_list[i] = graphs
            pool.close()
            pool.join()
            
            arg_min = np.argmin(sod_k_list)
            sod_ks_cur = sod_k_list[arg_min]
            group_cur = graphs_list[arg_min]
            if sod_ks_cur < sod_ks_min:
                sod_ks_min = sod_ks_cur
                group_min = group_cur
                print('get closer graphs:', sod_ks_min, group_min)
    else:        
        for items in tqdm(k_graph_groups, desc='Choosing k closest graphs', file=sys.stdout):
    #        if items[0] != itmp:
    #            itmp = items[0]
    #            print(items)
            k_graph_pairs = combinations(items, 2)
            sod_ks = 0
            for i1, i2 in k_graph_pairs:
                sod_ks += dis_mat[i1, i2]
            if sod_ks < sod_ks_min:
                sod_ks_min = sod_ks
                group_min = items
                print('get closer graphs:', sod_ks_min, group_min)
                
    return sod_ks_min, group_min


 def _get_closest_k_graphs_parallel(itr):
    k_graph_pairs = combinations(itr[0], 2)
    sod_ks = 0
    for i1, i2 in k_graph_pairs:
        sod_ks += G_dis_mat[i1, i2]

    return itr[0], itr[1], sod_ks
    

 def split_iterable(iterable, n, len_iter):
    it = iter(iterable)
    for i in range(0, len_iter, n):
        piece = islice(it, n)
        yield piece


 def get_combination_length(n, k):
    len_combination = 1
    for i in range(n, n - k, -1):
        len_combination *= i
    return int(len_combination / math.factorial(k))


 ###############################################################################

 def test_k_closest_graphs():
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
 #    gkernel = 'untilhpathkernel'
 #    gkernel = 'weisfeilerlehmankernel'
    gkernel = 'treeletkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
    k = 5
    edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
    
 #    sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
 #        = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, 
 #                                     'precomputed', edit_costs=edit_costs, 
 ##                                     'k-graphs',
 #                                     parallel=False)
 #        
 #    sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
 #        = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, 
 #                                     'expert', parallel=False)
        
    sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
        = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, 
                                     'expert', parallel=False)
    return


 def test_k_closest_graphs_with_cv():
    gkernel = 'untilhpathkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    
    k = 4
    
    y_all = ['3', '1', '4', '6', '7', '8', '9', '2']
    repeats = 50
    collection_path = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/monoterpenoides/'
    graph_dir = collection_path + 'gxl/'
    
    sod_sm_list = []
    sod_gm_list = []
    dis_k_sm_list = []
    dis_k_gm_list = []
    dis_k_gi_min_list = []
    for y in y_all:
        print('\n-------------------------------------------------------')
        print('class of y:', y)
        
        sod_sm_list.append([])
        sod_gm_list.append([])
        dis_k_sm_list.append([])
        dis_k_gm_list.append([])
        dis_k_gi_min_list.append([])
    
        for repeat in range(repeats):
            print('\nrepeat ', repeat)
            collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml'
            Gn, _ = loadDataset(collection_file, extra_params=graph_dir)
            sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
                = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, 
                                             k, 'whole-dataset', graph_dir=graph_dir,
                                             parallel=False)
            
            sod_sm_list[-1].append(sod_sm)
            sod_gm_list[-1].append(sod_gm)
            dis_k_sm_list[-1].append(dis_k_sm)
            dis_k_gm_list[-1].append(dis_k_gm)
            dis_k_gi_min_list[-1].append(dis_k_gi_min)
            
        print('\nsods of the set median for this class:', sod_sm_list[-1])
        print('\nsods of the gen median for this class:', sod_gm_list[-1])
        print('\ndistances in kernel space of set median for this class:', 
              dis_k_sm_list[-1])
        print('\ndistances in kernel space of gen median for this class:', 
              dis_k_gm_list[-1])
        print('\ndistances in kernel space of min graph for this class:', 
              dis_k_gi_min_list[-1])
        
        sod_sm_list[-1] = np.mean(sod_sm_list[-1])
        sod_gm_list[-1] = np.mean(sod_gm_list[-1])
        dis_k_sm_list[-1] = np.mean(dis_k_sm_list[-1])
        dis_k_gm_list[-1] = np.mean(dis_k_gm_list[-1])
        dis_k_gi_min_list[-1] = np.mean(dis_k_gi_min_list[-1])
        
    print()
    print('\nmean sods of the set median for each class:', sod_sm_list)
    print('\nmean sods of the gen median for each class:', sod_gm_list)
    print('\nmean distance in kernel space of set median for each class:', 
          dis_k_sm_list)
    print('\nmean distances in kernel space of gen median for each class:', 
          dis_k_gm_list)
    print('\nmean distances in kernel space of min graph for each class:', 
          dis_k_gi_min_list)
    
    print('\nmean sods of the set median of all:', np.mean(sod_sm_list))
    print('\nmean sods of the gen median of all:', np.mean(sod_gm_list))
    print('\nmean distances in kernel space of set median of all:', 
            np.mean(dis_k_sm_list))
    print('\nmean distances in kernel space of gen median of all:', 
            np.mean(dis_k_gm_list))
    print('\nmean distances in kernel space of min graph of all:', 
            np.mean(dis_k_gi_min_list))
    
    return
    

 if __name__ == '__main__':
    test_k_closest_graphs()
 #    test_k_closest_graphs_with_cv()
--- a/gklearn/preimage/test_median_graph_estimator.py
+++ b/gklearn/preimage/test_median_graph_estimator.py
@@ -0,0 +1,91 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Mar 16 17:26:40 2020

@author: ljia
 """
 	
 def test_median_graph_estimator():
 	from gklearn.utils.graphfiles import loadDataset
 	from gklearn.preimage.median_graph_estimator import MedianGraphEstimator
 	from gklearn.gedlib import librariesImport, gedlibpy
 	from gklearn.preimage.utils import get_same_item_indices
 	from gklearn.preimage.ged import convertGraph
 	import multiprocessing

 	# estimator parameters.
 	init_type = 'MEDOID'
 	num_inits = 1
 	threads = multiprocessing.cpu_count()
 	time_limit = 60000
 	
 	# algorithm parameters.
 	algo = 'IPFP'
 	initial_solutions = 40
 	algo_options_suffix = ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1'

 	edit_cost_name = 'LETTER2'
 	edit_cost_constants = [0.02987291, 0.0178211, 0.01431966, 0.001, 0.001]
 	ds_name = 'COIL-DEL'
 	
 	# Load dataset.
 	# dataset = '../../datasets/COIL-DEL/COIL-DEL_A.txt'
 	dataset = '../../datasets/Letter-high/Letter-high_A.txt'
 	Gn, y_all = loadDataset(dataset)
 	y_idx = get_same_item_indices(y_all)
 	for i, (y, values) in enumerate(y_idx.items()):
 		Gn_i = [Gn[val] for val in values]
 		break
 	
 	# Set up the environment.
 	ged_env = gedlibpy.GEDEnv()
 	# gedlibpy.restart_env()
 	ged_env.set_edit_cost(edit_cost_name, edit_cost_constant=edit_cost_constants)
 	for G in Gn_i:
 		ged_env.add_nx_graph(convertGraph(G, edit_cost_name), '')
 	graph_ids = ged_env.get_all_graph_ids()
 	set_median_id = ged_env.add_graph('set_median')
 	gen_median_id = ged_env.add_graph('gen_median')
 	ged_env.init(init_option='EAGER_WITHOUT_SHUFFLED_COPIES')
 	
 	# Set up the estimator.
 	mge = MedianGraphEstimator(ged_env, constant_node_costs(edit_cost_name))
 	mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1')
 	
 	mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type
 	mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1'  + ' --refine FALSE'# @todo: std::to_string(rng())
 	
 	# Select the GED algorithm.
 	algo_options = '--threads ' + str(threads) + algo_options_suffix
 	mge.set_options(mge_options)
 	mge.set_init_method(algo, algo_options)
 	mge.set_descent_method(algo, algo_options)
 	
 	# Run the estimator.
 	mge.run(graph_ids, set_median_id, gen_median_id)
 	
 	# Get SODs.
 	sod_sm = mge.get_sum_of_distances('initialized')
 	sod_gm = mge.get_sum_of_distances('converged')
 	print('sod_sm, sod_gm: ', sod_sm, sod_gm)
 	
 	# Get median graphs.
 	set_median = ged_env.get_nx_graph(set_median_id)
 	gen_median = ged_env.get_nx_graph(gen_median_id)
 	
 	return set_median, gen_median
 	


 def constant_node_costs(edit_cost_name):
 	if edit_cost_name == 'NON_SYMBOLIC' or edit_cost_name == 'LETTER2' or edit_cost_name == 'LETTER':
 		return False
 #	 elif edit_cost_name != '':
 # # 		throw ged::Error("Invalid dataset " + dataset + ". Usage: ./median_tests <AIDS|Mutagenicity|Letter-high|Letter-med|Letter-low|monoterpenoides|SYNTHETICnew|Fingerprint|COIL-DEL>");
 #		 return False
 	# return True


 if __name__ == '__main__':
 	set_median, gen_median = test_median_graph_estimator()
--- a/gklearn/preimage/test_others.py
+++ b/gklearn/preimage/test_others.py
@@ -0,0 +1,686 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Jul  4 12:20:16 2019

@author: ljia
 """
 import numpy as np
 import networkx as nx
 import matplotlib.pyplot as plt
 import time
 from tqdm import tqdm

 from gklearn.utils.graphfiles import loadDataset
 from gklearn.preimage.median import draw_Letter_graph
 from gklearn.preimage.ged import GED, ged_median
 from gklearn.preimage.utils import get_same_item_indices, compute_kernel, gram2distances, \
    dis_gstar, remove_edges


 # --------------------------- These are tests --------------------------------#
    
 def test_who_is_the_closest_in_kernel_space(Gn):
    idx_gi = [0, 6]
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    # create the "median" graph.
    gnew = g2.copy()
    gnew.remove_node(0)
    nx.draw_networkx(gnew)
    plt.show()
    print(gnew.nodes(data=True))
    Gn = [gnew] + Gn
    
    # compute gram matrix
    Kmatrix = compute_kernel(Gn, 'untilhpathkernel', True)
    # the distance matrix
    dmatrix = gram2distances(Kmatrix)
    print(np.sort(dmatrix[idx_gi[0] + 1]))
    print(np.argsort(dmatrix[idx_gi[0] + 1]))
    print(np.sort(dmatrix[idx_gi[1] + 1]))
    print(np.argsort(dmatrix[idx_gi[1] + 1]))
    # for all g in Gn, compute (d(g1, g) + d(g2, g)) / 2
    dis_median = [(dmatrix[i, idx_gi[0] + 1] + dmatrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
    print(np.sort(dis_median))
    print(np.argsort(dis_median))
    return


 def test_who_is_the_closest_in_GED_space(Gn):
    idx_gi = [0, 6]
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    # create the "median" graph.
    gnew = g2.copy()
    gnew.remove_node(0)
    nx.draw_networkx(gnew)
    plt.show()
    print(gnew.nodes(data=True))
    Gn = [gnew] + Gn
    
    # compute GEDs
    ged_matrix = np.zeros((len(Gn), len(Gn)))
    for i1 in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
        for i2 in range(len(Gn)):
            dis, _, _ = GED(Gn[i1], Gn[i2], lib='gedlib')
            ged_matrix[i1, i2] = dis
    print(np.sort(ged_matrix[idx_gi[0] + 1]))
    print(np.argsort(ged_matrix[idx_gi[0] + 1]))
    print(np.sort(ged_matrix[idx_gi[1] + 1]))
    print(np.argsort(ged_matrix[idx_gi[1] + 1]))
    # for all g in Gn, compute (GED(g1, g) + GED(g2, g)) / 2
    dis_median = [(ged_matrix[i, idx_gi[0] + 1] + ged_matrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
    print(np.sort(dis_median))
    print(np.argsort(dis_median))
    return


 def test_will_IAM_give_the_median_graph_we_wanted(Gn):
    idx_gi = [0, 6]
    g1 = Gn[idx_gi[0]].copy()
    g2 = Gn[idx_gi[1]].copy()
 #    del Gn[idx_gi[0]]
 #    del Gn[idx_gi[1] - 1]
    g_median = test_iam_with_more_graphs_as_init([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
 #    g_median = test_iam_with_more_graphs_as_init(Gn, Gn, c_ei=1, c_er=1, c_es=1)
    nx.draw_networkx(g_median)
    plt.show()
    print(g_median.nodes(data=True))
    print(g_median.edges(data=True))
    
    
 def test_new_IAM_allGraph_deleteNodes(Gn):
    idx_gi = [0, 6]
 #    g1 = Gn[idx_gi[0]].copy()
 #    g2 = Gn[idx_gi[1]].copy()

 #    g1 = nx.Graph(name='haha')
 #    g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})])
 #    g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})])
 #    g2 = nx.Graph(name='hahaha')
 #    g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
 #                       (3, {'atom': 'O'}), (4, {'atom': 'C'})])
 #    g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
 #                       (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
    
    g1 = nx.Graph(name='haha')
    g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
                       (3, {'atom': 'S'}), (4, {'atom': 'S'})])
    g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
                       (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
    g2 = nx.Graph(name='hahaha')
    g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
                       (3, {'atom': 'O'}), (4, {'atom': 'O'})])
    g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
                       (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])

 #    g2 = g1.copy()
 #    g2.add_nodes_from([(3, {'atom': 'O'})])
 #    g2.add_nodes_from([(4, {'atom': 'C'})])
 #    g2.add_edges_from([(1, 3, {'bond_type': '1'})])
 #    g2.add_edges_from([(3, 4, {'bond_type': '1'})])

 #    del Gn[idx_gi[0]]
 #    del Gn[idx_gi[1] - 1]
    
    nx.draw_networkx(g1)
    plt.show()
    print(g1.nodes(data=True))
    print(g1.edges(data=True))
    nx.draw_networkx(g2)
    plt.show()
    print(g2.nodes(data=True))
    print(g2.edges(data=True))
    
    g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
 #    g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(Gn, Gn, c_ei=1, c_er=1, c_es=1)
    nx.draw_networkx(g_median)
    plt.show()
    print(g_median.nodes(data=True))
    print(g_median.edges(data=True))
    
    
 def test_the_simple_two(Gn, gkernel):
    from gk_iam import gk_iam_nearest_multi
    lmbda = 0.03 # termination probalility
    r_max = 10 # recursions
    l = 500
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 2 # k nearest neighbors
    
    # randomly select two molecules
    np.random.seed(1)
    idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    Gn_mix = [g.copy() for g in Gn]
    Gn_mix.append(g1.copy())
    Gn_mix.append(g2.copy())
    
 #    g_tmp = iam([g1, g2])
 #    nx.draw_networkx(g_tmp)
 #    plt.show()
    
    # compute 
 #    k_list = [] # kernel between each graph and itself.
 #    k_g1_list = [] # kernel between each graph and g1
 #    k_g2_list = [] # kernel between each graph and g2
 #    for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): 
 #        ktemp = compute_kernel([g, g1, g2], 'marginalizedkernel', False)
 #        k_list.append(ktemp[0][0, 0])
 #        k_g1_list.append(ktemp[0][0, 1])
 #        k_g2_list.append(ktemp[0][0, 2])
        
    km = compute_kernel(Gn_mix, gkernel, True)
 #    k_list = np.diag(km) # kernel between each graph and itself.
 #    k_g1_list = km[idx_gi[0]] # kernel between each graph and g1
 #    k_g2_list = km[idx_gi[1]] # kernel between each graph and g2    

    g_best = []
    dis_best = []
    # for each alpha
    for alpha in alpha_range:
        print('alpha =', alpha)
        dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha], 
                                               range(len(Gn), len(Gn) + 2), km,
                                               k, r_max,gkernel)
        dis_best.append(dhat)
        g_best.append(ghat_list)
        
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', dis_best[idx])
        print('the corresponding pre-images are')
        for g in g_best[idx]:
            nx.draw_networkx(g)
            plt.show()
            print(g.nodes(data=True))
            print(g.edges(data=True))
            
    
 def test_remove_bests(Gn, gkernel):
    from gk_iam import gk_iam_nearest_multi
    lmbda = 0.03 # termination probalility
    r_max = 10 # recursions
    l = 500
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 20 # k nearest neighbors
    
    # randomly select two molecules
    np.random.seed(1)
    idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx_gi[0]]
    g2 = Gn[idx_gi[1]]
    # remove the best 2 graphs.
    del Gn[idx_gi[0]]
    del Gn[idx_gi[1] - 1]
 #    del Gn[8]
    
    Gn_mix = [g.copy() for g in Gn]
    Gn_mix.append(g1.copy())
    Gn_mix.append(g2.copy())

    
    # compute
    km = compute_kernel(Gn_mix, gkernel, True)
    g_best = []
    dis_best = []
    # for each alpha
    for alpha in alpha_range:
        print('alpha =', alpha)
        dhat, ghat_list = gk_iam_nearest_multi(Gn, [g1, g2], [alpha, 1 - alpha], 
                                               range(len(Gn), len(Gn) + 2), km, 
                                               k, r_max, gkernel)
        dis_best.append(dhat)
        g_best.append(ghat_list)
        
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', dis_best[idx])
        print('the corresponding pre-images are')
        for g in g_best[idx]:
            draw_Letter_graph(g)
 #            nx.draw_networkx(g)
 #            plt.show()
            print(g.nodes(data=True))
            print(g.edges(data=True))
            
            
 ###############################################################################
 # Tests on dataset Letter-H.
            
 def test_gkiam_letter_h():
    from gk_iam import gk_iam_nearest_multi
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
 #          'extra_params': {}} # node nsymb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    gkernel = 'structuralspkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 3 # recursions
 #    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 10 # k nearest neighbors
    
    # classify graphs according to letters.
    idx_dict = get_same_item_indices(y_all)
    time_list = []
    sod_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    for letter in idx_dict:
        print('\n-------------------------------------------------------\n')
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
        Gn_mix = Gn_let + [g.copy() for g in Gn_let]
        
        alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
        
        # compute
        time0 = time.time()
        km = compute_kernel(Gn_mix, gkernel, True)
        g_best = []
        dis_best = []
        # for each alpha
        for alpha in alpha_range:
            print('alpha =', alpha)
            dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let, 
                Gn_let, [alpha] * len(Gn_let), range(len(Gn_let), len(Gn_mix)), 
                km, k, r_max, gkernel, c_ei=1.7, c_er=1.7, c_es=1.7,
                ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter')
            dis_best.append(dhat)
            g_best.append(ghat_list)
        time_list.append(time.time() - time0)
            
        # show best graphs and save them to file.
        for idx, item in enumerate(alpha_range):
            print('when alpha is', item, 'the shortest distance is', dis_best[idx])
            print('the corresponding pre-images are')
            for g in g_best[idx]:
                draw_Letter_graph(g, savepath='results/gk_iam/')
 #            nx.draw_networkx(g)
 #            plt.show()
                print(g.nodes(data=True))
                print(g.edges(data=True))
                
        # compute the corresponding sod in graph space. (alpha range not considered.)
        sod_tmp, _ = ged_median(g_best[0], Gn_let, ged_cost='LETTER', 
                                     ged_method='IPFP', saveGXL='gedlib-letter')
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        sod_ks_min_list.append(sod_ks)
        nb_updated_list.append(nb_updated)
        
                
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list)  
    print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list) 
    print('\nnumber of updates for each letter: ', nb_updated_list)             
    print('\ntimes:', time_list)

 #def compute_letter_median_by_average(Gn):
 #    return g_median
    

 def test_iam_letter_h():
    from iam import test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
 #          'extra_params': {}} # node nsymb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    
    lmbda = 0.03 # termination probalility
 #    alpha_range = np.linspace(0.5, 0.5, 1)
    
    # classify graphs according to letters.
    idx_dict = get_same_item_indices(y_all)
    time_list = []
    sod_list = []
    sod_min_list = []
    for letter in idx_dict:        
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
        
        alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
        
        # compute
        g_best = []
        dis_best = []
        time0 = time.time()
        # for each alpha
        for alpha in alpha_range:
            print('alpha =', alpha)
            ghat_list, dhat = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
                Gn_let, Gn_let, c_ei=1.7, c_er=1.7, c_es=1.7,
                ged_cost='LETTER', ged_method='IPFP', saveGXL='gedlib-letter')
            dis_best.append(dhat)
            g_best.append(ghat_list)
        time_list.append(time.time() - time0)
            
        # show best graphs and save them to file.
        for idx, item in enumerate(alpha_range):
            print('when alpha is', item, 'the shortest distance is', dis_best[idx])
            print('the corresponding pre-images are')
            for g in g_best[idx]:
                draw_Letter_graph(g, savepath='results/iam/')
 #            nx.draw_networkx(g)
 #            plt.show()
                print(g.nodes(data=True))
                print(g.edges(data=True))
                
        # compute the corresponding sod in kernel space. (alpha range not considered.)
        gkernel = 'structuralspkernel'        
        sod_tmp = []
        Gn_mix = g_best[0] + Gn_let
        km = compute_kernel(Gn_mix, gkernel, True)
        for ig, g in tqdm(enumerate(g_best[0]), desc='computing kernel sod', file=sys.stdout):
            dtemp = dis_gstar(ig, range(len(g_best[0]), len(Gn_mix)), 
                              [alpha_range[0]] * len(Gn_let), km, withterm3=False)
            sod_tmp.append(dtemp)
        sod_list.append(sod_tmp)
        sod_min_list.append(np.min(sod_tmp))
        
                
    print('\nsods in kernel space: ', sod_list)
    print('\nsmallest sod in kernel space for each letter: ', sod_min_list)
    print('\ntimes:', time_list)
    
    
 def test_random_preimage_letter_h():
    from preimage_random import preimage_random
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
 #          'extra_params': {}} # node nsymb
    #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
 #          'extra_params': {}}  # node/edge symb
 #    ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
 #          'extra_params': {}}
 #    ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
 #            'extra_params': {}} # node symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    gkernel = 'structuralspkernel'
    
 #    lmbda = 0.03 # termination probalility
    r_max = 3 # 10 # recursions
    l = 500
 #    alpha_range = np.linspace(0.5, 0.5, 1)
    #alpha_range = np.linspace(0.1, 0.9, 9)
    k = 10 # 5 # k nearest neighbors
    
    # classify graphs according to letters.
    idx_dict = get_same_item_indices(y_all)
    time_list = []
    sod_list = []
    sod_min_list = []
    for letter in idx_dict:
        print('\n-------------------------------------------------------\n')
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
        Gn_mix = Gn_let + [g.copy() for g in Gn_let]
        
        alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
        
        # compute
        time0 = time.time()
        km = compute_kernel(Gn_mix, gkernel, True)
        g_best = []
        dis_best = []
        # for each alpha
        for alpha in alpha_range:
            print('alpha =', alpha)
            dhat, ghat_list = preimage_random(Gn_let, Gn_let, [alpha] * len(Gn_let), 
                                                   range(len(Gn_let), len(Gn_mix)), km, 
                                                   k, r_max, gkernel, c_ei=1.7, 
                                                   c_er=1.7, c_es=1.7)
            dis_best.append(dhat)
            g_best.append(ghat_list)
        time_list.append(time.time() - time0)
            
        # show best graphs and save them to file.
        for idx, item in enumerate(alpha_range):
            print('when alpha is', item, 'the shortest distance is', dis_best[idx])
            print('the corresponding pre-images are')
            for g in g_best[idx]:
                draw_Letter_graph(g, savepath='results/gk_iam/')
 #            nx.draw_networkx(g)
 #            plt.show()
                print(g.nodes(data=True))
                print(g.edges(data=True))
                
        # compute the corresponding sod in graph space. (alpha range not considered.)
        sod_tmp, _ = ged_median(g_best[0], Gn_let)
        sod_list.append(sod_tmp)
        sod_min_list.append(np.min(sod_tmp))
        
                
    print('\nsods in graph space: ', sod_list)
    print('\nsmallest sod in graph space for each letter: ', sod_min_list)               
    print('\ntimes:', time_list)
    
    

    
    
    
    
 def test_gkiam_mutag():
    from gk_iam import gk_iam_nearest_multi
    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt',
 #          'extra_params': {}} # node nsymb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
    gkernel = 'structuralspkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 3 # recursions
 #    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 20 # k nearest neighbors
    
    # classify graphs according to letters.
    idx_dict = get_same_item_indices(y_all)
    time_list = []
    sod_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    for letter in idx_dict:
        print('\n-------------------------------------------------------\n')
        Gn_let = [Gn[i].copy() for i in idx_dict[letter]]
        Gn_mix = Gn_let + [g.copy() for g in Gn_let]
        
        alpha_range = np.linspace(1 / len(Gn_let), 1 / len(Gn_let), 1)
        
        # compute
        time0 = time.time()
        km = compute_kernel(Gn_mix, gkernel, True)
        g_best = []
        dis_best = []
        # for each alpha
        for alpha in alpha_range:
            print('alpha =', alpha)
            dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn_let, Gn_let, [alpha] * len(Gn_let), 
                                                   range(len(Gn_let), len(Gn_mix)), km, 
                                                   k, r_max, gkernel, c_ei=1.7, 
                                                   c_er=1.7, c_es=1.7)
            dis_best.append(dhat)
            g_best.append(ghat_list)
        time_list.append(time.time() - time0)
            
        # show best graphs and save them to file.
        for idx, item in enumerate(alpha_range):
            print('when alpha is', item, 'the shortest distance is', dis_best[idx])
            print('the corresponding pre-images are')
            for g in g_best[idx]:
                draw_Letter_graph(g, savepath='results/gk_iam/')
 #            nx.draw_networkx(g)
 #            plt.show()
                print(g.nodes(data=True))
                print(g.edges(data=True))
                
        # compute the corresponding sod in graph space. (alpha range not considered.)
        sod_tmp, _ = ged_median(g_best[0], Gn_let)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        sod_ks_min_list.append(sod_ks)
        nb_updated_list.append(nb_updated)
        
                
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each letter: ', sod_gs_min_list)  
    print('\nsmallest sod in kernel space for each letter: ', sod_ks_min_list) 
    print('\nnumber of updates for each letter: ', nb_updated_list)             
    print('\ntimes:', time_list)
    
    
 ###############################################################################
 # Re-test.
    
 def retest_the_simple_two():
    from gk_iam import gk_iam_nearest_multi
    
    # The two simple graphs.
 #    g1 = nx.Graph(name='haha')
 #    g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'})])
 #    g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'})])
 #    g2 = nx.Graph(name='hahaha')
 #    g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
 #                       (3, {'atom': 'O'}), (4, {'atom': 'C'})])
 #    g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
 #                       (2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
    
    g1 = nx.Graph(name='haha')
    g1.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
                       (3, {'atom': 'S'}), (4, {'atom': 'S'})])
    g1.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
                       (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
    g2 = nx.Graph(name='hahaha')
    g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'C'}), (2, {'atom': 'C'}),
                       (3, {'atom': 'O'}), (4, {'atom': 'O'})])
    g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
                       (2, 3, {'bond_type': '1'}), (2, 4, {'bond_type': '1'})])
    
 #    # randomly select two molecules
 #    np.random.seed(1)
 #    idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
 #    g1 = Gn[idx_gi[0]]
 #    g2 = Gn[idx_gi[1]]
 #    Gn_mix = [g.copy() for g in Gn]
 #    Gn_mix.append(g1.copy())
 #    Gn_mix.append(g2.copy())
    
    Gn = [g1.copy(), g2.copy()]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 10 # recursions
 #    l = 500
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 2 # k nearest neighbors
    epsilon = 1e-6
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    c_ei=1
    c_er=1
    c_es=1
    
    Gn_mix = Gn + [g1.copy(), g2.copy()]
    
    # compute         
    time0 = time.time()
    km = compute_kernel(Gn_mix, gkernel, True)
    time_km = time.time() - time0

    time_list = []
    sod_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []       
    g_best = []
    # for each alpha
    for alpha in alpha_range:
        print('\n-------------------------------------------------------\n')
        print('alpha =', alpha)
        time0 = time.time()
        dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
            [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, 
            gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon, 
            ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL)
        time_total = time.time() - time0 + time_km
        print('time: ', time_total)
        time_list.append(time_total)
        sod_ks_min_list.append(dhat)
        g_best.append(ghat_list)
        nb_updated_list.append(nb_updated)       
        
    # show best graphs and save them to file.
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', sod_ks_min_list[idx])
        print('one of the possible corresponding pre-images is')
        nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), 
                with_labels=True)
        plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG")
        plt.show()
        print(g_best[idx][0].nodes(data=True))
        print(g_best[idx][0].edges(data=True))
        
 #        for g in g_best[idx]:
 #            draw_Letter_graph(g, savepath='results/gk_iam/')
 ##            nx.draw_networkx(g)
 ##            plt.show()
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
            
    # compute the corresponding sod in graph space.
    for idx, item in enumerate(alpha_range):
        sod_tmp, _ = ged_median(g_best[0], [g1, g2], ged_cost=ged_cost, 
                                     ged_method=ged_method, saveGXL=saveGXL)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)  
    print('\nsmallest sod in kernel space for each alpha: ', sod_ks_min_list) 
    print('\nnumber of updates for each alpha: ', nb_updated_list)             
    print('\ntimes:', time_list)
            
        

 if __name__ == '__main__':
 #    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
 #          'extra_params': {}}  # node/edge symb
 #    ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
 #          'extra_params': {}} # node nsymb
 #    ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
 #          'extra_params': {}}
 #    ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
 #        'extra_params': {}} # node symb
 #    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:20]
    
 #    import networkx.algorithms.isomorphism as iso
 #    G1 = nx.MultiDiGraph()
 #    G2 = nx.MultiDiGraph()
 #    G1.add_nodes_from([1,2,3], fill='red')
 #    G2.add_nodes_from([10,20,30,40], fill='red')
 #    nx.add_path(G1, [1,2,3,4], weight=3, linewidth=2.5)
 #    nx.add_path(G2, [10,20,30,40], weight=3)
 #    nm = iso.categorical_node_match('fill', 'red')
 #    print(nx.is_isomorphic(G1, G2, node_match=nm))
 #    
 #    test_new_IAM_allGraph_deleteNodes(Gn)
 #    test_will_IAM_give_the_median_graph_we_wanted(Gn)
 #    test_who_is_the_closest_in_GED_space(Gn)
 #    test_who_is_the_closest_in_kernel_space(Gn)
    
 #    test_the_simple_two(Gn, 'untilhpathkernel')
 #    test_remove_bests(Gn, 'untilhpathkernel')
 #    test_gkiam_letter_h()
 #    test_iam_letter_h()
 #    test_random_preimage_letter_h
    
 ###############################################################################
 # retests.
    retest_the_simple_two()
--- a/gklearn/preimage/test_preimage_iam.py
+++ b/gklearn/preimage/test_preimage_iam.py
@@ -0,0 +1,620 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Sep  5 15:59:00 2019

@author: ljia
 """

 import numpy as np
 import networkx as nx
 import matplotlib.pyplot as plt
 import time
 import random
 #from tqdm import tqdm

 from gklearn.utils.graphfiles import loadDataset
 from gklearn.preimage.utils import remove_edges, compute_kernel, get_same_item_indices
 from gklearn.preimage.ged import ged_median

 from gklearn.preimage.preimage_iam import preimage_iam 


 ###############################################################################
 # tests on different values on grid of median-sets and k.

 def test_preimage_iam_grid_k_median_nb():       
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 5 # iteration limit for pre-image.
 #    alpha_range = np.linspace(0.5, 0.5, 1)
 #    k = 5 # k nearest neighbors
    epsilon = 1e-6
    InitIAMWithAllDk = True
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    # parameters for IAM function
    c_ei=1
    c_er=1
    c_es=1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = True
    connected_iam = False
    
    # number of graphs; we what to compute the median of these graphs. 
    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    # number of nearest neighbors.
    k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100]
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0    
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
        
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    nb_updated_k_list = []
    g_best = []
    for idx_nb, nb_median in enumerate(nb_median_range):
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        
 #        for g in Gn_median:
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
 #            plt.show()
 #            plt.clf()                         
                    
        ###################################################################
        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        km_tmp = gmfile['gm']
        time_km = gmfile['gmtime']
        # modify mixed gram matrix.
        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                km[i, j] = km_tmp[i, j]
                km[j, i] = km[i, j]
        for i in range(len(Gn)):
            for j, idx in enumerate(idx_rdm):
                km[i, len(Gn) + j] = km[i, idx]
                km[len(Gn) + j, i] = km[i, idx]
        for i, idx1 in enumerate(idx_rdm):
            for j, idx2 in enumerate(idx_rdm):
                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        
        time_list.append([])
        dis_ks_min_list.append([])
        sod_gs_list.append([])
        sod_gs_min_list.append([])
        nb_updated_list.append([])
        nb_updated_k_list.append([])
        g_best.append([])   
        
        for k in k_range:
            print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
            print('k =', k)
            time0 = time.time()
            dhat, ghat_list, dis_of_each_itr, nb_updated, nb_updated_k = \
                preimage_iam(Gn, Gn_median,
                alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, 
                gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
                params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, 
                            'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
                            'removeNodes': removeNodes, 'connected': connected_iam},
                params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, 
                            'saveGXL': saveGXL})
                
            time_total = time.time() - time0 + time_km
            print('time: ', time_total)
            time_list[idx_nb].append(time_total)
            print('\nsmallest distance in kernel space: ', dhat) 
            dis_ks_min_list[idx_nb].append(dhat)
            g_best[idx_nb].append(ghat_list)
            print('\nnumber of updates of the best graph by IAM: ', nb_updated)
            nb_updated_list[idx_nb].append(nb_updated)
            print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k)
            nb_updated_k_list[idx_nb].append(nb_updated_k)
            
            # show the best graph and save it to file.
            print('the shortest distance is', dhat)
            print('one of the possible corresponding pre-images is')
            nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), 
                    with_labels=True)
            plt.savefig('results/preimage_iam/mutag_median_nb' + str(nb_median) + 
                        '_k' + str(k) + '.png', format="PNG")
    #        plt.show()
            plt.clf()
    #        print(ghat_list[0].nodes(data=True))
    #        print(ghat_list[0].edges(data=True))
        
            # compute the corresponding sod in graph space.
            sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, 
                                         ged_method=ged_method, saveGXL=saveGXL)
            sod_gs_list[idx_nb].append(sod_tmp)
            sod_gs_min_list[idx_nb].append(np.min(sod_tmp))
            print('\nsmallest sod in graph space: ', np.min(sod_tmp))
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each set of median graphs and k: ', 
          sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs and k: ', 
          dis_ks_min_list) 
    print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', 
          nb_updated_list)
    print('\nnumber of updates of k nearest graphs for each set of median graphs and k by IAM: ', 
          nb_updated_k_list)
    print('\ntimes:', time_list)
    
    
    
    


 ###############################################################################
 # tests on different numbers of median-sets.

 def test_preimage_iam_median_nb():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 3 # iteration limit for pre-image.
 #    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 5 # k nearest neighbors
    epsilon = 1e-6
    InitIAMWithAllDk = True
    # parameters for IAM function
 #    c_vi = 0.037
 #    c_vr = 0.038
 #    c_vs = 0.075
 #    c_ei = 0.001
 #    c_er = 0.001
 #    c_es = 0.0
    c_vi = 4
    c_vr = 4
    c_vs = 2
    c_ei = 1
    c_er = 1
    c_es = 1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = True
    connected_iam = False
    # parameters for GED function
 #    ged_cost='CHEM_1'
    ged_cost = 'CONSTANT'
    ged_method = 'IPFP'
    edit_cost_constant = [c_vi, c_vr, c_vs, c_ei, c_er, c_es]
    ged_stabilizer = 'min'
    ged_repeat = 50
    params_ged = {'lib': 'gedlibpy', 'cost': ged_cost, 'method': ged_method, 
                  'edit_cost_constant': edit_cost_constant, 
                  'stabilizer': ged_stabilizer, 'repeat': ged_repeat}
    
    # number of graphs; we what to compute the median of these graphs. 
 #    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    nb_median_range = [2]
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0    
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
        
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    nb_updated_k_list = []
    g_best = []
    for nb_median in nb_median_range:
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        
 #        for g in Gn_median:
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
 #            plt.show()
 #            plt.clf()                         
                    
        ###################################################################
        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        km_tmp = gmfile['gm']
        time_km = gmfile['gmtime']
        # modify mixed gram matrix.
        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                km[i, j] = km_tmp[i, j]
                km[j, i] = km[i, j]
        for i in range(len(Gn)):
            for j, idx in enumerate(idx_rdm):
                km[i, len(Gn) + j] = km[i, idx]
                km[len(Gn) + j, i] = km[i, idx]
        for i, idx1 in enumerate(idx_rdm):
            for j, idx2 in enumerate(idx_rdm):
                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        time0 = time.time()
        dhat, ghat_list, dis_of_each_itr, nb_updated, nb_updated_k = \
            preimage_iam(Gn, Gn_median,
            alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, 
            gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
            params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, 
                        'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
                        'removeNodes': removeNodes, 'connected': connected_iam},
            params_ged=params_ged)
            
        time_total = time.time() - time0 + time_km
        print('\ntime: ', time_total)
        time_list.append(time_total)
        print('\nsmallest distance in kernel space: ', dhat) 
        dis_ks_min_list.append(dhat)
        g_best.append(ghat_list)
        print('\nnumber of updates of the best graph: ', nb_updated)
        nb_updated_list.append(nb_updated)
        print('\nnumber of updates of k nearest graphs: ', nb_updated_k)
        nb_updated_k_list.append(nb_updated_k)
        
        # show the best graph and save it to file.
        print('the shortest distance is', dhat)
        print('one of the possible corresponding pre-images is')
        nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), 
                with_labels=True)
        plt.show()
 #        plt.savefig('results/preimage_iam/mutag_median_cs.001_nb' + str(nb_median) + 
 #                    '.png', format="PNG")
        plt.clf()
 #        print(ghat_list[0].nodes(data=True))
 #        print(ghat_list[0].edges(data=True))
    
        # compute the corresponding sod in graph space.
        sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, params_ged=params_ged)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        print('\nsmallest sod in graph space: ', np.min(sod_tmp))
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs: ', 
          dis_ks_min_list) 
    print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', 
          nb_updated_list)
    print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', 
          nb_updated_k_list)
    print('\ntimes:', time_list)
    
    
    
    
    

 ###############################################################################
 # test on the combination of the two randomly chosen graphs. (the same as in the
 # random pre-image paper.)

 def test_gkiam_2combination_all_pairs():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 10 # iteration limit for pre-image.
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 5 # k nearest neighbors
    epsilon = 1e-6
    InitIAMWithAllDk = False
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    # parameters for IAM function
    c_ei=1
    c_er=1
    c_es=1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = True
    connected_iam = False
    
    nb_update_mat = np.full((len(Gn), len(Gn)), np.inf)
    # test on each pair of graphs.
 #    for idx1 in range(len(Gn) - 1, -1, -1):
 #        for idx2 in range(idx1, -1, -1):
    for idx1 in range(187, 188):
        for idx2 in range(167, 168):
            g1 = Gn[idx1].copy()
            g2 = Gn[idx2].copy()
        #    Gn[10] = []
        #    Gn[10] = []
            
            nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
            plt.savefig("results/gk_iam/all_pairs/mutag187.png", format="PNG")
            plt.show()
            plt.clf()
            nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
            plt.savefig("results/gk_iam/all_pairs/mutag167.png", format="PNG")
            plt.show()
            plt.clf()

            ###################################################################            
 #            Gn_mix = [g.copy() for g in Gn]
 #            Gn_mix.append(g1.copy())
 #            Gn_mix.append(g2.copy())
 #            
 #            # compute
 #            time0 = time.time()
 #            km = compute_kernel(Gn_mix, gkernel, True)
 #            time_km = time.time() - time0
 #            
 #            # write Gram matrix to file and read it.
 #            np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km)
            
            ###################################################################
            gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
            km = gmfile['gm']
            time_km = gmfile['gmtime']
            # modify mixed gram matrix.
            for i in range(len(Gn)):
                km[i, len(Gn)] = km[i, idx1]
                km[i, len(Gn) + 1] = km[i, idx2]
                km[len(Gn), i] = km[i, idx1]
                km[len(Gn) + 1, i] = km[i, idx2]
            km[len(Gn), len(Gn)] = km[idx1, idx1]
            km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
            km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
            km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
            
            ###################################################################
 #            # use only the two graphs in median set as candidates.
 #            Gn = [g1.copy(), g2.copy()]
 #            Gn_mix = Gn + [g1.copy(), g2.copy()]
 #            # compute         
 #            time0 = time.time()
 #            km = compute_kernel(Gn_mix, gkernel, True)
 #            time_km = time.time() - time0
    
            
            time_list = []
            dis_ks_min_list = []
            sod_gs_list = []
            sod_gs_min_list = []
            nb_updated_list = []
            nb_updated_k_list = [] 
            g_best = []
            # for each alpha
            for alpha in alpha_range:
                print('\n-------------------------------------------------------\n')
                print('alpha =', alpha)
                time0 = time.time()
                dhat, ghat_list, sod_ks, nb_updated, nb_updated_k = \
                    preimage_iam(Gn, [g1, g2],
                    [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, 
                    gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk,
                    params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, 
                                'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
                                'removeNodes': removeNodes, 'connected': connected_iam},
                    params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, 
                                'saveGXL': saveGXL})
                time_total = time.time() - time0 + time_km
                print('time: ', time_total)
                time_list.append(time_total)
                dis_ks_min_list.append(dhat)
                g_best.append(ghat_list)
                nb_updated_list.append(nb_updated)
                nb_updated_k_list.append(nb_updated_k)
                
            # show best graphs and save them to file.
            for idx, item in enumerate(alpha_range):
                print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
                print('one of the possible corresponding pre-images is')
                nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), 
                        with_labels=True)
                plt.savefig('results/gk_iam/mutag' + str(idx1) + '_' + str(idx2) 
                            + '_alpha' + str(item) + '.png', format="PNG")
 #                plt.show()
                plt.clf()
 #                print(g_best[idx][0].nodes(data=True))
 #                print(g_best[idx][0].edges(data=True))
                
        #        for g in g_best[idx]:
        #            draw_Letter_graph(g, savepath='results/gk_iam/')
        ##            nx.draw_networkx(g)
        ##            plt.show()
        #            print(g.nodes(data=True))
        #            print(g.edges(data=True))
                    
            # compute the corresponding sod in graph space.
            for idx, item in enumerate(alpha_range):
                sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost, 
                                             ged_method=ged_method, saveGXL=saveGXL)
                sod_gs_list.append(sod_tmp)
                sod_gs_min_list.append(np.min(sod_tmp))
                
            print('\nsods in graph space: ', sod_gs_list)
            print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)  
            print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) 
            print('\nnumber of updates of the best graph for each alpha: ', 
                  nb_updated_list)
            print('\nnumber of updates of the k nearest graphs for each alpha: ', 
                  nb_updated_k_list)
            print('\ntimes:', time_list)
            nb_update_mat[idx1, idx2] = nb_updated_list[0]
            
            str_fw = 'graphs %d and %d: %d.\n' % (idx1, idx2, nb_updated_list[0])
            with open('results/gk_iam/all_pairs/nb_updates.txt', 'r+') as file:
                content = file.read()
                file.seek(0, 0)
                file.write(str_fw + content)
    
    

 def test_gkiam_2combination():
    from gk_iam import gk_iam_nearest_multi
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 10 # iteration limit for pre-image.
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 20 # k nearest neighbors
    epsilon = 1e-6
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    c_ei=1
    c_er=1
    c_es=1
    
    # randomly select two molecules
    np.random.seed(1)
    idx_gi = [10, 11] # np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx_gi[0]].copy()
    g2 = Gn[idx_gi[1]].copy()
 #    Gn[10] = []
 #    Gn[10] = []
    
 #    nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
 #    plt.savefig("results/random_preimage/mutag10.png", format="PNG")
 #    plt.show()
 #    nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
 #    plt.savefig("results/random_preimage/mutag11.png", format="PNG")
 #    plt.show() 
    
    Gn_mix = [g.copy() for g in Gn]
    Gn_mix.append(g1.copy())
    Gn_mix.append(g2.copy())
    
    # compute
 #    time0 = time.time()
 #    km = compute_kernel(Gn_mix, gkernel, True)
 #    time_km = time.time() - time0
    
    # write Gram matrix to file and read it.
 #    np.savez('results/gram_matrix.gm', gm=km, gmtime=time_km)
    gmfile = np.load('results/gram_matrix.gm.npz')
    km = gmfile['gm']
    time_km = gmfile['gmtime']
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []       
    g_best = []
    # for each alpha
    for alpha in alpha_range:
        print('\n-------------------------------------------------------\n')
        print('alpha =', alpha)
        time0 = time.time()
        dhat, ghat_list, sod_ks, nb_updated = gk_iam_nearest_multi(Gn, [g1, g2],
            [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, 
            gkernel, c_ei=c_ei, c_er=c_er, c_es=c_es, epsilon=epsilon, 
            ged_cost=ged_cost, ged_method=ged_method, saveGXL=saveGXL)
        time_total = time.time() - time0 + time_km
        print('time: ', time_total)
        time_list.append(time_total)
        dis_ks_min_list.append(dhat)
        g_best.append(ghat_list)
        nb_updated_list.append(nb_updated)       
        
    # show best graphs and save them to file.
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
        print('one of the possible corresponding pre-images is')
        nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), 
                with_labels=True)
        plt.savefig('results/gk_iam/mutag_alpha' + str(item) + '.png', format="PNG")
        plt.show()
        print(g_best[idx][0].nodes(data=True))
        print(g_best[idx][0].edges(data=True))
        
 #        for g in g_best[idx]:
 #            draw_Letter_graph(g, savepath='results/gk_iam/')
 ##            nx.draw_networkx(g)
 ##            plt.show()
 #            print(g.nodes(data=True))
 #            print(g.edges(data=True))
            
    # compute the corresponding sod in graph space.
    for idx, item in enumerate(alpha_range):
        sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost, 
                                     ged_method=ged_method, saveGXL=saveGXL)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) 
    print('\nnumber of updates for each alpha: ', nb_updated_list)             
    print('\ntimes:', time_list)
    
    
 ###############################################################################

    
 if __name__ == '__main__':
 ###############################################################################
 # test on the combination of the two randomly chosen graphs. (the same as in the
 # random pre-image paper.)
 #    test_gkiam_2combination()
 #    test_gkiam_2combination_all_pairs()
    
 ###############################################################################
 # tests on different numbers of median-sets.
    test_preimage_iam_median_nb()
    
 ###############################################################################
 # tests on different values on grid of median-sets and k.
 #    test_preimage_iam_grid_k_median_nb()
--- a/gklearn/preimage/test_preimage_mix.py
+++ b/gklearn/preimage/test_preimage_mix.py
@@ -0,0 +1,539 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Sep  5 15:59:00 2019

@author: ljia
 """

 import numpy as np
 import networkx as nx
 import matplotlib.pyplot as plt
 import time
 import random
 #from tqdm import tqdm

 from gklearn.utils.graphfiles import loadDataset
 from gklearn.preimage.ged import ged_median
 from gklearn.preimage.utils import compute_kernel, get_same_item_indices, remove_edges
 from gklearn.preimage.preimage_iam import preimage_iam_random_mix

 ###############################################################################
 # tests on different values on grid of median-sets and k.

 def test_preimage_mix_grid_k_median_nb():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 5 # iteration limit for pre-image.
    l_max = 500 # update limit for random generation
 #    alpha_range = np.linspace(0.5, 0.5, 1)
 #    k = 5 # k nearest neighbors
    epsilon = 1e-6
    InitIAMWithAllDk = True
    InitRandomWithAllDk = True
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    # parameters for IAM function
    c_ei=1
    c_er=1
    c_es=1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = True
    connected_iam = False
    
    # number of graphs; we what to compute the median of these graphs. 
    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    # number of nearest neighbors.
    k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100]
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0    
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
        
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list_iam = []
    nb_updated_list_random = []
    nb_updated_k_list_iam = []
    nb_updated_k_list_random = []
    g_best = []
    for idx_nb, nb_median in enumerate(nb_median_range):
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        
 #        for g in Gn_median:
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
 #            plt.show()
 #            plt.clf()                         
                    
        ###################################################################
        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        km_tmp = gmfile['gm']
        time_km = gmfile['gmtime']
        # modify mixed gram matrix.
        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                km[i, j] = km_tmp[i, j]
                km[j, i] = km[i, j]
        for i in range(len(Gn)):
            for j, idx in enumerate(idx_rdm):
                km[i, len(Gn) + j] = km[i, idx]
                km[len(Gn) + j, i] = km[i, idx]
        for i, idx1 in enumerate(idx_rdm):
            for j, idx2 in enumerate(idx_rdm):
                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        
        time_list.append([])
        dis_ks_min_list.append([])
        sod_gs_list.append([])
        sod_gs_min_list.append([])
        nb_updated_list_iam.append([])
        nb_updated_list_random.append([])
        nb_updated_k_list_iam.append([])
        nb_updated_k_list_random.append([])
        g_best.append([])   
        
        for k in k_range:
            print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
            print('k =', k)
            time0 = time.time()
            dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \
                nb_updated_k_iam, nb_updated_k_random = \
                preimage_iam_random_mix(Gn, Gn_median,
                alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, 
                l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, 
                InitRandomWithAllDk=InitRandomWithAllDk,
                params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, 
                            'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
                            'removeNodes': removeNodes, 'connected': connected_iam},
                params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, 
                            'saveGXL': saveGXL})
                
            time_total = time.time() - time0 + time_km
            print('time: ', time_total)
            time_list[idx_nb].append(time_total)
            print('\nsmallest distance in kernel space: ', dhat) 
            dis_ks_min_list[idx_nb].append(dhat)
            g_best[idx_nb].append(ghat_list)
            print('\nnumber of updates of the best graph by IAM: ', nb_updated_iam)
            nb_updated_list_iam[idx_nb].append(nb_updated_iam)
            print('\nnumber of updates of the best graph by random generation: ', 
                  nb_updated_random)
            nb_updated_list_random[idx_nb].append(nb_updated_random)
            print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k_iam)
            nb_updated_k_list_iam[idx_nb].append(nb_updated_k_iam)
            print('\nnumber of updates of k nearest graphs by random generation: ', 
                  nb_updated_k_random)
            nb_updated_k_list_random[idx_nb].append(nb_updated_k_random) 
            
            # show the best graph and save it to file.
            print('the shortest distance is', dhat)
            print('one of the possible corresponding pre-images is')
            nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), 
                    with_labels=True)
            plt.savefig('results/preimage_mix/mutag_median_nb' + str(nb_median) + 
                        '_k' + str(k) + '.png', format="PNG")
    #        plt.show()
            plt.clf()
    #        print(ghat_list[0].nodes(data=True))
    #        print(ghat_list[0].edges(data=True))
        
            # compute the corresponding sod in graph space.
            sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, 
                                         ged_method=ged_method, saveGXL=saveGXL)
            sod_gs_list[idx_nb].append(sod_tmp)
            sod_gs_min_list[idx_nb].append(np.min(sod_tmp))
            print('\nsmallest sod in graph space: ', np.min(sod_tmp))
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each set of median graphs and k: ', 
          sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs and k: ', 
          dis_ks_min_list) 
    print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', 
          nb_updated_list_iam)
    print('\nnumber of updates of the best graph for each set of median graphs and k by random generation: ', 
          nb_updated_list_random)
    print('\nnumber of updates of k nearest graphs for each set of median graphs and k by IAM: ', 
          nb_updated_k_list_iam)
    print('\nnumber of updates of k nearest graphs for each set of median graphs and k by random generation: ', 
          nb_updated_k_list_random)
    print('\ntimes:', time_list)
    
    


 ###############################################################################
 # tests on different numbers of median-sets.

 def test_preimage_mix_median_nb():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 5 # iteration limit for pre-image.
    l_max = 500 # update limit for random generation
 #    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 5 # k nearest neighbors
    epsilon = 1e-6
    InitIAMWithAllDk = True
    InitRandomWithAllDk = True
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    # parameters for IAM function
    c_ei=1
    c_er=1
    c_es=1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = True
    connected_iam = False
    
    # number of graphs; we what to compute the median of these graphs. 
    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0    
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
        
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list_iam = []
    nb_updated_list_random = []
    nb_updated_k_list_iam = []
    nb_updated_k_list_random = []
    g_best = []
    for nb_median in nb_median_range:
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        
 #        for g in Gn_median:
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
 #            plt.show()
 #            plt.clf()                         
                    
        ###################################################################
        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        km_tmp = gmfile['gm']
        time_km = gmfile['gmtime']
        # modify mixed gram matrix.
        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                km[i, j] = km_tmp[i, j]
                km[j, i] = km[i, j]
        for i in range(len(Gn)):
            for j, idx in enumerate(idx_rdm):
                km[i, len(Gn) + j] = km[i, idx]
                km[len(Gn) + j, i] = km[i, idx]
        for i, idx1 in enumerate(idx_rdm):
            for j, idx2 in enumerate(idx_rdm):
                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        time0 = time.time()
        dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \
            nb_updated_k_iam, nb_updated_k_random = \
            preimage_iam_random_mix(Gn, Gn_median,
            alpha_range, range(len(Gn), len(Gn) + nb_median), km, k, r_max, 
            l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, 
            InitRandomWithAllDk=InitRandomWithAllDk,
            params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, 
                        'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
                        'removeNodes': removeNodes, 'connected': connected_iam},
            params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, 
                        'saveGXL': saveGXL})
            
        time_total = time.time() - time0 + time_km
        print('time: ', time_total)
        time_list.append(time_total)
        print('\nsmallest distance in kernel space: ', dhat) 
        dis_ks_min_list.append(dhat)
        g_best.append(ghat_list)
        print('\nnumber of updates of the best graph by IAM: ', nb_updated_iam)
        nb_updated_list_iam.append(nb_updated_iam)
        print('\nnumber of updates of the best graph by random generation: ', 
              nb_updated_random)
        nb_updated_list_random.append(nb_updated_random)
        print('\nnumber of updates of k nearest graphs by IAM: ', nb_updated_k_iam)
        nb_updated_k_list_iam.append(nb_updated_k_iam)
        print('\nnumber of updates of k nearest graphs by random generation: ', 
              nb_updated_k_random)
        nb_updated_k_list_random.append(nb_updated_k_random) 
        
        # show the best graph and save it to file.
        print('the shortest distance is', dhat)
        print('one of the possible corresponding pre-images is')
        nx.draw(ghat_list[0], labels=nx.get_node_attributes(ghat_list[0], 'atom'), 
                with_labels=True)
        plt.savefig('results/preimage_mix/mutag_median_nb' + str(nb_median) + 
                    '.png', format="PNG")
 #        plt.show()
        plt.clf()
 #        print(ghat_list[0].nodes(data=True))
 #        print(ghat_list[0].edges(data=True))
    
        # compute the corresponding sod in graph space.
        sod_tmp, _ = ged_median([ghat_list[0]], Gn_median, ged_cost=ged_cost, 
                                     ged_method=ged_method, saveGXL=saveGXL)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        print('\nsmallest sod in graph space: ', np.min(sod_tmp))
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs: ', 
          dis_ks_min_list) 
    print('\nnumber of updates of the best graph for each set of median graphs by IAM: ', 
          nb_updated_list_iam)
    print('\nnumber of updates of the best graph for each set of median graphs by random generation: ', 
          nb_updated_list_random)
    print('\nnumber of updates of k nearest graphs for each set of median graphs by IAM: ', 
          nb_updated_k_list_iam)
    print('\nnumber of updates of k nearest graphs for each set of median graphs by random generation: ', 
          nb_updated_k_list_random)
    print('\ntimes:', time_list)
    
    

 ###############################################################################
 # test on the combination of the two randomly chosen graphs. (the same as in the
 # random pre-image paper.)

 def test_preimage_mix_2combination_all_pairs():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 10 # iteration limit for pre-image.
    l_max = 500 # update limit for random generation
    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 5 # k nearest neighbors
    epsilon = 1e-6
    InitIAMWithAllDk = True
    InitRandomWithAllDk = True
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    # parameters for IAM function
    c_ei=1
    c_er=1
    c_es=1
    ite_max_iam = 50
    epsilon_iam = 0.001
    removeNodes = True
    connected_iam = False
    
    nb_update_mat_iam = np.full((len(Gn), len(Gn)), np.inf)
    nb_update_mat_random = np.full((len(Gn), len(Gn)), np.inf)
    # test on each pair of graphs.
 #    for idx1 in range(len(Gn) - 1, -1, -1):
 #        for idx2 in range(idx1, -1, -1):
    for idx1 in range(187, 188):
        for idx2 in range(167, 168):
            g1 = Gn[idx1].copy()
            g2 = Gn[idx2].copy()
        #    Gn[10] = []
        #    Gn[10] = []
            
            nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
            plt.savefig("results/preimage_mix/mutag187.png", format="PNG")
            plt.show()
            plt.clf()
            nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
            plt.savefig("results/preimage_mix/mutag167.png", format="PNG")
            plt.show()
            plt.clf()

            ###################################################################            
 #            Gn_mix = [g.copy() for g in Gn]
 #            Gn_mix.append(g1.copy())
 #            Gn_mix.append(g2.copy())
 #            
 #            # compute
 #            time0 = time.time()
 #            km = compute_kernel(Gn_mix, gkernel, True)
 #            time_km = time.time() - time0
 #            
 #            # write Gram matrix to file and read it.
 #            np.savez('results/gram_matrix_uhpath_itr7_pq0.8.gm', gm=km, gmtime=time_km)
            
            ###################################################################
            gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
            km = gmfile['gm']
            time_km = gmfile['gmtime']
            # modify mixed gram matrix.
            for i in range(len(Gn)):
                km[i, len(Gn)] = km[i, idx1]
                km[i, len(Gn) + 1] = km[i, idx2]
                km[len(Gn), i] = km[i, idx1]
                km[len(Gn) + 1, i] = km[i, idx2]
            km[len(Gn), len(Gn)] = km[idx1, idx1]
            km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
            km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
            km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
            
            ###################################################################
 #            # use only the two graphs in median set as candidates.
 #            Gn = [g1.copy(), g2.copy()]
 #            Gn_mix = Gn + [g1.copy(), g2.copy()]
 #            # compute         
 #            time0 = time.time()
 #            km = compute_kernel(Gn_mix, gkernel, True)
 #            time_km = time.time() - time0
    
            
            time_list = []
            dis_ks_min_list = []
            sod_gs_list = []
            sod_gs_min_list = []
            nb_updated_list_iam = []
            nb_updated_list_random = []
            nb_updated_k_list_iam = []
            nb_updated_k_list_random = []
            g_best = []
            # for each alpha
            for alpha in alpha_range:
                print('\n-------------------------------------------------------\n')
                print('alpha =', alpha)
                time0 = time.time()
                dhat, ghat_list, dis_of_each_itr, nb_updated_iam, nb_updated_random, \
                    nb_updated_k_iam, nb_updated_k_random = \
                    preimage_iam_random_mix(Gn, [g1, g2],
                    [alpha, 1 - alpha], range(len(Gn), len(Gn) + 2), km, k, r_max, 
                    l_max, gkernel, epsilon=epsilon, InitIAMWithAllDk=InitIAMWithAllDk, 
                    InitRandomWithAllDk=InitRandomWithAllDk,
                    params_iam={'c_ei': c_ei, 'c_er': c_er, 'c_es': c_es, 
                                'ite_max': ite_max_iam, 'epsilon': epsilon_iam,
                                'removeNodes': removeNodes, 'connected': connected_iam},
                    params_ged={'ged_cost': ged_cost, 'ged_method': ged_method, 
                                'saveGXL': saveGXL})
                time_total = time.time() - time0 + time_km
                print('time: ', time_total)
                time_list.append(time_total)
                dis_ks_min_list.append(dhat)
                g_best.append(ghat_list)
                nb_updated_list_iam.append(nb_updated_iam)       
                nb_updated_list_random.append(nb_updated_random)
                nb_updated_k_list_iam.append(nb_updated_k_iam)       
                nb_updated_k_list_random.append(nb_updated_k_random) 
                
            # show best graphs and save them to file.
            for idx, item in enumerate(alpha_range):
                print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
                print('one of the possible corresponding pre-images is')
                nx.draw(g_best[idx][0], labels=nx.get_node_attributes(g_best[idx][0], 'atom'), 
                        with_labels=True)
                plt.savefig('results/preimage_mix/mutag' + str(idx1) + '_' + str(idx2) 
                            + '_alpha' + str(item) + '.png', format="PNG")
 #                plt.show()
                plt.clf()
 #                print(g_best[idx][0].nodes(data=True))
 #                print(g_best[idx][0].edges(data=True))
                
        #        for g in g_best[idx]:
        #            draw_Letter_graph(g, savepath='results/gk_iam/')
        ##            nx.draw_networkx(g)
        ##            plt.show()
        #            print(g.nodes(data=True))
        #            print(g.edges(data=True))
                    
            # compute the corresponding sod in graph space.
            for idx, item in enumerate(alpha_range):
                sod_tmp, _ = ged_median([g_best[0]], [g1, g2], ged_cost=ged_cost, 
                                             ged_method=ged_method, saveGXL=saveGXL)
                sod_gs_list.append(sod_tmp)
                sod_gs_min_list.append(np.min(sod_tmp))
                
            print('\nsods in graph space: ', sod_gs_list)
            print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)  
            print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) 
            print('\nnumber of updates of the best graph for each alpha by IAM: ', nb_updated_list_iam)
            print('\nnumber of updates of the best graph for each alpha by random generation: ', 
                  nb_updated_list_random)
            print('\nnumber of updates of k nearest graphs for each alpha by IAM: ', 
                  nb_updated_k_list_iam)
            print('\nnumber of updates of k nearest graphs for each alpha by random generation: ', 
                  nb_updated_k_list_random)
            print('\ntimes:', time_list)
            nb_update_mat_iam[idx1, idx2] = nb_updated_list_iam[0]
            nb_update_mat_random[idx1, idx2] = nb_updated_list_random[0]
            
            str_fw = 'graphs %d and %d: %d times by IAM, %d times by random generation.\n' \
                % (idx1, idx2, nb_updated_list_iam[0], nb_updated_list_random[0])
            with open('results/preimage_mix/nb_updates.txt', 'r+') as file:
                content = file.read()
                file.seek(0, 0)
                file.write(str_fw + content)
    
 ###############################################################################

    
 if __name__ == '__main__':
 ###############################################################################
 # test on the combination of the two randomly chosen graphs. (the same as in the
 # random pre-image paper.)
 #    test_preimage_mix_2combination_all_pairs()
    
 ###############################################################################
 # tests on different numbers of median-sets.
 #    test_preimage_mix_median_nb()
    
 ###############################################################################
 # tests on different values on grid of median-sets and k.
    test_preimage_mix_grid_k_median_nb()
--- a/gklearn/preimage/test_preimage_random.py
+++ b/gklearn/preimage/test_preimage_random.py
@@ -0,0 +1,398 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Sep  5 15:59:00 2019

@author: ljia
 """

 import numpy as np
 import networkx as nx
 import matplotlib.pyplot as plt
 import time
 import random
 #from tqdm import tqdm

 from gklearn.utils.graphfiles import loadDataset
 from gklearn.preimage.preimage_random import preimage_random
 from gklearn.preimage.ged import ged_median
 from gklearn.preimage.utils import compute_kernel, get_same_item_indices, remove_edges


 ###############################################################################
 # tests on different values on grid of median-sets and k.

 def test_preimage_random_grid_k_median_nb():    
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 5 # iteration limit for pre-image.
    l = 500 # update limit for random generation
 #    alpha_range = np.linspace(0.5, 0.5, 1)
 #    k = 5 # k nearest neighbors
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    
    # number of graphs; we what to compute the median of these graphs. 
    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    # number of nearest neighbors.
    k_range = [5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 100]
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0    
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
        
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    g_best = []
    for idx_nb, nb_median in enumerate(nb_median_range):
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        
 #        for g in Gn_median:
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
 #            plt.show()
 #            plt.clf()                         
                    
        ###################################################################
        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        km_tmp = gmfile['gm']
        time_km = gmfile['gmtime']
        # modify mixed gram matrix.
        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                km[i, j] = km_tmp[i, j]
                km[j, i] = km[i, j]
        for i in range(len(Gn)):
            for j, idx in enumerate(idx_rdm):
                km[i, len(Gn) + j] = km[i, idx]
                km[len(Gn) + j, i] = km[i, idx]
        for i, idx1 in enumerate(idx_rdm):
            for j, idx2 in enumerate(idx_rdm):
                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        
        time_list.append([])
        dis_ks_min_list.append([])
        sod_gs_list.append([])
        sod_gs_min_list.append([])
        nb_updated_list.append([])
        g_best.append([])   
        
        for k in k_range:
            print('\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
            print('k =', k)
            time0 = time.time()
            dhat, ghat, nb_updated = preimage_random(Gn, Gn_median, alpha_range, 
                range(len(Gn), len(Gn) + nb_median), km, k, r_max, l, gkernel)
                
            time_total = time.time() - time0 + time_km
            print('time: ', time_total)
            time_list[idx_nb].append(time_total)
            print('\nsmallest distance in kernel space: ', dhat) 
            dis_ks_min_list[idx_nb].append(dhat)
            g_best[idx_nb].append(ghat)
            print('\nnumber of updates of the best graph: ', nb_updated)
            nb_updated_list[idx_nb].append(nb_updated)
            
            # show the best graph and save it to file.
            print('the shortest distance is', dhat)
            print('one of the possible corresponding pre-images is')
            nx.draw(ghat, labels=nx.get_node_attributes(ghat, 'atom'), 
                    with_labels=True)
            plt.savefig('results/preimage_random/mutag_median_nb' + str(nb_median) + 
                        '_k' + str(k) + '.png', format="PNG")
    #        plt.show()
            plt.clf()
    #        print(ghat_list[0].nodes(data=True))
    #        print(ghat_list[0].edges(data=True))
        
            # compute the corresponding sod in graph space.
            sod_tmp, _ = ged_median([ghat], Gn_median, ged_cost=ged_cost, 
                                         ged_method=ged_method, saveGXL=saveGXL)
            sod_gs_list[idx_nb].append(sod_tmp)
            sod_gs_min_list[idx_nb].append(np.min(sod_tmp))
            print('\nsmallest sod in graph space: ', np.min(sod_tmp))
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each set of median graphs and k: ', 
          sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs and k: ', 
          dis_ks_min_list) 
    print('\nnumber of updates of the best graph for each set of median graphs and k by IAM: ', 
          nb_updated_list)
    print('\ntimes:', time_list)
    



 ###############################################################################
 # tests on different numbers of median-sets.

 def test_preimage_random_median_nb():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:50]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
    lmbda = 0.03 # termination probalility
    r_max = 5 # iteration limit for pre-image.
    l = 500 # update limit for random generation
 #    alpha_range = np.linspace(0.5, 0.5, 1)
    k = 5 # k nearest neighbors
    # parameters for GED function
    ged_cost='CHEM_1'
    ged_method='IPFP'
    saveGXL='gedlib'
    
    # number of graphs; we what to compute the median of these graphs. 
    nb_median_range = [2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
    
    # find out all the graphs classified to positive group 1.
    idx_dict = get_same_item_indices(y_all)
    Gn = [Gn[i] for i in idx_dict[1]]
    
 #    # compute Gram matrix.
 #    time0 = time.time()
 #    km = compute_kernel(Gn, gkernel, True)
 #    time_km = time.time() - time0    
 #    # write Gram matrix to file.
 #    np.savez('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm', gm=km, gmtime=time_km)
        
    
    time_list = []
    dis_ks_min_list = []
    sod_gs_list = []
    sod_gs_min_list = []
    nb_updated_list = []
    g_best = []
    for nb_median in nb_median_range:
        print('\n-------------------------------------------------------')
        print('number of median graphs =', nb_median)
        random.seed(1)
        idx_rdm = random.sample(range(len(Gn)), nb_median)
        print('graphs chosen:', idx_rdm)
        Gn_median = [Gn[idx].copy() for idx in idx_rdm]
        
 #        for g in Gn_median:
 #            nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
 ##            plt.savefig("results/preimage_mix/mutag.png", format="PNG")
 #            plt.show()
 #            plt.clf()                         
                    
        ###################################################################
        gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03_mutag_positive.gm.npz')
        km_tmp = gmfile['gm']
        time_km = gmfile['gmtime']
        # modify mixed gram matrix.
        km = np.zeros((len(Gn) + nb_median, len(Gn) + nb_median))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                km[i, j] = km_tmp[i, j]
                km[j, i] = km[i, j]
        for i in range(len(Gn)):
            for j, idx in enumerate(idx_rdm):
                km[i, len(Gn) + j] = km[i, idx]
                km[len(Gn) + j, i] = km[i, idx]
        for i, idx1 in enumerate(idx_rdm):
            for j, idx2 in enumerate(idx_rdm):
                km[len(Gn) + i, len(Gn) + j] = km[idx1, idx2]
                
        ###################################################################
        alpha_range = [1 / nb_median] * nb_median
        time0 = time.time()
        dhat, ghat, nb_updated = preimage_random(Gn, Gn_median, alpha_range, 
            range(len(Gn), len(Gn) + nb_median), km, k, r_max, l, gkernel)
            
        time_total = time.time() - time0 + time_km
        print('time: ', time_total)
        time_list.append(time_total)
        print('\nsmallest distance in kernel space: ', dhat) 
        dis_ks_min_list.append(dhat)
        g_best.append(ghat)
        print('\nnumber of updates of the best graph: ', nb_updated)
        nb_updated_list.append(nb_updated)
        
        # show the best graph and save it to file.
        print('the shortest distance is', dhat)
        print('one of the possible corresponding pre-images is')
        nx.draw(ghat, labels=nx.get_node_attributes(ghat, 'atom'), 
                with_labels=True)
        plt.savefig('results/preimage_random/mutag_median_nb' + str(nb_median) + 
                    '.png', format="PNG")
 #        plt.show()
        plt.clf()
 #        print(ghat_list[0].nodes(data=True))
 #        print(ghat_list[0].edges(data=True))
    
        # compute the corresponding sod in graph space.
        sod_tmp, _ = ged_median([ghat], Gn_median, ged_cost=ged_cost, 
                                     ged_method=ged_method, saveGXL=saveGXL)
        sod_gs_list.append(sod_tmp)
        sod_gs_min_list.append(np.min(sod_tmp))
        print('\nsmallest sod in graph space: ', np.min(sod_tmp))
        
    print('\nsods in graph space: ', sod_gs_list)
    print('\nsmallest sod in graph space for each set of median graphs: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each set of median graphs: ', 
          dis_ks_min_list) 
    print('\nnumber of updates of the best graph for each set of median graphs: ', 
          nb_updated_list)
    print('\ntimes:', time_list)
    
    

 ###############################################################################
 # test on the combination of the two randomly chosen graphs. (the same as in the
 # random pre-image paper.)
    
 def test_random_preimage_2combination():
    ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG_A.txt',
          'extra_params': {}}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
 #    Gn = Gn[0:12]
    remove_edges(Gn)
    gkernel = 'marginalizedkernel'
    
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, gkernel=gkernel)
 #    print(dis_max, dis_min, dis_mean)
    
    lmbda = 0.03 # termination probalility
    r_max = 10 # iteration limit for pre-image.
    l = 500
    alpha_range = np.linspace(0, 1, 11)
    k = 5 # k nearest neighbors
    
    # randomly select two molecules
    np.random.seed(1)
    idx_gi = [187, 167] # np.random.randint(0, len(Gn), 2)
    g1 = Gn[idx_gi[0]].copy()
    g2 = Gn[idx_gi[1]].copy()
    
 #    nx.draw(g1, labels=nx.get_node_attributes(g1, 'atom'), with_labels=True)
 #    plt.savefig("results/random_preimage/mutag10.png", format="PNG")
 #    plt.show()
 #    nx.draw(g2, labels=nx.get_node_attributes(g2, 'atom'), with_labels=True)
 #    plt.savefig("results/random_preimage/mutag11.png", format="PNG")
 #    plt.show()    
    
    ######################################################################
 #    Gn_mix = [g.copy() for g in Gn]
 #    Gn_mix.append(g1.copy())
 #    Gn_mix.append(g2.copy())
 #    
 ##    g_tmp = iam([g1, g2])
 ##    nx.draw_networkx(g_tmp)
 ##    plt.show()
 #    
 #    # compute 
 #    time0 = time.time()
 #    km = compute_kernel(Gn_mix, gkernel, True)
 #    time_km = time.time() - time0
    
    ###################################################################
    idx1 = idx_gi[0]
    idx2 = idx_gi[1]
    gmfile = np.load('results/gram_matrix_marg_itr10_pq0.03.gm.npz')
    km = gmfile['gm']
    time_km = gmfile['gmtime']
    # modify mixed gram matrix.
    for i in range(len(Gn)):
        km[i, len(Gn)] = km[i, idx1]
        km[i, len(Gn) + 1] = km[i, idx2]
        km[len(Gn), i] = km[i, idx1]
        km[len(Gn) + 1, i] = km[i, idx2]
    km[len(Gn), len(Gn)] = km[idx1, idx1]
    km[len(Gn), len(Gn) + 1] = km[idx1, idx2]
    km[len(Gn) + 1, len(Gn)] = km[idx2, idx1]
    km[len(Gn) + 1, len(Gn) + 1] = km[idx2, idx2]
            
    ###################################################################

    time_list = []
    nb_updated_list = []
    g_best = []
    dis_ks_min_list = []
    # for each alpha
    for alpha in alpha_range:
        print('\n-------------------------------------------------------\n')
        print('alpha =', alpha)
        time0 = time.time()
        dhat, ghat, nb_updated = preimage_random(Gn, [g1, g2], [alpha, 1 - alpha], 
                                          range(len(Gn), len(Gn) + 2), km,
                                          k, r_max, l, gkernel)
        time_total = time.time() - time0 + time_km
        print('time: ', time_total)
        time_list.append(time_total)
        dis_ks_min_list.append(dhat)
        g_best.append(ghat)
        nb_updated_list.append(nb_updated)
        
    # show best graphs and save them to file.
    for idx, item in enumerate(alpha_range):
        print('when alpha is', item, 'the shortest distance is', dis_ks_min_list[idx])
        print('one of the possible corresponding pre-images is')
        nx.draw(g_best[idx], labels=nx.get_node_attributes(g_best[idx], 'atom'), 
                with_labels=True)
        plt.show()
        plt.savefig('results/random_preimage/mutag_alpha' + str(item) + '.png', format="PNG")
        plt.clf()
        print(g_best[idx].nodes(data=True))
        print(g_best[idx].edges(data=True))
            
 #        # compute the corresponding sod in graph space. (alpha range not considered.)
 #        sod_tmp, _ = median_distance(g_best[0], Gn_let)
 #        sod_gs_list.append(sod_tmp)
 #        sod_gs_min_list.append(np.min(sod_tmp))
 #        sod_ks_min_list.append(sod_ks)
 #        nb_updated_list.append(nb_updated)
                      
 #    print('\nsmallest sod in graph space for each alpha: ', sod_gs_min_list)  
    print('\nsmallest distance in kernel space for each alpha: ', dis_ks_min_list) 
    print('\nnumber of updates for each alpha: ', nb_updated_list)             
    print('\ntimes:', time_list)
    
 ###############################################################################

    
 if __name__ == '__main__':
 ###############################################################################
 # test on the combination of the two randomly chosen graphs. (the same as in the
 # random pre-image paper.)
 #    test_random_preimage_2combination()
    
 ###############################################################################
 # tests all algorithms on different numbers of median-sets.
    test_preimage_random_median_nb()
    
 ###############################################################################
 # tests all algorithms on different values on grid of median-sets and k.
 #    test_preimage_random_grid_k_median_nb()
--- a/gklearn/preimage/timer.py
+++ b/gklearn/preimage/timer.py
@@ -0,0 +1,40 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Mar 23 09:52:50 2020

@author: ljia
 """
 import time

 class Timer(object):
 	"""A timer class that can be used by methods that support time limits.
 	
 	Note
 	----
 	This is the Python implementation of `the C++ code in GEDLIB <https://github.com/dbblumenthal/gedlib/blob/master/src/env/timer.hpp>`__.
 	"""
 	
 	def __init__(self, time_limit_in_sec):
 		"""Constructs a timer for a given time limit.
 		
 		Parameters
 		----------
 		time_limit_in_sec : string
 			The time limit in seconds.
 		"""		
 		self.__time_limit_in_sec = time_limit_in_sec
 		self.__start_time = time.time()
 	
 	
 	def expired(self):
 		"""Checks if the time limit has expired. 
 		
 		Return
 		------
 		Boolean true if the time limit has expired and false otherwise.
 """
 		if self.__time_limit_in_sec > 0:
 			runtime = time.time() - self.__start_time
 			return runtime >= self.__time_limit_in_sec
 		return False
--- a/gklearn/preimage/utils.py
+++ b/gklearn/preimage/utils.py
@@ -0,0 +1,151 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Oct 17 19:05:07 2019

 Useful functions.
@author: ljia
 """
 #import networkx as nx

 import multiprocessing
 import numpy as np

 from gklearn.kernels.marginalizedKernel import marginalizedkernel
 from gklearn.kernels.untilHPathKernel import untilhpathkernel
 from gklearn.kernels.spKernel import spkernel
 import functools
 from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, polynomialkernel
 from gklearn.kernels.structuralspKernel import structuralspkernel
 from gklearn.kernels.treeletKernel import treeletkernel
 from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel


 def remove_edges(Gn):
    for G in Gn:
        for _, _, attrs in G.edges(data=True):
            attrs.clear()
            
 def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True):
    term1 = Kmatrix[idx_g, idx_g]
    term2 = 0
    for i, a in enumerate(alpha):
        term2 += a * Kmatrix[idx_g, idx_gi[i]]
    term2 *= 2
    if withterm3 == False:
        for i1, a1 in enumerate(alpha):
            for i2, a2 in enumerate(alpha):
                term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
    return np.sqrt(term1 - term2 + term3)


 def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'):
    if graph_kernel == 'marginalizedkernel':
        Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label,
                                  p_quit=0.03, n_iteration=10, remove_totters=False,
                                  n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'untilhpathkernel':
        Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label,
                                  depth=7, k_func='MinMax', compute_method='trie',
                                  parallel=parallel,
                                  n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'spkernel':
        mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
        Kmatrix = np.empty((len(Gn), len(Gn)))
 #        Kmatrix[:] = np.nan
        Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels=
                              {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
                              n_jobs=multiprocessing.cpu_count(), verbose=verbose)
 #        for i, row in enumerate(idx):
 #            for j, col in enumerate(idx):
 #                Kmatrix[row, col] = Kmatrix_tmp[i, j]
    elif graph_kernel == 'structuralspkernel':
        mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
        sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
        Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, 
                              edge_label=edge_label, node_kernels=sub_kernels,
                              edge_kernels=sub_kernels,
                              parallel=parallel, n_jobs=multiprocessing.cpu_count(), 
                              verbose=verbose)
    elif graph_kernel == 'treeletkernel':
        pkernel = functools.partial(polynomialkernel, d=2, c=1e5)
 #        pkernel = functools.partial(gaussiankernel, gamma=1e-6)
        mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
        Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label,
                                   sub_kernel=pkernel, parallel=parallel,
                                   n_jobs=multiprocessing.cpu_count(), verbose=verbose)
    elif graph_kernel == 'weisfeilerlehmankernel':
        Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label,
                                   height=4, base_kernel='subtree', parallel=None,
                                   n_jobs=multiprocessing.cpu_count(), verbose=verbose)
        
    # normalization
    Kmatrix_diag = Kmatrix.diagonal().copy()
    for i in range(len(Kmatrix)):
        for j in range(i, len(Kmatrix)):
            Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
            Kmatrix[j][i] = Kmatrix[i][j]
    return Kmatrix
            

 def gram2distances(Kmatrix):
    dmatrix = np.zeros((len(Kmatrix), len(Kmatrix)))
    for i1 in range(len(Kmatrix)):
        for i2 in range(len(Kmatrix)):
            dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2]
    dmatrix = np.sqrt(dmatrix)
    return dmatrix


 def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, 
                           gkernel=None, verbose=True):
    dis_mat = np.empty((len(Gn), len(Gn)))
    if Kmatrix is None:
        Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose)
    for i in range(len(Gn)):
        for j in range(i, len(Gn)):
            dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j]
            if dis < 0:
                if dis > -1e-10:
                    dis = 0
                else:
                    raise ValueError('The distance is negative.')
            dis_mat[i, j] = np.sqrt(dis)
            dis_mat[j, i] = dis_mat[i, j]
    dis_max = np.max(np.max(dis_mat))
    dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
    dis_mean = np.mean(np.mean(dis_mat))
    return dis_mat, dis_max, dis_min, dis_mean


 def get_same_item_indices(ls):
    """Get the indices of the same items in a list. Return a dict keyed by items.
    """
    idx_dict = {}
    for idx, item in enumerate(ls):
        if item in idx_dict:
            idx_dict[item].append(idx)
        else:
            idx_dict[item] = [idx]
    return idx_dict


 def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None,
                                                  node_label=None, edge_label=None):
    dis_k_all = [] # distance between g_star and each graph.
    alpha = [1 / len(Gn)] * len(Gn)
    if Kmatrix is None:
        Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True)
    term3 = 0
    for i1, a1 in enumerate(alpha):
        for i2, a2 in enumerate(alpha):
            term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
    for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout):
        dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3)
        dis_all.append(dtemp)


 def normalize_distance_matrix(D):
    max_value = np.amax(D)
    min_value = np.amin(D)
    return (D - min_value) / (max_value - min_value)
--- a/gklearn/preimage/visualization.py
+++ b/gklearn/preimage/visualization.py
@@ -0,0 +1,585 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Dec 19 17:16:23 2019

@author: ljia
 """
 import numpy as np
 from sklearn.manifold import TSNE, Isomap
 import matplotlib.pyplot as plt
 from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset
 from tqdm import tqdm

 from gklearn.utils.graphfiles import loadDataset, loadGXL
 from gklearn.preimage.utils import kernel_distance_matrix, compute_kernel, dis_gstar, get_same_item_indices


 def visualize_graph_dataset(dis_measure, visual_method, draw_figure, 
                            draw_params={}, dis_mat=None, Gn=None, 
                            median_set=None):
    
    
    def draw_zoomed_axes(Gn_embedded, ax):
        margin = 0.01
        if dis_measure == 'graph-kernel':
            index = -2
        elif dis_measure == 'ged':
            index = -1
        x1 = np.min(Gn_embedded[median_set + [index], 0]) - margin * np.max(Gn_embedded)
        x2 = np.max(Gn_embedded[median_set + [index], 0]) + margin * np.max(Gn_embedded)
        y1 = np.min(Gn_embedded[median_set + [index], 1]) - margin * np.max(Gn_embedded)
        y2 = np.max(Gn_embedded[median_set + [index], 1]) + margin * np.max(Gn_embedded)
        if (x1 < 0 and y1 < 0) or ((x1 > 0 and y1 > 0)):
            loc = 2
        else:
            loc = 3
        axins = zoomed_inset_axes(ax, 4, loc=loc) # zoom-factor: 2.5, location: upper-left
        draw_figure(axins, Gn_embedded, dis_measure=dis_measure, 
                    median_set=median_set, **draw_params)
        axins.set_xlim(x1, x2) # apply the x-limits
        axins.set_ylim(y1, y2) # apply the y-limits
        plt.yticks(visible=False)
        plt.xticks(visible=False)
        loc1 = 1 if loc == 2 else 3
        mark_inset(ax, axins, loc1=2, loc2=4, fc="none", ec="0.5")  
        
        
    if dis_mat is None:
        if dis_measure == 'graph-kernel':
            gkernel = 'untilhpathkernel'
            node_label = 'atom'
            edge_label = 'bond_type'
            dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, 
                                                      Kmatrix=None, gkernel=gkernel)
        elif dis_measure == 'ged':
            pass
        
    if visual_method == 'tsne':
        Gn_embedded = TSNE(n_components=2, metric='precomputed').fit_transform(dis_mat)
    elif visual_method == 'isomap':
        Gn_embedded = Isomap(n_components=2, metric='precomputed').fit_transform(dis_mat)
    print(Gn_embedded.shape)
    fig, ax = plt.subplots()
    draw_figure(plt, Gn_embedded, dis_measure=dis_measure, legend=True, 
                median_set=median_set, **draw_params)        
 #    draw_zoomed_axes(Gn_embedded, ax)
    plt.show()
    plt.clf()
    
    return


 def draw_figure(ax, Gn_embedded, dis_measure=None, y_idx=None, legend=False,
                median_set=None):
    from matplotlib import colors as mcolors
    colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS))
 #    colors = ['#08306b', '#08519c', '#2171b5', '#4292c6', '#6baed6', '#9ecae1',
 #              '#c6dbef', '#deebf7']
 #    for i, values in enumerate(y_idx.values()):
 #        for item in values:
 ##            ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c=colors[i]) # , c='b')        
 #            ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c='b')
 #    ax.scatter(Gn_embedded[:,0], Gn_embedded[:,1], c='b')        
    h1 = ax.scatter(Gn_embedded[median_set, 0], Gn_embedded[median_set, 1], c='b')
    if dis_measure == 'graph-kernel':
        h2 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='darkorchid') # \psi
        h3 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='gold') # gen median
        h4 = ax.scatter(Gn_embedded[-3, 0], Gn_embedded[-3, 1], c='r') #c='g', marker='+') # set median
    elif dis_measure == 'ged':
        h3 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='gold') # gen median
        h4 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='r') #c='g', marker='+') # set median        
    if legend:
 #    fig.subplots_adjust(bottom=0.17)
        if dis_measure == 'graph-kernel':
            ax.legend([h1, h2, h3, h4], 
                      ['k closest graphs', 'true median', 'gen median', 'set median'])
        elif dis_measure == 'ged':       
            ax.legend([h1, h3, h4], ['k closest graphs', 'gen median', 'set median'])
 #    fig.legend(handles, labels, loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6)
 #    plt.savefig('symbolic_and_non_comparison_vertical_short.eps', format='eps', dpi=300, transparent=True,
 #            bbox_inches='tight')
 #    plt.show()
            
    
 ###############################################################################
    
 def visualize_distances_in_kernel():
    
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    fname_medians = 'expert.treelet'
    # add set median.
    fname_sm = 'results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl'
    set_median = loadGXL(fname_sm)
    Gn.append(set_median)
    # add generalized median (estimated pre-image.)
    fname_gm = 'results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl'
    gen_median = loadGXL(fname_gm)
    Gn.append(gen_median)
    
    # compute distance matrix
    median_set = [22, 29, 54, 74]
    gkernel = 'treeletkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    Gn_median_set = [Gn[i].copy() for i in median_set]
    Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label, 
                                    edge_label, True)
    Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
    dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, 
                                              Kmatrix=Kmatrix, gkernel=gkernel)
    print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
    print('min distances: ', np.min(np.min(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
    print('max distances: ', np.max(np.max(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))

    # add distances for the image of exact median \psi.
    dis_k_median_list = []
    for idx, g in enumerate(Gn):
        dis_k_median_list.append(dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)), 
                                           [1 / len(Gn_median_set)] * len(Gn_median_set),
                                           Kmatrix_median, withterm3=False))
    dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
    for i in range(len(Gn)):
        for j in range(i, len(Gn)):
            dis_mat_median[i, j] = dis_mat[i, j]
            dis_mat_median[j, i] = dis_mat_median[i, j]
    for i in range(len(Gn)):
        dis_mat_median[i, -1] = dis_k_median_list[i]
        dis_mat_median[-1, i] = dis_k_median_list[i]
    
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    
    # visualization.
 #    visualize_graph_dataset('graph-kernel', 'tsne', Gn)
 #    visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, 
 #                            draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
    visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, 
                            draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median,
                            median_set=median_set)
        
    
 def visualize_distances_in_ged():
    from gklearn.preimage.fitDistance import compute_geds
    from gklearn.preimage.ged import GED
    ds = {'name': 'monoterpenoides', 
          'dataset': '../datasets/monoterpenoides/dataset_10+.ds'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    # add set median.
    fname_medians = 'expert.treelet'
    fname_sm = 'preimage/results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl'
    set_median = loadGXL(fname_sm)
    Gn.append(set_median)
    # add generalized median (estimated pre-image.)
    fname_gm = 'preimage/results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl'
    gen_median = loadGXL(fname_gm)
    Gn.append(gen_median)
    
    # compute/load ged matrix.
 #    # compute.
 ##    k = 4
 ##    edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
 #    edit_costs = [3, 3, 1, 3, 3, 1]
 ##    edit_costs = [7, 3, 5, 9, 2, 6]
 #    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
 #    params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 
 #                'algo_options': algo_options, 'stabilizer': None, 
 #                'edit_cost_constant': edit_costs}    
 #    _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
 #    np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm', ged_mat=ged_mat)
    # load from file.
    gmfile = np.load('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm.npz')
    ged_mat = gmfile['ged_mat']
 #    # change medians.
 #    edit_costs = [3, 3, 1, 3, 3, 1]
 #    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
 #    params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 
 #                'algo_options': algo_options, 'stabilizer': None, 
 #                'edit_cost_constant': edit_costs}
 #    for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
 #        dis, _, _ = GED(Gn[idx], set_median, **params_ged)
 #        ged_mat[idx, -2] = dis
 #        ged_mat[-2, idx] = dis
 #        dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
 #        ged_mat[idx, -1] = dis
 #        ged_mat[-1, idx] = dis
 #    np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm', 
 #             ged_mat=ged_mat)

    
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)

    # visualization.
    median_set = [22, 29, 54, 74]
    visualize_graph_dataset('ged', 'tsne', draw_figure, 
                            draw_params={'y_idx': y_idx}, dis_mat=ged_mat,
                            median_set=median_set)
    
 ###############################################################################
    
    
 def visualize_distances_in_kernel_monoterpenoides():
    import os

    ds = {'dataset': '../datasets/monoterpenoides/dataset_10+.ds',
          'graph_dir': os.path.dirname(os.path.realpath(__file__))  + '../../datasets/monoterpenoides/'}  # node/edge symb
    Gn_original, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    
    # compute distance matrix
 #    median_set = [22, 29, 54, 74]
    gkernel = 'treeletkernel'
    fit_method = 'expert'
    node_label = 'atom'
    edge_label = 'bond_type'
    ds_name = 'monoterpenoides'
    fname_medians = fit_method + '.' + gkernel
    dir_output = 'results/xp_monoterpenoides/'
    repeat = 0
    
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    for i, (y, values) in enumerate(y_idx.items()):
        print('\ny =', y)
        k = len(values)
        
        Gn = [Gn_original[g].copy() for g in values]
        # add set median.
        fname_sm = dir_output + 'medians/' + str(int(y)) + '/set_median.k' + str(int(k)) \
            + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
        set_median = loadGXL(fname_sm)
        Gn.append(set_median)
        # add generalized median (estimated pre-image.)
        fname_gm = dir_output + 'medians/' + str(int(y)) + '/gen_median.k' + str(int(k)) \
            + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
        gen_median = loadGXL(fname_gm)
        Gn.append(gen_median)
    
        # compute distance matrix
        median_set = range(0, len(values))
    
        Gn_median_set = [Gn[i].copy() for i in median_set]
        Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label, 
                                        edge_label, False)
        Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
        dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, 
                                                  Kmatrix=Kmatrix, gkernel=gkernel)
        print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
        print('min distances: ', np.min(np.min(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
        print('max distances: ', np.max(np.max(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))

        # add distances for the image of exact median \psi.
        dis_k_median_list = []
        for idx, g in enumerate(Gn):
            dis_k_median_list.append(dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)), 
                                               [1 / len(Gn_median_set)] * len(Gn_median_set),
                                               Kmatrix_median, withterm3=False))
        dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                dis_mat_median[i, j] = dis_mat[i, j]
                dis_mat_median[j, i] = dis_mat_median[i, j]
        for i in range(len(Gn)):
            dis_mat_median[i, -1] = dis_k_median_list[i]
            dis_mat_median[-1, i] = dis_k_median_list[i]
            
    
        # visualization.
 #    visualize_graph_dataset('graph-kernel', 'tsne', Gn)
 #    visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, 
 #                            draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
        visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, 
                                draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median,
                                median_set=median_set)
        
    
 def visualize_distances_in_ged_monoterpenoides():
    from gklearn.preimage.fitDistance import compute_geds
    from gklearn.preimage.ged import GED
    import os
    
    ds = {'dataset': '../datasets/monoterpenoides/dataset_10+.ds',
          'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'}  # node/edge symb
    Gn_original, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    
    # compute distance matrix
 #    median_set = [22, 29, 54, 74]
    gkernel = 'treeletkernel'
    fit_method = 'expert'
    ds_name = 'monoterpenoides'
    fname_medians = fit_method + '.' + gkernel
    dir_output = 'results/xp_monoterpenoides/'
    repeat = 0
 #    edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
    edit_costs = [3, 3, 1, 3, 3, 1]
 #    edit_costs = [7, 3, 5, 9, 2, 6]
    
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    for i, (y, values) in enumerate(y_idx.items()):
        print('\ny =', y)
        k = len(values)
        
        Gn = [Gn_original[g].copy() for g in values]
        # add set median.
        fname_sm = dir_output + 'medians/' + str(int(y)) + '/set_median.k' + str(int(k)) \
            + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
        set_median = loadGXL(fname_sm)
        Gn.append(set_median)
        # add generalized median (estimated pre-image.)
        fname_gm = dir_output + 'medians/' + str(int(y)) + '/gen_median.k' + str(int(k)) \
            + '.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
        gen_median = loadGXL(fname_gm)
        Gn.append(gen_median)
    
    
        # compute/load ged matrix.
        # compute.
        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        params_ged = {'dataset': ds_name, 'lib': 'gedlibpy', 'cost': 'CONSTANT', 
                      'method': 'IPFP', 'algo_options': algo_options, 
                      'stabilizer': None, 'edit_cost_constant': edit_costs}    
        _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
        np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + str(int(y)) \
            + '.with_medians.gm', ged_mat=ged_mat)
 #        # load from file.
 #        gmfile = np.load('dir_output + 'ged_mat.' + fname_medians + '.y' + str(int(y)) + '.with_medians.gm.npz')
 #        ged_mat = gmfile['ged_mat']
 #        # change medians.
 #        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
 #        params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 
 #                    'algo_options': algo_options, 'stabilizer': None, 
 #                    'edit_cost_constant': edit_costs}
 #        for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
 #            dis, _, _ = GED(Gn[idx], set_median, **params_ged)
 #            ged_mat[idx, -2] = dis
 #            ged_mat[-2, idx] = dis
 #            dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
 #            ged_mat[idx, -1] = dis
 #            ged_mat[-1, idx] = dis
 #        np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + str(int(y)) + '.with_medians.gm', 
 #                 ged_mat=ged_mat)

        # visualization.
        median_set = range(0, len(values))
        visualize_graph_dataset('ged', 'tsne', draw_figure, 
                                draw_params={'y_idx': y_idx}, dis_mat=ged_mat,
                                median_set=median_set)
        
        
 ###############################################################################
    
    
 def visualize_distances_in_kernel_letter_h():
    
    ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
          'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'}  # node/edge symb
    Gn_original, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
 #    Gn = Gn[0:50]
    
    # compute distance matrix
 #    median_set = [22, 29, 54, 74]
    gkernel = 'structuralspkernel'
    fit_method = 'expert'
    node_label = None
    edge_label = None
    ds_name = 'letter-h'
    fname_medians = fit_method + '.' + gkernel
    dir_output = 'results/xp_letter_h/'
    k = 150
    repeat = 0
    
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    for i, (y, values) in enumerate(y_idx.items()):
        print('\ny =', y)
        
        Gn = [Gn_original[g].copy() for g in values]
        # add set median.
        fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        set_median = loadGXL(fname_sm)
        Gn.append(set_median)
        # add generalized median (estimated pre-image.)
        fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        gen_median = loadGXL(fname_gm)
        Gn.append(gen_median)
    
        # compute distance matrix
        median_set = range(0, len(values))
    
        Gn_median_set = [Gn[i].copy() for i in median_set]
        Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label, 
                                        edge_label, False)
        Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
        dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, 
                                                  Kmatrix=Kmatrix, gkernel=gkernel)
        print('average distances: ', np.mean(np.mean(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
        print('min distances: ', np.min(np.min(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))
        print('max distances: ', np.max(np.max(dis_mat[0:len(Gn)-2, 0:len(Gn)-2])))

        # add distances for the image of exact median \psi.
        dis_k_median_list = []
        for idx, g in enumerate(Gn):
            dis_k_median_list.append(dis_gstar(idx, range(len(Gn), len(Gn) + len(Gn_median_set)), 
                                               [1 / len(Gn_median_set)] * len(Gn_median_set),
                                               Kmatrix_median, withterm3=False))
        dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                dis_mat_median[i, j] = dis_mat[i, j]
                dis_mat_median[j, i] = dis_mat_median[i, j]
        for i in range(len(Gn)):
            dis_mat_median[i, -1] = dis_k_median_list[i]
            dis_mat_median[-1, i] = dis_k_median_list[i]
            
    
        # visualization.
 #    visualize_graph_dataset('graph-kernel', 'tsne', Gn)
 #    visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, 
 #                            draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
        visualize_graph_dataset('graph-kernel', 'tsne', draw_figure, 
                                draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median,
                                median_set=median_set)
        
    
 def visualize_distances_in_ged_letter_h():
    from fitDistance import compute_geds
    from preimage.test_k_closest_graphs import reform_attributes
    
    ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
          'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'}  # node/edge symb
    Gn_original, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
 #    Gn = Gn[0:50]
    
    # compute distance matrix
 #    median_set = [22, 29, 54, 74]
    gkernel = 'structuralspkernel'
    fit_method = 'expert'
    ds_name = 'letter-h'
    fname_medians = fit_method + '.' + gkernel
    dir_output = 'results/xp_letter_h/'
    k = 150
    repeat = 0
 #    edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
    edit_costs = [3, 3, 1, 3, 3, 1]
 #    edit_costs = [7, 3, 5, 9, 2, 6]
    
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    for i, (y, values) in enumerate(y_idx.items()):
        print('\ny =', y)
        
        Gn = [Gn_original[g].copy() for g in values]
        # add set median.
        fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        set_median = loadGXL(fname_sm)
        Gn.append(set_median)
        # add generalized median (estimated pre-image.)
        fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        gen_median = loadGXL(fname_gm)
        Gn.append(gen_median)
    
    
        # compute/load ged matrix.
        # compute.
        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        params_ged = {'dataset': 'Letter', 'lib': 'gedlibpy', 'cost': 'CONSTANT', 
                      'method': 'IPFP', 'algo_options': algo_options, 
                      'stabilizer': None, 'edit_cost_constant': edit_costs}    
        for g in Gn:
            reform_attributes(g)
        _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
        np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm', ged_mat=ged_mat)
 #        # load from file.
 #        gmfile = np.load('dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm.npz')
 #        ged_mat = gmfile['ged_mat']
 #        # change medians.
 #        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
 #        params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP', 
 #                    'algo_options': algo_options, 'stabilizer': None, 
 #                    'edit_cost_constant': edit_costs}
 #        for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
 #            dis, _, _ = GED(Gn[idx], set_median, **params_ged)
 #            ged_mat[idx, -2] = dis
 #            ged_mat[-2, idx] = dis
 #            dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
 #            ged_mat[idx, -1] = dis
 #            ged_mat[-1, idx] = dis
 #        np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm', 
 #                 ged_mat=ged_mat)

    
        # visualization.
        median_set = range(0, len(values))
        visualize_graph_dataset('ged', 'tsne', draw_figure, 
                                draw_params={'y_idx': y_idx}, dis_mat=ged_mat,
                                median_set=median_set)


 if __name__ == '__main__':
    visualize_distances_in_kernel_letter_h()
 #    visualize_distances_in_ged_letter_h()
 #    visualize_distances_in_kernel_monoterpenoides()
 #    visualize_distances_in_kernel_monoterpenoides()
 #    visualize_distances_in_kernel()
 #    visualize_distances_in_ged()
    
    
    
    
    
    
    
 #def draw_figure_dis_k(ax, Gn_embedded, y_idx=None, legend=False):
 #    from matplotlib import colors as mcolors
 #    colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS))
 ##    colors = ['#08306b', '#08519c', '#2171b5', '#4292c6', '#6baed6', '#9ecae1',
 ##              '#c6dbef', '#deebf7']
 #    for i, values in enumerate(y_idx.values()):
 #        for item in values:
 ##            ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c=colors[i]) # , c='b')        
 #            ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c='b')        
 #    h1 = ax.scatter(Gn_embedded[[12, 13, 22, 29], 0], Gn_embedded[[12, 13, 22, 29], 1], c='r')
 #    h2 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='darkorchid') # \psi
 #    h3 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='gold') # gen median
 #    h4 = ax.scatter(Gn_embedded[-3, 0], Gn_embedded[-3, 1], c='r', marker='+') # set median
 #    if legend:
 ##    fig.subplots_adjust(bottom=0.17)
 #        ax.legend([h1, h2, h3, h4], ['k clostest graphs', 'true median', 'gen median', 'set median'])
 ##    fig.legend(handles, labels, loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6)
 ##    plt.savefig('symbolic_and_non_comparison_vertical_short.eps', format='eps', dpi=300, transparent=True,
 ##            bbox_inches='tight')
 ##    plt.show()
    
    
    
 #def draw_figure_ged(ax, Gn_embedded, y_idx=None, legend=False):
 #    from matplotlib import colors as mcolors
 #    colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS))
 ##    colors = ['#08306b', '#08519c', '#2171b5', '#4292c6', '#6baed6', '#9ecae1',
 ##              '#c6dbef', '#deebf7']
 #    for i, values in enumerate(y_idx.values()):
 #        for item in values:
 ##            ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c=colors[i]) # , c='b')        
 #            ax.scatter(Gn_embedded[item,0], Gn_embedded[item,1], c='b')        
 #    h1 = ax.scatter(Gn_embedded[[12, 13, 22, 29], 0], Gn_embedded[[12, 13, 22, 29], 1], c='r')
 ##    h2 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='darkorchid') # \psi
 #    h3 = ax.scatter(Gn_embedded[-1, 0], Gn_embedded[-1, 1], c='gold') # gen median
 #    h4 = ax.scatter(Gn_embedded[-2, 0], Gn_embedded[-2, 1], c='r', marker='+') # set median
 #    if legend:
 ##    fig.subplots_adjust(bottom=0.17)
 #        ax.legend([h1, h3, h4], ['k clostest graphs', 'gen median', 'set median'])
 ##    fig.legend(handles, labels, loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6)
 ##    plt.savefig('symbolic_and_non_comparison_vertical_short.eps', format='eps', dpi=300, transparent=True,
 ##            bbox_inches='tight')
 ##    plt.show()
--- a/gklearn/preimage/xp_fit_method.py
+++ b/gklearn/preimage/xp_fit_method.py
@@ -0,0 +1,935 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Jan 14 15:39:29 2020

@author: ljia
 """
 import numpy as np
 import random
 import csv
 from shutil import copyfile
 import networkx as nx
 import matplotlib.pyplot as plt
 import os
 import time

 from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL
 from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes
 from gklearn.preimage.utils import get_same_item_indices, kernel_distance_matrix, compute_kernel
 from gklearn.preimage.find_best_k import getRelations


 def get_dataset(ds_name):
    if ds_name == 'Letter-high': # node non-symb
        dataset = 'cpp_ext/data/collections/Letter.xml'
        graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/' 
        Gn, y_all = loadDataset(dataset, extra_params=graph_dir)
        for G in Gn:
            reform_attributes(G, na_names=['x', 'y'])
            G.graph['node_labels'] = []
            G.graph['edge_labels'] = []
            G.graph['node_attrs'] = ['x', 'y']
            G.graph['edge_attrs'] = []
    elif ds_name == 'Letter-med': # node non-symb
        dataset = 'cpp_ext/data/collections/Letter.xml'
        graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/MED/' 
        Gn, y_all = loadDataset(dataset, extra_params=graph_dir)
        for G in Gn:
            reform_attributes(G, na_names=['x', 'y'])
            G.graph['node_labels'] = []
            G.graph['edge_labels'] = []
            G.graph['node_attrs'] = ['x', 'y']
            G.graph['edge_attrs'] = []
    elif ds_name == 'Letter-low': # node non-symb
        dataset = 'cpp_ext/data/collections/Letter.xml'
        graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/LOW/' 
        Gn, y_all = loadDataset(dataset, extra_params=graph_dir)
        for G in Gn:
            reform_attributes(G, na_names=['x', 'y'])
            G.graph['node_labels'] = []
            G.graph['edge_labels'] = []
            G.graph['node_attrs'] = ['x', 'y']
            G.graph['edge_attrs'] = []
    elif ds_name == 'Fingerprint':
 #        dataset = 'cpp_ext/data/collections/Fingerprint.xml'
 #        graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/Fingerprint/node_attrs/'
 #        Gn, y_all = loadDataset(dataset, extra_params=graph_dir)
 #        for G in Gn:
 #            reform_attributes(G)
        dataset = '../../datasets/Fingerprint/Fingerprint_A.txt'
        graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/Fingerprint/node_attrs/'
        Gn, y_all = loadDataset(dataset)
    elif ds_name == 'SYNTHETIC':
        pass
    elif ds_name == 'SYNTHETICnew':
        dataset = '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
        graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/SYNTHETICnew'
 #        dataset = '../../datasets/Letter-high/Letter-high_A.txt'
 #        graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'
        Gn, y_all = loadDataset(dataset)
    elif ds_name == 'Synthie':
        pass
    elif ds_name == 'COIL-DEL':
        dataset = '../../datasets/COIL-DEL/COIL-DEL_A.txt'
        graph_dir = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/generated_datsets/COIL-DEL/'
        Gn, y_all = loadDataset(dataset)
    elif ds_name == 'COIL-RAG':
        pass
    elif ds_name == 'COLORS-3':
        pass
    elif ds_name == 'FRANKENSTEIN':
        pass
    
    return Gn, y_all, graph_dir


 def init_output_file(ds_name, gkernel, fit_method, dir_output):
 #    fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
    fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv'
    f_detail = open(dir_output + fn_output_detail, 'a')
    csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', 
              'GED method', 'attr distance', 'fit method', 'k', 
              'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
              'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
              'dis_k gi -> GM', 'fitting time', 'generating time', 'total time',
              'median set'])
    f_detail.close()
    
 #    fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
    fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv'
    f_summary = open(dir_output + fn_output_summary, 'a')
    csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost', 
              'GED method', 'attr distance', 'fit method', 'k', 
              'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
              'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
              'dis_k gi -> GM', 'fitting time', 'generating time', 'total time',
              '# SOD SM -> GM', '# dis_k SM -> GM', 
              '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', 
              'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', 
              'repeats better dis_k gi -> GM'])
    f_summary.close()
    
    return fn_output_detail, fn_output_summary


 def xp_fit_method_for_non_symbolic(parameters, save_results=True, initial_solutions=1,
                                   Gn_data=None, k_dis_data=None, Kmatrix=None,
                                   is_separate=False):
    
    # 1. set parameters.
    print('1. setting parameters...')
    ds_name = parameters['ds_name']
    gkernel = parameters['gkernel']
    edit_cost_name = parameters['edit_cost_name']
    ged_method = parameters['ged_method']
    attr_distance = parameters['attr_distance']
    fit_method = parameters['fit_method']
    init_ecc = parameters['init_ecc']

    node_label = None
    edge_label = None
    dir_output = 'results/xp_fit_method/'    
      
    
    # 2. get dataset.
    print('2. getting dataset...')
    if Gn_data is None:
        Gn, y_all, graph_dir = get_dataset(ds_name)
    else:
        Gn = Gn_data[0]
        y_all = Gn_data[1]
        graph_dir = Gn_data[2]
        
    
    # 3. compute kernel distance matrix.
    print('3. computing kernel distance matrix...')
    if k_dis_data is None:
        dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, 
            None, Kmatrix=Kmatrix, gkernel=gkernel)
    else:
 #        dis_mat = k_dis_data[0]
 #        dis_max = k_dis_data[1]
 #        dis_min = k_dis_data[2]
 #        dis_mean = k_dis_data[3]
 #        print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min, dis_mean)
        pass


    if save_results:
        # create result files.
        print('creating output files...')
        fn_output_detail, fn_output_summary = init_output_file(ds_name, gkernel, 
                                                               fit_method, dir_output)

            
    # start repeats.    
    repeats = 1
 #    k_list = range(2, 11)
    k_list = [0]
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    random.seed(1)
    rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
    
    for k in k_list:
 #        print('\n--------- k =', k, '----------')
        
        sod_sm_mean_list = []
        sod_gm_mean_list = []
        dis_k_sm_mean_list = []
        dis_k_gm_mean_list = []
        dis_k_gi_min_mean_list = []
        time_fitting_mean_list = []
        time_generating_mean_list = []
        time_total_mean_list = []
        
        # 3. start generating and computing over targets.
        print('4. starting generating and computing over targets......')
        for i, (y, values) in enumerate(y_idx.items()):
 #            y = 'I'
 #            values = y_idx[y]
 #            values = values[0:10]            
            print('\ny =', y)
 #            if y.strip() == 'A':
 #                continue
            
            k = len(values)
            print('\n--------- k =', k, '----------')
            
            if k < 2:
                print('\nk = ', k, ', skip.\n')
                continue
            
            sod_sm_list = []
            sod_gm_list = []
            dis_k_sm_list = []
            dis_k_gm_list = []
            dis_k_gi_min_list = []
            time_fitting_list = []
            time_generating_list = []
            time_total_list = []
            nb_sod_sm2gm = [0, 0, 0]
            nb_dis_k_sm2gm = [0, 0, 0]
            nb_dis_k_gi2sm = [0, 0, 0]
            nb_dis_k_gi2gm = [0, 0, 0]
            repeats_better_sod_sm2gm = []
            repeats_better_dis_k_sm2gm = []
            repeats_better_dis_k_gi2sm = []
            repeats_better_dis_k_gi2gm = []
            
            # get Gram matrix for this part of data.
            if Kmatrix is not None:
                if is_separate:
                    Kmatrix_sub = Kmatrix[i].copy()
                else:
                    Kmatrix_sub = Kmatrix[values,:]
                    Kmatrix_sub = Kmatrix_sub[:,values]
            else:
                Kmatrix_sub = None
            
            for repeat in range(repeats):
                print('\nrepeat =', repeat)
                random.seed(rdn_seed_list[repeat])
                median_set_idx_idx = random.sample(range(0, len(values)), k)
                median_set_idx = [values[idx] for idx in median_set_idx_idx]
                print('median set: ', median_set_idx)
                Gn_median = [Gn[g] for g in values]
 #                from notebooks.utils.plot_all_graphs import draw_Fingerprint_graph
 #                for Gn in Gn_median:
 #                    draw_Fingerprint_graph(Gn, save=None)
                
                # GENERATING & COMPUTING!!
                res_sods, res_dis_ks, res_times = median_on_k_closest_graphs(Gn_median, 
                        node_label, edge_label, 
                        gkernel, k, fit_method=fit_method, graph_dir=graph_dir,
                        edit_cost_constants=None, group_min=median_set_idx_idx, 
                        dataset=ds_name, initial_solutions=initial_solutions,
                        edit_cost_name=edit_cost_name, init_ecc=init_ecc,
                        Kmatrix=Kmatrix_sub, parallel=False)
                sod_sm = res_sods[0]
                sod_gm = res_sods[1] 
                dis_k_sm = res_dis_ks[0]
                dis_k_gm = res_dis_ks[1]
                dis_k_gi = res_dis_ks[2]
                dis_k_gi_min = res_dis_ks[3]
                idx_dis_k_gi_min = res_dis_ks[4]
                time_fitting = res_times[0]
                time_generating = res_times[1]                    
                
                # write result detail.
                sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
                dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
                dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
                dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
                if save_results:
                    f_detail = open(dir_output + fn_output_detail, 'a')
                    csv.writer(f_detail).writerow([ds_name, gkernel, 
                              edit_cost_name, ged_method, attr_distance,
                              fit_method, k, y, repeat,
                              sod_sm, sod_gm, dis_k_sm, dis_k_gm, 
                              dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
                              dis_k_gi2gm, time_fitting, time_generating,
                              time_fitting + time_generating, median_set_idx])
                    f_detail.close()
                
                # compute result summary.
                sod_sm_list.append(sod_sm)
                sod_gm_list.append(sod_gm)
                dis_k_sm_list.append(dis_k_sm)
                dis_k_gm_list.append(dis_k_gm)
                dis_k_gi_min_list.append(dis_k_gi_min)
                time_fitting_list.append(time_fitting)
                time_generating_list.append(time_generating)
                time_total_list.append(time_fitting + time_generating)
                # # SOD SM -> GM
                if sod_sm > sod_gm:
                    nb_sod_sm2gm[0] += 1
                    repeats_better_sod_sm2gm.append(repeat)
                elif sod_sm == sod_gm:
                    nb_sod_sm2gm[1] += 1
                elif sod_sm < sod_gm:
                    nb_sod_sm2gm[2] += 1
                # # dis_k SM -> GM
                if dis_k_sm > dis_k_gm:
                    nb_dis_k_sm2gm[0] += 1
                    repeats_better_dis_k_sm2gm.append(repeat)
                elif dis_k_sm == dis_k_gm:
                    nb_dis_k_sm2gm[1] += 1
                elif dis_k_sm < dis_k_gm:
                    nb_dis_k_sm2gm[2] += 1
                # # dis_k gi -> SM
                if dis_k_gi_min > dis_k_sm:
                    nb_dis_k_gi2sm[0] += 1
                    repeats_better_dis_k_gi2sm.append(repeat)
                elif dis_k_gi_min == dis_k_sm:
                    nb_dis_k_gi2sm[1] += 1
                elif dis_k_gi_min < dis_k_sm:
                    nb_dis_k_gi2sm[2] += 1
                # # dis_k gi -> GM
                if dis_k_gi_min > dis_k_gm:
                    nb_dis_k_gi2gm[0] += 1
                    repeats_better_dis_k_gi2gm.append(repeat)
                elif dis_k_gi_min == dis_k_gm:
                    nb_dis_k_gi2gm[1] += 1
                elif dis_k_gi_min < dis_k_gm:
                    nb_dis_k_gi2gm[2] += 1
                    
                # save median graphs.
                fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
                fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat)
                copyfile(fname_sm, fn_pre_sm_new + '.gxl')
                fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
                fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat)
                copyfile(fname_gm, fn_pre_gm_new + '.gxl')
                G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
 #                reform_attributes(G_best_kernel)
                fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + str(y) + '.repeat' + str(repeat)
                saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='default')
                
                # plot median graphs.
                if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low':
                    set_median = loadGXL(fn_pre_sm_new + '.gxl')
                    gen_median = loadGXL(fn_pre_gm_new + '.gxl')                
                    draw_Letter_graph(set_median, fn_pre_sm_new)
                    draw_Letter_graph(gen_median, fn_pre_gm_new)
                    draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
                    
            # write result summary for each letter. 
            sod_sm_mean_list.append(np.mean(sod_sm_list))
            sod_gm_mean_list.append(np.mean(sod_gm_list))
            dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
            dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
            dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
            time_fitting_mean_list.append(np.mean(time_fitting_list))
            time_generating_mean_list.append(np.mean(time_generating_list))
            time_total_mean_list.append(np.mean(time_total_list))
            sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
            dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
            dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            if save_results:
                f_summary = open(dir_output + fn_output_summary, 'a')
                csv.writer(f_summary).writerow([ds_name, gkernel, 
                          edit_cost_name, ged_method, attr_distance,
                          fit_method, k, y,
                          sod_sm_mean_list[-1], sod_gm_mean_list[-1], 
                          dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
                          dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, 
                          dis_k_gi2sm_mean, dis_k_gi2gm_mean, 
                          time_fitting_mean_list[-1], time_generating_mean_list[-1],
                          time_total_mean_list[-1], nb_sod_sm2gm, 
                          nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, 
                          repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, 
                          repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
                f_summary.close()
            

        # write result summary for each letter. 
        sod_sm_mean = np.mean(sod_sm_mean_list)
        sod_gm_mean = np.mean(sod_gm_mean_list)
        dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
        dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        time_fitting_mean = np.mean(time_fitting_list)
        time_generating_mean = np.mean(time_generating_list)
        time_total_mean = np.mean(time_total_list)
        sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
        dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        if save_results:
            f_summary = open(dir_output + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([ds_name, gkernel, 
                      edit_cost_name, ged_method, attr_distance,
                      fit_method, k, 'all',
                      sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
                      dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, 
                      dis_k_gi2sm_mean, dis_k_gi2gm_mean,
                      time_fitting_mean, time_generating_mean, time_total_mean])
            f_summary.close()
        
    print('\ncomplete.')
    
    
 #Dessin median courrant
 def draw_Letter_graph(graph, file_prefix):
    plt.figure()
    pos = {}
    for n in graph.nodes:
        pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
    nx.draw_networkx(graph, pos)
    plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
 #    plt.show()
    plt.clf()
    
    
 def compute_gm_for_each_class(Gn, y_all, gkernel, parallel='imap_unordered', is_separate=True):
    
    if is_separate:
        print('the Gram matrix is computed for each class.')
        y_idx = get_same_item_indices(y_all)
        Kmatrix = []
        run_time = []
        k_dis_data = []
        for i, (y, values) in enumerate(y_idx.items()):
            print('The ', str(i), ' class:')
            Gn_i = [Gn[val] for val in values]
            time0 = time.time()            
            Kmatrix.append(compute_kernel(Gn_i, gkernel, None, None, True, parallel=parallel))
            run_time.append(time.time() - time0)
            k_dis_data.append(kernel_distance_matrix(Gn_i, None, None, 
                Kmatrix=Kmatrix[i], gkernel=gkernel, verbose=True))
        np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
                 Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate)
        dis_max = np.max([item[1] for item in k_dis_data])
        dis_min = np.min([item[2] for item in k_dis_data])
        dis_mean = np.mean([item[3] for item in k_dis_data])
        print('pair distances - dis_max, dis_min, dis_mean:', dis_max, dis_min,
              dis_mean)

    else:
        time0 = time.time()
        Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel=parallel)
        run_time = time.time() - time0
        np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
                 Kmatrix=Kmatrix, run_time=run_time, is_separate=is_separate)
        k_dis_data = kernel_distance_matrix(Gn, None, None, 
            Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
        print('the Gram matrix is computed for the whole dataset.')
        print('pair distances - dis_max, dis_min, dis_mean:', k_dis_data[1], 
              k_dis_data[2], k_dis_data[3])
    
    print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
 #    k_dis_data = [dis_mat, dis_max, dis_min, dis_mean]
    return Kmatrix, run_time, k_dis_data
    

 if __name__ == "__main__":
 #    #### xp 1: Letter-high, spkernel.
 #    # load dataset.
 #    print('getting dataset and computing kernel distance matrix first...')
 #    ds_name = 'Letter-high'
 #    gkernel = 'spkernel'
 #    Gn, y_all, graph_dir = get_dataset(ds_name)
 #    # remove graphs without edges.
 #    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
 #    idx = [G[0] for G in Gn]
 #    Gn = [G[1] for G in Gn]
 #    y_all = [y_all[i] for i in idx]
 ##    Gn = Gn[0:50]
 ##    y_all = y_all[0:50]
 #    # compute pair distances.
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=None, gkernel=gkernel, verbose=True)
 ##    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
 #    # fitting and computing.
 #    fit_methods = ['random', 'expert', 'k-graphs']
 #    for fit_method in fit_methods:
 #        print('\n-------------------------------------')
 #        print('fit method:', fit_method)
 #        parameters = {'ds_name': ds_name,
 #                      'gkernel': gkernel,
 #                      'edit_cost_name': 'LETTER2',
 #                      'ged_method': 'mIPFP',
 #                      'attr_distance': 'euclidean',
 #                      'fit_method': fit_method}
 #        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
 #                                       initial_solutions=40,
 #                                       Gn_data = [Gn, y_all, graph_dir],
 #                                       k_dis_data = [dis_mat, dis_max, dis_min, dis_mean])
        
        
 #    #### xp 2: Letter-high, sspkernel.
 #    # load dataset.
 #    print('getting dataset and computing kernel distance matrix first...')
 #    ds_name = 'Letter-high'
 #    gkernel = 'structuralspkernel'
 #    Gn, y_all, graph_dir = get_dataset(ds_name)
 ##    Gn = Gn[0:50]
 ##    y_all = y_all[0:50]
 #    # compute pair distances.
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=None, gkernel=gkernel, verbose=True)
 ##    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
 #    # fitting and computing.
 #    fit_methods = ['random', 'expert', 'k-graphs']
 #    for fit_method in fit_methods:
 #        print('\n-------------------------------------')
 #        print('fit method:', fit_method)
 #        parameters = {'ds_name': ds_name,
 #                      'gkernel': gkernel,
 #                      'edit_cost_name': 'LETTER2',
 #                      'ged_method': 'mIPFP',
 #                      'attr_distance': 'euclidean',
 #                      'fit_method': fit_method}
 #        print('parameters: ', parameters)
 #        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
 #                                       initial_solutions=40,
 #                                       Gn_data = [Gn, y_all, graph_dir],
 #                                       k_dis_data = [dis_mat, dis_max, dis_min, dis_mean])
        
        
 #    #### xp 3: SYNTHETICnew, sspkernel, using NON_SYMBOLIC.
 #    gmfile = np.load('results/xp_fit_method/Kmatrix.SYNTHETICnew.structuralspkernel.gm.npz')
 #    Kmatrix = gmfile['Kmatrix']
 #    run_time = gmfile['run_time']
 #    # normalization
 #    Kmatrix_diag = Kmatrix.diagonal().copy()
 #    for i in range(len(Kmatrix)):
 #        for j in range(i, len(Kmatrix)):
 #            Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
 #            Kmatrix[j][i] = Kmatrix[i][j]
 ##    np.savez('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm',
 ##             Kmatrix=Kmatrix, run_time=run_time)
 #    # load dataset.
 #    print('getting dataset and computing kernel distance matrix first...')
 #    ds_name = 'SYNTHETICnew'
 #    gkernel = 'structuralspkernel'
 #    Gn, y_all, graph_dir = get_dataset(ds_name)
 #    # remove graphs without nodes and edges.
 #    Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0
 #          and nx.number_of_edges(G) != 0)]
 #    idx = [G[0] for G in Gn]
 #    Gn = [G[1] for G in Gn]
 #    y_all = [y_all[i] for i in idx]
 ##    Gn = Gn[0:10]
 ##    y_all = y_all[0:10]
 #    for G in Gn:
 #        G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
 #    # compute pair distances.
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
 ##    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
 #    # fitting and computing.
 #    fit_methods = ['k-graphs', 'random', 'random', 'random']
 #    for fit_method in fit_methods:
 #        print('\n-------------------------------------')
 #        print('fit method:', fit_method)
 #        parameters = {'ds_name': ds_name,
 #                      'gkernel': gkernel,
 #                      'edit_cost_name': 'NON_SYMBOLIC',
 #                      'ged_method': 'mIPFP',
 #                      'attr_distance': 'euclidean',
 #                      'fit_method': fit_method}
 #        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
 #                                       initial_solutions=1,
 #                                       Gn_data = [Gn, y_all, graph_dir],
 #                                       k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
 #                                       Kmatrix=Kmatrix)
        
        
 #    ### xp 4: SYNTHETICnew, spkernel, using NON_SYMBOLIC.
 #    gmfile = np.load('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm.npz')
 #    Kmatrix = gmfile['Kmatrix']
 #    # normalization
 #    Kmatrix_diag = Kmatrix.diagonal().copy()
 #    for i in range(len(Kmatrix)):
 #        for j in range(i, len(Kmatrix)):
 #            Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
 #            Kmatrix[j][i] = Kmatrix[i][j]
 #    run_time = 21821.35
 #    np.savez('results/xp_fit_method/Kmatrix.SYNTHETICnew.spkernel.gm',
 #             Kmatrix=Kmatrix, run_time=run_time)
 #    
 #    # load dataset.
 #    print('getting dataset and computing kernel distance matrix first...')
 #    ds_name = 'SYNTHETICnew'
 #    gkernel = 'spkernel'
 #    Gn, y_all, graph_dir = get_dataset(ds_name)
 ##    # remove graphs without nodes and edges.
 ##    Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_node(G) != 0
 ##          and nx.number_of_edges(G) != 0)]
 ##    idx = [G[0] for G in Gn]
 ##    Gn = [G[1] for G in Gn]
 ##    y_all = [y_all[i] for i in idx]
 ##    Gn = Gn[0:5]
 ##    y_all = y_all[0:5]
 #    for G in Gn:
 #        G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
 #    
 #    # compute/read Gram matrix and pair distances.
 ##    Kmatrix = compute_kernel(Gn, gkernel, None, None, True)
 ##    np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
 ##         Kmatrix=Kmatrix)
 #    gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
 #    Kmatrix = gmfile['Kmatrix']
 #    run_time = gmfile['run_time']
 ##    Kmatrix = Kmatrix[[0,1,2,3,4],:]
 ##    Kmatrix = Kmatrix[:,[0,1,2,3,4]]
 #    print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
 ##    Kmatrix = np.zeros((len(Gn), len(Gn)))
 ##    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
 #    
 #    # fitting and computing.
 #    fit_methods = ['k-graphs', 'random', 'random', 'random']
 #    for fit_method in fit_methods:
 #        print('\n-------------------------------------')
 #        print('fit method:', fit_method)
 #        parameters = {'ds_name': ds_name,
 #                      'gkernel': gkernel,
 #                      'edit_cost_name': 'NON_SYMBOLIC',
 #                      'ged_method': 'mIPFP',
 #                      'attr_distance': 'euclidean',
 #                      'fit_method': fit_method}
 #        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
 #                                       initial_solutions=1,
 #                                       Gn_data=[Gn, y_all, graph_dir],
 #                                       k_dis_data=[dis_mat, dis_max, dis_min, dis_mean],
 #                                       Kmatrix=Kmatrix)
    
    
 #    #### xp 5: Fingerprint, sspkernel, using LETTER2, only node attrs.
 #    # load dataset.
 #    print('getting dataset and computing kernel distance matrix first...')
 #    ds_name = 'Fingerprint'
 #    gkernel = 'structuralspkernel'
 #    Gn, y_all, graph_dir = get_dataset(ds_name)
 #    # remove graphs without nodes and edges.
 #    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 0]
 ##          and nx.number_of_edges(G) != 0)]
 #    idx = [G[0] for G in Gn]
 #    Gn = [G[1] for G in Gn]
 #    y_all = [y_all[i] for i in idx]
 #    y_idx = get_same_item_indices(y_all)
 #    # remove unused labels.
 #    for G in Gn:
 #        G.graph['edge_attrs'] = []
 #        for edge in G.edges:
 #            del G.edges[edge]['attributes']
 #            del G.edges[edge]['orient']
 #            del G.edges[edge]['angle']
 ##    Gn = Gn[805:815]
 ##    y_all = y_all[805:815]
 #    for G in Gn:
 #        G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
 #            
 #    # compute/read Gram matrix and pair distances.
 ##    Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
 ##    np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
 ##         Kmatrix=Kmatrix)
 #    gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
 #    Kmatrix = gmfile['Kmatrix']
 ##    run_time = gmfile['run_time']
 ##    Kmatrix = Kmatrix[[0,1,2,3,4],:]
 ##    Kmatrix = Kmatrix[:,[0,1,2,3,4]]
 ##    print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
 ##    Kmatrix = np.zeros((len(Gn), len(Gn)))
 ##    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
 #    
 #    # fitting and computing.
 #    fit_methods = ['k-graphs', 'random', 'random', 'random']
 #    for fit_method in fit_methods:
 #        print('\n-------------------------------------')
 #        print('fit method:', fit_method)
 #        parameters = {'ds_name': ds_name,
 #                      'gkernel': gkernel,
 #                      'edit_cost_name': 'LETTER2',
 #                      'ged_method': 'mIPFP',
 #                      'attr_distance': 'euclidean',
 #                      'fit_method': fit_method,
 #                      'init_ecc': [1,1,1,1,1]} # [0.525, 0.525, 0.001, 0.125, 0.125]}
 #        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
 #                                       initial_solutions=40,
 #                                       Gn_data = [Gn, y_all, graph_dir],
 #                                       k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
 #                                       Kmatrix=Kmatrix)
        
        
 #    #### xp 6: Letter-med, sspkernel.
 #    # load dataset.
 #    print('getting dataset and computing kernel distance matrix first...')
 #    ds_name = 'Letter-med'
 #    gkernel = 'structuralspkernel'
 #    Gn, y_all, graph_dir = get_dataset(ds_name)
 ##    Gn = Gn[0:50]
 ##    y_all = y_all[0:50]
 #    
 #    # compute/read Gram matrix and pair distances.
 #    Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
 #    np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
 #         Kmatrix=Kmatrix)
 ##    gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
 ##    Kmatrix = gmfile['Kmatrix']
 ##    run_time = gmfile['run_time']
 ##    Kmatrix = Kmatrix[[0,1,2,3,4],:]
 ##    Kmatrix = Kmatrix[:,[0,1,2,3,4]]
 ##    print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
 ##    Kmatrix = np.zeros((len(Gn), len(Gn)))
 ##    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
 #    
 #    # fitting and computing.
 #    fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
 #    for fit_method in fit_methods:
 #        print('\n-------------------------------------')
 #        print('fit method:', fit_method)
 #        parameters = {'ds_name': ds_name,
 #                      'gkernel': gkernel,
 #                      'edit_cost_name': 'LETTER2',
 #                      'ged_method': 'mIPFP',
 #                      'attr_distance': 'euclidean',
 #                      'fit_method': fit_method,
 #                      'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475]}
 #        print('parameters: ', parameters)
 #        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
 #                                       initial_solutions=40,
 #                                       Gn_data = [Gn, y_all, graph_dir],
 #                                       k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
 #                                       Kmatrix=Kmatrix)
        
        
 #    #### xp 7: Letter-low, sspkernel.
 #    # load dataset.
 #    print('getting dataset and computing kernel distance matrix first...')
 #    ds_name = 'Letter-low'
 #    gkernel = 'structuralspkernel'
 #    Gn, y_all, graph_dir = get_dataset(ds_name)
 ##    Gn = Gn[0:50]
 ##    y_all = y_all[0:50]
 #    
 #    # compute/read Gram matrix and pair distances.
 #    Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
 #    np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
 #         Kmatrix=Kmatrix)
 ##    gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
 ##    Kmatrix = gmfile['Kmatrix']
 ##    run_time = gmfile['run_time']
 ##    Kmatrix = Kmatrix[[0,1,2,3,4],:]
 ##    Kmatrix = Kmatrix[:,[0,1,2,3,4]]
 ##    print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
 ##    Kmatrix = np.zeros((len(Gn), len(Gn)))
 ##    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
 #    
 #    # fitting and computing.
 #    fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
 #    for fit_method in fit_methods:
 #        print('\n-------------------------------------')
 #        print('fit method:', fit_method)
 #        parameters = {'ds_name': ds_name,
 #                      'gkernel': gkernel,
 #                      'edit_cost_name': 'LETTER2',
 #                      'ged_method': 'mIPFP',
 #                      'attr_distance': 'euclidean',
 #                      'fit_method': fit_method,
 #                      'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]}
 #        print('parameters: ', parameters)
 #        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
 #                                       initial_solutions=40,
 #                                       Gn_data = [Gn, y_all, graph_dir],
 #                                       k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
 #                                       Kmatrix=Kmatrix)
        
    
 #    #### xp 8: Letter-med, spkernel.
 #    # load dataset.
 #    print('getting dataset and computing kernel distance matrix first...')
 #    ds_name = 'Letter-med'
 #    gkernel = 'spkernel'
 #    Gn, y_all, graph_dir = get_dataset(ds_name)
 #    # remove graphs without nodes and edges.
 #    Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0
 #          and nx.number_of_edges(G) != 0)]
 #    idx = [G[0] for G in Gn]
 #    Gn = [G[1] for G in Gn]
 #    y_all = [y_all[i] for i in idx]
 ##    Gn = Gn[0:50]
 ##    y_all = y_all[0:50]
 #    
 #    # compute/read Gram matrix and pair distances.
 #    Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
 #    np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
 #         Kmatrix=Kmatrix)
 ##    gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
 ##    Kmatrix = gmfile['Kmatrix']
 ##    run_time = gmfile['run_time']
 ##    Kmatrix = Kmatrix[[0,1,2,3,4],:]
 ##    Kmatrix = Kmatrix[:,[0,1,2,3,4]]
 ##    print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
 ##    Kmatrix = np.zeros((len(Gn), len(Gn)))
 ##    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
 #    
 #    # fitting and computing.
 #    fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
 #    for fit_method in fit_methods:
 #        print('\n-------------------------------------')
 #        print('fit method:', fit_method)
 #        parameters = {'ds_name': ds_name,
 #                      'gkernel': gkernel,
 #                      'edit_cost_name': 'LETTER2',
 #                      'ged_method': 'mIPFP',
 #                      'attr_distance': 'euclidean',
 #                      'fit_method': fit_method,
 #                      'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475]}
 #        print('parameters: ', parameters)
 #        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
 #                                       initial_solutions=40,
 #                                       Gn_data = [Gn, y_all, graph_dir],
 #                                       k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
 #                                       Kmatrix=Kmatrix)
        

 #    #### xp 9: Letter-low, spkernel.
 #    # load dataset.
 #    print('getting dataset and computing kernel distance matrix first...')
 #    ds_name = 'Letter-low'
 #    gkernel = 'spkernel'
 #    Gn, y_all, graph_dir = get_dataset(ds_name)
 #    # remove graphs without nodes and edges.
 #    Gn = [(idx, G) for idx, G in enumerate(Gn) if (nx.number_of_nodes(G) != 0
 #          and nx.number_of_edges(G) != 0)]
 #    idx = [G[0] for G in Gn]
 #    Gn = [G[1] for G in Gn]
 #    y_all = [y_all[i] for i in idx]
 ##    Gn = Gn[0:50]
 ##    y_all = y_all[0:50]
 #    
 #    # compute/read Gram matrix and pair distances.
 #    Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
 #    np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
 #         Kmatrix=Kmatrix)
 ##    gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
 ##    Kmatrix = gmfile['Kmatrix']
 ##    run_time = gmfile['run_time']
 ##    Kmatrix = Kmatrix[[0,1,2,3,4],:]
 ##    Kmatrix = Kmatrix[:,[0,1,2,3,4]]
 ##    print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
 ##    Kmatrix = np.zeros((len(Gn), len(Gn)))
 ##    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
 #    
 #    # fitting and computing.
 #    fit_methods = ['k-graphs', 'expert', 'random', 'random', 'random']
 #    for fit_method in fit_methods:
 #        print('\n-------------------------------------')
 #        print('fit method:', fit_method)
 #        parameters = {'ds_name': ds_name,
 #                      'gkernel': gkernel,
 #                      'edit_cost_name': 'LETTER2',
 #                      'ged_method': 'mIPFP',
 #                      'attr_distance': 'euclidean',
 #                      'fit_method': fit_method,
 #                      'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075]}
 #        print('parameters: ', parameters)
 #        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
 #                                       initial_solutions=40,
 #                                       Gn_data = [Gn, y_all, graph_dir],
 #                                       k_dis_data = [dis_mat, dis_max, dis_min, dis_mean],
 #                                       Kmatrix=Kmatrix)
        
        
    #### xp 5: COIL-DEL, sspkernel, using LETTER2, only node attrs.
    # load dataset.
    print('getting dataset and computing kernel distance matrix first...')
    ds_name = 'COIL-DEL'
    gkernel = 'structuralspkernel'
    Gn, y_all, graph_dir = get_dataset(ds_name)
    # remove graphs without nodes and edges.
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 0]
 #          and nx.number_of_edges(G) != 0)]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    y_all = [y_all[i] for i in idx]
    # remove unused labels.
    for G in Gn:
        G.graph['edge_labels'] = []
        for edge in G.edges:
            del G.edges[edge]['bond_type']
            del G.edges[edge]['valence']
 #    Gn = Gn[805:815]
 #    y_all = y_all[805:815]
    for G in Gn:
        G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
            
    # compute/read Gram matrix and pair distances.
    is_separate = True
    Kmatrix, run_time, k_dis_data = compute_gm_for_each_class(Gn, 
                                                              y_all, 
                                                              gkernel, 
                                                              parallel='imap_unordered',
                                                              is_separate=is_separate)
 #    Kmatrix = compute_kernel(Gn, gkernel, None, None, True, parallel='imap_unordered')
 #    np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
 #         Kmatrix=Kmatrix)
 #    gmfile = np.load('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm.npz')
 #    Kmatrix = gmfile['Kmatrix']
 #    run_time = gmfile['run_time']
 #    Kmatrix = Kmatrix[[0,1,2,3,4],:]
 #    Kmatrix = Kmatrix[:,[0,1,2,3,4]]
 #    print('\nTime to compute Gram matrix for the whole dataset: ', run_time)
 #    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, 
 #        Kmatrix=Kmatrix, gkernel=gkernel, verbose=True)
 #    Kmatrix = np.zeros((len(Gn), len(Gn)))
 #    dis_mat, dis_max, dis_min, dis_mean = 0, 0, 0, 0
    
    # fitting and computing.
    fit_methods = ['k-graphs', 'random', 'random', 'random']
    for fit_method in fit_methods:
        print('\n-------------------------------------')
        print('fit method:', fit_method)
        parameters = {'ds_name': ds_name,
                      'gkernel': gkernel,
                      'edit_cost_name': 'LETTER2',
                      'ged_method': 'mIPFP',
                      'attr_distance': 'euclidean',
                      'fit_method': fit_method,
                      'init_ecc': [3,3,1,3,3]} # [0.525, 0.525, 0.001, 0.125, 0.125]}
        xp_fit_method_for_non_symbolic(parameters, save_results=True, 
                                       initial_solutions=40,
                                       Gn_data=[Gn, y_all, graph_dir],
                                       k_dis_data=k_dis_data,
                                       Kmatrix=Kmatrix, 
                                       is_separate=is_separate)
--- a/gklearn/preimage/xp_letter_h.py
+++ b/gklearn/preimage/xp_letter_h.py
@@ -0,0 +1,476 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Jan 14 15:39:29 2020

@author: ljia
 """
 import numpy as np
 import random
 import csv
 from shutil import copyfile
 import networkx as nx
 import matplotlib.pyplot as plt

 from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL
 from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes
 from gklearn.preimage.utils import get_same_item_indices, kernel_distance_matrix
 from gklearn.preimage.find_best_k import getRelations


 def xp_letter_h_LETTER2_cost():
    ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
          'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
    
    dis_mat, dis_max, dis_min, dis_mean = kernel_distance_matrix(Gn, None, None, Kmatrix=None, gkernel='structuralspkernel')
    for G in Gn:
        reform_attributes(G)
 #    ds = {'name': 'Letter-high', 
 #          'dataset': '../datasets/Letter-high/Letter-high_A.txt'}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    gkernel = 'structuralspkernel'
    node_label = None
    edge_label = None
    ds_name = 'letter-h'
    dir_output = 'results/xp_letter_h/'
    save_results = True
    cost = 'LETTER2'
    
    repeats = 1
 #    k_list = range(2, 11)
    k_list = [150]
    fit_method = 'k-graphs'
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    
    if save_results:
        # create result files.
        fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_detail = open(dir_output + fn_output_detail, 'a')
        csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', 'median set'])
        f_detail.close()
        fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_summary = open(dir_output + fn_output_summary, 'a')
        csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', 
                  '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', 
                  'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', 
                  'repeats better dis_k gi -> GM'])
        f_summary.close()
    
    random.seed(1)
    rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
    
    for k in k_list:
        print('\n--------- k =', k, '----------')
        
        sod_sm_mean_list = []
        sod_gm_mean_list = []
        dis_k_sm_mean_list = []
        dis_k_gm_mean_list = []
        dis_k_gi_min_mean_list = []
 #        nb_sod_sm2gm = [0, 0, 0]
 #        nb_dis_k_sm2gm = [0, 0, 0]
 #        nb_dis_k_gi2sm = [0, 0, 0]
 #        nb_dis_k_gi2gm = [0, 0, 0]
 #        repeats_better_sod_sm2gm = []
 #        repeats_better_dis_k_sm2gm = []
 #        repeats_better_dis_k_gi2sm = []
 #        repeats_better_dis_k_gi2gm = []
        
        for i, (y, values) in enumerate(y_idx.items()):
            print('\ny =', y)
 #            y = 'F'
 #            values = y_idx[y]
 #            values = values[0:10]
            
            k = len(values)
            
            sod_sm_list = []
            sod_gm_list = []
            dis_k_sm_list = []
            dis_k_gm_list = []
            dis_k_gi_min_list = []
            nb_sod_sm2gm = [0, 0, 0]
            nb_dis_k_sm2gm = [0, 0, 0]
            nb_dis_k_gi2sm = [0, 0, 0]
            nb_dis_k_gi2gm = [0, 0, 0]
            repeats_better_sod_sm2gm = []
            repeats_better_dis_k_sm2gm = []
            repeats_better_dis_k_gi2sm = []
            repeats_better_dis_k_gi2gm = []
            
            for repeat in range(repeats):
                print('\nrepeat =', repeat)
                random.seed(rdn_seed_list[repeat])
                median_set_idx_idx = random.sample(range(0, len(values)), k)
                median_set_idx = [values[idx] for idx in median_set_idx_idx]
                print('median set: ', median_set_idx)
                Gn_median = [Gn[g] for g in values]
        
                sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \
                    = median_on_k_closest_graphs(Gn_median, node_label, edge_label, 
                        gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
                        edit_costs=None, group_min=median_set_idx_idx, 
                        dataset='Letter', cost=cost, parallel=False)
                    
                # write result detail.
                sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
                dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
                dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
                dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
                if save_results:
                    f_detail = open(dir_output + fn_output_detail, 'a')
                    csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, 
                              y, repeat,
                              sod_sm, sod_gm, dis_k_sm, dis_k_gm, 
                              dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
                              dis_k_gi2gm, median_set_idx])
                    f_detail.close()
                
                # compute result summary.
                sod_sm_list.append(sod_sm)
                sod_gm_list.append(sod_gm)
                dis_k_sm_list.append(dis_k_sm)
                dis_k_gm_list.append(dis_k_gm)
                dis_k_gi_min_list.append(dis_k_gi_min)
                # # SOD SM -> GM
                if sod_sm > sod_gm:
                    nb_sod_sm2gm[0] += 1
                    repeats_better_sod_sm2gm.append(repeat)
                elif sod_sm == sod_gm:
                    nb_sod_sm2gm[1] += 1
                elif sod_sm < sod_gm:
                    nb_sod_sm2gm[2] += 1
                # # dis_k SM -> GM
                if dis_k_sm > dis_k_gm:
                    nb_dis_k_sm2gm[0] += 1
                    repeats_better_dis_k_sm2gm.append(repeat)
                elif dis_k_sm == dis_k_gm:
                    nb_dis_k_sm2gm[1] += 1
                elif dis_k_sm < dis_k_gm:
                    nb_dis_k_sm2gm[2] += 1
                # # dis_k gi -> SM
                if dis_k_gi_min > dis_k_sm:
                    nb_dis_k_gi2sm[0] += 1
                    repeats_better_dis_k_gi2sm.append(repeat)
                elif dis_k_gi_min == dis_k_sm:
                    nb_dis_k_gi2sm[1] += 1
                elif dis_k_gi_min < dis_k_sm:
                    nb_dis_k_gi2sm[2] += 1
                # # dis_k gi -> GM
                if dis_k_gi_min > dis_k_gm:
                    nb_dis_k_gi2gm[0] += 1
                    repeats_better_dis_k_gi2gm.append(repeat)
                elif dis_k_gi_min == dis_k_gm:
                    nb_dis_k_gi2gm[1] += 1
                elif dis_k_gi_min < dis_k_gm:
                    nb_dis_k_gi2gm[2] += 1
                    
                # save median graphs.
                fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
                fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_sm, fn_pre_sm_new + '.gxl')
                fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
                fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_gm, fn_pre_gm_new + '.gxl')
                G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
                reform_attributes(G_best_kernel)
                fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter')
                
                # plot median graphs.
                set_median = loadGXL(fn_pre_sm_new + '.gxl')
                gen_median = loadGXL(fn_pre_gm_new + '.gxl')
                draw_Letter_graph(set_median, fn_pre_sm_new)
                draw_Letter_graph(gen_median, fn_pre_gm_new)
                draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
                    
            # write result summary for each letter. 
            sod_sm_mean_list.append(np.mean(sod_sm_list))
            sod_gm_mean_list.append(np.mean(sod_gm_list))
            dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
            dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
            dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
            sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
            dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
            dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            if save_results:
                f_summary = open(dir_output + fn_output_summary, 'a')
                csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
                          sod_sm_mean_list[-1], sod_gm_mean_list[-1], 
                          dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
                          dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, 
                          dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, 
                          nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, 
                          repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, 
                          repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
                f_summary.close()
            

        # write result summary for each letter. 
        sod_sm_mean = np.mean(sod_sm_mean_list)
        sod_gm_mean = np.mean(sod_gm_mean_list)
        dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
        dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
        dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        if save_results:
            f_summary = open(dir_output + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
                      sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
                      dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, 
                      dis_k_gi2sm_mean, dis_k_gi2gm_mean])
            f_summary.close()
        
    print('\ncomplete.')


 def xp_letter_h():
    ds = {'dataset': 'cpp_ext/data/collections/Letter.xml',
          'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/data/datasets/Letter/HIGH/'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['graph_dir'])
    for G in Gn:
        reform_attributes(G)
 #    ds = {'name': 'Letter-high', 
 #          'dataset': '../datasets/Letter-high/Letter-high_A.txt'}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    gkernel = 'structuralspkernel'
    node_label = None
    edge_label = None
    ds_name = 'letter-h'
    dir_output = 'results/xp_letter_h/'
    save_results = False
    
    repeats = 1
 #    k_list = range(2, 11)
    k_list = [150]
    fit_method = 'k-graphs'
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    
    if save_results:
        # create result files.
        fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_detail = open(dir_output + fn_output_detail, 'a')
        csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', 'median set'])
        f_detail.close()
        fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
        f_summary = open(dir_output + fn_output_summary, 'a')
        csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
                  'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
                  'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
                  'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', 
                  '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', 
                  'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', 
                  'repeats better dis_k gi -> GM'])
        f_summary.close()
    
    random.seed(1)
    rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
    
    for k in k_list:
        print('\n--------- k =', k, '----------')
        
        sod_sm_mean_list = []
        sod_gm_mean_list = []
        dis_k_sm_mean_list = []
        dis_k_gm_mean_list = []
        dis_k_gi_min_mean_list = []
 #        nb_sod_sm2gm = [0, 0, 0]
 #        nb_dis_k_sm2gm = [0, 0, 0]
 #        nb_dis_k_gi2sm = [0, 0, 0]
 #        nb_dis_k_gi2gm = [0, 0, 0]
 #        repeats_better_sod_sm2gm = []
 #        repeats_better_dis_k_sm2gm = []
 #        repeats_better_dis_k_gi2sm = []
 #        repeats_better_dis_k_gi2gm = []
        
        for i, (y, values) in enumerate(y_idx.items()):
            print('\ny =', y)
 #            y = 'N'
 #            values = y_idx[y]
 #            values = values[0:10]
            
            k = len(values)
            
            sod_sm_list = []
            sod_gm_list = []
            dis_k_sm_list = []
            dis_k_gm_list = []
            dis_k_gi_min_list = []
            nb_sod_sm2gm = [0, 0, 0]
            nb_dis_k_sm2gm = [0, 0, 0]
            nb_dis_k_gi2sm = [0, 0, 0]
            nb_dis_k_gi2gm = [0, 0, 0]
            repeats_better_sod_sm2gm = []
            repeats_better_dis_k_sm2gm = []
            repeats_better_dis_k_gi2sm = []
            repeats_better_dis_k_gi2gm = []
            
            for repeat in range(repeats):
                print('\nrepeat =', repeat)
                random.seed(rdn_seed_list[repeat])
                median_set_idx_idx = random.sample(range(0, len(values)), k)
                median_set_idx = [values[idx] for idx in median_set_idx_idx]
                print('median set: ', median_set_idx)
                Gn_median = [Gn[g] for g in values]
        
                sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \
                    = median_on_k_closest_graphs(Gn_median, node_label, edge_label, 
                        gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
                        edit_costs=None, group_min=median_set_idx_idx, 
                        dataset='Letter', parallel=False)
                    
                # write result detail.
                sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
                dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
                dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
                dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
                if save_results:
                    f_detail = open(dir_output + fn_output_detail, 'a')
                    csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, 
                              y, repeat,
                              sod_sm, sod_gm, dis_k_sm, dis_k_gm, 
                              dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
                              dis_k_gi2gm, median_set_idx])
                    f_detail.close()
                
                # compute result summary.
                sod_sm_list.append(sod_sm)
                sod_gm_list.append(sod_gm)
                dis_k_sm_list.append(dis_k_sm)
                dis_k_gm_list.append(dis_k_gm)
                dis_k_gi_min_list.append(dis_k_gi_min)
                # # SOD SM -> GM
                if sod_sm > sod_gm:
                    nb_sod_sm2gm[0] += 1
                    repeats_better_sod_sm2gm.append(repeat)
                elif sod_sm == sod_gm:
                    nb_sod_sm2gm[1] += 1
                elif sod_sm < sod_gm:
                    nb_sod_sm2gm[2] += 1
                # # dis_k SM -> GM
                if dis_k_sm > dis_k_gm:
                    nb_dis_k_sm2gm[0] += 1
                    repeats_better_dis_k_sm2gm.append(repeat)
                elif dis_k_sm == dis_k_gm:
                    nb_dis_k_sm2gm[1] += 1
                elif dis_k_sm < dis_k_gm:
                    nb_dis_k_sm2gm[2] += 1
                # # dis_k gi -> SM
                if dis_k_gi_min > dis_k_sm:
                    nb_dis_k_gi2sm[0] += 1
                    repeats_better_dis_k_gi2sm.append(repeat)
                elif dis_k_gi_min == dis_k_sm:
                    nb_dis_k_gi2sm[1] += 1
                elif dis_k_gi_min < dis_k_sm:
                    nb_dis_k_gi2sm[2] += 1
                # # dis_k gi -> GM
                if dis_k_gi_min > dis_k_gm:
                    nb_dis_k_gi2gm[0] += 1
                    repeats_better_dis_k_gi2gm.append(repeat)
                elif dis_k_gi_min == dis_k_gm:
                    nb_dis_k_gi2gm[1] += 1
                elif dis_k_gi_min < dis_k_gm:
                    nb_dis_k_gi2gm[2] += 1
                    
                # save median graphs.
                fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
                fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_sm, fn_pre_sm_new + '.gxl')
                fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
                fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                copyfile(fname_gm, fn_pre_gm_new + '.gxl')
                G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
                reform_attributes(G_best_kernel)
                fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + y + '.repeat' + str(repeat)
                saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib-letter')
                
                # plot median graphs.
                set_median = loadGXL(fn_pre_sm_new + '.gxl')
                gen_median = loadGXL(fn_pre_gm_new + '.gxl')
                draw_Letter_graph(set_median, fn_pre_sm_new)
                draw_Letter_graph(gen_median, fn_pre_gm_new)
                draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
                    
            # write result summary for each letter. 
            sod_sm_mean_list.append(np.mean(sod_sm_list))
            sod_gm_mean_list.append(np.mean(sod_gm_list))
            dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
            dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
            dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
            sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
            dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
            dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            if save_results:
                f_summary = open(dir_output + fn_output_summary, 'a')
                csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
                          sod_sm_mean_list[-1], sod_gm_mean_list[-1], 
                          dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
                          dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, 
                          dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, 
                          nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, 
                          repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, 
                          repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
                f_summary.close()
            

        # write result summary for each letter. 
        sod_sm_mean = np.mean(sod_sm_mean_list)
        sod_gm_mean = np.mean(sod_gm_mean_list)
        dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
        dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
        dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        if save_results:
            f_summary = open(dir_output + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
                      sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
                      dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, 
                      dis_k_gi2sm_mean, dis_k_gi2gm_mean])
            f_summary.close()
        
    print('\ncomplete.')
    
    
 #Dessin median courrant
 def draw_Letter_graph(graph, file_prefix):
    plt.figure()
    pos = {}
    for n in graph.nodes:
        pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
    nx.draw_networkx(graph, pos)
    plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
 #    plt.show()
    plt.clf()
        

 if __name__ == "__main__":
 #    xp_letter_h()
    xp_letter_h_LETTER2_cost()
--- a/gklearn/preimage/xp_monoterpenoides.py
+++ b/gklearn/preimage/xp_monoterpenoides.py
@@ -0,0 +1,249 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Thu Jan 16 11:03:11 2020

@author: ljia
 """

 import numpy as np
 import random
 import csv
 from shutil import copyfile
 import networkx as nx
 import matplotlib.pyplot as plt

 from gklearn.utils.graphfiles import loadDataset, loadGXL, saveGXL
 from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs, reform_attributes
 from gklearn.preimage.utils import get_same_item_indices
 from gklearn.preimage.find_best_k import getRelations

 def xp_monoterpenoides():
    import os

    ds = {'dataset': '../../datasets/monoterpenoides/dataset_10+.ds',
          'graph_dir': os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'}  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
 #    ds = {'name': 'Letter-high', 
 #          'dataset': '../datasets/Letter-high/Letter-high_A.txt'}  # node/edge symb
 #    Gn, y_all = loadDataset(ds['dataset'])
 #    Gn = Gn[0:50]
    gkernel = 'treeletkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    ds_name = 'monoterpenoides'
    dir_output = 'results/xp_monoterpenoides/'
    
    repeats = 1
 #    k_list = range(2, 11)
    k_list = [0]
    fit_method = 'k-graphs'
    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    
    # create result files.
    fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
    f_detail = open(dir_output + fn_output_detail, 'a')
    csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
              'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
              'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
              'dis_k gi -> GM', 'median set'])
    f_detail.close()
    fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
    f_summary = open(dir_output + fn_output_summary, 'a')
    csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k', 
              'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
              'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', 
              'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM', 
              '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM', 
              'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', 
              'repeats better dis_k gi -> GM'])
    f_summary.close()
    
    random.seed(1)
    rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
    
    for k in k_list:
        print('\n--------- k =', k, '----------')
        
        sod_sm_mean_list = []
        sod_gm_mean_list = []
        dis_k_sm_mean_list = []
        dis_k_gm_mean_list = []
        dis_k_gi_min_mean_list = []
 #        nb_sod_sm2gm = [0, 0, 0]
 #        nb_dis_k_sm2gm = [0, 0, 0]
 #        nb_dis_k_gi2sm = [0, 0, 0]
 #        nb_dis_k_gi2gm = [0, 0, 0]
 #        repeats_better_sod_sm2gm = []
 #        repeats_better_dis_k_sm2gm = []
 #        repeats_better_dis_k_gi2sm = []
 #        repeats_better_dis_k_gi2gm = []
        
        for i, (y, values) in enumerate(y_idx.items()):
            print('\ny =', y)
 #            y = 'I'
 #            values = y_idx[y]
            
            k = len(values)
 #            k = kkk
            
            sod_sm_list = []
            sod_gm_list = []
            dis_k_sm_list = []
            dis_k_gm_list = []
            dis_k_gi_min_list = []
            nb_sod_sm2gm = [0, 0, 0]
            nb_dis_k_sm2gm = [0, 0, 0]
            nb_dis_k_gi2sm = [0, 0, 0]
            nb_dis_k_gi2gm = [0, 0, 0]
            repeats_better_sod_sm2gm = []
            repeats_better_dis_k_sm2gm = []
            repeats_better_dis_k_gi2sm = []
            repeats_better_dis_k_gi2gm = []
            
            for repeat in range(repeats):
                print('\nrepeat =', repeat)
                random.seed(rdn_seed_list[repeat])
                median_set_idx_idx = random.sample(range(0, len(values)), k)
                median_set_idx = [values[idx] for idx in median_set_idx_idx]
                print('median set: ', median_set_idx)
                Gn_median = [Gn[g] for g in values]
        
                sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min \
                    = median_on_k_closest_graphs(Gn_median, node_label, edge_label, 
                        gkernel, k, fit_method=fit_method, graph_dir=ds['graph_dir'],
                        edit_costs=None, group_min=median_set_idx_idx, 
                        dataset=ds_name, parallel=False)
                    
                # write result detail.
                sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
                dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
                dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
                dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
                f_detail = open(dir_output + fn_output_detail, 'a')
                csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, 
                          y, repeat,
                          sod_sm, sod_gm, dis_k_sm, dis_k_gm, 
                          dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
                          dis_k_gi2gm, median_set_idx])
                f_detail.close()
                
                # compute result summary.
                sod_sm_list.append(sod_sm)
                sod_gm_list.append(sod_gm)
                dis_k_sm_list.append(dis_k_sm)
                dis_k_gm_list.append(dis_k_gm)
                dis_k_gi_min_list.append(dis_k_gi_min)
                # # SOD SM -> GM
                if sod_sm > sod_gm:
                    nb_sod_sm2gm[0] += 1
                    repeats_better_sod_sm2gm.append(repeat)
                elif sod_sm == sod_gm:
                    nb_sod_sm2gm[1] += 1
                elif sod_sm < sod_gm:
                    nb_sod_sm2gm[2] += 1
                # # dis_k SM -> GM
                if dis_k_sm > dis_k_gm:
                    nb_dis_k_sm2gm[0] += 1
                    repeats_better_dis_k_sm2gm.append(repeat)
                elif dis_k_sm == dis_k_gm:
                    nb_dis_k_sm2gm[1] += 1
                elif dis_k_sm < dis_k_gm:
                    nb_dis_k_sm2gm[2] += 1
                # # dis_k gi -> SM
                if dis_k_gi_min > dis_k_sm:
                    nb_dis_k_gi2sm[0] += 1
                    repeats_better_dis_k_gi2sm.append(repeat)
                elif dis_k_gi_min == dis_k_sm:
                    nb_dis_k_gi2sm[1] += 1
                elif dis_k_gi_min < dis_k_sm:
                    nb_dis_k_gi2sm[2] += 1
                # # dis_k gi -> GM
                if dis_k_gi_min > dis_k_gm:
                    nb_dis_k_gi2gm[0] += 1
                    repeats_better_dis_k_gi2gm.append(repeat)
                elif dis_k_gi_min == dis_k_gm:
                    nb_dis_k_gi2gm[1] += 1
                elif dis_k_gi_min < dis_k_gm:
                    nb_dis_k_gi2gm[2] += 1
                    
                # save median graphs.
                fname_sm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/set_median.gxl'
                fn_pre_sm_new = dir_output + 'medians/set_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat)
                copyfile(fname_sm, fn_pre_sm_new + '.gxl')
                fname_gm = os.path.dirname(os.path.realpath(__file__)) + '/cpp_ext/output/tmp_ged/gen_median.gxl'
                fn_pre_gm_new = dir_output + 'medians/gen_median.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat)
                copyfile(fname_gm, fn_pre_gm_new + '.gxl')
                G_best_kernel = Gn_median[idx_dis_k_gi_min].copy()
 #                reform_attributes(G_best_kernel)
                fn_pre_g_best_kernel = dir_output + 'medians/g_best_kernel.' + fit_method \
                    + '.k' + str(int(k)) + '.y' + str(int(y)) + '.repeat' + str(repeat)
                saveGXL(G_best_kernel, fn_pre_g_best_kernel + '.gxl', method='gedlib')
                
 #                # plot median graphs.
 #                set_median = loadGXL(fn_pre_sm_new + '.gxl')
 #                gen_median = loadGXL(fn_pre_gm_new + '.gxl')
 #                draw_Letter_graph(set_median, fn_pre_sm_new)
 #                draw_Letter_graph(gen_median, fn_pre_gm_new)
 #                draw_Letter_graph(G_best_kernel, fn_pre_g_best_kernel)
                    
            # write result summary for each letter. 
            sod_sm_mean_list.append(np.mean(sod_sm_list))
            sod_gm_mean_list.append(np.mean(sod_gm_list))
            dis_k_sm_mean_list.append(np.mean(dis_k_sm_list))
            dis_k_gm_mean_list.append(np.mean(dis_k_gm_list))
            dis_k_gi_min_mean_list.append(np.mean(dis_k_gi_min_list))
            sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean_list[-1] - sod_sm_mean_list[-1]))
            dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_sm_mean_list[-1]))
            dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean_list[-1] - dis_k_gi_min_mean_list[-1]))
            f_summary = open(dir_output + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, y,
                      sod_sm_mean_list[-1], sod_gm_mean_list[-1], 
                      dis_k_sm_mean_list[-1], dis_k_gm_mean_list[-1],
                      dis_k_gi_min_mean_list[-1], sod_sm2gm_mean, dis_k_sm2gm_mean, 
                      dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm, 
                      nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm, 
                      repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm, 
                      repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
            f_summary.close()
            

        # write result summary for each letter. 
        sod_sm_mean = np.mean(sod_sm_mean_list)
        sod_gm_mean = np.mean(sod_gm_mean_list)
        dis_k_sm_mean = np.mean(dis_k_sm_mean_list)
        dis_k_gm_mean = np.mean(dis_k_gm_mean_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
        dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        f_summary = open(dir_output + fn_output_summary, 'a')
        csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k, 'all',
                  sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
                  dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, 
                  dis_k_gi2sm_mean, dis_k_gi2gm_mean])
        f_summary.close()
            
        
    print('\ncomplete.')
    
    
 #Dessin median courrant
 def draw_Letter_graph(graph, file_prefix):
    plt.figure()
    pos = {}
    for n in graph.nodes:
        pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])])
    nx.draw_networkx(graph, pos)
    plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
 #    plt.show()
    plt.clf()
    

 if __name__ == "__main__":
    xp_monoterpenoides()
--- a/gklearn/utils/isNotebook.py
+++ b/gklearn/utils/isNotebook.py
@@ -0,0 +1,16 @@
 """ Functions for python system.
 """

 def isNotebook():
    """check if code is executed in the IPython notebook.
    """
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter
--- a/gklearn/utils/logger2file.py
+++ b/gklearn/utils/logger2file.py
@@ -0,0 +1,27 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Nov  8 14:21:25 2019

@author: ljia
 """

 import sys
 import time

 class Logger(object):
    def __init__(self):
        self.terminal = sys.stdout
        self.log = open("log." + str(time.time()) + ".log", "a")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)  

    def flush(self):
        #this flush method is needed for python 3 compatibility.
        #this handles the flush command by doing nothing.
        #you might want to specify some extra behavior here.
        pass    

 sys.stdout = Logger()
--- a/notebooks/else/compute_spkernel_for_syntheticnew.py
+++ b/notebooks/else/compute_spkernel_for_syntheticnew.py
@@ -0,0 +1,52 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sun Dec 23 16:40:52 2018

@author: ljia
 """
 import sys
 import numpy as np
 import networkx as nx

 sys.path.insert(0, "../")
 from gklearn.utils.graphfiles import loadDataset
 from gklearn.utils.model_selection_precomputed import compute_gram_matrices
 from gklearn.kernels.spKernel import spkernel
 from sklearn.model_selection import ParameterGrid

 from libs import *
 import multiprocessing
 import functools
 from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct


 if __name__ == "__main__":
    # load dataset.
    print('getting dataset and computing kernel distance matrix first...')
    ds_name = 'SYNTHETICnew'
    gkernel = 'spkernel'
    dataset = '../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
    Gn, y_all = loadDataset(dataset)

    for G in Gn:
        G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
    
    # compute/read Gram matrix and pair distances.
    mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
    Kmatrix = np.empty((len(Gn), len(Gn)))
    Kmatrix, run_time, idx = spkernel(Gn, node_label=None, node_kernels=
                              {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel},
                              n_jobs=multiprocessing.cpu_count(), verbose=True)
    
    # normalization
    Kmatrix_diag = Kmatrix.diagonal().copy()
    for i in range(len(Kmatrix)):
        for j in range(i, len(Kmatrix)):
            Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
            Kmatrix[j][i] = Kmatrix[i][j]
    
    np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
         Kmatrix=Kmatrix, run_time=run_time)
    
    print('complete!')
--- a/notebooks/else/compute_sspkernel_for_syntheticnew.py
+++ b/notebooks/else/compute_sspkernel_for_syntheticnew.py
@@ -0,0 +1,54 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sun Dec 23 16:40:52 2018

@author: ljia
 """
 import sys
 import numpy as np
 import networkx as nx

 sys.path.insert(0, "../")
 from gklearn.utils.graphfiles import loadDataset
 from gklearn.utils.model_selection_precomputed import compute_gram_matrices
 from gklearn.kernels.structuralspKernel import structuralspkernel
 from sklearn.model_selection import ParameterGrid

 from libs import *
 import multiprocessing
 import functools
 from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct


 if __name__ == "__main__":
    # load dataset.
    print('getting dataset and computing kernel distance matrix first...')
    ds_name = 'SYNTHETICnew'
    gkernel = 'structuralspkernel'
    dataset = '../datasets/SYNTHETICnew/SYNTHETICnew_A.txt'
    Gn, y_all = loadDataset(dataset)

    for G in Gn:
        G.graph['filename'] = 'graph' + str(G.graph['name']) + '.gxl'
    
    # compute/read Gram matrix and pair distances.
    mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
    sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}
    Kmatrix, run_time = structuralspkernel(Gn, node_label=None, edge_label=None,
                          node_kernels=sub_kernels, edge_kernels=sub_kernels,
                          parallel=None,  # parallel='imap_unordered', 
                          n_jobs=multiprocessing.cpu_count(), 
                          verbose=True)
    
    # normalization
    Kmatrix_diag = Kmatrix.diagonal().copy()
    for i in range(len(Kmatrix)):
        for j in range(i, len(Kmatrix)):
            Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
            Kmatrix[j][i] = Kmatrix[i][j]
    
    np.savez('results/xp_fit_method/Kmatrix.' + ds_name + '.' + gkernel + '.gm', 
         Kmatrix=Kmatrix, run_time=run_time)
    
    print('complete!')
--- a/notebooks/else/job_graphkernels.sl
+++ b/notebooks/else/job_graphkernels.sl
@@ -0,0 +1,19 @@
 #!/bin/bash

 #SBATCH --exclusive
 #SBATCH --job-name="graphkernels"
 #SBATCH --partition=tcourt
 #SBATCH --mail-type=ALL
 #SBATCH --mail-user=jajupmochi@gmail.com
 #SBATCH --output=output_graphkernels.txt
 #SBATCH --error=error_graphkernels.txt
 #
 #SBATCH --ntasks=1
 #SBATCH --nodes=2
 #SBATCH --cpus-per-task=56
 #SBATCH --time=24:00:00
 #SBATCH --mem-per-cpu=4000

 srun hostname
 srun cd /home/2017018/ljia01/graphkit-learn/notebooks
 srun python3 run_spkernel.py
--- a/notebooks/else/job_test.sl
+++ b/notebooks/else/job_test.sl
@@ -0,0 +1,12 @@
 #!/bin/bash
 #
 #SBATCH --job-name=test
 #SBATCH --output=res.txt
 #SBATCH --partition=long
 #
 #SBATCH --ntasks=1
 #SBATCH --time=10:00
 #SBATCH --mem-per-cpu=100

 srun hostname
 srun sleep 60
--- a/notebooks/else/run_rwalk_symonly.py
+++ b/notebooks/else/run_rwalk_symonly.py
@@ -0,0 +1,70 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sun Dec 23 16:56:44 2018

@author: ljia
 """

 import functools
 from libs import *
 import multiprocessing

 from gklearn.kernels.rwalk_sym import randomwalkkernel
 from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct

 import numpy as np


 dslist = [
    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
    # node nsymb
    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
    # node symb/nsymb
 ]
 estimator = randomwalkkernel
 param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
              {'alpha': np.logspace(-10, 10, num=41, base=10)}]

 for ds in dslist:
    print()
    print(ds['name'])
    for compute_method in ['conjugate', 'fp']:
        if compute_method == 'sylvester':
            param_grid_precomputed = {'compute_method': ['sylvester'],
 #                          'weight': np.linspace(0.01, 0.10, 10)}
                          'weight': np.logspace(-1, -10, num=10, base=10)}
        elif compute_method == 'conjugate':
            mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
            param_grid_precomputed = {'compute_method': ['conjugate'], 
                          'node_kernels': 
                          [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
                          'edge_kernels': 
                          [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
                          'weight': np.logspace(-1, -10, num=10, base=10)}
        elif compute_method == 'fp':
            mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
            param_grid_precomputed = {'compute_method': ['fp'], 
                          'node_kernels': 
                          [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
                          'edge_kernels': 
                          [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
                          'weight': np.logspace(-3, -10, num=8, base=10)}
        elif compute_method == 'spectral':
            param_grid_precomputed = {'compute_method': ['spectral'],
                          'weight': np.logspace(-1, -10, num=10, base=10),
                          'sub_kernel': ['geo', 'exp']}
        model_selection_for_precomputed_kernel(
            ds['dataset'],
            estimator,
            param_grid_precomputed,
            (param_grid[1] if ('task' in ds and ds['task']
                               == 'regression') else param_grid[0]),
            (ds['task'] if 'task' in ds else 'classification'),
            NUM_TRIALS=30,
            datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
            extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
            ds_name=ds['name'],
            n_jobs=multiprocessing.cpu_count(),
            read_gm_from_file=False)
    print()
--- a/notebooks/else/run_sp_symonly.py
+++ b/notebooks/else/run_sp_symonly.py
@@ -0,0 +1,61 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Fri Dec 21 17:59:28 2018

@author: ljia
 """

 import functools
 from libs import *
 import multiprocessing

 from gklearn.kernels.sp_sym import spkernel
 from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct
 #from gklearn.utils.model_selection_precomputed import trial_do

 dslist = [
    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
    # node nsymb
    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
    # node symb/nsymb

    #     {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
    # # #     {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
    # # #     {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
    #     {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
    #
    # #     {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
    # #     {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
    # #     {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

    # #     {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
    # #     {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
 ]
 estimator = spkernel
 mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
 param_grid_precomputed = {'node_kernels': [
    {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
 param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
              {'alpha': np.logspace(-10, 10, num=41, base=10)}]

 for ds in dslist:
    print()
    print(ds['name'])
    model_selection_for_precomputed_kernel(
        ds['dataset'],
        estimator,
        param_grid_precomputed,
        (param_grid[1] if ('task' in ds and ds['task']
                           == 'regression') else param_grid[0]),
        (ds['task'] if 'task' in ds else 'classification'),
        NUM_TRIALS=30,
        datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
        ds_name=ds['name'],
        n_jobs=multiprocessing.cpu_count(),
        read_gm_from_file=False)
    print()
--- a/notebooks/else/run_ssp_symonly.py
+++ b/notebooks/else/run_ssp_symonly.py
@@ -0,0 +1,47 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sun Dec 23 16:40:52 2018

@author: ljia
 """

 import functools
 from libs import *
 import multiprocessing

 from gklearn.kernels.ssp_sym import structuralspkernel
 from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct

 dslist = [
    {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
    # node nsymb
    {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
    # node symb/nsymb
 ]
 estimator = structuralspkernel
 mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
 param_grid_precomputed = {'node_kernels': 
    [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
    'edge_kernels': 
    [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
 param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
              {'alpha': np.logspace(-10, 10, num=41, base=10)}]

 for ds in dslist:
    print()
    print(ds['name'])
    model_selection_for_precomputed_kernel(
        ds['dataset'],
        estimator,
        param_grid_precomputed,
        (param_grid[1] if ('task' in ds and ds['task']
                           == 'regression') else param_grid[0]),
        (ds['task'] if 'task' in ds else 'classification'),
        NUM_TRIALS=30,
        datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
        extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
        ds_name=ds['name'],
        n_jobs=multiprocessing.cpu_count(),
        read_gm_from_file=False)
    print()
--- a/notebooks/tests/memory_profile.ipynb
+++ b/notebooks/tests/memory_profile.ipynb
@@ -0,0 +1,821 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Acyclic\n",
      "\n",
      "--- This is a regression problem ---\n",
      "\n",
      "\n",
      "1. Loading dataset from file...\n",
      "\n",
      "2. Calculating gram matrices. This could take a while...\n",
      "\n",
      " None edge weight specified. Set all weight to 1.\n",
      "\n",
      "getting sp graphs: 183it [00:00, 1871.37it/s]\n",
      "calculating kernels: 16836it [00:16, 1014.42it/s]\n",
      "\n",
      " --- shortest path kernel matrix of size 183 built in 16.947543382644653 seconds ---\n",
      "\n",
      "the gram matrix with parameters {'node_kernels': {'symb': <function deltakernel at 0x7f3a99093950>, 'nsymb': <function gaussiankernel at 0x7f3a990931e0>, 'mix': functools.partial(<function kernelproduct at 0x7f3a99088ae8>, <function deltakernel at 0x7f3a99093950>, <function gaussiankernel at 0x7f3a990931e0>)}, 'n_jobs': 8} is: \n",
      "\n",
      "\n",
      "\n",
      "1 gram matrices are calculated, 0 of which are ignored.\n",
      "\n",
      "3. Fitting and predicting using nested cross validation. This could really take a while...\n",
      "cross validation: 30it [00:12,  2.03it/s]\n",
      "\n",
      "4. Getting final performance...\n",
      "best_params_out:  [{'node_kernels': {'symb': <function deltakernel at 0x7f3a99093950>, 'nsymb': <function gaussiankernel at 0x7f3a990931e0>, 'mix': functools.partial(<function kernelproduct at 0x7f3a99088ae8>, <function deltakernel at 0x7f3a99093950>, <function gaussiankernel at 0x7f3a990931e0>)}, 'n_jobs': 8}]\n",
      "best_params_in:  [{'alpha': 1e-06}]\n",
      "\n",
      "best_val_perf:  9.55244065682399\n",
      "best_val_std:  0.5574811966683159\n",
      "final_performance:  [9.724426192585643]\n",
      "final_confidence:  [2.999822095078807]\n",
      "train_performance: [6.141755071354953]\n",
      "train_std:  [0.2732168016478284]\n",
      "\n",
      "time to calculate gram matrix with different hyper-params: 16.95±nans\n",
      "time to calculate best gram matrix: 16.95±nans\n",
      "total training time with all hyper-param choices: 32.74s\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/numpy/core/_methods.py:140: RuntimeWarning: Degrees of freedom <= 0 for slice\n",
      "  keepdims=keepdims)\n",
      "/usr/local/lib/python3.6/dist-packages/numpy/core/_methods.py:132: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  ret = ret.dtype.type(ret / rcount)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Filename: ../../gklearn/utils/model_selection_precomputed.py\n",
      "\n",
      "Line #    Mem usage    Increment   Line Contents\n",
      "================================================\n",
      "    24    115.2 MiB    115.2 MiB   @profile\n",
      "    25                             def model_selection_for_precomputed_kernel(datafile,\n",
      "    26                                                                        estimator,\n",
      "    27                                                                        param_grid_precomputed,\n",
      "    28                                                                        param_grid,\n",
      "    29                                                                        model_type,\n",
      "    30                                                                        NUM_TRIALS=30,\n",
      "    31                                                                        datafile_y=None,\n",
      "    32                                                                        extra_params=None,\n",
      "    33                                                                        ds_name='ds-unknown',\n",
      "    34                                                                        n_jobs=1,\n",
      "    35                                                                        read_gm_from_file=False):\n",
      "    36                                 \"\"\"Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.\n",
      "    37                             \n",
      "    38                                 Parameters\n",
      "    39                                 ----------\n",
      "    40                                 datafile : string\n",
      "    41                                     Path of dataset file.\n",
      "    42                                 estimator : function\n",
      "    43                                     kernel function used to estimate. This function needs to return a gram matrix.\n",
      "    44                                 param_grid_precomputed : dictionary\n",
      "    45                                     Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.\n",
      "    46                                 param_grid : dictionary\n",
      "    47                                     Dictionary with names (string) of parameters used as penelties as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.\n",
      "    48                                 model_type : string\n",
      "    49                                     Typr of the problem, can be regression or classification.\n",
      "    50                                 NUM_TRIALS : integer\n",
      "    51                                     Number of random trials of outer cv loop. The default is 30.\n",
      "    52                                 datafile_y : string\n",
      "    53                                     Path of file storing y data. This parameter is optional depending on the given dataset file.\n",
      "    54                                 read_gm_from_file : boolean\n",
      "    55                                     Whether gram matrices are loaded from file.\n",
      "    56                             \n",
      "    57                                 Examples\n",
      "    58                                 --------\n",
      "    59                                 >>> import numpy as np\n",
      "    60                                 >>> import sys\n",
      "    61                                 >>> sys.path.insert(0, \"../\")\n",
      "    62                                 >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel\n",
      "    63                                 >>> from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel\n",
      "    64                                 >>>\n",
      "    65                                 >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
      "    66                                 >>> estimator = weisfeilerlehmankernel\n",
      "    67                                 >>> param_grid_precomputed = {'height': [0,1,2,3,4,5,6,7,8,9,10], 'base_kernel': ['subtree']}\n",
      "    68                                 >>> param_grid = {\"alpha\": np.logspace(-2, 2, num = 10, base = 10)}\n",
      "    69                                 >>>\n",
      "    70                                 >>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression')\n",
      "    71                                 \"\"\"\n",
      "    72    115.2 MiB      0.0 MiB       tqdm.monitor_interval = 0\n",
      "    73                             \n",
      "    74    115.2 MiB      0.0 MiB       results_dir = '../notebooks/results/' + estimator.__name__\n",
      "    75    115.2 MiB      0.0 MiB       if not os.path.exists(results_dir):\n",
      "    76                                     os.makedirs(results_dir)\n",
      "    77                                 # a string to save all the results.\n",
      "    78    115.2 MiB      0.0 MiB       str_fw = '###################### log time: ' + datetime.datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\") + '. ######################\\n\\n'\n",
      "    79    115.2 MiB      0.0 MiB       str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\\n# including gram matrices, serial numbers for gram matrix figures and performance.\\n\\n'\n",
      "    80                             \n",
      "    81                                 # setup the model type\n",
      "    82    115.2 MiB      0.0 MiB       model_type = model_type.lower()\n",
      "    83    115.2 MiB      0.0 MiB       if model_type != 'regression' and model_type != 'classification':\n",
      "    84                                     raise Exception(\n",
      "    85                                         'The model type is incorrect! Please choose from regression or classification.'\n",
      "    86                                     )\n",
      "    87    115.2 MiB      0.0 MiB       print()\n",
      "    88    115.2 MiB      0.0 MiB       print('--- This is a %s problem ---' % model_type)\n",
      "    89    115.2 MiB      0.0 MiB       str_fw += 'This is a %s problem.\\n' % model_type\n",
      "    90                                 \n",
      "    91                                 # calculate gram matrices rather than read them from file.\n",
      "    92    115.2 MiB      0.0 MiB       if read_gm_from_file == False:\n",
      "    93                                     # Load the dataset\n",
      "    94    115.2 MiB      0.0 MiB           print()\n",
      "    95    115.2 MiB      0.0 MiB           print('\\n1. Loading dataset from file...')\n",
      "    96    115.2 MiB      0.0 MiB           if isinstance(datafile, str):\n",
      "    97    115.2 MiB      0.0 MiB               dataset, y_all = loadDataset(\n",
      "    98    116.3 MiB      1.1 MiB                       datafile, filename_y=datafile_y, extra_params=extra_params)\n",
      "    99                                     else: # load data directly from variable.\n",
      "   100                                         dataset = datafile\n",
      "   101                                         y_all = datafile_y                \n",
      "   102                             \n",
      "   103                                     #     import matplotlib.pyplot as plt\n",
      "   104                                     #     import networkx as nx\n",
      "   105                                     #     nx.draw_networkx(dataset[30])\n",
      "   106                                     #     plt.show()\n",
      "   107                                 \n",
      "   108                                     # Grid of parameters with a discrete number of values for each.\n",
      "   109    116.3 MiB      0.0 MiB           param_list_precomputed = list(ParameterGrid(param_grid_precomputed))\n",
      "   110    116.3 MiB      0.0 MiB           param_list = list(ParameterGrid(param_grid))\n",
      "   111                                 \n",
      "   112    116.3 MiB      0.0 MiB           gram_matrices = [\n",
      "   113                                     ]  # a list to store gram matrices for all param_grid_precomputed\n",
      "   114    116.3 MiB      0.0 MiB           gram_matrix_time = [\n",
      "   115                                     ]  # a list to store time to calculate gram matrices\n",
      "   116    116.3 MiB      0.0 MiB           param_list_pre_revised = [\n",
      "   117                                     ]  # list to store param grids precomputed ignoring the useless ones\n",
      "   118                                 \n",
      "   119                                     # calculate all gram matrices\n",
      "   120    116.3 MiB      0.0 MiB           print()\n",
      "   121    116.3 MiB      0.0 MiB           print('2. Calculating gram matrices. This could take a while...')\n",
      "   122    116.3 MiB      0.0 MiB           str_fw += '\\nII. Gram matrices.\\n\\n'\n",
      "   123    116.3 MiB      0.0 MiB           tts = time.time()  # start training time\n",
      "   124    116.3 MiB      0.0 MiB           nb_gm_ignore = 0  # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)\n",
      "   125    145.3 MiB      0.0 MiB           for idx, params_out in enumerate(param_list_precomputed):\n",
      "   126    116.3 MiB      0.0 MiB               y = y_all[:]\n",
      "   127    116.3 MiB      0.0 MiB               params_out['n_jobs'] = n_jobs\n",
      "   128                             #            print(dataset)\n",
      "   129                             #            import networkx as nx\n",
      "   130                             #            nx.draw_networkx(dataset[1])\n",
      "   131                             #            plt.show()\n",
      "   132    119.5 MiB      3.1 MiB               rtn_data = estimator(dataset[:], **params_out)\n",
      "   133    119.5 MiB      0.0 MiB               Kmatrix = rtn_data[0]\n",
      "   134    119.5 MiB      0.0 MiB               current_run_time = rtn_data[1]\n",
      "   135                                         # for some kernels, some graphs in datasets may not meet the \n",
      "   136                                         # kernels' requirements for graph structure. These graphs are trimmed. \n",
      "   137    119.5 MiB      0.0 MiB               if len(rtn_data) == 3:\n",
      "   138    119.5 MiB      0.0 MiB                   idx_trim = rtn_data[2]  # the index of trimmed graph list\n",
      "   139    119.5 MiB      0.0 MiB                   y = [y[idxt] for idxt in idx_trim] # trim y accordingly\n",
      "   140                             #            Kmatrix = np.random.rand(2250, 2250)\n",
      "   141                             #            current_run_time = 0.1\n",
      "   142                                         \n",
      "   143                                         # remove graphs whose kernels with themselves are zeros\n",
      "   144    119.5 MiB      0.0 MiB               Kmatrix_diag = Kmatrix.diagonal().copy()\n",
      "   145    119.5 MiB      0.0 MiB               nb_g_ignore = 0\n",
      "   146    119.5 MiB      0.0 MiB               for idxk, diag in enumerate(Kmatrix_diag):\n",
      "   147    119.5 MiB      0.0 MiB                   if diag == 0:\n",
      "   148                                                 Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0)\n",
      "   149                                                 Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)\n",
      "   150                                                 nb_g_ignore += 1\n",
      "   151                                         # normalization\n",
      "   152    119.5 MiB      0.0 MiB               Kmatrix_diag = Kmatrix.diagonal().copy()\n",
      "   153    119.5 MiB      0.0 MiB               for i in range(len(Kmatrix)):\n",
      "   154    119.5 MiB      0.0 MiB                   for j in range(i, len(Kmatrix)):\n",
      "   155    119.5 MiB      0.0 MiB                       Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])\n",
      "   156    119.5 MiB      0.0 MiB                       Kmatrix[j][i] = Kmatrix[i][j]\n",
      "   157                                 \n",
      "   158    119.5 MiB      0.0 MiB               print()\n",
      "   159    119.5 MiB      0.0 MiB               if params_out == {}:\n",
      "   160                                             print('the gram matrix is: ')\n",
      "   161                                             str_fw += 'the gram matrix is:\\n\\n'\n",
      "   162                                         else:\n",
      "   163    119.5 MiB      0.0 MiB                   print('the gram matrix with parameters', params_out, 'is: \\n\\n')\n",
      "   164    119.5 MiB      0.0 MiB                   str_fw += 'the gram matrix with parameters %s is:\\n\\n' % params_out\n",
      "   165    119.5 MiB      0.0 MiB               if len(Kmatrix) < 2:\n",
      "   166                                             nb_gm_ignore += 1\n",
      "   167                                             print('ignored, as at most only one of all its diagonal value is non-zero.')\n",
      "   168                                             str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\\n\\n'\n",
      "   169                                         else:                \n",
      "   170    119.5 MiB      0.0 MiB                   if np.isnan(Kmatrix).any(\n",
      "   171                                             ):  # if the matrix contains elements that are not numbers\n",
      "   172                                                 nb_gm_ignore += 1\n",
      "   173                                                 print('ignored, as it contains elements that are not numbers.')\n",
      "   174                                                 str_fw += 'ignored, as it contains elements that are not numbers.\\n\\n'\n",
      "   175                                             else:\n",
      "   176                             #                    print(Kmatrix)\n",
      "   177    119.5 MiB      0.0 MiB                       str_fw += np.array2string(\n",
      "   178    119.5 MiB      0.0 MiB                               Kmatrix,\n",
      "   179    119.5 MiB      0.0 MiB                               separator=',') + '\\n\\n'\n",
      "   180                             #                            separator=',',\n",
      "   181                             #                            threshold=np.inf,\n",
      "   182                             #                            floatmode='unique') + '\\n\\n'\n",
      "   183                             \n",
      "   184    119.5 MiB      0.0 MiB                       fig_file_name = results_dir + '/GM[ds]' + ds_name\n",
      "   185    119.5 MiB      0.0 MiB                       if params_out != {}:\n",
      "   186    119.5 MiB      0.0 MiB                           fig_file_name += '[params]' + str(idx)\n",
      "   187    120.3 MiB      0.7 MiB                       plt.imshow(Kmatrix)\n",
      "   188    120.4 MiB      0.1 MiB                       plt.colorbar()\n",
      "   189    145.3 MiB     24.9 MiB                       plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)\n",
      "   190                             #                    plt.show()\n",
      "   191    145.3 MiB      0.0 MiB                       plt.clf()\n",
      "   192    145.3 MiB      0.0 MiB                       gram_matrices.append(Kmatrix)\n",
      "   193    145.3 MiB      0.0 MiB                       gram_matrix_time.append(current_run_time)\n",
      "   194    145.3 MiB      0.0 MiB                       param_list_pre_revised.append(params_out)\n",
      "   195    145.3 MiB      0.0 MiB                       if nb_g_ignore > 0:\n",
      "   196                                                     print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)\n",
      "   197                                                     str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore\n",
      "   198    145.3 MiB      0.0 MiB           print()\n",
      "   199    145.3 MiB      0.0 MiB           print(\n",
      "   200    145.3 MiB      0.0 MiB               '{} gram matrices are calculated, {} of which are ignored.'.format(\n",
      "   201    145.3 MiB      0.0 MiB                   len(param_list_precomputed), nb_gm_ignore))\n",
      "   202    145.3 MiB      0.0 MiB           str_fw += '{} gram matrices are calculated, {} of which are ignored.\\n\\n'.format(len(param_list_precomputed), nb_gm_ignore)\n",
      "   203    145.3 MiB      0.0 MiB           str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\\n\\n'\n",
      "   204    145.3 MiB      0.0 MiB           str_fw += ''.join([\n",
      "   205    145.3 MiB      0.0 MiB               '{}: {}\\n'.format(idx, params_out)\n",
      "   206    145.3 MiB      0.0 MiB               for idx, params_out in enumerate(param_list_precomputed)\n",
      "   207                                     ])\n",
      "   208                                 \n",
      "   209    145.3 MiB      0.0 MiB           print()\n",
      "   210    145.3 MiB      0.0 MiB           if len(gram_matrices) == 0:\n",
      "   211                                         print('all gram matrices are ignored, no results obtained.')\n",
      "   212                                         str_fw += '\\nall gram matrices are ignored, no results obtained.\\n\\n'\n",
      "   213                                     else:\n",
      "   214                                         # save gram matrices to file.\n",
      "   215    145.4 MiB      0.1 MiB               np.savez(results_dir + '/' + ds_name + '.gm', \n",
      "   216    145.4 MiB      0.0 MiB                        gms=gram_matrices, params=param_list_pre_revised, y=y, \n",
      "   217    145.4 MiB      0.0 MiB                        gmtime=gram_matrix_time)\n",
      "   218                                         \n",
      "   219    145.4 MiB      0.0 MiB               print(\n",
      "   220    145.4 MiB      0.0 MiB                   '3. Fitting and predicting using nested cross validation. This could really take a while...'\n",
      "   221                                         )\n",
      "   222                                         \n",
      "   223                                         # ---- use pool.imap_unordered to parallel and track progress. ----\n",
      "   224                             #            train_pref = []\n",
      "   225                             #            val_pref = []\n",
      "   226                             #            test_pref = []\n",
      "   227                             #            def func_assign(result, var_to_assign):\n",
      "   228                             #                for idx, itm in enumerate(var_to_assign):\n",
      "   229                             #                    itm.append(result[idx])                \n",
      "   230                             #            trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)\n",
      "   231                             #                      \n",
      "   232                             #            parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, \n",
      "   233                             #                        [train_pref, val_pref, test_pref], glbv=gram_matrices,\n",
      "   234                             #                        method='imap_unordered', n_jobs=n_jobs, chunksize=1,\n",
      "   235                             #                        itr_desc='cross validation')\n",
      "   236                                         \n",
      "   237    145.4 MiB      0.0 MiB               def init_worker(gms_toshare):\n",
      "   238                                             global G_gms\n",
      "   239                                             G_gms = gms_toshare\n",
      "   240                                         \n",
      "   241                             #            gram_matrices = np.array(gram_matrices)\n",
      "   242                             #            gms_shape = gram_matrices.shape\n",
      "   243                             #            gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C'))\n",
      "   244                             #            pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape))\n",
      "   245    145.4 MiB      0.0 MiB               pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,))\n",
      "   246    145.4 MiB      0.0 MiB               trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type)\n",
      "   247    145.4 MiB      0.0 MiB               train_pref = []\n",
      "   248    145.4 MiB      0.0 MiB               val_pref = []\n",
      "   249    145.4 MiB      0.0 MiB               test_pref = []\n",
      "   250                             #            if NUM_TRIALS < 1000 * n_jobs:\n",
      "   251                             #                chunksize = int(NUM_TRIALS / n_jobs) + 1\n",
      "   252                             #            else:\n",
      "   253                             #                chunksize = 1000\n",
      "   254    145.4 MiB      0.0 MiB               chunksize = 1\n",
      "   255    145.4 MiB      0.0 MiB               for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):\n",
      "   256    145.4 MiB      0.0 MiB                   train_pref.append(o1)\n",
      "   257    145.4 MiB      0.0 MiB                   val_pref.append(o2)\n",
      "   258    145.4 MiB      0.0 MiB                   test_pref.append(o3)\n",
      "   259    145.4 MiB      0.0 MiB               pool.close()\n",
      "   260    145.4 MiB      0.0 MiB               pool.join()\n",
      "   261                                 \n",
      "   262                             #            # ---- use pool.map to parallel. ----\n",
      "   263                             #            pool =  Pool(n_jobs)\n",
      "   264                             #            trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type)\n",
      "   265                             #            result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))\n",
      "   266                             #            train_pref = [item[0] for item in result_perf]\n",
      "   267                             #            val_pref = [item[1] for item in result_perf]\n",
      "   268                             #            test_pref = [item[2] for item in result_perf]\n",
      "   269                                 \n",
      "   270                             #            # ---- direct running, normally use a single CPU core. ----\n",
      "   271                             #            train_pref = []\n",
      "   272                             #            val_pref = []\n",
      "   273                             #            test_pref = []\n",
      "   274                             #            for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):\n",
      "   275                             #                o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)\n",
      "   276                             #                train_pref.append(o1)\n",
      "   277                             #                val_pref.append(o2)\n",
      "   278                             #                test_pref.append(o3)\n",
      "   279                             #            print()\n",
      "   280                                 \n",
      "   281    145.4 MiB      0.0 MiB               print()\n",
      "   282    145.4 MiB      0.0 MiB               print('4. Getting final performance...')\n",
      "   283    145.4 MiB      0.0 MiB               str_fw += '\\nIII. Performance.\\n\\n'\n",
      "   284                                         # averages and confidences of performances on outer trials for each combination of parameters\n",
      "   285    145.4 MiB      0.0 MiB               average_train_scores = np.mean(train_pref, axis=0)\n",
      "   286                             #            print('val_pref: ', val_pref[0][0])\n",
      "   287    145.4 MiB      0.0 MiB               average_val_scores = np.mean(val_pref, axis=0)\n",
      "   288                             #            print('test_pref: ', test_pref[0][0])\n",
      "   289    145.4 MiB      0.0 MiB               average_perf_scores = np.mean(test_pref, axis=0)\n",
      "   290                                         # sample std is used here\n",
      "   291    145.4 MiB      0.0 MiB               std_train_scores = np.std(train_pref, axis=0, ddof=1)\n",
      "   292    145.4 MiB      0.0 MiB               std_val_scores = np.std(val_pref, axis=0, ddof=1)\n",
      "   293    145.4 MiB      0.0 MiB               std_perf_scores = np.std(test_pref, axis=0, ddof=1)\n",
      "   294                                 \n",
      "   295    145.4 MiB      0.0 MiB               if model_type == 'regression':\n",
      "   296    145.4 MiB      0.0 MiB                   best_val_perf = np.amin(average_val_scores)\n",
      "   297                                         else:\n",
      "   298                                             best_val_perf = np.amax(average_val_scores)\n",
      "   299                             #            print('average_val_scores: ', average_val_scores)\n",
      "   300                             #            print('best_val_perf: ', best_val_perf)\n",
      "   301                             #            print()\n",
      "   302    145.4 MiB      0.0 MiB               best_params_index = np.where(average_val_scores == best_val_perf)\n",
      "   303                                         # find smallest val std with best val perf.\n",
      "   304                                         best_val_stds = [\n",
      "   305    145.4 MiB      0.0 MiB                   std_val_scores[value][best_params_index[1][idx]]\n",
      "   306    145.4 MiB      0.0 MiB                   for idx, value in enumerate(best_params_index[0])\n",
      "   307                                         ]\n",
      "   308    145.4 MiB      0.0 MiB               min_val_std = np.amin(best_val_stds)\n",
      "   309    145.4 MiB      0.0 MiB               best_params_index = np.where(std_val_scores == min_val_std)\n",
      "   310                                         best_params_out = [\n",
      "   311    145.4 MiB      0.0 MiB                   param_list_pre_revised[i] for i in best_params_index[0]\n",
      "   312                                         ]\n",
      "   313    145.4 MiB      0.0 MiB               best_params_in = [param_list[i] for i in best_params_index[1]]\n",
      "   314    145.4 MiB      0.0 MiB               print('best_params_out: ', best_params_out)\n",
      "   315    145.4 MiB      0.0 MiB               print('best_params_in: ', best_params_in)\n",
      "   316    145.4 MiB      0.0 MiB               print()\n",
      "   317    145.4 MiB      0.0 MiB               print('best_val_perf: ', best_val_perf)\n",
      "   318    145.4 MiB      0.0 MiB               print('best_val_std: ', min_val_std)\n",
      "   319    145.4 MiB      0.0 MiB               str_fw += 'best settings of hyper-params to build gram matrix: %s\\n' % best_params_out\n",
      "   320    145.4 MiB      0.0 MiB               str_fw += 'best settings of other hyper-params: %s\\n\\n' % best_params_in\n",
      "   321    145.4 MiB      0.0 MiB               str_fw += 'best_val_perf: %s\\n' % best_val_perf\n",
      "   322    145.4 MiB      0.0 MiB               str_fw += 'best_val_std: %s\\n' % min_val_std\n",
      "   323                                 \n",
      "   324                             #            print(best_params_index)\n",
      "   325                             #            print(best_params_index[0])\n",
      "   326                             #            print(average_perf_scores)\n",
      "   327                                         final_performance = [\n",
      "   328    145.4 MiB      0.0 MiB                   average_perf_scores[value][best_params_index[1][idx]]\n",
      "   329    145.4 MiB      0.0 MiB                   for idx, value in enumerate(best_params_index[0])\n",
      "   330                                         ]\n",
      "   331                                         final_confidence = [\n",
      "   332    145.4 MiB      0.0 MiB                   std_perf_scores[value][best_params_index[1][idx]]\n",
      "   333    145.4 MiB      0.0 MiB                   for idx, value in enumerate(best_params_index[0])\n",
      "   334                                         ]\n",
      "   335    145.4 MiB      0.0 MiB               print('final_performance: ', final_performance)\n",
      "   336    145.4 MiB      0.0 MiB               print('final_confidence: ', final_confidence)\n",
      "   337    145.4 MiB      0.0 MiB               str_fw += 'final_performance: %s\\n' % final_performance\n",
      "   338    145.4 MiB      0.0 MiB               str_fw += 'final_confidence: %s\\n' % final_confidence\n",
      "   339                                         train_performance = [\n",
      "   340    145.4 MiB      0.0 MiB                   average_train_scores[value][best_params_index[1][idx]]\n",
      "   341    145.4 MiB      0.0 MiB                   for idx, value in enumerate(best_params_index[0])\n",
      "   342                                         ]\n",
      "   343                                         train_std = [\n",
      "   344    145.4 MiB      0.0 MiB                   std_train_scores[value][best_params_index[1][idx]]\n",
      "   345    145.4 MiB      0.0 MiB                   for idx, value in enumerate(best_params_index[0])\n",
      "   346                                         ]\n",
      "   347    145.4 MiB      0.0 MiB               print('train_performance: %s' % train_performance)\n",
      "   348    145.4 MiB      0.0 MiB               print('train_std: ', train_std)\n",
      "   349    145.4 MiB      0.0 MiB               str_fw += 'train_performance: %s\\n' % train_performance\n",
      "   350    145.4 MiB      0.0 MiB               str_fw += 'train_std: %s\\n\\n' % train_std\n",
      "   351                                 \n",
      "   352    145.4 MiB      0.0 MiB               print()\n",
      "   353    145.4 MiB      0.0 MiB               tt_total = time.time() - tts  # training time for all hyper-parameters\n",
      "   354    145.4 MiB      0.0 MiB               average_gram_matrix_time = np.mean(gram_matrix_time)\n",
      "   355    145.4 MiB      0.0 MiB               std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)\n",
      "   356                                         best_gram_matrix_time = [\n",
      "   357    145.4 MiB      0.0 MiB                   gram_matrix_time[i] for i in best_params_index[0]\n",
      "   358                                         ]\n",
      "   359    145.4 MiB      0.0 MiB               ave_bgmt = np.mean(best_gram_matrix_time)\n",
      "   360    145.4 MiB      0.0 MiB               std_bgmt = np.std(best_gram_matrix_time, ddof=1)\n",
      "   361    145.4 MiB      0.0 MiB               print(\n",
      "   362    145.4 MiB      0.0 MiB                   'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'\n",
      "   363    145.4 MiB      0.0 MiB                   .format(average_gram_matrix_time, std_gram_matrix_time))\n",
      "   364    145.4 MiB      0.0 MiB               print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(\n",
      "   365    145.4 MiB      0.0 MiB                   ave_bgmt, std_bgmt))\n",
      "   366    145.4 MiB      0.0 MiB               print(\n",
      "   367    145.4 MiB      0.0 MiB                   'total training time with all hyper-param choices: {:.2f}s'.format(\n",
      "   368    145.4 MiB      0.0 MiB                       tt_total))\n",
      "   369    145.4 MiB      0.0 MiB               str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\\n'.format(average_gram_matrix_time, std_gram_matrix_time)\n",
      "   370    145.4 MiB      0.0 MiB               str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\\n'.format(ave_bgmt, std_bgmt)\n",
      "   371    145.4 MiB      0.0 MiB               str_fw += 'total training time with all hyper-param choices: {:.2f}s\\n\\n'.format(tt_total)\n",
      "   372                                 \n",
      "   373                                         # # save results to file\n",
      "   374                                         # np.savetxt(results_name_pre + 'average_train_scores.dt',\n",
      "   375                                         #            average_train_scores)\n",
      "   376                                         # np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)\n",
      "   377                                         # np.savetxt(results_name_pre + 'average_perf_scores.dt',\n",
      "   378                                         #            average_perf_scores)\n",
      "   379                                         # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)\n",
      "   380                                         # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)\n",
      "   381                                         # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)\n",
      "   382                                 \n",
      "   383                                         # np.save(results_name_pre + 'best_params_index', best_params_index)\n",
      "   384                                         # np.save(results_name_pre + 'best_params_pre.dt', best_params_out)\n",
      "   385                                         # np.save(results_name_pre + 'best_params_in.dt', best_params_in)\n",
      "   386                                         # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)\n",
      "   387                                         # np.save(results_name_pre + 'best_val_std.dt', best_val_std)\n",
      "   388                                         # np.save(results_name_pre + 'final_performance.dt', final_performance)\n",
      "   389                                         # np.save(results_name_pre + 'final_confidence.dt', final_confidence)\n",
      "   390                                         # np.save(results_name_pre + 'train_performance.dt', train_performance)\n",
      "   391                                         # np.save(results_name_pre + 'train_std.dt', train_std)\n",
      "   392                                 \n",
      "   393                                         # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)\n",
      "   394                                         # np.save(results_name_pre + 'average_gram_matrix_time.dt',\n",
      "   395                                         #         average_gram_matrix_time)\n",
      "   396                                         # np.save(results_name_pre + 'std_gram_matrix_time.dt',\n",
      "   397                                         #         std_gram_matrix_time)\n",
      "   398                                         # np.save(results_name_pre + 'best_gram_matrix_time.dt',\n",
      "   399                                         #         best_gram_matrix_time)\n",
      "   400                                 \n",
      "   401                                         # print out as table.\n",
      "   402    145.4 MiB      0.0 MiB               from collections import OrderedDict\n",
      "   403    145.4 MiB      0.0 MiB               from tabulate import tabulate\n",
      "   404    145.4 MiB      0.0 MiB               table_dict = {}\n",
      "   405    145.4 MiB      0.0 MiB               if model_type == 'regression':\n",
      "   406    145.6 MiB      0.0 MiB                   for param_in in param_list:\n",
      "   407    145.6 MiB      0.2 MiB                       param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])\n",
      "   408                                         else:\n",
      "   409                                             for param_in in param_list:\n",
      "   410                                                 param_in['C'] = '{:.2e}'.format(param_in['C'])\n",
      "   411    145.6 MiB      0.0 MiB               table_dict['params'] = [{**param_out, **param_in}\n",
      "   412    145.6 MiB      0.0 MiB                                       for param_in in param_list for param_out in param_list_pre_revised]\n",
      "   413                                         table_dict['gram_matrix_time'] = [\n",
      "   414    145.6 MiB      0.0 MiB                   '{:.2f}'.format(gram_matrix_time[index_out])\n",
      "   415    145.6 MiB      0.0 MiB                   for param_in in param_list\n",
      "   416    145.6 MiB      0.0 MiB                   for index_out, _ in enumerate(param_list_pre_revised)\n",
      "   417                                         ]\n",
      "   418                                         table_dict['valid_perf'] = [\n",
      "   419    145.6 MiB      0.0 MiB                   '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],\n",
      "   420                                                                    std_val_scores[index_out][index_in])\n",
      "   421    145.6 MiB      0.0 MiB                   for index_in, _ in enumerate(param_list)\n",
      "   422    145.6 MiB      0.0 MiB                   for index_out, _ in enumerate(param_list_pre_revised)\n",
      "   423                                         ]\n",
      "   424                                         table_dict['test_perf'] = [\n",
      "   425    145.6 MiB      0.0 MiB                   '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],\n",
      "   426                                                                    std_perf_scores[index_out][index_in])\n",
      "   427    145.6 MiB      0.0 MiB                   for index_in, _ in enumerate(param_list)\n",
      "   428    145.6 MiB      0.0 MiB                   for index_out, _ in enumerate(param_list_pre_revised)\n",
      "   429                                         ]\n",
      "   430                                         table_dict['train_perf'] = [\n",
      "   431    145.6 MiB      0.0 MiB                   '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],\n",
      "   432                                                                    std_train_scores[index_out][index_in])\n",
      "   433    145.6 MiB      0.0 MiB                   for index_in, _ in enumerate(param_list)\n",
      "   434    145.6 MiB      0.0 MiB                   for index_out, _ in enumerate(param_list_pre_revised)\n",
      "   435                                         ]\n",
      "   436                                         keyorder = [\n",
      "   437    145.6 MiB      0.0 MiB                   'params', 'train_perf', 'valid_perf', 'test_perf',\n",
      "   438    145.6 MiB      0.0 MiB                   'gram_matrix_time'\n",
      "   439                                         ]\n",
      "   440    145.6 MiB      0.0 MiB               print()\n",
      "   441    145.6 MiB      0.0 MiB               tb_print = tabulate(\n",
      "   442    145.6 MiB      0.0 MiB                   OrderedDict(\n",
      "   443    145.6 MiB      0.0 MiB                       sorted(table_dict.items(),\n",
      "   444    145.6 MiB      0.0 MiB                              key=lambda i: keyorder.index(i[0]))),\n",
      "   445    145.6 MiB      0.0 MiB                   headers='keys')\n",
      "   446                             #            print(tb_print)\n",
      "   447    145.6 MiB      0.0 MiB               str_fw += 'table of performance v.s. hyper-params:\\n\\n%s\\n\\n' % tb_print\n",
      "   448                                 \n",
      "   449                                 # read gram matrices from file.\n",
      "   450                                 else:    \n",
      "   451                                     # Grid of parameters with a discrete number of values for each.\n",
      "   452                             #        param_list_precomputed = list(ParameterGrid(param_grid_precomputed))\n",
      "   453                                     param_list = list(ParameterGrid(param_grid))\n",
      "   454                                 \n",
      "   455                                     # read gram matrices from file.\n",
      "   456                                     print()\n",
      "   457                                     print('2. Reading gram matrices from file...')\n",
      "   458                                     str_fw += '\\nII. Gram matrices.\\n\\nGram matrices are read from file, see last log for detail.\\n'\n",
      "   459                                     gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')\n",
      "   460                                     gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed\n",
      "   461                                     gram_matrix_time = gmfile['gmtime'] # time used to compute the gram matrices\n",
      "   462                                     param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones\n",
      "   463                                     y = gmfile['y'].tolist()\n",
      "   464                                     \n",
      "   465                                     tts = time.time()  # start training time\n",
      "   466                             #        nb_gm_ignore = 0  # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)            \n",
      "   467                                     print(\n",
      "   468                                         '3. Fitting and predicting using nested cross validation. This could really take a while...'\n",
      "   469                                     )\n",
      "   470                              \n",
      "   471                                     # ---- use pool.imap_unordered to parallel and track progress. ----\n",
      "   472                                     def init_worker(gms_toshare):\n",
      "   473                                         global G_gms\n",
      "   474                                         G_gms = gms_toshare\n",
      "   475                             \n",
      "   476                                     pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,))\n",
      "   477                                     trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type)\n",
      "   478                                     train_pref = []\n",
      "   479                                     val_pref = []\n",
      "   480                                     test_pref = []\n",
      "   481                                     chunksize = 1\n",
      "   482                                     for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):\n",
      "   483                                         train_pref.append(o1)\n",
      "   484                                         val_pref.append(o2)\n",
      "   485                                         test_pref.append(o3)\n",
      "   486                                     pool.close()\n",
      "   487                                     pool.join()\n",
      "   488                                     \n",
      "   489                                     # # ---- use pool.map to parallel. ----\n",
      "   490                                     # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))\n",
      "   491                                     # train_pref = [item[0] for item in result_perf]\n",
      "   492                                     # val_pref = [item[1] for item in result_perf]\n",
      "   493                                     # test_pref = [item[2] for item in result_perf]\n",
      "   494                             \n",
      "   495                                     # # ---- use joblib.Parallel to parallel and track progress. ----\n",
      "   496                                     # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)\n",
      "   497                                     # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))\n",
      "   498                                     # train_pref = [item[0] for item in result_perf]\n",
      "   499                                     # val_pref = [item[1] for item in result_perf]\n",
      "   500                                     # test_pref = [item[2] for item in result_perf]\n",
      "   501                             \n",
      "   502                             #        # ---- direct running, normally use a single CPU core. ----\n",
      "   503                             #        train_pref = []\n",
      "   504                             #        val_pref = []\n",
      "   505                             #        test_pref = []\n",
      "   506                             #        for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):\n",
      "   507                             #            o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)\n",
      "   508                             #            train_pref.append(o1)\n",
      "   509                             #            val_pref.append(o2)\n",
      "   510                             #            test_pref.append(o3)\n",
      "   511                             \n",
      "   512                                     print()\n",
      "   513                                     print('4. Getting final performance...')\n",
      "   514                                     str_fw += '\\nIII. Performance.\\n\\n'\n",
      "   515                                     # averages and confidences of performances on outer trials for each combination of parameters\n",
      "   516                                     average_train_scores = np.mean(train_pref, axis=0)\n",
      "   517                                     average_val_scores = np.mean(val_pref, axis=0)\n",
      "   518                                     average_perf_scores = np.mean(test_pref, axis=0)\n",
      "   519                                     # sample std is used here\n",
      "   520                                     std_train_scores = np.std(train_pref, axis=0, ddof=1)\n",
      "   521                                     std_val_scores = np.std(val_pref, axis=0, ddof=1)\n",
      "   522                                     std_perf_scores = np.std(test_pref, axis=0, ddof=1)\n",
      "   523                             \n",
      "   524                                     if model_type == 'regression':\n",
      "   525                                         best_val_perf = np.amin(average_val_scores)\n",
      "   526                                     else:\n",
      "   527                                         best_val_perf = np.amax(average_val_scores)\n",
      "   528                                     best_params_index = np.where(average_val_scores == best_val_perf)\n",
      "   529                                     # find smallest val std with best val perf.\n",
      "   530                                     best_val_stds = [\n",
      "   531                                         std_val_scores[value][best_params_index[1][idx]]\n",
      "   532                                         for idx, value in enumerate(best_params_index[0])\n",
      "   533                                     ]\n",
      "   534                                     min_val_std = np.amin(best_val_stds)\n",
      "   535                                     best_params_index = np.where(std_val_scores == min_val_std)\n",
      "   536                                     best_params_out = [\n",
      "   537                                         param_list_pre_revised[i] for i in best_params_index[0]\n",
      "   538                                     ]\n",
      "   539                                     best_params_in = [param_list[i] for i in best_params_index[1]]\n",
      "   540                                     print('best_params_out: ', best_params_out)\n",
      "   541                                     print('best_params_in: ', best_params_in)\n",
      "   542                                     print()\n",
      "   543                                     print('best_val_perf: ', best_val_perf)\n",
      "   544                                     print('best_val_std: ', min_val_std)\n",
      "   545                                     str_fw += 'best settings of hyper-params to build gram matrix: %s\\n' % best_params_out\n",
      "   546                                     str_fw += 'best settings of other hyper-params: %s\\n\\n' % best_params_in\n",
      "   547                                     str_fw += 'best_val_perf: %s\\n' % best_val_perf\n",
      "   548                                     str_fw += 'best_val_std: %s\\n' % min_val_std\n",
      "   549                             \n",
      "   550                                     final_performance = [\n",
      "   551                                         average_perf_scores[value][best_params_index[1][idx]]\n",
      "   552                                         for idx, value in enumerate(best_params_index[0])\n",
      "   553                                     ]\n",
      "   554                                     final_confidence = [\n",
      "   555                                         std_perf_scores[value][best_params_index[1][idx]]\n",
      "   556                                         for idx, value in enumerate(best_params_index[0])\n",
      "   557                                     ]\n",
      "   558                                     print('final_performance: ', final_performance)\n",
      "   559                                     print('final_confidence: ', final_confidence)\n",
      "   560                                     str_fw += 'final_performance: %s\\n' % final_performance\n",
      "   561                                     str_fw += 'final_confidence: %s\\n' % final_confidence\n",
      "   562                                     train_performance = [\n",
      "   563                                         average_train_scores[value][best_params_index[1][idx]]\n",
      "   564                                         for idx, value in enumerate(best_params_index[0])\n",
      "   565                                     ]\n",
      "   566                                     train_std = [\n",
      "   567                                         std_train_scores[value][best_params_index[1][idx]]\n",
      "   568                                         for idx, value in enumerate(best_params_index[0])\n",
      "   569                                     ]\n",
      "   570                                     print('train_performance: %s' % train_performance)\n",
      "   571                                     print('train_std: ', train_std)\n",
      "   572                                     str_fw += 'train_performance: %s\\n' % train_performance\n",
      "   573                                     str_fw += 'train_std: %s\\n\\n' % train_std\n",
      "   574                             \n",
      "   575                                     print()\n",
      "   576                                     average_gram_matrix_time = np.mean(gram_matrix_time)\n",
      "   577                                     std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)\n",
      "   578                                     best_gram_matrix_time = [\n",
      "   579                                         gram_matrix_time[i] for i in best_params_index[0]\n",
      "   580                                     ]\n",
      "   581                                     ave_bgmt = np.mean(best_gram_matrix_time)\n",
      "   582                                     std_bgmt = np.std(best_gram_matrix_time, ddof=1)\n",
      "   583                                     print(\n",
      "   584                                         'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'\n",
      "   585                                         .format(average_gram_matrix_time, std_gram_matrix_time))\n",
      "   586                                     print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(\n",
      "   587                                         ave_bgmt, std_bgmt))\n",
      "   588                                     tt_poster = time.time() - tts  # training time with hyper-param choices who did not participate in calculation of gram matrices\n",
      "   589                                     print(\n",
      "   590                                         'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format(\n",
      "   591                                             tt_poster))\n",
      "   592                                     print('total training time with all hyper-param choices: {:.2f}s'.format(\n",
      "   593                                             tt_poster + np.sum(gram_matrix_time)))\n",
      "   594                             #        str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\\n'.format(average_gram_matrix_time, std_gram_matrix_time)\n",
      "   595                             #        str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\\n'.format(ave_bgmt, std_bgmt)\n",
      "   596                                     str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\\n\\n'.format(tt_poster)\n",
      "   597                             \n",
      "   598                                     # print out as table.\n",
      "   599                                     from collections import OrderedDict\n",
      "   600                                     from tabulate import tabulate\n",
      "   601                                     table_dict = {}\n",
      "   602                                     if model_type == 'regression':\n",
      "   603                                         for param_in in param_list:\n",
      "   604                                             param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])\n",
      "   605                                     else:\n",
      "   606                                         for param_in in param_list:\n",
      "   607                                             param_in['C'] = '{:.2e}'.format(param_in['C'])\n",
      "   608                                     table_dict['params'] = [{**param_out, **param_in}\n",
      "   609                                                             for param_in in param_list for param_out in param_list_pre_revised]\n",
      "   610                             #        table_dict['gram_matrix_time'] = [\n",
      "   611                             #            '{:.2f}'.format(gram_matrix_time[index_out])\n",
      "   612                             #            for param_in in param_list\n",
      "   613                             #            for index_out, _ in enumerate(param_list_pre_revised)\n",
      "   614                             #        ]\n",
      "   615                                     table_dict['valid_perf'] = [\n",
      "   616                                         '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],\n",
      "   617                                                                std_val_scores[index_out][index_in])\n",
      "   618                                         for index_in, _ in enumerate(param_list)\n",
      "   619                                         for index_out, _ in enumerate(param_list_pre_revised)\n",
      "   620                                     ]\n",
      "   621                                     table_dict['test_perf'] = [\n",
      "   622                                         '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],\n",
      "   623                                                                std_perf_scores[index_out][index_in])\n",
      "   624                                         for index_in, _ in enumerate(param_list)\n",
      "   625                                         for index_out, _ in enumerate(param_list_pre_revised)\n",
      "   626                                     ]\n",
      "   627                                     table_dict['train_perf'] = [\n",
      "   628                                         '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],\n",
      "   629                                                                std_train_scores[index_out][index_in])\n",
      "   630                                         for index_in, _ in enumerate(param_list)\n",
      "   631                                         for index_out, _ in enumerate(param_list_pre_revised)\n",
      "   632                                     ]\n",
      "   633                                     keyorder = [\n",
      "   634                                         'params', 'train_perf', 'valid_perf', 'test_perf'\n",
      "   635                                     ]\n",
      "   636                                     print()\n",
      "   637                                     tb_print = tabulate(\n",
      "   638                                         OrderedDict(\n",
      "   639                                             sorted(table_dict.items(),\n",
      "   640                                                    key=lambda i: keyorder.index(i[0]))),\n",
      "   641                                         headers='keys')\n",
      "   642                             #        print(tb_print)\n",
      "   643                                     str_fw += 'table of performance v.s. hyper-params:\\n\\n%s\\n\\n' % tb_print\n",
      "   644                             \n",
      "   645                                     # open file to save all results for this dataset.\n",
      "   646                                     if not os.path.exists(results_dir):\n",
      "   647                                         os.makedirs(results_dir)\n",
      "   648                                         \n",
      "   649                                 # open file to save all results for this dataset.\n",
      "   650    145.6 MiB      0.0 MiB       if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'):\n",
      "   651                                     with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f:\n",
      "   652                                         f.write(str_fw)\n",
      "   653                                 else:\n",
      "   654    145.6 MiB      0.0 MiB           with open(results_dir + '/' + ds_name + '.output.txt', 'r+') as f:\n",
      "   655    145.6 MiB      0.0 MiB               content = f.read()\n",
      "   656    145.6 MiB      0.0 MiB               f.seek(0, 0)\n",
      "   657    145.6 MiB      0.0 MiB               f.write(str_fw + '\\n\\n\\n' + content)\n",
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import functools\n",
    "import sys\n",
    "sys.path.insert(0, \"../\")\n",
    "sys.path.insert(0, \"../../\")\n",
    "from libs import *\n",
    "import multiprocessing\n",
    "\n",
    "from gklearn.kernels.spKernel import spkernel\n",
    "from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct\n",
    "#from gklearn.utils.model_selection_precomputed import trial_do\n",
    "\n",
    "dslist = [\n",
    "    {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds',\n",
    "        'task': 'regression'},  # node symb\n",
    "#    {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', 'task': 'regression',\n",
    "#             'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt', },  \n",
    "#    # contains single node graph, node symb\n",
    "#    {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds', },  # node/edge symb\n",
    "#    {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds', },  # unlabeled\n",
    "#    {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',\n",
    "#             'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},  # node/edge symb\n",
    "#    {'name': 'Letter-med', 'dataset': '../../datasets/Letter-med/Letter-med_A.txt'},\n",
    "#    # node nsymb\n",
    "#    {'name': 'ENZYMES', 'dataset': '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
    "#    # node symb/nsymb\n",
    "#    {'name': 'Mutagenicity', 'dataset': '../../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
    "#    # node/edge symb\n",
    "#    {'name': 'D&D', 'dataset': '../../datasets/D&D/DD.mat',\n",
    "#     'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},  # node symb\n",
    "\n",
    "    #     {'name': 'COIL-DEL', 'dataset': '../../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
    "    # # #     {'name': 'BZR', 'dataset': '../../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
    "    # # #     {'name': 'COX2', 'dataset': '../../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
    "    #     {'name': 'Fingerprint', 'dataset': '../../datasets/Fingerprint/Fingerprint_A.txt'},\n",
    "    #\n",
    "    # #     {'name': 'DHFR', 'dataset': '../../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
    "    # #     {'name': 'SYNTHETIC', 'dataset': '../../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
    "    # #     {'name': 'MSRC9', 'dataset': '../../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
    "    # #     {'name': 'MSRC21', 'dataset': '../../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
    "    # #     {'name': 'FIRSTMM_DB', 'dataset': '../../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
    "\n",
    "    # #     {'name': 'PROTEINS', 'dataset': '../../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
    "    # #     {'name': 'PROTEINS_full', 'dataset': '../../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
    "    # #     {'name': 'AIDS', 'dataset': '../../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
    "    #     {'name': 'NCI1', 'dataset': '../../datasets/NCI1/NCI1.mat',\n",
    "    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
    "    #     {'name': 'NCI109', 'dataset': '../../datasets/NCI109/NCI109.mat',\n",
    "    #         'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
    "    #     {'name': 'NCI-HIV', 'dataset': '../../datasets/NCI-HIV/AIDO99SD.sdf',\n",
    "    #         'dataset_y': '../../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
    "\n",
    "    #     # not working below\n",
    "    #     {'name': 'PTC_FM', 'dataset': '../../datasets/PTC/Train/FM.ds',},\n",
    "    #     {'name': 'PTC_FR', 'dataset': '../../datasets/PTC/Train/FR.ds',},\n",
    "    #     {'name': 'PTC_MM', 'dataset': '../../datasets/PTC/Train/MM.ds',},\n",
    "    #     {'name': 'PTC_MR', 'dataset': '../../datasets/PTC/Train/MR.ds',},\n",
    "]\n",
    "estimator = spkernel\n",
    "mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)\n",
    "param_grid_precomputed = {'node_kernels': [\n",
    "    {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}\n",
    "param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},\n",
    "              {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n",
    "\n",
    "for ds in dslist:\n",
    "    print()\n",
    "    print(ds['name'])\n",
    "    model_selection_for_precomputed_kernel(\n",
    "        ds['dataset'],\n",
    "        estimator,\n",
    "        param_grid_precomputed,\n",
    "        (param_grid[1] if ('task' in ds and ds['task']\n",
    "                           == 'regression') else param_grid[0]),\n",
    "        (ds['task'] if 'task' in ds else 'classification'),\n",
    "        NUM_TRIALS=30,\n",
    "        datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n",
    "        extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n",
    "        ds_name=ds['name'],\n",
    "        n_jobs=multiprocessing.cpu_count(),\n",
    "        read_gm_from_file=False)\n",
    "    print()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/notebooks/tests/test_lib.ipynb
+++ b/notebooks/tests/test_lib.ipynb
--- a/notebooks/tests/test_modelselection.ipynb
+++ b/notebooks/tests/test_modelselection.ipynb
--- a/notebooks/tests/test_networkx.ipynb
+++ b/notebooks/tests/test_networkx.ipynb
--- a/notebooks/tests/test_parallel_chunksize.py
+++ b/notebooks/tests/test_parallel_chunksize.py
@@ -0,0 +1,689 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Test of parallel, find the best parallel chunksize and iteration seperation scheme.
 Created on Wed Sep 26 12:09:34 2018

@author: ljia
 """

 import sys
 import time
 from itertools import combinations_with_replacement, product, combinations
 from functools import partial
 from multiprocessing import Pool
 from tqdm import tqdm
 import networkx as nx
 import numpy as np
 import functools
 #import multiprocessing
 from matplotlib import pyplot as plt
 from sklearn.model_selection import ParameterGrid

 sys.path.insert(0, "../")
 sys.path.insert(0, "../../")
 from libs import *
 from gklearn.utils.utils import getSPGraph, direct_product
 from gklearn.utils.graphdataset import get_dataset_attributes
 from gklearn.utils.graphfiles import loadDataset
 from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct


 def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None,
             chunksize=1):
    """Calculate shortest-path kernels between graphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)

    # remove graphs with no edges, as no sp can be found in their structures, 
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        print('\n %d graphs are removed as they don\'t contain edges.\n' %
              (len_gn - len(Gn)))

    start_time = time.time()

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    for i, g in tqdm(
            pool.imap_unordered(getsp_partial, itr, chunksize),
            desc='getting sp graphs', file=sys.stdout):
        Gn[i] = g
    pool.close()
    pool.join()

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare
    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)   
    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    with Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) as pool:
        for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize),
                                 desc='calculating kernels', file=sys.stdout):
            Kmatrix[i][j] = kernel
            Kmatrix[j][i] = kernel
    
 #    # ---- direct running, normally use single CPU core. ----
 #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
 #    for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
 #        kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
 #        Kmatrix[i][j] = kernel
 #        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time, idx


 def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
    
    kernel = 0

    # compute shortest path matrices first, method borrowed from FCSP.
    if ds_attrs['node_labeled']:
        # node symb and non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['mix']
            vk_dict = {}  # shortest path matrices dict
            for n1, n2 in product(
                    g1.nodes(data=True), g2.nodes(data=True)):
                vk_dict[(n1[0], n2[0])] = kn(
                    n1[1][node_label], n2[1][node_label],
                    n1[1]['attributes'], n2[1]['attributes'])
        # node symb labeled
        else:
            kn = node_kernels['symb']
            vk_dict = {}  # shortest path matrices dict
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
                                                 n2[1][node_label])
    else:
        # node non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['nsymb']
            vk_dict = {}  # shortest path matrices dict
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
                                                 n2[1]['attributes'])
        # node unlabeled
        else:
            for e1, e2 in product(
                    g1.edges(data=True), g2.edges(data=True)):
                if e1[2]['cost'] == e2[2]['cost']:
                    kernel += 1
            return kernel

    # compute graph kernels
    if ds_attrs['is_directed']:
        for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
            if e1[2]['cost'] == e2[2]['cost']:
                nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
                                                               e2[1])]
                kn1 = nk11 * nk22
                kernel += kn1
    else:
        for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
            if e1[2]['cost'] == e2[2]['cost']:
                # each edge walk is counted twice, starting from both its extreme nodes.
                nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
                    e1[0], e2[1])], vk_dict[(e1[1],
                                             e2[0])], vk_dict[(e1[1],
                                                               e2[1])]
                kn1 = nk11 * nk22
                kn2 = nk12 * nk21
                kernel += kn1 + kn2

    return kernel


 def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):
    i = itr[0]
    j = itr[1]
    return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels)


 def wrapper_getSPGraph(weight, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    return i, getSPGraph(g, edge_weight=weight)



 #
 #
 #def commonwalkkernel(*args,
 #                     node_label='atom',
 #                     edge_label='bond_type',
 #                     n=None,
 #                     weight=1,
 #                     compute_method=None,
 #                     n_jobs=None,
 #                     chunksize=1):
 #    """Calculate common walk graph kernels between graphs.
 #    """
 #    compute_method = compute_method.lower()
 #    # arrange all graphs in a list
 #    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
 #    Kmatrix = np.zeros((len(Gn), len(Gn)))
 #    ds_attrs = get_dataset_attributes(
 #        Gn,
 #        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
 #        node_label=node_label,
 #        edge_label=edge_label)
 #    if not ds_attrs['node_labeled']:
 #        for G in Gn:
 #            nx.set_node_attributes(G, '0', 'atom')
 #    if not ds_attrs['edge_labeled']:
 #        for G in Gn:
 #            nx.set_edge_attributes(G, '0', 'bond_type')
 #    if not ds_attrs['is_directed']:  # convert
 #        Gn = [G.to_directed() for G in Gn]
 #
 #    start_time = time.time()
 #
 #    # ---- use pool.imap_unordered to parallel and track progress. ----
 #    pool = Pool(n_jobs)
 #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
 ##    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
 ##    if len_itr < 100:
 ##        chunksize, extra = divmod(len_itr, n_jobs * 4)
 ##        if extra:
 ##            chunksize += 1
 ##    else:
 ##        chunksize = 100
 #
 #    # direct product graph method - exponential
 #    if compute_method == 'exp':
 #        do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label,
 #                             weight)
 #    # direct product graph method - geometric
 #    elif compute_method == 'geo':
 #        do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label,
 #                             weight)
 #
 #    for i, j, kernel in tqdm(
 #            pool.imap_unordered(do_partial, itr, chunksize),
 #            desc='calculating kernels',
 #            file=sys.stdout):
 #        Kmatrix[i][j] = kernel
 #        Kmatrix[j][i] = kernel
 #    pool.close()
 #    pool.join()
 #
 #    run_time = time.time() - start_time
 #    print(
 #        "\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---"
 #        % (len(Gn), run_time))
 #
 #    return Kmatrix, run_time
 #
 #
 #def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
 #    """Calculate walk graph kernels up to n between 2 graphs using exponential 
 #    series.
 #    """
 #    i = ij[0]
 #    j = ij[1]
 #    g1 = Gn[i]
 #    g2 = Gn[j]
 #
 #    # get tensor product / direct product
 #    gp = direct_product(g1, g2, node_label, edge_label)
 #    A = nx.adjacency_matrix(gp).todense()
 #
 #    ew, ev = np.linalg.eig(A)
 #    D = np.zeros((len(ew), len(ew)))
 #    for i in range(len(ew)):
 #        D[i][i] = np.exp(beta * ew[i])
 #    exp_D = ev * D * ev.T
 #
 #    return i, j, exp_D.sum()
 #
 #
 #def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
 #    """Calculate common walk graph kernels up to n between 2 graphs using 
 #    geometric series.
 #    """
 #    i = ij[0]
 #    j = ij[1]
 #    g1 = Gn[i]
 #    g2 = Gn[j]
 #
 #    # get tensor product / direct product
 #    gp = direct_product(g1, g2, node_label, edge_label)
 #    A = nx.adjacency_matrix(gp).todense()
 #    mat = np.identity(len(A)) - gamma * A
 #    try:
 #        return i, j, mat.I.sum()
 #    except np.linalg.LinAlgError:
 #        return i, j, np.nan


 #def structuralspkernel(*args,
 #                       node_label='atom',
 #                       edge_weight=None,
 #                       edge_label='bond_type',
 #                       node_kernels=None,
 #                       edge_kernels=None,
 #                       n_jobs=None,
 #                       chunksize=1):
 #    """Calculate mean average structural shortest path kernels between graphs.
 #    """
 #    # pre-process
 #    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
 #
 #    weight = None
 #    if edge_weight is None:
 #        print('\n None edge weight specified. Set all weight to 1.\n')
 #    else:
 #        try:
 #            some_weight = list(
 #                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
 #            if isinstance(some_weight, (float, int)):
 #                weight = edge_weight
 #            else:
 #                print(
 #                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
 #                    % edge_weight)
 #        except:
 #            print(
 #                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
 #                % edge_weight)
 #    ds_attrs = get_dataset_attributes(
 #        Gn,
 #        attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
 #                    'edge_attr_dim', 'is_directed'],
 #        node_label=node_label, edge_label=edge_label)
 #
 #    start_time = time.time()
 #
 #    # get shortest paths of each graph in Gn
 #    splist = [[] for _ in range(len(Gn))]
 #    pool = Pool(n_jobs)
 #    # get shortest path graphs of Gn
 #    getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
 ##    if len(Gn) < 100:
 ##        # use default chunksize as pool.map when iterable is less than 100
 ##        chunksize, extra = divmod(len(Gn), n_jobs * 4)
 ##        if extra:
 ##            chunksize += 1
 ##    else:
 ##        chunksize = 100
 #    # chunksize = 300  # int(len(list(itr)) / n_jobs)
 #    for i, sp in tqdm(
 #            pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
 #            desc='getting shortest paths',
 #            file=sys.stdout):
 #        splist[i] = sp
 #
 #    Kmatrix = np.zeros((len(Gn), len(Gn)))
 #
 #    # ---- use pool.imap_unordered to parallel and track progress. ----
 #    do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
 #                         node_label, edge_label, node_kernels, edge_kernels)
 #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
 ##    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
 ##    if len_itr < 100:
 ##        chunksize, extra = divmod(len_itr, n_jobs * 4)
 ##        if extra:
 ##            chunksize += 1
 ##    else:
 ##        chunksize = 100
 #    for i, j, kernel in tqdm(
 #            pool.imap_unordered(do_partial, itr, chunksize),
 #            desc='calculating kernels',
 #            file=sys.stdout):
 #        Kmatrix[i][j] = kernel
 #        Kmatrix[j][i] = kernel
 #    pool.close()
 #    pool.join()
 #
 #    run_time = time.time() - start_time
 #    print(
 #        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
 #        % (len(Gn), run_time))
 #
 #    return Kmatrix, run_time
 #
 #
 #def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
 #                          node_kernels, edge_kernels, ij):
 #
 #    iglobal = ij[0]
 #    jglobal = ij[1]
 #    g1 = Gn[iglobal]
 #    g2 = Gn[jglobal]
 #    spl1 = splist[iglobal]
 #    spl2 = splist[jglobal]
 #    kernel = 0
 #
 #    try:
 #        # First, compute shortest path matrices, method borrowed from FCSP.
 #        if ds_attrs['node_labeled']:
 #            # node symb and non-synb labeled
 #            if ds_attrs['node_attr_dim'] > 0:
 #                kn = node_kernels['mix']
 #                vk_dict = {}  # shortest path matrices dict
 #                for n1, n2 in product(
 #                        g1.nodes(data=True), g2.nodes(data=True)):
 #                    vk_dict[(n1[0], n2[0])] = kn(
 #                        n1[1][node_label], n2[1][node_label],
 #                        [n1[1]['attributes']], [n2[1]['attributes']])
 #            # node symb labeled
 #            else:
 #                kn = node_kernels['symb']
 #                vk_dict = {}  # shortest path matrices dict
 #                for n1 in g1.nodes(data=True):
 #                    for n2 in g2.nodes(data=True):
 #                        vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
 #                                                     n2[1][node_label])
 #        else:
 #            # node non-synb labeled
 #            if ds_attrs['node_attr_dim'] > 0:
 #                kn = node_kernels['nsymb']
 #                vk_dict = {}  # shortest path matrices dict
 #                for n1 in g1.nodes(data=True):
 #                    for n2 in g2.nodes(data=True):
 #                        vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
 #                                                     [n2[1]['attributes']])
 #            # node unlabeled
 #            else:
 #                vk_dict = {}
 #
 #        # Then, compute kernels between all pairs of edges, which idea is an
 #        # extension of FCSP. It suits sparse graphs, which is the most case we
 #        # went though. For dense graphs, it would be slow.
 #        if ds_attrs['edge_labeled']:
 #            # edge symb and non-synb labeled
 #            if ds_attrs['edge_attr_dim'] > 0:
 #                ke = edge_kernels['mix']
 #                ek_dict = {}  # dict of edge kernels
 #                for e1, e2 in product(
 #                        g1.edges(data=True), g2.edges(data=True)):
 #                    ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
 #                        e1[2][edge_label], e2[2][edge_label],
 #                        [e1[2]['attributes']], [e2[2]['attributes']])
 #            # edge symb labeled
 #            else:
 #                ke = edge_kernels['symb']
 #                ek_dict = {}
 #                for e1 in g1.edges(data=True):
 #                    for e2 in g2.edges(data=True):
 #                        ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
 #                            e1[2][edge_label], e2[2][edge_label])
 #        else:
 #            # edge non-synb labeled
 #            if ds_attrs['edge_attr_dim'] > 0:
 #                ke = edge_kernels['nsymb']
 #                ek_dict = {}
 #                for e1 in g1.edges(data=True):
 #                    for e2 in g2.edges(data=True):
 #                        ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = kn(
 #                            [e1[2]['attributes']], [e2[2]['attributes']])
 #            # edge unlabeled
 #            else:
 #                ek_dict = {}
 #
 #        # compute graph kernels
 #        if vk_dict:
 #            if ek_dict:
 #                for p1, p2 in product(spl1, spl2):
 #                    if len(p1) == len(p2):
 #                        kpath = vk_dict[(p1[0], p2[0])]
 #                        if kpath:
 #                            for idx in range(1, len(p1)):
 #                                kpath *= vk_dict[(p1[idx], p2[idx])] * \
 #                                    ek_dict[((p1[idx-1], p1[idx]),
 #                                             (p2[idx-1], p2[idx]))]
 #                                if not kpath:
 #                                    break
 #                            kernel += kpath  # add up kernels of all paths
 #            else:
 #                for p1, p2 in product(spl1, spl2):
 #                    if len(p1) == len(p2):
 #                        kpath = vk_dict[(p1[0], p2[0])]
 #                        if kpath:
 #                            for idx in range(1, len(p1)):
 #                                kpath *= vk_dict[(p1[idx], p2[idx])]
 #                                if not kpath:
 #                                    break
 #                            kernel += kpath  # add up kernels of all paths
 #        else:
 #            if ek_dict:
 #                for p1, p2 in product(spl1, spl2):
 #                    if len(p1) == len(p2):
 #                        if len(p1) == 0:
 #                            kernel += 1
 #                        else:
 #                            kpath = 1
 #                            for idx in range(0, len(p1) - 1):
 #                                kpath *= ek_dict[((p1[idx], p1[idx+1]),
 #                                                  (p2[idx], p2[idx+1]))]
 #                                if not kpath:
 #                                    break
 #                            kernel += kpath  # add up kernels of all paths
 #            else:
 #                for p1, p2 in product(spl1, spl2):
 #                    if len(p1) == len(p2):
 #                        kernel += 1
 #
 #        kernel = kernel / (len(spl1) * len(spl2))  # calculate mean average
 #    except KeyError:  # missing labels or attributes
 #        pass
 #
 #    return iglobal, jglobal, kernel
 #
 #
 #def get_shortest_paths(G, weight, directed):
 #    """Get all shortest paths of a graph.
 #    """
 #    sp = []
 #    for n1, n2 in combinations(G.nodes(), 2):
 #        try:
 #            sptemp = nx.shortest_path(G, n1, n2, weight=weight)
 #            sp.append(sptemp)
 #            # each edge walk is counted twice, starting from both its extreme nodes.
 #            if not directed:
 #                sp.append(sptemp[::-1])
 #        except nx.NetworkXNoPath:  # nodes not connected
 #            #            sp.append([])
 #            pass
 #    # add single nodes as length 0 paths.
 #    sp += [[n] for n in G.nodes()]
 #    return sp
 #
 #
 #def wrap_getSP(Gn, weight, directed, i):
 #    return i, get_shortest_paths(Gn[i], weight, directed)


 def compute_gram_matrices(datafile,
                          estimator,
                          param_grid_precomputed,
                          datafile_y=None,
                          extra_params=None,
                          ds_name='ds-unknown',
                          n_jobs=1,
                          chunksize=1):
    """

    Parameters
    ----------
    datafile : string
        Path of dataset file.
    estimator : function
        kernel function used to estimate. This function needs to return a gram matrix.
    param_grid_precomputed : dictionary
        Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
    datafile_y : string
        Path of file storing y data. This parameter is optional depending on the given dataset file.
    """
    tqdm.monitor_interval = 0

    # Load the dataset
    dataset, y_all = loadDataset(
        datafile, filename_y=datafile_y, extra_params=extra_params)

    # Grid of parameters with a discrete number of values for each.
    param_list_precomputed = list(ParameterGrid(param_grid_precomputed))

    gram_matrix_time = [
    ]  # a list to store time to calculate gram matrices

    # calculate all gram matrices
    for idx, params_out in enumerate(param_list_precomputed):
        y = y_all[:]
        params_out['n_jobs'] = n_jobs
        params_out['chunksize'] = chunksize
        rtn_data = estimator(dataset[:], **params_out)
        Kmatrix = rtn_data[0]
        current_run_time = rtn_data[1]
        # for some kernels, some graphs in datasets may not meet the
        # kernels' requirements for graph structure. These graphs are trimmed.
        if len(rtn_data) == 3:
            idx_trim = rtn_data[2]  # the index of trimmed graph list
            y = [y[idx] for idx in idx_trim]  # trim y accordingly

        Kmatrix_diag = Kmatrix.diagonal().copy()
        # remove graphs whose kernels with themselves are zeros
        nb_g_ignore = 0
        for idx, diag in enumerate(Kmatrix_diag):
            if diag == 0:
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
                nb_g_ignore += 1
        # normalization
        for i in range(len(Kmatrix)):
            for j in range(i, len(Kmatrix)):
                Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
                Kmatrix[j][i] = Kmatrix[i][j]

        gram_matrix_time.append(current_run_time)

    average_gram_matrix_time = np.mean(gram_matrix_time)

    return average_gram_matrix_time


 dslist = [
    {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds',
        'task': 'regression'},  # node symb
    {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', 'task': 'regression',
             'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt', },  # contains single node graph, node symb
    {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds', },  # node/edge symb
    {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds', },  # unlabeled
    {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG.mat',
             'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},  # node/edge symb
    {'name': 'Letter-med', 'dataset': '../../datasets/Letter-med/Letter-med_A.txt'},
    # node symb/nsymb
    {'name': 'ENZYMES', 'dataset': '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
    # node/edge symb
 #    {'name': 'Mutagenicity', 'dataset': '../../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    {'name': 'D&D', 'dataset': '../../datasets/D&D/DD.mat',
 #     'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},  # node symb
 ]

 fig, ax = plt.subplots()
 ax.set_xscale('log', nonposx='clip')
 ax.set_yscale('log', nonposy='clip')
 ax.set_xlabel('parallel chunksize')
 ax.set_ylabel('runtime($s$)')
 ax.set_title('28 cpus')
 ax.grid(axis='both')

 estimator = spkernel
 if estimator.__name__ == 'spkernel':
    mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
    param_grid_precomputed = {'node_kernels': [
        {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}

 elif estimator.__name__ == 'commonwalkkernel':
    mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
    param_grid_precomputed = {'compute_method': ['geo'],
                               'weight': [1]}           
 elif estimator.__name__ == 'structuralspkernel':
    mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
    param_grid_precomputed = {'node_kernels': 
        [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
        'edge_kernels': 
        [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}                 

 #list(range(10, 100, 20)) + 
 #chunklist = list(range(10, 100, 20)) + list(range(100, 1000, 200)) + \
 #    list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
 #    chunklist = list(range(300, 1000, 200)) + list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
 chunklist = list(range(10, 100, 10)) + list(range(100, 1000, 100)) + \
    list(range(1000, 10000, 1000)) + list(range(10000, 100000, 10000))
 #chunklist = list(range(1000, 10000, 1000))
 gmtmat = np.zeros((len(dslist), len(chunklist)))
 cpus = 28   

 for idx1, ds in enumerate(dslist):
    print()
    print(ds['name'])

    for idx2, cs in enumerate(chunklist):
        print(ds['name'], idx2, cs)
        gmtmat[idx1][idx2] = compute_gram_matrices(
            ds['dataset'],
            estimator,
            param_grid_precomputed,

            datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
            extra_params=(ds['extra_params']
                          if 'extra_params' in ds else None),
            ds_name=ds['name'],
            n_jobs=cpus,
            chunksize=cs)

    print()
    print(gmtmat[idx1, :])
    np.save('../test_parallel/' + estimator.__name__ + '.' + ds['name'] + '_' + 
            str(idx1), gmtmat[idx1, :])

    p = ax.plot(chunklist, gmtmat[idx1, :], '.-', label=ds['name'], zorder=3)    
    ax.legend(loc='upper right', ncol=3, labelspacing=0.1, handletextpad=0.4, 
              columnspacing=0.6)
    plt.savefig('../test_parallel/' + estimator.__name__ + str(idx1) + '_' + 
                str(cpus) + '.eps', format='eps', dpi=300)
 #    plt.show()
--- a/notebooks/tests/test_parallel_chunksize_2.py
+++ b/notebooks/tests/test_parallel_chunksize_2.py
@@ -0,0 +1,690 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Test of parallel, find the best parallel chunksize and iteration seperation scheme.
 Created on Wed Sep 26 12:09:34 2018

@author: ljia
 """

 import sys
 import time
 from itertools import combinations_with_replacement, product, combinations
 from functools import partial
 from multiprocessing import Pool
 from tqdm import tqdm
 import networkx as nx
 import numpy as np
 import functools
 #import multiprocessing
 from matplotlib import pyplot as plt
 from sklearn.model_selection import ParameterGrid

 sys.path.insert(0, "../")
 sys.path.insert(0, "../../")
 from libs import *
 from gklearn.utils.utils import getSPGraph, direct_product
 from gklearn.utils.graphdataset import get_dataset_attributes
 from gklearn.utils.graphfiles import loadDataset
 from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct


 def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None,
             chunksize=1):
    """Calculate shortest-path kernels between graphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)

    # remove graphs with no edges, as no sp can be found in their structures, 
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        print('\n %d graphs are removed as they don\'t contain edges.\n' %
              (len_gn - len(Gn)))

    start_time = time.time()

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    for i, g in tqdm(
            pool.imap_unordered(getsp_partial, itr, chunksize),
            desc='getting sp graphs', file=sys.stdout):
        Gn[i] = g
    pool.close()
    pool.join()

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare
    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)   
    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    with Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) as pool:
        for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize),
                                 desc='calculating kernels', file=sys.stdout):
            Kmatrix[i][j] = kernel
            Kmatrix[j][i] = kernel
    
 #    # ---- direct running, normally use single CPU core. ----
 #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
 #    for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
 #        kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
 #        Kmatrix[i][j] = kernel
 #        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time, idx


 def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
    
    kernel = 0

    # compute shortest path matrices first, method borrowed from FCSP.
    vk_dict = {}  # shortest path matrices dict
    if ds_attrs['node_labeled']:
        # node symb and non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['mix']
            for n1, n2 in product(
                    g1.nodes(data=True), g2.nodes(data=True)):
                vk_dict[(n1[0], n2[0])] = kn(
                    n1[1][node_label], n2[1][node_label],
                    n1[1]['attributes'], n2[1]['attributes'])
        # node symb labeled
        else:
            kn = node_kernels['symb']
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
                                                 n2[1][node_label])
    else:
        # node non-synb labeled
        if ds_attrs['node_attr_dim'] > 0:
            kn = node_kernels['nsymb']
            for n1 in g1.nodes(data=True):
                for n2 in g2.nodes(data=True):
                    vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
                                                 n2[1]['attributes'])
        # node unlabeled
        else:
            for e1, e2 in product(
                    g1.edges(data=True), g2.edges(data=True)):
                if e1[2]['cost'] == e2[2]['cost']:
                    kernel += 1
            return kernel

    # compute graph kernels
    if ds_attrs['is_directed']:
        for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
            if e1[2]['cost'] == e2[2]['cost']:
                nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
                                                               e2[1])]
                kn1 = nk11 * nk22
                kernel += kn1
    else:
        for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
            if e1[2]['cost'] == e2[2]['cost']:
                # each edge walk is counted twice, starting from both its extreme nodes.
                nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
                    e1[0], e2[1])], vk_dict[(e1[1],
                                             e2[0])], vk_dict[(e1[1],
                                                               e2[1])]
                kn1 = nk11 * nk22
                kn2 = nk12 * nk21
                kernel += kn1 + kn2

    return kernel


 def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):
    i = itr[0]
    j = itr[1]
    return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels)


 def wrapper_getSPGraph(weight, itr_item):
    g = itr_item[0]
    i = itr_item[1]
    return i, getSPGraph(g, edge_weight=weight)



 #
 #
 #def commonwalkkernel(*args,
 #                     node_label='atom',
 #                     edge_label='bond_type',
 #                     n=None,
 #                     weight=1,
 #                     compute_method=None,
 #                     n_jobs=None,
 #                     chunksize=1):
 #    """Calculate common walk graph kernels between graphs.
 #    """
 #    compute_method = compute_method.lower()
 #    # arrange all graphs in a list
 #    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
 #    Kmatrix = np.zeros((len(Gn), len(Gn)))
 #    ds_attrs = get_dataset_attributes(
 #        Gn,
 #        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
 #        node_label=node_label,
 #        edge_label=edge_label)
 #    if not ds_attrs['node_labeled']:
 #        for G in Gn:
 #            nx.set_node_attributes(G, '0', 'atom')
 #    if not ds_attrs['edge_labeled']:
 #        for G in Gn:
 #            nx.set_edge_attributes(G, '0', 'bond_type')
 #    if not ds_attrs['is_directed']:  # convert
 #        Gn = [G.to_directed() for G in Gn]
 #
 #    start_time = time.time()
 #
 #    # ---- use pool.imap_unordered to parallel and track progress. ----
 #    pool = Pool(n_jobs)
 #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
 ##    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
 ##    if len_itr < 100:
 ##        chunksize, extra = divmod(len_itr, n_jobs * 4)
 ##        if extra:
 ##            chunksize += 1
 ##    else:
 ##        chunksize = 100
 #
 #    # direct product graph method - exponential
 #    if compute_method == 'exp':
 #        do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label,
 #                             weight)
 #    # direct product graph method - geometric
 #    elif compute_method == 'geo':
 #        do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label,
 #                             weight)
 #
 #    for i, j, kernel in tqdm(
 #            pool.imap_unordered(do_partial, itr, chunksize),
 #            desc='calculating kernels',
 #            file=sys.stdout):
 #        Kmatrix[i][j] = kernel
 #        Kmatrix[j][i] = kernel
 #    pool.close()
 #    pool.join()
 #
 #    run_time = time.time() - start_time
 #    print(
 #        "\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---"
 #        % (len(Gn), run_time))
 #
 #    return Kmatrix, run_time
 #
 #
 #def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
 #    """Calculate walk graph kernels up to n between 2 graphs using exponential 
 #    series.
 #    """
 #    i = ij[0]
 #    j = ij[1]
 #    g1 = Gn[i]
 #    g2 = Gn[j]
 #
 #    # get tensor product / direct product
 #    gp = direct_product(g1, g2, node_label, edge_label)
 #    A = nx.adjacency_matrix(gp).todense()
 #
 #    ew, ev = np.linalg.eig(A)
 #    D = np.zeros((len(ew), len(ew)))
 #    for i in range(len(ew)):
 #        D[i][i] = np.exp(beta * ew[i])
 #    exp_D = ev * D * ev.T
 #
 #    return i, j, exp_D.sum()
 #
 #
 #def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
 #    """Calculate common walk graph kernels up to n between 2 graphs using 
 #    geometric series.
 #    """
 #    i = ij[0]
 #    j = ij[1]
 #    g1 = Gn[i]
 #    g2 = Gn[j]
 #
 #    # get tensor product / direct product
 #    gp = direct_product(g1, g2, node_label, edge_label)
 #    A = nx.adjacency_matrix(gp).todense()
 #    mat = np.identity(len(A)) - gamma * A
 #    try:
 #        return i, j, mat.I.sum()
 #    except np.linalg.LinAlgError:
 #        return i, j, np.nan


 #def structuralspkernel(*args,
 #                       node_label='atom',
 #                       edge_weight=None,
 #                       edge_label='bond_type',
 #                       node_kernels=None,
 #                       edge_kernels=None,
 #                       n_jobs=None,
 #                       chunksize=1):
 #    """Calculate mean average structural shortest path kernels between graphs.
 #    """
 #    # pre-process
 #    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
 #
 #    weight = None
 #    if edge_weight is None:
 #        print('\n None edge weight specified. Set all weight to 1.\n')
 #    else:
 #        try:
 #            some_weight = list(
 #                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
 #            if isinstance(some_weight, (float, int)):
 #                weight = edge_weight
 #            else:
 #                print(
 #                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
 #                    % edge_weight)
 #        except:
 #            print(
 #                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
 #                % edge_weight)
 #    ds_attrs = get_dataset_attributes(
 #        Gn,
 #        attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
 #                    'edge_attr_dim', 'is_directed'],
 #        node_label=node_label, edge_label=edge_label)
 #
 #    start_time = time.time()
 #
 #    # get shortest paths of each graph in Gn
 #    splist = [[] for _ in range(len(Gn))]
 #    pool = Pool(n_jobs)
 #    # get shortest path graphs of Gn
 #    getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
 ##    if len(Gn) < 100:
 ##        # use default chunksize as pool.map when iterable is less than 100
 ##        chunksize, extra = divmod(len(Gn), n_jobs * 4)
 ##        if extra:
 ##            chunksize += 1
 ##    else:
 ##        chunksize = 100
 #    # chunksize = 300  # int(len(list(itr)) / n_jobs)
 #    for i, sp in tqdm(
 #            pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
 #            desc='getting shortest paths',
 #            file=sys.stdout):
 #        splist[i] = sp
 #
 #    Kmatrix = np.zeros((len(Gn), len(Gn)))
 #
 #    # ---- use pool.imap_unordered to parallel and track progress. ----
 #    do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
 #                         node_label, edge_label, node_kernels, edge_kernels)
 #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
 ##    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
 ##    if len_itr < 100:
 ##        chunksize, extra = divmod(len_itr, n_jobs * 4)
 ##        if extra:
 ##            chunksize += 1
 ##    else:
 ##        chunksize = 100
 #    for i, j, kernel in tqdm(
 #            pool.imap_unordered(do_partial, itr, chunksize),
 #            desc='calculating kernels',
 #            file=sys.stdout):
 #        Kmatrix[i][j] = kernel
 #        Kmatrix[j][i] = kernel
 #    pool.close()
 #    pool.join()
 #
 #    run_time = time.time() - start_time
 #    print(
 #        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
 #        % (len(Gn), run_time))
 #
 #    return Kmatrix, run_time
 #
 #
 #def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
 #                          node_kernels, edge_kernels, ij):
 #
 #    iglobal = ij[0]
 #    jglobal = ij[1]
 #    g1 = Gn[iglobal]
 #    g2 = Gn[jglobal]
 #    spl1 = splist[iglobal]
 #    spl2 = splist[jglobal]
 #    kernel = 0
 #
 #    try:
 #        # First, compute shortest path matrices, method borrowed from FCSP.
 #        if ds_attrs['node_labeled']:
 #            # node symb and non-synb labeled
 #            if ds_attrs['node_attr_dim'] > 0:
 #                kn = node_kernels['mix']
 #                vk_dict = {}  # shortest path matrices dict
 #                for n1, n2 in product(
 #                        g1.nodes(data=True), g2.nodes(data=True)):
 #                    vk_dict[(n1[0], n2[0])] = kn(
 #                        n1[1][node_label], n2[1][node_label],
 #                        [n1[1]['attributes']], [n2[1]['attributes']])
 #            # node symb labeled
 #            else:
 #                kn = node_kernels['symb']
 #                vk_dict = {}  # shortest path matrices dict
 #                for n1 in g1.nodes(data=True):
 #                    for n2 in g2.nodes(data=True):
 #                        vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
 #                                                     n2[1][node_label])
 #        else:
 #            # node non-synb labeled
 #            if ds_attrs['node_attr_dim'] > 0:
 #                kn = node_kernels['nsymb']
 #                vk_dict = {}  # shortest path matrices dict
 #                for n1 in g1.nodes(data=True):
 #                    for n2 in g2.nodes(data=True):
 #                        vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
 #                                                     [n2[1]['attributes']])
 #            # node unlabeled
 #            else:
 #                vk_dict = {}
 #
 #        # Then, compute kernels between all pairs of edges, which idea is an
 #        # extension of FCSP. It suits sparse graphs, which is the most case we
 #        # went though. For dense graphs, it would be slow.
 #        if ds_attrs['edge_labeled']:
 #            # edge symb and non-synb labeled
 #            if ds_attrs['edge_attr_dim'] > 0:
 #                ke = edge_kernels['mix']
 #                ek_dict = {}  # dict of edge kernels
 #                for e1, e2 in product(
 #                        g1.edges(data=True), g2.edges(data=True)):
 #                    ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
 #                        e1[2][edge_label], e2[2][edge_label],
 #                        [e1[2]['attributes']], [e2[2]['attributes']])
 #            # edge symb labeled
 #            else:
 #                ke = edge_kernels['symb']
 #                ek_dict = {}
 #                for e1 in g1.edges(data=True):
 #                    for e2 in g2.edges(data=True):
 #                        ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
 #                            e1[2][edge_label], e2[2][edge_label])
 #        else:
 #            # edge non-synb labeled
 #            if ds_attrs['edge_attr_dim'] > 0:
 #                ke = edge_kernels['nsymb']
 #                ek_dict = {}
 #                for e1 in g1.edges(data=True):
 #                    for e2 in g2.edges(data=True):
 #                        ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = kn(
 #                            [e1[2]['attributes']], [e2[2]['attributes']])
 #            # edge unlabeled
 #            else:
 #                ek_dict = {}
 #
 #        # compute graph kernels
 #        if vk_dict:
 #            if ek_dict:
 #                for p1, p2 in product(spl1, spl2):
 #                    if len(p1) == len(p2):
 #                        kpath = vk_dict[(p1[0], p2[0])]
 #                        if kpath:
 #                            for idx in range(1, len(p1)):
 #                                kpath *= vk_dict[(p1[idx], p2[idx])] * \
 #                                    ek_dict[((p1[idx-1], p1[idx]),
 #                                             (p2[idx-1], p2[idx]))]
 #                                if not kpath:
 #                                    break
 #                            kernel += kpath  # add up kernels of all paths
 #            else:
 #                for p1, p2 in product(spl1, spl2):
 #                    if len(p1) == len(p2):
 #                        kpath = vk_dict[(p1[0], p2[0])]
 #                        if kpath:
 #                            for idx in range(1, len(p1)):
 #                                kpath *= vk_dict[(p1[idx], p2[idx])]
 #                                if not kpath:
 #                                    break
 #                            kernel += kpath  # add up kernels of all paths
 #        else:
 #            if ek_dict:
 #                for p1, p2 in product(spl1, spl2):
 #                    if len(p1) == len(p2):
 #                        if len(p1) == 0:
 #                            kernel += 1
 #                        else:
 #                            kpath = 1
 #                            for idx in range(0, len(p1) - 1):
 #                                kpath *= ek_dict[((p1[idx], p1[idx+1]),
 #                                                  (p2[idx], p2[idx+1]))]
 #                                if not kpath:
 #                                    break
 #                            kernel += kpath  # add up kernels of all paths
 #            else:
 #                for p1, p2 in product(spl1, spl2):
 #                    if len(p1) == len(p2):
 #                        kernel += 1
 #
 #        kernel = kernel / (len(spl1) * len(spl2))  # calculate mean average
 #    except KeyError:  # missing labels or attributes
 #        pass
 #
 #    return iglobal, jglobal, kernel
 #
 #
 #def get_shortest_paths(G, weight, directed):
 #    """Get all shortest paths of a graph.
 #    """
 #    sp = []
 #    for n1, n2 in combinations(G.nodes(), 2):
 #        try:
 #            sptemp = nx.shortest_path(G, n1, n2, weight=weight)
 #            sp.append(sptemp)
 #            # each edge walk is counted twice, starting from both its extreme nodes.
 #            if not directed:
 #                sp.append(sptemp[::-1])
 #        except nx.NetworkXNoPath:  # nodes not connected
 #            #            sp.append([])
 #            pass
 #    # add single nodes as length 0 paths.
 #    sp += [[n] for n in G.nodes()]
 #    return sp
 #
 #
 #def wrap_getSP(Gn, weight, directed, i):
 #    return i, get_shortest_paths(Gn[i], weight, directed)


 def compute_gram_matrices(datafile,
                          estimator,
                          param_grid_precomputed,
                          datafile_y=None,
                          extra_params=None,
                          ds_name='ds-unknown',
                          n_jobs=1,
                          chunksize=1):
    """

    Parameters
    ----------
    datafile : string
        Path of dataset file.
    estimator : function
        kernel function used to estimate. This function needs to return a gram matrix.
    param_grid_precomputed : dictionary
        Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
    datafile_y : string
        Path of file storing y data. This parameter is optional depending on the given dataset file.
    """
    tqdm.monitor_interval = 0

    # Load the dataset
    dataset, y_all = loadDataset(
        datafile, filename_y=datafile_y, extra_params=extra_params)

    # Grid of parameters with a discrete number of values for each.
    param_list_precomputed = list(ParameterGrid(param_grid_precomputed))

    gram_matrix_time = [
    ]  # a list to store time to calculate gram matrices

    # calculate all gram matrices
    for idx, params_out in enumerate(param_list_precomputed):
        y = y_all[:]
        params_out['n_jobs'] = n_jobs
        params_out['chunksize'] = chunksize
        rtn_data = estimator(dataset[:], **params_out)
        Kmatrix = rtn_data[0]
        current_run_time = rtn_data[1]
        # for some kernels, some graphs in datasets may not meet the
        # kernels' requirements for graph structure. These graphs are trimmed.
        if len(rtn_data) == 3:
            idx_trim = rtn_data[2]  # the index of trimmed graph list
            y = [y[idx] for idx in idx_trim]  # trim y accordingly

        Kmatrix_diag = Kmatrix.diagonal().copy()
        # remove graphs whose kernels with themselves are zeros
        nb_g_ignore = 0
        for idx, diag in enumerate(Kmatrix_diag):
            if diag == 0:
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
                Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
                nb_g_ignore += 1
        # normalization
        Kmatrix_diag = Kmatrix.diagonal().copy()
        for i in range(len(Kmatrix)):
            for j in range(i, len(Kmatrix)):
                Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
                Kmatrix[j][i] = Kmatrix[i][j]

        gram_matrix_time.append(current_run_time)

    average_gram_matrix_time = np.mean(gram_matrix_time)

    return average_gram_matrix_time


 dslist = [
    {'name': 'Alkane', 'dataset': '../../datasets/Alkane/dataset.ds', 'task': 'regression',
        'dataset_y': '../../datasets/Alkane/dataset_boiling_point_names.txt'},  
    # contains single node graph, node symb
    {'name': 'Acyclic', 'dataset': '../../datasets/acyclic/dataset_bps.ds',
        'task': 'regression'},  # node symb
    {'name': 'MAO', 'dataset': '../../datasets/MAO/dataset.ds'}, # node/edge symb
    {'name': 'PAH', 'dataset': '../../datasets/PAH/dataset.ds'}, # unlabeled
    {'name': 'MUTAG', 'dataset': '../../datasets/MUTAG/MUTAG_A.txt'}, # node/edge symb
    {'name': 'Letter-med', 'dataset': '../../datasets/Letter-med/Letter-med_A.txt'},
    # node nsymb
    {'name': 'ENZYMES', 'dataset': '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
    # node symb/nsymb
    {'name': 'AIDS', 'dataset': '../../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
 #    {'name': 'Mutagenicity', 'dataset': '../../datasets/Mutagenicity/Mutagenicity_A.txt'},
 #    {'name': 'D&D', 'dataset': '../../datasets/D&D/DD.mat',
 #     'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},  # node symb
 ]

 fig, ax = plt.subplots()
 ax.set_xscale('log', nonposx='clip')
 ax.set_yscale('log', nonposy='clip')
 ax.set_xlabel('parallel chunksize')
 ax.set_ylabel('runtime($s$)')
 ax.set_title('28 cpus')
 ax.grid(axis='both')

 estimator = spkernel
 if estimator.__name__ == 'spkernel':
    mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
    param_grid_precomputed = {'node_kernels': [
        {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}

 elif estimator.__name__ == 'commonwalkkernel':
    mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
    param_grid_precomputed = {'compute_method': ['geo'],
                               'weight': [1]}           
 elif estimator.__name__ == 'structuralspkernel':
    mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
    param_grid_precomputed = {'node_kernels': 
        [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
        'edge_kernels': 
        [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}                 

 #list(range(10, 100, 20)) + 
 #chunklist = list(range(10, 100, 20)) + list(range(100, 1000, 200)) + \
 #    list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
 #    chunklist = list(range(300, 1000, 200)) + list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
 chunklist = list(range(10, 100, 10)) + list(range(100, 1000, 100)) + \
    list(range(1000, 10000, 1000)) + list(range(10000, 100000, 10000))
 #chunklist = list(range(1000, 10000, 1000))
 gmtmat = np.zeros((len(dslist), len(chunklist)))
 cpus = 28   

 for idx1, ds in enumerate(dslist):
    print()
    print(ds['name'])

    for idx2, cs in enumerate(chunklist):
        print(ds['name'], idx2, cs)
        gmtmat[idx1][idx2] = compute_gram_matrices(
            ds['dataset'],
            estimator,
            param_grid_precomputed,

            datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
            extra_params=(ds['extra_params']
                          if 'extra_params' in ds else None),
            ds_name=ds['name'],
            n_jobs=cpus,
            chunksize=cs)

    print()
    print(gmtmat[idx1, :])
    np.save('../test_parallel/' + estimator.__name__ + '.' + ds['name'] + '_' + 
            str(idx1), gmtmat[idx1, :])

    p = ax.plot(chunklist, gmtmat[idx1, :], '.-', label=ds['name'], zorder=3)    
    ax.legend(loc='upper right', ncol=3, labelspacing=0.1, handletextpad=0.4, 
              columnspacing=0.6)
    plt.savefig('../test_parallel/' + estimator.__name__ + str(idx1) + '_' + 
                str(cpus) + '.eps', format='eps', dpi=300)
 #    plt.show()
--- a/notebooks/tests/test_scikit_ksvm.ipynb
+++ b/notebooks/tests/test_scikit_ksvm.ipynb
--- a/notebooks/tests/test_sp_methods.py
+++ b/notebooks/tests/test_sp_methods.py
@@ -0,0 +1,22 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Test networkx shortest paths methods.
 Created on Tue Oct  9 14:49:09 2018

@author: ljia
 """

 import networkx as nx

 g = nx.Graph()
 g.add_edge(1, 2)
 g.add_edge(3, 2)
 g.add_edge(1, 4)
 g.add_edge(3, 4)
 p1 = nx.shortest_path(g, 1, 3)
 p1 = [p1]
 p2 = list(nx.all_shortest_paths(g, 1, 3))
 p1 += p2
 pr = [sp[::-1] for sp in p1]
 nx.draw(g)
--- a/notebooks/tests/test_spkernel.ipynb
+++ b/notebooks/tests/test_spkernel.ipynb
--- a/notebooks/unfinished/run_cyclicpatternkernel.ipynb
+++ b/notebooks/unfinished/run_cyclicpatternkernel.ipynb
--- a/notebooks/unfinished/run_treeletkernel_acyclic.ipynb
+++ b/notebooks/unfinished/run_treeletkernel_acyclic.ipynb
--- a/notebooks/unfinished/run_treepatternkernel.ipynb
+++ b/notebooks/unfinished/run_treepatternkernel.ipynb
--- a/notebooks/unfinished/run_weisfeilerLehmankernel.ipynb
+++ b/notebooks/unfinished/run_weisfeilerLehmankernel.ipynb
--- a/notebooks/unfinished/test_mpi.py
+++ b/notebooks/unfinished/test_mpi.py
@@ -0,0 +1,47 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Test Message Passing Interface for cluster paralleling.
 Created on Wed Nov  7 17:26:40 2018

@author: ljia
 """

 from mpi4py import MPI

 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()

 import numpy as np
 import time
 size = comm.Get_size()
 numDataPerRank = 10
 data = None
 if rank == 0:
    data = np.linspace(1, size * numDataPerRank, size * numDataPerRank)
    
 recvbuf = np.empty(numDataPerRank, dtype='d')
 comm.Scatter(data, recvbuf, root=0)
 recvbuf += 1
 print('Rank: ', rank, ', recvbuf received: ', recvbuf, ', size: ', size, ', time: ', time.time())

 #if rank == 0:
 #    data = {'key1' : [1,2, 3],
 #            'key2' : ( 'abc', 'xyz')}
 #else:
 #    data = None
 #
 #data = comm.bcast(data, root=0)
 #print('Rank: ',rank,', data: ' ,data)

 #if rank == 0:
 #    data = {'a': 7, 'b': 3.14}
 #    comm.send(data, dest=1)
 #elif rank == 1:
 #    data = comm.recv(source=0)
 #    print('On process 1, data is ', data)

 #print('My rank is ', rank)

 #for i in range(0, 100000000):
 #    print(i)