From ee6c79603d7381312bb141e07b1d077115802377 Mon Sep 17 00:00:00 2001
From: jajupmochi <jajupmochi@gmail.com>
Date: Tue, 22 Oct 2019 17:05:29 +0200
Subject: [PATCH] 1. update fitDistance.py. 1.1 add parallel computation of
 GEDs. 1.2 randomly initialize edit costs instead of uniform initialization.
 1.3 remove the constrain that sum of the edit costs has to be equal to 1,
 avoiding sparsity.

---
 preimage/fitDistance.py   | 135 +++++++++++++++++++++++++++++++++++++---------
 pygraph/utils/parallel.py |  11 ++--
 2 files changed, 114 insertions(+), 32 deletions(-)

diff --git a/preimage/fitDistance.py b/preimage/fitDistance.py
index d0d2357..42b9889 100644
--- a/preimage/fitDistance.py
+++ b/preimage/fitDistance.py
@@ -7,6 +7,12 @@ Created on Wed Oct 16 14:20:06 2019
 """
 import numpy as np
 from tqdm import tqdm
+from itertools import combinations_with_replacement
+import multiprocessing
+from multiprocessing import Pool
+from functools import partial
+import time
+import random
 
 from scipy import optimize
 import cvxpy as cp
@@ -19,7 +25,12 @@ from utils import kernel_distance_matrix
 
 def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
     # c_vi, c_vr, c_vs, c_ei, c_er, c_es or parts of them.
-    edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
+    random.seed(1)
+    cost_rdm = random.sample(range(1, 10), 5)
+    edit_costs = cost_rdm + [0]
+#    edit_costs = [0.2, 0.2, 0.2, 0.2, 0.2, 0]
+#    edit_costs = [0, 0, 0.9544, 0.026, 0.0196, 0]
+#    edit_costs = [0.008429912251810438, 0.025461055985319694, 0.2047320869225948, 0.004148727085832133, 0.0, 0]
     idx_nonzeros = [i for i, item in enumerate(edit_costs) if item != 0]
     
     # compute distances in feature space.
@@ -34,24 +45,13 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
     edit_cost_list = []
     
     for itr in range(itr_max):
-        print('iteration', itr)
-        ged_all = []
-        n_edit_operations = [[] for i in range(len(idx_nonzeros))]
+        print('\niteration', itr)
         # compute GEDs and numbers of edit operations.
         edit_cost_constant = [i for i in edit_costs]
         edit_cost_list.append(edit_cost_constant)
-        for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
-#        for i in range(len(Gn)):
-            for j in range(i, len(Gn)):
-                dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy', 
-                    cost='CONSTANT', method='IPFP', 
-                    edit_cost_constant=edit_cost_constant, stabilizer='min', 
-                    repeat=30)
-                ged_all.append(dis)
-                n_eo_tmp = get_nb_edit_operations(Gn[i], 
-                    Gn[j], pi_forward, pi_backward)
-                for idx, item in enumerate(idx_nonzeros):
-                    n_edit_operations[idx].append(n_eo_tmp[item])
+        
+        ged_all, ged_mat, n_edit_operations = compute_geds(Gn, edit_cost_constant, 
+            idx_nonzeros, parallel=True)
                 
         residual = np.sqrt(np.sum(np.square(np.array(ged_all) - dis_k_vec)))
         residual_list.append(residual)
@@ -59,23 +59,105 @@ def fit_GED_to_kernel_distance(Gn, gkernel, itr_max):
         # "fit" geds to distances in feature space by tuning edit costs using the
         # Least Squares Method.
         nb_cost_mat = np.array(n_edit_operations).T
-        edit_costs_new, residual = get_better_costs(nb_cost_mat, dis_k_vec)
+        edit_costs_new, residual = compute_better_costs(nb_cost_mat, dis_k_vec)
 
-        print(residual)
+        print('pseudo residual:', residual)
         for i in range(len(edit_costs_new)):
             if edit_costs_new[i] < 0:
-                if edit_costs_new[i] > -1e-6:
+                if edit_costs_new[i] > -1e-9:
                     edit_costs_new[i] = 0
                 else:
                     raise ValueError('The edit cost is negative.')
         
         for idx, item in enumerate(idx_nonzeros):
             edit_costs[item] = edit_costs_new[idx]
+            
+        print('edit_costs:', edit_costs)
+        print('residual_list:', residual_list)
     
-    return edit_costs, residual_list, edit_cost_list
+    return edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat
+
+
+def compute_geds(Gn, edit_cost_constant, idx_nonzeros, parallel=False):
+    ged_mat = np.zeros((len(Gn), len(Gn)))
+    if parallel:
+#        print('parallel')
+        len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
+        ged_all = [0 for i in range(len_itr)]
+        n_edit_operations = [[0 for i in range(len_itr)] for j in 
+                              range(len(idx_nonzeros))]
+               
+        itr = combinations_with_replacement(range(0, len(Gn)), 2)
+        n_jobs = multiprocessing.cpu_count()
+        if len_itr < 100 * n_jobs:
+            chunksize = int(len_itr / n_jobs) + 1
+        else:
+            chunksize = 100
+        def init_worker(gn_toshare):
+            global G_gn
+            G_gn = gn_toshare
+        do_partial = partial(_wrapper_compute_ged_parallel, edit_cost_constant, 
+                             idx_nonzeros)
+        pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,))
+        iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
+                        desc='computing GEDs', file=sys.stdout)
+#        iterator = pool.imap_unordered(do_partial, itr, chunksize)
+        for i, j, dis, n_eo_tmp in iterator:
+            idx_itr = int(len(Gn) * i + j - i * (i + 1) / 2)
+            ged_all[idx_itr] = dis
+            ged_mat[i][j] = dis
+            ged_mat[j][i] = dis
+            for idx, item in enumerate(idx_nonzeros):
+                n_edit_operations[idx][idx_itr] = n_eo_tmp[item]
+#            print('\n-------------------------------------------')
+#            print(i, j, idx_itr, dis)
+        pool.close()
+        pool.join()
+        
+    else:
+        ged_all = []
+        n_edit_operations = [[] for i in range(len(idx_nonzeros))]
+        for i in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
+#        for i in range(len(Gn)):
+            for j in range(i, len(Gn)):
+#                time0 = time.time()
+                dis, pi_forward, pi_backward = GED(Gn[i], Gn[j], lib='gedlibpy', 
+                    cost='CONSTANT', method='IPFP', 
+                    edit_cost_constant=edit_cost_constant, stabilizer='min', 
+                    repeat=50)
+#                time1 = time.time() - time0
+#                time0 = time.time()
+                ged_all.append(dis)
+                ged_mat[i][j] = dis
+                ged_mat[j][i] = dis
+                n_eo_tmp = get_nb_edit_operations(Gn[i], Gn[j], pi_forward, pi_backward)
+                for idx, item in enumerate(idx_nonzeros):
+                    n_edit_operations[idx].append(n_eo_tmp[item])
+#                time2 = time.time() - time0
+#                print(time1, time2, time1 / time2)
+                    
+    return ged_all, ged_mat, n_edit_operations
+                    
+
+def _wrapper_compute_ged_parallel(edit_cost_constant, idx_nonzeros, itr):
+    i = itr[0]
+    j = itr[1]
+    dis, n_eo_tmp = _compute_ged_parallel(G_gn[i], G_gn[j], edit_cost_constant, 
+                                          idx_nonzeros)
+    return i, j, dis, n_eo_tmp
+
+
+def _compute_ged_parallel(g1, g2, edit_cost_constant, idx_nonzeros):
+    dis, pi_forward, pi_backward = GED(g1, g2, lib='gedlibpy', 
+        cost='CONSTANT', method='IPFP', 
+        edit_cost_constant=edit_cost_constant, stabilizer='min', 
+        repeat=50)
+    n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward)
+        
+    return dis, n_eo_tmp
 
 
-def get_better_costs(nb_cost_mat, dis_k_vec):
+def compute_better_costs(nb_cost_mat, dis_k_vec):
 #    # method 1: simple least square method.
 #    edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
 #                                                     rcond=None)
@@ -92,13 +174,11 @@ def get_better_costs(nb_cost_mat, dis_k_vec):
     b = 1
     x = cp.Variable(nb_cost_mat.shape[1])
     prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x),
-                      [G@x <= h,
-                       A@x == b])
+                      [G@x <= h])
     prob.solve()
     edit_costs_new = x.value
     residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec)
     
-#    p = program(minimize(norm2(nb_cost_mat*x-dis_k_vec)),[equals(sum(x),1),geq(x,0)])
     return edit_costs_new, residual
 
 
@@ -111,5 +191,8 @@ if __name__ == '__main__':
     remove_edges(Gn)
     gkernel = 'marginalizedkernel'
     itr_max = 10
-    edit_costs, residual_list, edit_cost_list = \
-        fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
\ No newline at end of file
+    time0 = time.time()
+    edit_costs, residual_list, edit_cost_list, dis_k_mat, ged_mat = \
+        fit_GED_to_kernel_distance(Gn, gkernel, itr_max)
+    total_time = time.time() - time0
+    print('total time:', total_time)
\ No newline at end of file
diff --git a/pygraph/utils/parallel.py b/pygraph/utils/parallel.py
index fdf2244..603164a 100644
--- a/pygraph/utils/parallel.py
+++ b/pygraph/utils/parallel.py
@@ -20,11 +20,10 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker
 #            def init_worker(v_share):
 #                global G_var
 #                G_var = v_share
-            
+            if n_jobs == None:
+                n_jobs = multiprocessing.cpu_count()
             with Pool(processes=n_jobs, initializer=init_worker, 
-                      initargs=glbv) as pool:
-                if n_jobs == None:
-                    n_jobs = multiprocessing.cpu_count()
+                      initargs=glbv) as pool:                
                 if chunksize == None:
                     if len_itr < 100 * n_jobs:
                         chunksize = int(len_itr / n_jobs) + 1
@@ -35,9 +34,9 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker
                     pool.imap_unordered(func, itr, chunksize)):
                     func_assign(result, var_to_assign)
         else:
+            if n_jobs == None:
+                n_jobs = multiprocessing.cpu_count()
             with Pool(processes=n_jobs) as pool:
-                if n_jobs == None:
-                    n_jobs = multiprocessing.cpu_count()
                 if chunksize == None:
                     if len_itr < 100 * n_jobs:
                         chunksize = int(len_itr / n_jobs) + 1