diff --git a/gklearn/kernels/common_walk.py b/gklearn/kernels/common_walk.py index a58fb86..ac3363e 100644 --- a/gklearn/kernels/common_walk.py +++ b/gklearn/kernels/common_walk.py @@ -5,15 +5,15 @@ Created on Tue Aug 18 11:21:31 2020 @author: ljia -@references: +@references: - [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: + [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003. """ import sys -from tqdm import tqdm +from gklearn.utils import get_iters import numpy as np import networkx as nx from gklearn.utils import SpecialLabel @@ -23,7 +23,7 @@ from gklearn.kernels import GraphKernel class CommonWalk(GraphKernel): - + def __init__(self, **kwargs): GraphKernel.__init__(self) self._node_labels = kwargs.get('node_labels', []) @@ -39,17 +39,16 @@ class CommonWalk(GraphKernel): self._add_dummy_labels(self._graphs) if not self._ds_infos['directed']: # convert self._graphs = [G.to_directed() for G in self._graphs] - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) - else: - iterator = itr - + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, + length=len_itr, verbose=(self._verbose >= 2)) + # direct product graph method - exponential if self._compute_method == 'exp': for i, j in iterator: @@ -62,50 +61,51 @@ class CommonWalk(GraphKernel): kernel = self._kernel_do_geo(self._graphs[i], self._graphs[j], self._weight) gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): self._check_graphs(self._graphs) self._add_dummy_labels(self._graphs) if not self._ds_infos['directed']: # convert self._graphs = [G.to_directed() for G in self._graphs] - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + # def init_worker(gn_toshare): # global G_gn # G_gn = gn_toshare - + # direct product graph method - exponential if self._compute_method == 'exp': do_fun = self._wrapper_kernel_do_exp # direct product graph method - geometric elif self._compute_method == 'geo': do_fun = self._wrapper_kernel_do_geo - - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm, + + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm, glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) - + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): self._check_graphs(g_list + [g1]) self._add_dummy_labels(g_list + [g1]) if not self._ds_infos['directed']: # convert g1 = g1.to_directed() g_list = [G.to_directed() for G in g_list] - + # compute kernel list. kernel_list = [None] * len(g_list) if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', + file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) else: iterator = range(len(g_list)) - + # direct product graph method - exponential if self._compute_method == 'exp': for i in iterator: @@ -116,17 +116,17 @@ class CommonWalk(GraphKernel): for i in iterator: kernel = self._kernel_do_geo(g1, g_list[i], self._weight) kernel_list[i] = kernel - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): self._check_graphs(g_list + [g1]) self._add_dummy_labels(g_list + [g1]) if not self._ds_infos['directed']: # convert g1 = g1.to_directed() g_list = [G.to_directed() for G in g_list] - + # compute kernel list. kernel_list = [None] * len(g_list) @@ -134,61 +134,61 @@ class CommonWalk(GraphKernel): # global G_g1, G_g_list # G_g1 = g1_toshare # G_g_list = g_list_toshare - + # direct product graph method - exponential if self._compute_method == 'exp': do_fun = self._wrapper_kernel_list_do_exp # direct product graph method - geometric elif self._compute_method == 'geo': do_fun = self._wrapper_kernel_list_do_geo - - def func_assign(result, var_to_assign): + + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered', + init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + return kernel_list - - + + def _wrapper_kernel_list_do_exp(self, itr): return itr, self._kernel_do_exp(G_g1, G_g_list[itr], self._weight) def _wrapper_kernel_list_do_geo(self, itr): return itr, self._kernel_do_geo(G_g1, G_g_list[itr], self._weight) - - + + def _compute_single_kernel_series(self, g1, g2): self._check_graphs([g1] + [g2]) self._add_dummy_labels([g1] + [g2]) if not self._ds_infos['directed']: # convert g1 = g1.to_directed() g2 = g2.to_directed() - + # direct product graph method - exponential if self._compute_method == 'exp': - kernel = self._kernel_do_exp(g1, g2, self._weight) + kernel = self._kernel_do_exp(g1, g2, self._weight) # direct product graph method - geometric elif self._compute_method == 'geo': - kernel = self._kernel_do_geo(g1, g2, self._weight) + kernel = self._kernel_do_geo(g1, g2, self._weight) + + return kernel + - return kernel - - def _kernel_do_exp(self, g1, g2, beta): - """Compute common walk graph kernel between 2 graphs using exponential + """Compute common walk graph kernel between 2 graphs using exponential series. - + Parameters ---------- g1, g2 : NetworkX graphs Graphs between which the kernels are computed. beta : integer Weight. - + Return ------ kernel : float @@ -200,9 +200,9 @@ class CommonWalk(GraphKernel): if nx.number_of_nodes(gp) < 2: return 0 A = nx.adjacency_matrix(gp).todense() - + ew, ev = np.linalg.eig(A) -# # remove imaginary part if possible. +# # remove imaginary part if possible. # # @todo: don't know if it is necessary. # for i in range(len(ew)): # if np.abs(ew[i].imag) < 1e-9: @@ -220,27 +220,27 @@ class CommonWalk(GraphKernel): kernel = exp_D.sum() if (kernel.real == 0 and np.abs(kernel.imag) < 1e-9) or np.abs(kernel.imag / kernel.real) < 1e-9: kernel = kernel.real - + return kernel - - + + def _wrapper_kernel_do_exp(self, itr): i = itr[0] j = itr[1] return i, j, self._kernel_do_exp(G_gn[i], G_gn[j], self._weight) - - + + def _kernel_do_geo(self, g1, g2, gamma): - """Compute common walk graph kernel between 2 graphs using geometric + """Compute common walk graph kernel between 2 graphs using geometric series. - + Parameters ---------- g1, g2 : NetworkX graphs Graphs between which the kernels are computed. gamma : integer Weight. - + Return ------ kernel : float @@ -258,19 +258,19 @@ class CommonWalk(GraphKernel): # except np.linalg.LinAlgError: # return np.nan - + def _wrapper_kernel_do_geo(self, itr): i = itr[0] j = itr[1] return i, j, self._kernel_do_geo(G_gn[i], G_gn[j], self._weight) - - + + def _check_graphs(self, Gn): for g in Gn: if nx.number_of_nodes(g) == 1: raise Exception('Graphs must contain more than 1 nodes to construct adjacency matrices.') - - + + def _add_dummy_labels(self, Gn): if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): for i in range(len(Gn)): @@ -280,13 +280,13 @@ class CommonWalk(GraphKernel): for i in range(len(Gn)): nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) self._edge_labels = [SpecialLabel.DUMMY] - - + + def _init_worker_gm(gn_toshare): global G_gn G_gn = gn_toshare - - + + def _init_worker_list(g1_toshare, g_list_toshare): global G_g1, G_g_list G_g1 = g1_toshare diff --git a/gklearn/kernels/conjugate_gradient.py b/gklearn/kernels/conjugate_gradient.py index 68c75a8..b162f20 100644 --- a/gklearn/kernels/conjugate_gradient.py +++ b/gklearn/kernels/conjugate_gradient.py @@ -5,13 +5,13 @@ Created on Thu Aug 20 16:09:51 2020 @author: ljia -@references: +@references: [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. """ import sys -from tqdm import tqdm +from gklearn.utils import get_iters import numpy as np import networkx as nx from scipy.sparse import identity @@ -22,8 +22,8 @@ from gklearn.utils.utils import compute_vertex_kernels class ConjugateGradient(RandomWalkMeta): - - + + def __init__(self, **kwargs): super().__init__(**kwargs) self._node_kernels = kwargs.get('node_kernels', None) @@ -32,33 +32,28 @@ class ConjugateGradient(RandomWalkMeta): self._edge_labels = kwargs.get('edge_labels', []) self._node_attrs = kwargs.get('node_attrs', []) self._edge_attrs = kwargs.get('edge_attrs', []) - + def _compute_gm_series(self): self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) - + lmda = self._weight - + # Compute Gram matrix. - gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + # Reindex nodes using consecutive integers for the convenience of kernel computation. - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] - + if self._p is None and self._q is None: # p and q are uniform distributions as default. - + from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) - else: - iterator = itr - + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + for i, j in iterator: kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) gram_matrix[i][j] = kernel @@ -66,92 +61,79 @@ class ConjugateGradient(RandomWalkMeta): else: # @todo pass - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) - + # Compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + # @todo: parallel this. # Reindex nodes using consecutive integers for the convenience of kernel computation. - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] - + if self._p is None and self._q is None: # p and q are uniform distributions as default. def init_worker(gn_toshare): global G_gn G_gn = gn_toshare - + do_fun = self._wrapper_kernel_do - - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) else: # @todo pass - + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) - + lmda = self._weight - + # compute kernel list. kernel_list = [None] * len(g_list) # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') - if self._verbose >= 2: - iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] - + if self._p is None and self._q is None: # p and q are uniform distributions as default. + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) - else: - iterator = range(len(g_list)) - for i in iterator: kernel = self._kernel_do(g1, g_list[i], lmda) kernel_list[i] = kernel else: # @todo pass - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) - + # compute kernel list. kernel_list = [None] * len(g_list) - + # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') # @todo: parallel this. - if self._verbose >= 2: - iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] - + if self._p is None and self._q is None: # p and q are uniform distributions as default. def init_worker(g1_toshare, g_list_toshare): @@ -159,56 +141,56 @@ class ConjugateGradient(RandomWalkMeta): G_g1 = g1_toshare G_g_list = g_list_toshare - do_fun = self._wrapper_kernel_list_do - - def func_assign(result, var_to_assign): + do_fun = self._wrapper_kernel_list_do + + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', + init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + else: # @todo pass - + return kernel_list def _wrapper_kernel_list_do(self, itr): return itr, self._kernel_do(G_g1, G_g_list[itr], self._weight) - - + + def _compute_single_kernel_series(self, g1, g2): self._check_edge_weight([g1] + [g2], self._verbose) self._check_graphs([g1] + [g2]) - + lmda = self._weight - + # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') g2 = nx.convert_node_labels_to_integers(g2, first_label=0, label_attribute='label_orignal') - + if self._p is None and self._q is None: # p and q are uniform distributions as default. kernel = self._kernel_do(g1, g2, lmda) else: # @todo pass - - return kernel - - + + return kernel + + def _kernel_do(self, g1, g2, lmda): - + # Frist, compute kernels between all pairs of nodes using the method borrowed - # from FCSP. It is faster than directly computing all edge kernels + # from FCSP. It is faster than directly computing all edge kernels # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the - # graphs compared, which is the most case we went though. For very + # graphs compared, which is the most case we went though. For very # sparse graphs, this would be slow. vk_dict = self._compute_vertex_kernels(g1, g2) - - # Compute the weight matrix of the direct product graph. - w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict) + + # Compute the weight matrix of the direct product graph. + w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict) # use uniform distribution if there is no prior knowledge. p_times_uni = 1 / w_dim A = identity(w_times.shape[0]) - w_times * lmda @@ -217,27 +199,27 @@ class ConjugateGradient(RandomWalkMeta): # use uniform distribution if there is no prior knowledge. q_times = np.full((1, w_dim), p_times_uni) return np.dot(q_times, x) - - + + def _wrapper_kernel_do(self, itr): i = itr[0] j = itr[1] return i, j, self._kernel_do(G_gn[i], G_gn[j], self._weight) - - + + def _func_fp(x, p_times, lmda, w_times): haha = w_times * x haha = lmda * haha haha = p_times + haha return p_times + lmda * np.dot(w_times, x) - - + + def _compute_vertex_kernels(self, g1, g2): """Compute vertex kernels between vertices of two graphs. """ return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs) - - + + # @todo: move if out to make it faster. # @todo: node/edge kernels use direct function rather than dicts. def _compute_weight_matrix(self, g1, g2, vk_dict): @@ -250,20 +232,20 @@ class ConjugateGradient(RandomWalkMeta): e1_attrs = [e1[2][ea] for ea in self._edge_attrs] e2_attrs = [e2[2][ea] for ea in self._edge_attrs] return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) - + def compute_ek_10(e1, e2, ke): e1_labels = [e1[2][el] for el in self._edge_labels] e2_labels = [e2[2][el] for el in self._edge_labels] return ke(e1_labels, e2_labels) - + def compute_ek_01(e1, e2, ke): e1_attrs = [e1[2][ea] for ea in self._edge_attrs] e2_attrs = [e2[2][ea] for ea in self._edge_attrs] return ke(e1_attrs, e2_attrs) - + def compute_ek_00(e1, e2, ke): return 1 - + # Select the proper edge kernel. if len(self._edge_labels) > 0: # edge symb and non-synb labeled @@ -283,11 +265,11 @@ class ConjugateGradient(RandomWalkMeta): else: ke = None ek_temp = compute_ek_00 # @todo: check how much slower is this. - + # Compute the weight matrix. w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) w_times = np.zeros((w_dim, w_dim)) - + if vk_dict: # node labeled if self._ds_infos['directed']: for e1 in g1.edges(data=True): diff --git a/gklearn/kernels/fixed_point.py b/gklearn/kernels/fixed_point.py index 249bf9c..12d8fe7 100644 --- a/gklearn/kernels/fixed_point.py +++ b/gklearn/kernels/fixed_point.py @@ -5,13 +5,13 @@ Created on Thu Aug 20 16:09:51 2020 @author: ljia -@references: +@references: [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. """ import sys -from tqdm import tqdm +from gklearn.utils import get_iters import numpy as np import networkx as nx from scipy import optimize @@ -22,8 +22,8 @@ from gklearn.utils.utils import compute_vertex_kernels class FixedPoint(RandomWalkMeta): - - + + def __init__(self, **kwargs): super().__init__(**kwargs) self._node_kernels = kwargs.get('node_kernels', None) @@ -32,33 +32,28 @@ class FixedPoint(RandomWalkMeta): self._edge_labels = kwargs.get('edge_labels', []) self._node_attrs = kwargs.get('node_attrs', []) self._edge_attrs = kwargs.get('edge_attrs', []) - + def _compute_gm_series(self): self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) - + lmda = self._weight - + # Compute Gram matrix. - gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + # Reindex nodes using consecutive integers for the convenience of kernel computation. - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout,verbose=(self._verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] - + if self._p is None and self._q is None: # p and q are uniform distributions as default. - + from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) - else: - iterator = itr - + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + for i, j in iterator: kernel = self._kernel_do(self._graphs[i], self._graphs[j], lmda) gram_matrix[i][j] = kernel @@ -66,92 +61,80 @@ class FixedPoint(RandomWalkMeta): else: # @todo pass - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) - + # Compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + # @todo: parallel this. # Reindex nodes using consecutive integers for the convenience of kernel computation. - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] - + if self._p is None and self._q is None: # p and q are uniform distributions as default. def init_worker(gn_toshare): global G_gn G_gn = gn_toshare - + do_fun = self._wrapper_kernel_do - - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) else: # @todo pass - + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) - + lmda = self._weight - + # compute kernel list. kernel_list = [None] * len(g_list) # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') - if self._verbose >= 2: - iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] - + if self._p is None and self._q is None: # p and q are uniform distributions as default. - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) - else: - iterator = range(len(g_list)) - + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + for i in iterator: kernel = self._kernel_do(g1, g_list[i], lmda) kernel_list[i] = kernel else: # @todo pass - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) - + # compute kernel list. kernel_list = [None] * len(g_list) - + # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') # @todo: parallel this. - if self._verbose >= 2: - iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] - + if self._p is None and self._q is None: # p and q are uniform distributions as default. def init_worker(g1_toshare, g_list_toshare): @@ -159,56 +142,56 @@ class FixedPoint(RandomWalkMeta): G_g1 = g1_toshare G_g_list = g_list_toshare - do_fun = self._wrapper_kernel_list_do - - def func_assign(result, var_to_assign): + do_fun = self._wrapper_kernel_list_do + + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', + init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + else: # @todo pass - + return kernel_list def _wrapper_kernel_list_do(self, itr): return itr, self._kernel_do(G_g1, G_g_list[itr], self._weight) - - + + def _compute_single_kernel_series(self, g1, g2): self._check_edge_weight([g1] + [g2], self._verbose) self._check_graphs([g1] + [g2]) - + lmda = self._weight - + # Reindex nodes using consecutive integers for the convenience of kernel computation. g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') g2 = nx.convert_node_labels_to_integers(g2, first_label=0, label_attribute='label_orignal') - + if self._p is None and self._q is None: # p and q are uniform distributions as default. kernel = self._kernel_do(g1, g2, lmda) else: # @todo pass - - return kernel - - + + return kernel + + def _kernel_do(self, g1, g2, lmda): - + # Frist, compute kernels between all pairs of nodes using the method borrowed - # from FCSP. It is faster than directly computing all edge kernels + # from FCSP. It is faster than directly computing all edge kernels # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the - # graphs compared, which is the most case we went though. For very + # graphs compared, which is the most case we went though. For very # sparse graphs, this would be slow. vk_dict = self._compute_vertex_kernels(g1, g2) - - # Compute the weight matrix of the direct product graph. - w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict) + + # Compute the weight matrix of the direct product graph. + w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict) # use uniform distribution if there is no prior knowledge. p_times_uni = 1 / w_dim p_times = np.full((w_dim, 1), p_times_uni) @@ -216,27 +199,27 @@ class FixedPoint(RandomWalkMeta): # use uniform distribution if there is no prior knowledge. q_times = np.full((1, w_dim), p_times_uni) return np.dot(q_times, x) - - + + def _wrapper_kernel_do(self, itr): i = itr[0] j = itr[1] return i, j, self._kernel_do(G_gn[i], G_gn[j], self._weight) - - + + def _func_fp(self, x, p_times, lmda, w_times): haha = w_times * x haha = lmda * haha haha = p_times + haha return p_times + lmda * np.dot(w_times, x) - - + + def _compute_vertex_kernels(self, g1, g2): """Compute vertex kernels between vertices of two graphs. """ return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs) - - + + # @todo: move if out to make it faster. # @todo: node/edge kernels use direct function rather than dicts. def _compute_weight_matrix(self, g1, g2, vk_dict): @@ -249,20 +232,20 @@ class FixedPoint(RandomWalkMeta): e1_attrs = [e1[2][ea] for ea in self._edge_attrs] e2_attrs = [e2[2][ea] for ea in self._edge_attrs] return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) - + def compute_ek_10(e1, e2, ke): e1_labels = [e1[2][el] for el in self._edge_labels] e2_labels = [e2[2][el] for el in self._edge_labels] return ke(e1_labels, e2_labels) - + def compute_ek_01(e1, e2, ke): e1_attrs = [e1[2][ea] for ea in self._edge_attrs] e2_attrs = [e2[2][ea] for ea in self._edge_attrs] return ke(e1_attrs, e2_attrs) - + def compute_ek_00(e1, e2, ke): return 1 - + # Select the proper edge kernel. if len(self._edge_labels) > 0: # edge symb and non-synb labeled @@ -282,11 +265,11 @@ class FixedPoint(RandomWalkMeta): else: ke = None ek_temp = compute_ek_00 # @todo: check how much slower is this. - + # Compute the weight matrix. w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) w_times = np.zeros((w_dim, w_dim)) - + if vk_dict: # node labeled if self._ds_infos['directed']: for e1 in g1.edges(data=True): diff --git a/gklearn/kernels/marginalized.py b/gklearn/kernels/marginalized.py index 75355b1..e3d70c6 100644 --- a/gklearn/kernels/marginalized.py +++ b/gklearn/kernels/marginalized.py @@ -7,19 +7,19 @@ Created on Wed Jun 3 22:22:57 2020 @references: - [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between - labeled graphs. In Proceedings of the 20th International Conference on + [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between + labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003. - [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and - Jean-Philippe Vert. Extensions of marginalized graph kernels. In - Proceedings of the twenty-first international conference on Machine + [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and + Jean-Philippe Vert. Extensions of marginalized graph kernels. In + Proceedings of the twenty-first international conference on Machine learning, page 70. ACM, 2004. """ import sys from multiprocessing import Pool -from tqdm import tqdm +from gklearn.utils import get_iters import numpy as np import networkx as nx from gklearn.utils import SpecialLabel @@ -30,7 +30,7 @@ from gklearn.kernels import GraphKernel class Marginalized(GraphKernel): - + def __init__(self, **kwargs): GraphKernel.__init__(self) self._node_labels = kwargs.get('node_labels', []) @@ -44,35 +44,31 @@ class Marginalized(GraphKernel): def _compute_gm_series(self): self._add_dummy_labels(self._graphs) - + if self._remove_totters: - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='removing tottering', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) # @todo: this may not work. self._graphs = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) - else: - iterator = itr + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, + length=len_itr, verbose=(self._verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(self._graphs[i], self._graphs[j]) gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel # @todo: no directed graph considered? - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): self._add_dummy_labels(self._graphs) - + if self._remove_totters: pool = Pool(self._n_jobs) itr = range(0, len(self._graphs)) @@ -81,57 +77,49 @@ class Marginalized(GraphKernel): else: chunksize = 100 remove_fun = self._wrapper_untotter - if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize), - desc='removing tottering', file=sys.stdout) - else: - iterator = pool.imap_unordered(remove_fun, itr, chunksize) + iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), + desc='removing tottering', file=sys.stdout, + length=len(self._graphs), verbose=(self._verbose >= 2)) for i, g in iterator: self._graphs[i] = g pool.close() pool.join() - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_fun = self._wrapper_kernel_do - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) - + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): self._add_dummy_labels(g_list + [g1]) - + if self._remove_totters: g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. - if self._verbose >= 2: - iterator = tqdm(g_list, desc='removing tottering', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='removing tottering', file=sys.stdout, verbose=(self._verbose >= 2)) # @todo: this may not work. g_list = [untotterTransformation(G, self._node_labels, self._edge_labels) for G in iterator] - + # compute kernel list. kernel_list = [None] * len(g_list) - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) - else: - iterator = range(len(g_list)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) for i in iterator: kernel = self._kernel_do(g1, g_list[i]) kernel_list[i] = kernel - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): self._add_dummy_labels(g_list + [g1]) - + if self._remove_totters: g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. pool = Pool(self._n_jobs) @@ -141,16 +129,14 @@ class Marginalized(GraphKernel): else: chunksize = 100 remove_fun = self._wrapper_untotter - if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize), - desc='removing tottering', file=sys.stdout) - else: - iterator = pool.imap_unordered(remove_fun, itr, chunksize) + iterator = get_iters(pool.imap_unordered(remove_fun, itr, chunksize), + desc='removing tottering', file=sys.stdout, + length=len(g_list), verbose=(self._verbose >= 2)) for i, g in iterator: g_list[i] = g pool.close() pool.join() - + # compute kernel list. kernel_list = [None] * len(g_list) @@ -159,38 +145,38 @@ class Marginalized(GraphKernel): G_g1 = g1_toshare G_g_list = g_list_toshare do_fun = self._wrapper_kernel_list_do - def func_assign(result, var_to_assign): + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', + init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + return kernel_list - - + + def _wrapper_kernel_list_do(self, itr): return itr, self._kernel_do(G_g1, G_g_list[itr]) - - + + def _compute_single_kernel_series(self, g1, g2): self._add_dummy_labels([g1] + [g2]) if self._remove_totters: g1 = untotterTransformation(g1, self._node_labels, self._edge_labels) # @todo: this may not work. g2 = untotterTransformation(g2, self._node_labels, self._edge_labels) kernel = self._kernel_do(g1, g2) - return kernel - - + return kernel + + def _kernel_do(self, g1, g2): """Compute marginalized graph kernel between 2 graphs. - + Parameters ---------- g1, g2 : NetworkX graphs 2 graphs between which the kernel is computed. - + Return ------ kernel : float @@ -204,10 +190,10 @@ class Marginalized(GraphKernel): # (uniform distribution over |G|) p_init_G1 = 1 / num_nodes_G1 p_init_G2 = 1 / num_nodes_G2 - + q = self._p_quit * self._p_quit r1 = q - + # # initial R_inf # # matrix to save all the R_inf for all pairs of nodes # R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) @@ -229,7 +215,7 @@ class Marginalized(GraphKernel): # neighbor_n2 = g2[node2[0]] # if len(neighbor_n2) > 0: # p_trans_n2 = (1 - p_quit) / len(neighbor_n2) - # + # # for neighbor1 in neighbor_n1: # for neighbor2 in neighbor_n2: # t = p_trans_n1 * p_trans_n2 * \ @@ -238,7 +224,7 @@ class Marginalized(GraphKernel): # deltakernel( # neighbor_n1[neighbor1][edge_label], # neighbor_n2[neighbor2][edge_label]) - # + # # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ # neighbor2] # ref [1] equation (8) # R_inf[:] = R_inf_new @@ -249,8 +235,8 @@ class Marginalized(GraphKernel): # s = p_init_G1 * p_init_G2 * deltakernel( # node1[1][node_label], node2[1][node_label]) # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) - - + + R_inf = {} # dict to save all the R_inf for all pairs of nodes # initial R_inf, the 1st iteration. for node1 in g1.nodes(): @@ -266,7 +252,7 @@ class Marginalized(GraphKernel): R_inf[(node1, node2)] = self._p_quit else: R_inf[(node1, node2)] = 1 - + # compute all transition probability first. t_dict = {} if self._n_iteration > 1: @@ -287,11 +273,11 @@ class Marginalized(GraphKernel): p_trans_n1 * p_trans_n2 * \ deltakernel(tuple(g1.nodes[neighbor1][nl] for nl in self._node_labels), tuple(g2.nodes[neighbor2][nl] for nl in self._node_labels)) * \ deltakernel(tuple(neighbor_n1[neighbor1][el] for el in self._edge_labels), tuple(neighbor_n2[neighbor2][el] for el in self._edge_labels)) - + # Compute R_inf with a simple interative method for i in range(2, self._n_iteration + 1): R_inf_old = R_inf.copy() - + # Compute R_inf for each pair of nodes for node1 in g1.nodes(): neighbor_n1 = g1[node1] @@ -301,32 +287,32 @@ class Marginalized(GraphKernel): if len(neighbor_n1) > 0: for node2 in g2.nodes(): neighbor_n2 = g2[node2] - if len(neighbor_n2) > 0: + if len(neighbor_n2) > 0: R_inf[(node1, node2)] = r1 for neighbor1 in neighbor_n1: for neighbor2 in neighbor_n2: R_inf[(node1, node2)] += \ (t_dict[(node1, node2, neighbor1, neighbor2)] * \ R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) - + # add elements of R_inf up and compute kernel. for (n1, n2), value in R_inf.items(): s = p_init_G1 * p_init_G2 * deltakernel(tuple(g1.nodes[n1][nl] for nl in self._node_labels), tuple(g2.nodes[n2][nl] for nl in self._node_labels)) kernel += s * value # ref [1] equation (6) - + return kernel - - + + def _wrapper_kernel_do(self, itr): i = itr[0] j = itr[1] return i, j, self._kernel_do(G_gn[i], G_gn[j]) - + def _wrapper_untotter(self, i): return i, untotterTransformation(self._graphs[i], self._node_labels, self._edge_labels) # @todo: this may not work. - - + + def _add_dummy_labels(self, Gn): if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): for i in range(len(Gn)): diff --git a/gklearn/kernels/path_up_to_h.py b/gklearn/kernels/path_up_to_h.py index e9869ea..afe3859 100644 --- a/gklearn/kernels/path_up_to_h.py +++ b/gklearn/kernels/path_up_to_h.py @@ -5,15 +5,15 @@ Created on Fri Apr 10 18:33:13 2020 @author: ljia -@references: +@references: - [1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre - Baldi. Graph kernels for chemical informatics. Neural networks, + [1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre + Baldi. Graph kernels for chemical informatics. Neural networks, 18(8):1093–1110, 2005. """ import sys from multiprocessing import Pool -from tqdm import tqdm +from gklearn.utils import get_iters import numpy as np import networkx as nx from collections import Counter @@ -25,7 +25,7 @@ from gklearn.utils import Trie class PathUpToH(GraphKernel): # @todo: add function for k_func is None - + def __init__(self, **kwargs): GraphKernel.__init__(self) self._node_labels = kwargs.get('node_labels', []) @@ -38,16 +38,14 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None def _compute_gm_series(self): self._add_dummy_labels(self._graphs) - + from itertools import combinations_with_replacement - itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator_ps = tqdm(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout) - iterator_kernel = tqdm(itr_kernel, desc='Computing kernels', file=sys.stdout) - else: - iterator_ps = range(0, len(self._graphs)) - iterator_kernel = itr_kernel - + itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) + iterator_ps = get_iters(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2)) + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator_kernel = get_iters(itr_kernel, desc='Computing kernels', + file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) if self._compute_method == 'trie': @@ -62,13 +60,13 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None kernel = self._kernel_do_naive(all_paths[i], all_paths[j]) gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): self._add_dummy_labels(self._graphs) - + # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(self._n_jobs) @@ -80,23 +78,21 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None all_paths = [[] for _ in range(len(self._graphs))] if self._compute_method == 'trie' and self._k_func is not None: get_ps_fun = self._wrapper_find_all_path_as_trie - elif self._compute_method != 'trie' and self._k_func is not None: - get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True) - else: - get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) - if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_ps_fun, itr, chunksize), - desc='getting paths', file=sys.stdout) + elif self._compute_method != 'trie' and self._k_func is not None: + get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True) else: - iterator = pool.imap_unordered(get_ps_fun, itr, chunksize) + get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) + iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), + desc='getting paths', file=sys.stdout, + length=len(self._graphs), verbose=(self._verbose >= 2)) for i, ps in iterator: all_paths[i] = ps pool.close() pool.join() - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + if self._compute_method == 'trie' and self._k_func is not None: def init_worker(trie_toshare): global G_trie @@ -106,28 +102,24 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None def init_worker(plist_toshare): global G_plist G_plist = plist_toshare - do_fun = self._wrapper_kernel_do_naive + do_fun = self._wrapper_kernel_do_naive else: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare - do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this? - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose) - + do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this? + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose) + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): self._add_dummy_labels(g_list + [g1]) - - if self._verbose >= 2: - iterator_ps = tqdm(g_list, desc='getting paths', file=sys.stdout) - iterator_kernel = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) - else: - iterator_ps = g_list - iterator_kernel = range(len(g_list)) - + + iterator_ps = get_iters(g_list, desc='getting paths', file=sys.stdout, verbose=(self._verbose >= 2)) + iterator_kernel = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + kernel_list = [None] * len(g_list) if self._compute_method == 'trie': @@ -142,13 +134,13 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None for i in iterator_kernel: kernel = self._kernel_do_naive(paths_g1, paths_g_list[i]) kernel_list[i] = kernel - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): self._add_dummy_labels(g_list + [g1]) - + # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(self._n_jobs) @@ -162,48 +154,46 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None paths_g1 = self._find_all_path_as_trie(g1) get_ps_fun = self._wrapper_find_all_path_as_trie elif self._compute_method != 'trie' and self._k_func is not None: - paths_g1 = self._find_all_paths_until_length(g1) - get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True) + paths_g1 = self._find_all_paths_until_length(g1) + get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True) else: - paths_g1 = self._find_all_paths_until_length(g1) + paths_g1 = self._find_all_paths_until_length(g1) get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) - if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_ps_fun, itr, chunksize), - desc='getting paths', file=sys.stdout) - else: - iterator = pool.imap_unordered(get_ps_fun, itr, chunksize) + iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), + desc='getting paths', file=sys.stdout, + length=len(g_list), verbose=(self._verbose >= 2)) for i, ps in iterator: paths_g_list[i] = ps pool.close() pool.join() - + # compute kernel list. kernel_list = [None] * len(g_list) - + def init_worker(p1_toshare, plist_toshare): global G_p1, G_plist G_p1 = p1_toshare G_plist = plist_toshare do_fun = self._wrapper_kernel_list_do - def func_assign(result, var_to_assign): + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + return kernel_list - - + + def _wrapper_kernel_list_do(self, itr): if self._compute_method == 'trie' and self._k_func is not None: return itr, self._kernel_do_trie(G_p1, G_plist[itr]) elif self._compute_method != 'trie' and self._k_func is not None: - return itr, self._kernel_do_naive(G_p1, G_plist[itr]) + return itr, self._kernel_do_naive(G_p1, G_plist[itr]) else: return itr, self._kernel_do_kernelless(G_p1, G_plist[itr]) - - + + def _compute_single_kernel_series(self, g1, g2): self._add_dummy_labels([g1] + [g2]) if self._compute_method == 'trie': @@ -214,32 +204,32 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None paths_g1 = self._find_all_paths_until_length(g1) paths_g2 = self._find_all_paths_until_length(g2) kernel = self._kernel_do_naive(paths_g1, paths_g2) - return kernel + return kernel + - def _kernel_do_trie(self, trie1, trie2): """Compute path graph kernels up to depth d between 2 graphs using trie. - + Parameters ---------- trie1, trie2 : list Tries that contains all paths in 2 graphs. k_func : function - A kernel function applied using different notions of fingerprint + A kernel function applied using different notions of fingerprint similarity. - + Return ------ kernel : float Path kernel up to h between 2 graphs. """ - if self._k_func == 'tanimoto': - # traverse all paths in graph1 and search them in graph2. Deep-first + if self._k_func == 'tanimoto': + # traverse all paths in graph1 and search them in graph2. Deep-first # search is applied. - def traverseTrie1t(root, trie2, setlist, pcurrent=[]): + def traverseTrie1t(root, trie2, setlist, pcurrent=[]): # @todo: no need to use value (# of occurrence of paths) in this case. for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: setlist[1] += 1 count2 = trie2.searchWord(pcurrent) if count2 != 0: @@ -250,17 +240,17 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - - # traverse all paths in graph2 and find out those that are not in - # graph1. Deep-first search is applied. + + + # traverse all paths in graph2 and find out those that are not in + # graph1. Deep-first search is applied. def traverseTrie2t(root, trie1, setlist, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) if node['isEndOfWord']: # print(node['count']) count1 = trie1.searchWord(pcurrent) - if count1 == 0: + if count1 == 0: setlist[1] += 1 if node['children'] != {}: traverseTrie2t(node, trie1, setlist, pcurrent) @@ -268,7 +258,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - + setlist = [0, 0] # intersection and union of path sets of g1, g2. # print(trie1.root) # print(trie2.root) @@ -277,9 +267,9 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None traverseTrie2t(trie2.root, trie1, setlist) # print(setlist) kernel = setlist[0] / setlist[1] - - elif self._k_func == 'MinMax': # MinMax kernel - # traverse all paths in graph1 and search them in graph2. Deep-first + + elif self._k_func == 'MinMax': # MinMax kernel + # traverse all paths in graph1 and search them in graph2. Deep-first # search is applied. def traverseTrie1m(root, trie2, sumlist, pcurrent=[]): for key, node in root['children'].items(): @@ -296,16 +286,16 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - # traverse all paths in graph2 and find out those that are not in - # graph1. Deep-first search is applied. + + # traverse all paths in graph2 and find out those that are not in + # graph1. Deep-first search is applied. def traverseTrie2m(root, trie1, sumlist, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: # print(node['count']) count1 = trie1.searchWord(pcurrent) - if count1 == 0: + if count1 == 0: sumlist[1] += node['count'] if node['children'] != {}: traverseTrie2m(node, trie1, sumlist, pcurrent) @@ -313,7 +303,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - + sumlist = [0, 0] # sum of mins and sum of maxs # print(trie1.root) # print(trie2.root) @@ -324,37 +314,37 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None kernel = sumlist[0] / sumlist[1] else: raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax".') - + return kernel - - + + def _wrapper_kernel_do_trie(self, itr): i = itr[0] j = itr[1] return i, j, self._kernel_do_trie(G_trie[i], G_trie[j]) - - + + def _kernel_do_naive(self, paths1, paths2): """Compute path graph kernels up to depth d between 2 graphs naively. - + Parameters ---------- paths_list : list of list - List of list of paths in all graphs, where for unlabeled graphs, each - path is represented by a list of nodes; while for labeled graphs, each - path is represented by a string consists of labels of nodes and/or + List of list of paths in all graphs, where for unlabeled graphs, each + path is represented by a list of nodes; while for labeled graphs, each + path is represented by a string consists of labels of nodes and/or edges on that path. k_func : function - A kernel function applied using different notions of fingerprint + A kernel function applied using different notions of fingerprint similarity. - + Return ------ kernel : float Path kernel up to h between 2 graphs. """ all_paths = list(set(paths1 + paths2)) - + if self._k_func == 'tanimoto': length_union = len(set(paths1 + paths2)) kernel = (len(set(paths1)) + len(set(paths2)) - @@ -363,7 +353,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None # vector2 = [(1 if path in paths2 else 0) for path in all_paths] # kernel_uv = np.dot(vector1, vector2) # kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) - + elif self._k_func == 'MinMax': # MinMax kernel path_count1 = Counter(paths1) path_count2 = Counter(paths2) @@ -373,7 +363,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None for key in all_paths] kernel = np.sum(np.minimum(vector1, vector2)) / \ np.sum(np.maximum(vector1, vector2)) - + elif self._k_func is None: # no sub-kernel used; compare paths directly. path_count1 = Counter(paths1) path_count2 = Counter(paths2) @@ -382,27 +372,27 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) for key in all_paths] kernel = np.dot(vector1, vector2) - + else: raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax" and None.') - + return kernel - - + + def _wrapper_kernel_do_naive(self, itr): i = itr[0] j = itr[1] return i, j, self._kernel_do_naive(G_plist[i], G_plist[j]) - - + + def _find_all_path_as_trie(self, G): - # all_path = find_all_paths_until_length(G, length, ds_attrs, + # all_path = find_all_paths_until_length(G, length, ds_attrs, # node_label=node_label, # edge_label=edge_label) # ptrie = Trie() # for path in all_path: # ptrie.insertWord(path) - + # ptrie = Trie() # path_l = [[n] for n in G.nodes] # paths of length l # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) @@ -421,15 +411,15 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) # for p in path_l_str: # ptrie.insertWord(p) - # + # # print(time.time() - time1) # print(ptrie.root) # print() - - - # traverse all paths up to length h in a graph and construct a trie with - # them. Deep-first search is applied. Notice the reverse of each path is - # also stored to the trie. + + + # traverse all paths up to length h in a graph and construct a trie with + # them. Deep-first search is applied. Notice the reverse of each path is + # also stored to the trie. def traverseGraph(root, ptrie, G, pcurrent=[]): if len(pcurrent) < self._depth + 1: for neighbor in G[root]: @@ -439,8 +429,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None ptrie.insertWord(plstr[0]) traverseGraph(neighbor, ptrie, G, pcurrent) del pcurrent[-1] - - + + ptrie = Trie() path_l = [[n] for n in G.nodes] # paths of length l path_l_str = self._paths2labelseqs(path_l, G) @@ -448,18 +438,18 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None ptrie.insertWord(p) for n in G.nodes: traverseGraph(n, ptrie, G, pcurrent=[n]) - - + + # def traverseGraph(root, all_paths, length, G, ds_attrs, node_label, edge_label, # pcurrent=[]): # if len(pcurrent) < length + 1: # for neighbor in G[root]: # if neighbor not in pcurrent: # pcurrent.append(neighbor) - # plstr = paths2labelseqs([pcurrent], G, ds_attrs, + # plstr = paths2labelseqs([pcurrent], G, ds_attrs, # node_label, edge_label) # all_paths.append(pcurrent[:]) - # traverseGraph(neighbor, all_paths, length, G, ds_attrs, + # traverseGraph(neighbor, all_paths, length, G, ds_attrs, # node_label, edge_label, pcurrent) # del pcurrent[-1] # @@ -470,24 +460,24 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None ## for p in path_l_str: ## ptrie.insertWord(p) # for n in G.nodes: - # traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label, + # traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label, # pcurrent=[n]) - + # print(ptrie.root) return ptrie - - + + def _wrapper_find_all_path_as_trie(self, itr_item): g = itr_item[0] i = itr_item[1] return i, self._find_all_path_as_trie(g) - - + + # @todo: (can be removed maybe) this method find paths repetively, it could be faster. def _find_all_paths_until_length(self, G, tolabelseqs=True): - """Find all paths no longer than a certain maximum length in a graph. A + """Find all paths no longer than a certain maximum length in a graph. A recursive depth first search is applied. - + Parameters ---------- G : NetworkX graphs @@ -500,13 +490,13 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. - + Return ------ path : list - List of paths retrieved, where for unlabeled graphs, each path is - represented by a list of nodes; while for labeled graphs, each path is - represented by a list of strings consists of labels of nodes and/or + List of paths retrieved, where for unlabeled graphs, each path is + represented by a list of nodes; while for labeled graphs, each path is + represented by a list of strings consists of labels of nodes and/or edges on that path. """ # path_l = [tuple([n]) for n in G.nodes] # paths of length l @@ -519,10 +509,10 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None # tmp = path + (neighbor, ) # if tuple(tmp[::-1]) not in path_l_new: # path_l_new.append(tuple(tmp)) - + # all_paths += path_l_new # path_l = path_l_new[:] - + path_l = [[n] for n in G.nodes] # paths of length l all_paths = [p.copy() for p in path_l] for l in range(1, self._depth + 1): @@ -533,28 +523,28 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None tmp = path + [neighbor] # if tmp[::-1] not in path_lplus1: path_lplus1.append(tmp) - + all_paths += path_lplus1 path_l = [p.copy() for p in path_lplus1] - + # for i in range(0, self._depth + 1): # new_paths = find_all_paths(G, i) # if new_paths == []: # break # all_paths.extend(new_paths) - + # consider labels # print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)) # print() return (self._paths2labelseqs(all_paths, G) if tolabelseqs else all_paths) - - + + def _wrapper_find_all_paths_until_length(self, tolabelseqs, itr_item): g = itr_item[0] i = itr_item[1] return i, self._find_all_paths_until_length(g, tolabelseqs=tolabelseqs) - - + + def _paths2labelseqs(self, plist, G): if len(self._node_labels) > 0: if len(self._edge_labels) > 0: @@ -589,8 +579,8 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func is None else: return [tuple(['0' for node in path]) for path in plist] # return [tuple([len(path)]) for path in all_paths] - - + + def _add_dummy_labels(self, Gn): if self._k_func is not None: if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): diff --git a/gklearn/kernels/shortest_path.py b/gklearn/kernels/shortest_path.py index 9e553a3..bfea553 100644 --- a/gklearn/kernels/shortest_path.py +++ b/gklearn/kernels/shortest_path.py @@ -15,7 +15,7 @@ import sys from itertools import product # from functools import partial from multiprocessing import Pool -from tqdm import tqdm +from gklearn.utils import get_iters import numpy as np import networkx as nx from gklearn.utils.parallel import parallel_gm, parallel_me @@ -38,10 +38,7 @@ class ShortestPath(GraphKernel): def _compute_gm_series(self): self._all_graphs_have_edges(self._graphs) # get shortest path graph of each graph. - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='getting sp graphs', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] # compute Gram matrix. @@ -49,10 +46,9 @@ class ShortestPath(GraphKernel): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) - else: - iterator = itr + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', + length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) for i, j in iterator: kernel = self._sp_do(self._graphs[i], self._graphs[j]) gram_matrix[i][j] = kernel @@ -71,11 +67,9 @@ class ShortestPath(GraphKernel): chunksize = int(len(self._graphs) / self._n_jobs) + 1 else: chunksize = 100 - if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), - desc='getting sp graphs', file=sys.stdout) - else: - iterator = pool.imap_unordered(get_sp_graphs_fun, itr, chunksize) + iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), + desc='getting sp graphs', file=sys.stdout, + length=len(self._graphs), verbose=(self._verbose >= 2)) for i, g in iterator: self._graphs[i] = g pool.close() @@ -98,18 +92,12 @@ class ShortestPath(GraphKernel): self._all_graphs_have_edges([g1] + g_list) # get shortest path graphs of g1 and each graph in g_list. g1 = getSPGraph(g1, edge_weight=self._edge_weight) - if self._verbose >= 2: - iterator = tqdm(g_list, desc='getting sp graphs', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] # compute kernel list. kernel_list = [None] * len(g_list) - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) - else: - iterator = range(len(g_list)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) for i in iterator: kernel = self._sp_do(g1, g_list[i]) kernel_list[i] = kernel @@ -128,11 +116,9 @@ class ShortestPath(GraphKernel): chunksize = int(len(g_list) / self._n_jobs) + 1 else: chunksize = 100 - if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), - desc='getting sp graphs', file=sys.stdout) - else: - iterator = pool.imap_unordered(get_sp_graphs_fun, itr, chunksize) + iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), + desc='getting sp graphs', file=sys.stdout, + length=len(g_list), verbose=(self._verbose >= 2)) for i, g in iterator: g_list[i] = g pool.close() diff --git a/gklearn/kernels/spectral_decomposition.py b/gklearn/kernels/spectral_decomposition.py index abb3dcd..561f632 100644 --- a/gklearn/kernels/spectral_decomposition.py +++ b/gklearn/kernels/spectral_decomposition.py @@ -5,13 +5,13 @@ Created on Thu Aug 20 16:12:45 2020 @author: ljia -@references: +@references: [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. """ import sys -from tqdm import tqdm +from gklearn.utils import get_iters import numpy as np import networkx as nx from scipy.sparse import kron @@ -20,12 +20,12 @@ from gklearn.kernels import RandomWalkMeta class SpectralDecomposition(RandomWalkMeta): - - + + def __init__(self, **kwargs): super().__init__(**kwargs) self._sub_kernel = kwargs.get('sub_kernel', None) - + def _compute_gm_series(self): self._check_edge_weight(self._graphs, self._verbose) @@ -33,18 +33,15 @@ class SpectralDecomposition(RandomWalkMeta): if self._verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') - + # compute Gram matrix. - gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + if self._q is None: # precompute the spectral decomposition of each graph. P_list = [] D_list = [] - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='spectral decompose', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -60,42 +57,37 @@ class SpectralDecomposition(RandomWalkMeta): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) - else: - iterator = itr - + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + for i, j in iterator: kernel = self._kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel) gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel - + else: # @todo pass else: # @todo pass - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') - + # compute Gram matrix. - gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + if self._q is None: # precompute the spectral decomposition of each graph. P_list = [] D_list = [] - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='spectral decompose', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -106,45 +98,42 @@ class SpectralDecomposition(RandomWalkMeta): if self._p is None: # p is uniform distribution as default. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in self._graphs] # @todo: parallel? - + def init_worker(q_T_list_toshare, P_list_toshare, D_list_toshare): global G_q_T_list, G_P_list, G_D_list G_q_T_list = q_T_list_toshare G_P_list = P_list_toshare G_D_list = D_list_toshare - - do_fun = self._wrapper_kernel_do - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + + do_fun = self._wrapper_kernel_do + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose) else: # @todo pass else: # @todo pass - + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') - + # compute kernel list. kernel_list = [None] * len(g_list) - + if self._q is None: # precompute the spectral decomposition of each graph. A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() D1, P1 = np.linalg.eig(A1) P_list = [] D_list = [] - if self._verbose >= 2: - iterator = tqdm(g_list, desc='spectral decompose', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -156,33 +145,30 @@ class SpectralDecomposition(RandomWalkMeta): if self._p is None: # p is uniform distribution as default. q_T1 = 1 / nx.number_of_nodes(g1) q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) - else: - iterator = range(len(g_list)) - + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + for i in iterator: kernel = self._kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel) kernel_list[i] = kernel - + else: # @todo pass else: # @todo pass - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') - + # compute kernel list. kernel_list = [None] * len(g_list) - + if self._q is None: # precompute the spectral decomposition of each graph. A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() @@ -204,7 +190,7 @@ class SpectralDecomposition(RandomWalkMeta): if self._p is None: # p is uniform distribution as default. q_T1 = 1 / nx.number_of_nodes(g1) q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] # @todo: parallel? - + def init_worker(q_T1_toshare, P1_toshare, D1_toshare, q_T_list_toshare, P_list_toshare, D_list_toshare): global G_q_T1, G_P1, G_D1, G_q_T_list, G_P_list, G_D_list G_q_T1 = q_T1_toshare @@ -214,34 +200,34 @@ class SpectralDecomposition(RandomWalkMeta): G_P_list = P_list_toshare G_D_list = D_list_toshare - do_fun = self._wrapper_kernel_list_do - - def func_assign(result, var_to_assign): + do_fun = self._wrapper_kernel_list_do + + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + else: # @todo pass else: # @todo pass - + return kernel_list def _wrapper_kernel_list_do(self, itr): return itr, self._kernel_do(G_q_T1, G_q_T_list[itr], G_P1, G_P_list[itr], G_D1, G_D_list[itr], self._weight, self._sub_kernel) - - + + def _compute_single_kernel_series(self, g1, g2): self._check_edge_weight([g1] + [g2], self._verbose) self._check_graphs([g1] + [g2]) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') - + if self._q is None: # precompute the spectral decomposition of each graph. A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() @@ -257,10 +243,10 @@ class SpectralDecomposition(RandomWalkMeta): pass else: # @todo pass - - return kernel - - + + return kernel + + def _kernel_do(self, q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): # use uniform distribution if there is no prior knowledge. kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense() @@ -276,7 +262,7 @@ class SpectralDecomposition(RandomWalkMeta): kmiddle = np.linalg.inv(kmiddle) return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0] - + def _wrapper_kernel_do(self, itr): i = itr[0] j = itr[1] diff --git a/gklearn/kernels/sylvester_equation.py b/gklearn/kernels/sylvester_equation.py index bf9cb2d..9f8fc66 100644 --- a/gklearn/kernels/sylvester_equation.py +++ b/gklearn/kernels/sylvester_equation.py @@ -5,13 +5,13 @@ Created on Wed Aug 19 17:24:46 2020 @author: ljia -@references: +@references: [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. """ import sys -from tqdm import tqdm +from gklearn.utils import get_iters import numpy as np import networkx as nx from control import dlyap @@ -20,11 +20,11 @@ from gklearn.kernels import RandomWalkMeta class SylvesterEquation(RandomWalkMeta): - - + + def __init__(self, **kwargs): super().__init__(**kwargs) - + def _compute_gm_series(self): self._check_edge_weight(self._graphs, self._verbose) @@ -32,24 +32,21 @@ class SylvesterEquation(RandomWalkMeta): if self._verbose >= 2: import warnings warnings.warn('All labels are ignored.') - + lmda = self._weight - + # compute Gram matrix. - gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # # normalized adjacency matrices # A_wave_list = [] # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): - # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() + # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() # norm = A_tilde.sum(axis=0) # norm[norm == 0] = 1 # A_wave_list.append(A_tilde / norm) @@ -57,119 +54,105 @@ class SylvesterEquation(RandomWalkMeta): if self._p is None: # p is uniform distribution as default. from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) - else: - iterator = itr - + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, length=len_itr, verbose=(self._verbose >= 2)) + for i, j in iterator: kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda) gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel - + else: # @todo pass else: # @todo pass - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored.') - + # compute Gram matrix. - gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? if self._p is None: # p is uniform distribution as default. def init_worker(A_wave_list_toshare): global G_A_wave_list G_A_wave_list = A_wave_list_toshare - + do_fun = self._wrapper_kernel_do - - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) else: # @todo pass else: # @todo pass - + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored.') - + lmda = self._weight - + # compute kernel list. kernel_list = [None] * len(g_list) - + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() - if self._verbose >= 2: - iterator = tqdm(g_list, desc='compute adjacency matrices', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] if self._p is None: # p is uniform distribution as default. - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) - else: - iterator = range(len(g_list)) - + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) + for i in iterator: kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda) kernel_list[i] = kernel - + else: # @todo pass else: # @todo pass - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored.') - + # compute kernel list. kernel_list = [None] * len(g_list) - + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() - if self._verbose >= 2: - iterator = tqdm(g_list, desc='compute adjacency matrices', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? if self._p is None: # p is uniform distribution as default. @@ -178,37 +161,37 @@ class SylvesterEquation(RandomWalkMeta): G_A_wave_1 = A_wave_1_toshare G_A_wave_list = A_wave_list_toshare - do_fun = self._wrapper_kernel_list_do - - def func_assign(result, var_to_assign): + do_fun = self._wrapper_kernel_list_do + + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', + init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + else: # @todo pass else: # @todo pass - + return kernel_list def _wrapper_kernel_list_do(self, itr): return itr, self._kernel_do(G_A_wave_1, G_A_wave_list[itr], self._weight) - - + + def _compute_single_kernel_series(self, g1, g2): self._check_edge_weight([g1] + [g2], self._verbose) self._check_graphs([g1] + [g2]) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored.') - + lmda = self._weight - + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. @@ -220,12 +203,12 @@ class SylvesterEquation(RandomWalkMeta): pass else: # @todo pass - - return kernel - - + + return kernel + + def _kernel_do(self, A_wave1, A_wave2, lmda): - + S = lmda * A_wave2 T_t = A_wave1 # use uniform distribution if there is no prior knowledge. @@ -237,8 +220,8 @@ class SylvesterEquation(RandomWalkMeta): # use uniform distribution if there is no prior knowledge. q_times = np.full((1, nb_pd), p_times_uni) return np.dot(q_times, X) - - + + def _wrapper_kernel_do(self, itr): i = itr[0] j = itr[1] diff --git a/gklearn/kernels/treelet.py b/gklearn/kernels/treelet.py index c27ebde..32cad43 100644 --- a/gklearn/kernels/treelet.py +++ b/gklearn/kernels/treelet.py @@ -5,15 +5,15 @@ Created on Mon Apr 13 18:02:46 2020 @author: ljia -@references: +@references: - [1] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in + [1] Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. """ import sys from multiprocessing import Pool -from tqdm import tqdm +from gklearn.utils import get_iters import numpy as np import networkx as nx from collections import Counter @@ -25,7 +25,7 @@ from gklearn.kernels import GraphKernel class Treelet(GraphKernel): - + def __init__(self, **kwargs): GraphKernel.__init__(self) self._node_labels = kwargs.get('node_labels', []) @@ -38,38 +38,35 @@ class Treelet(GraphKernel): def _compute_gm_series(self): self._add_dummy_labels(self._graphs) - - # get all canonical keys of all graphs before computing kernels to save + + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys = [] - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='getting canonkeys', file=sys.stdout) - else: - iterator = self._graphs + iterator = get_iters(self._graphs, desc='getting canonkeys', file=sys.stdout, + verbose=(self._verbose >= 2)) for g in iterator: canonkeys.append(self._get_canonkeys(g)) - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) - else: - iterator = itr + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, + length=len_itr, verbose=(self._verbose >= 2)) for i, j in iterator: kernel = self._kernel_do(canonkeys[i], canonkeys[j]) gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel # @todo: no directed graph considered? - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): self._add_dummy_labels(self._graphs) - - # get all canonical keys of all graphs before computing kernels to save + + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. pool = Pool(self._n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) @@ -79,60 +76,52 @@ class Treelet(GraphKernel): chunksize = 100 canonkeys = [[] for _ in range(len(self._graphs))] get_fun = self._wrapper_get_canonkeys - if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_fun, itr, chunksize), - desc='getting canonkeys', file=sys.stdout) - else: - iterator = pool.imap_unordered(get_fun, itr, chunksize) + iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), + desc='getting canonkeys', file=sys.stdout, + length=len(self._graphs), verbose=(self._verbose >= 2)) for i, ck in iterator: canonkeys[i] = ck pool.close() pool.join() - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + def init_worker(canonkeys_toshare): global G_canonkeys G_canonkeys = canonkeys_toshare do_fun = self._wrapper_kernel_do - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(canonkeys,), n_jobs=self._n_jobs, verbose=self._verbose) - + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): self._add_dummy_labels(g_list + [g1]) - - # get all canonical keys of all graphs before computing kernels to save + + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys_1 = self._get_canonkeys(g1) canonkeys_list = [] - if self._verbose >= 2: - iterator = tqdm(g_list, desc='getting canonkeys', file=sys.stdout) - else: - iterator = g_list + iterator = get_iters(g_list, desc='getting canonkeys', file=sys.stdout, verbose=(self._verbose >= 2)) for g in iterator: canonkeys_list.append(self._get_canonkeys(g)) - + # compute kernel list. kernel_list = [None] * len(g_list) - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) - else: - iterator = range(len(g_list)) + iterator = get_iters(range(len(g_list)), desc='Computing kernels', file=sys.stdout, length=len(g_list), verbose=(self._verbose >= 2)) for i in iterator: kernel = self._kernel_do(canonkeys_1, canonkeys_list[i]) kernel_list[i] = kernel - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): self._add_dummy_labels(g_list + [g1]) - - # get all canonical keys of all graphs before computing kernels to save + + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys_1 = self._get_canonkeys(g1) canonkeys_list = [[] for _ in range(len(g_list))] @@ -143,16 +132,14 @@ class Treelet(GraphKernel): else: chunksize = 100 get_fun = self._wrapper_get_canonkeys - if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_fun, itr, chunksize), - desc='getting canonkeys', file=sys.stdout) - else: - iterator = pool.imap_unordered(get_fun, itr, chunksize) + iterator = get_iters(pool.imap_unordered(get_fun, itr, chunksize), + desc='getting canonkeys', file=sys.stdout, + length=len(g_list), verbose=(self._verbose >= 2)) for i, ck in iterator: canonkeys_list[i] = ck pool.close() pool.join() - + # compute kernel list. kernel_list = [None] * len(g_list) @@ -161,37 +148,37 @@ class Treelet(GraphKernel): G_ck_1 = ck_1_toshare G_ck_list = ck_list_toshare do_fun = self._wrapper_kernel_list_do - def func_assign(result, var_to_assign): + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered', + init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + return kernel_list - - + + def _wrapper_kernel_list_do(self, itr): return itr, self._kernel_do(G_ck_1, G_ck_list[itr]) - - + + def _compute_single_kernel_series(self, g1, g2): self._add_dummy_labels([g1] + [g2]) canonkeys_1 = self._get_canonkeys(g1) canonkeys_2 = self._get_canonkeys(g2) kernel = self._kernel_do(canonkeys_1, canonkeys_2) - return kernel - - + return kernel + + def _kernel_do(self, canonkey1, canonkey2): """Compute treelet graph kernel between 2 graphs. - + Parameters ---------- canonkey1, canonkey2 : list List of canonical keys in 2 graphs, where each key is represented by a string. - + Return ------ kernel : float @@ -199,38 +186,38 @@ class Treelet(GraphKernel): """ keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) - vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) - kernel = self._sub_kernel(vector1, vector2) + vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) + kernel = self._sub_kernel(vector1, vector2) return kernel - - + + def _wrapper_kernel_do(self, itr): i = itr[0] j = itr[1] return i, j, self._kernel_do(G_canonkeys[i], G_canonkeys[j]) - - + + def _get_canonkeys(self, G): """Generate canonical keys of all treelets in a graph. - + Parameters ---------- G : NetworkX graphs The graph in which keys are generated. - + Return ------ canonkey/canonkey_l : dict - For unlabeled graphs, canonkey is a dictionary which records amount of - every tree pattern. For labeled graphs, canonkey_l is one which keeps + For unlabeled graphs, canonkey is a dictionary which records amount of + every tree pattern. For labeled graphs, canonkey_l is one which keeps track of amount of every treelet. """ patterns = {} # a dictionary which consists of lists of patterns for all graphlet. canonkey = {} # canonical key, a dictionary which records amount of every tree pattern. - + ### structural analysis ### - ### In this section, a list of patterns is generated for each graphlet, - ### where every pattern is represented by nodes ordered by Morgan's + ### In this section, a list of patterns is generated for each graphlet, + ### where every pattern is represented by nodes ordered by Morgan's ### extended labeling. # linear patterns patterns['0'] = list(G.nodes()) @@ -238,16 +225,16 @@ class Treelet(GraphKernel): for i in range(1, 6): # for i in range(1, 6): patterns[str(i)] = find_all_paths(G, i, self._ds_infos['directed']) canonkey[str(i)] = len(patterns[str(i)]) - + # n-star patterns patterns['3star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3] patterns['4star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4] - patterns['5star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5] + patterns['5star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5] # n-star patterns canonkey['6'] = len(patterns['3star']) canonkey['8'] = len(patterns['4star']) canonkey['d'] = len(patterns['5star']) - + # pattern 7 patterns['7'] = [] # the 1st line of Table 1 in Ref [1] for pattern in patterns['3star']: @@ -261,7 +248,7 @@ class Treelet(GraphKernel): new_pattern = pattern_t + [neighborx] patterns['7'].append(new_pattern) canonkey['7'] = len(patterns['7']) - + # pattern 11 patterns['11'] = [] # the 4th line of Table 1 in Ref [1] for pattern in patterns['4star']: @@ -274,7 +261,7 @@ class Treelet(GraphKernel): new_pattern = pattern_t + [neighborx] patterns['11'].append(new_pattern) canonkey['b'] = len(patterns['11']) - + # pattern 12 patterns['12'] = [] # the 5th line of Table 1 in Ref [1] rootlist = [] # a list of root nodes, whose extended labels are 3 @@ -294,7 +281,7 @@ class Treelet(GraphKernel): # new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ] patterns['12'].append(new_pattern) canonkey['c'] = int(len(patterns['12']) / 2) - + # pattern 9 patterns['9'] = [] # the 2nd line of Table 1 in Ref [1] for pattern in patterns['3star']: @@ -311,10 +298,10 @@ class Treelet(GraphKernel): new_pattern = pattern_t + [neighborx1] + [neighborx2] patterns['9'].append(new_pattern) canonkey['9'] = len(patterns['9']) - + # pattern 10 patterns['10'] = [] # the 3rd line of Table 1 in Ref [1] - for pattern in patterns['3star']: + for pattern in patterns['3star']: for i in range(1, len(pattern)): if G.degree(pattern[i]) >= 2: for neighborx in G[pattern[i]]: @@ -324,20 +311,20 @@ class Treelet(GraphKernel): new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ] patterns['10'].extend(new_patterns) canonkey['a'] = len(patterns['10']) - + ### labeling information ### - ### In this section, a list of canonical keys is generated for every - ### pattern obtained in the structural analysis section above, which is a + ### In this section, a list of canonical keys is generated for every + ### pattern obtained in the structural analysis section above, which is a ### string corresponding to a unique treelet. A dictionary is built to keep ### track of the amount of every treelet. if len(self._node_labels) > 0 or len(self._edge_labels) > 0: canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. - + # linear patterns canonkey_t = Counter(get_mlti_dim_node_attrs(G, self._node_labels)) for key in canonkey_t: canonkey_l[('0', key)] = canonkey_t[key] - + for i in range(1, 6): # for i in range(1, 6): treelet = [] for pattern in patterns[str(i)]: @@ -349,7 +336,7 @@ class Treelet(GraphKernel): canonkey_t = canonlist if canonlist < canonlist[::-1] else canonlist[::-1] treelet.append(tuple([str(i)] + canonkey_t)) canonkey_l.update(Counter(treelet)) - + # n-star patterns for i in range(3, 6): treelet = [] @@ -361,12 +348,12 @@ class Treelet(GraphKernel): canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonlist = list(chain.from_iterable(canonlist)) - canonkey_t = tuple(['d' if i == 5 else str(i * 2)] + - [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonkey_t = tuple(['d' if i == 5 else str(i * 2)] + + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) - + # pattern 7 treelet = [] for pattern in patterns['7']: @@ -377,15 +364,15 @@ class Treelet(GraphKernel): canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonlist = list(chain.from_iterable(canonlist)) - canonkey_t = tuple(['7'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonkey_t = tuple(['7'] + + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist + + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] - + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] + + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)]) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) - + # pattern 11 treelet = [] for pattern in patterns['11']: @@ -396,15 +383,15 @@ class Treelet(GraphKernel): canonlist.append(tuple((nlabels, elabels))) canonlist.sort() canonlist = list(chain.from_iterable(canonlist)) - canonkey_t = tuple(['b'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist - + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] + canonkey_t = tuple(['b'] + + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist + + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] + [tuple(G[pattern[4]][pattern[0]][el] for el in self._edge_labels)] - + [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)] + + [tuple(G.nodes[pattern[5]][nl] for nl in self._node_labels)] + [tuple(G[pattern[5]][pattern[4]][el] for el in self._edge_labels)]) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) - + # pattern 10 treelet = [] for pattern in patterns['10']: @@ -418,15 +405,15 @@ class Treelet(GraphKernel): canonlist.sort() canonkey0 = list(chain.from_iterable(canonlist)) canonkey_t = tuple(['a'] - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] - + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] + + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + + [tuple(G.nodes[pattern[4]][nl] for nl in self._node_labels)] + + [tuple(G[pattern[4]][pattern[3]][el] for el in self._edge_labels)] + + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] + canonkey4 + canonkey0) treelet.append(canonkey_t) canonkey_l.update(Counter(treelet)) - + # pattern 12 treelet = [] for pattern in patterns['12']: @@ -444,22 +431,22 @@ class Treelet(GraphKernel): canonlist3.append(tuple((nlabels, elabels))) canonlist3.sort() canonlist3 = list(chain.from_iterable(canonlist3)) - - # 2 possible key can be generated from 2 nodes with extended label 3, + + # 2 possible key can be generated from 2 nodes with extended label 3, # select the one with lower lexicographic order. - canonkey_t1 = tuple(['c'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0 - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] + canonkey_t1 = tuple(['c'] + + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonlist0 + + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + + [tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] + canonlist3) - canonkey_t2 = tuple(['c'] - + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3 - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] - + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] + canonkey_t2 = tuple(['c'] + + [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels)] + canonlist3 + + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + + [tuple(G[pattern[0]][pattern[3]][el] for el in self._edge_labels)] + canonlist0) treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) canonkey_l.update(Counter(treelet)) - + # pattern 9 treelet = [] for pattern in patterns['9']: @@ -469,7 +456,7 @@ class Treelet(GraphKernel): tuple(G[pattern[5]][pattern[3]][el] for el in self._edge_labels)] prekey2 = [tuple(G.nodes[pattern[2]][nl] for nl in self._node_labels), tuple(G[pattern[2]][pattern[0]][el] for el in self._edge_labels)] - prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels), + prekey3 = [tuple(G.nodes[pattern[3]][nl] for nl in self._node_labels), tuple(G[pattern[3]][pattern[0]][el] for el in self._edge_labels)] if prekey2 + canonkey2 < prekey3 + canonkey3: canonkey_t = [tuple(G.nodes[pattern[1]][nl] for nl in self._node_labels)] \ @@ -480,21 +467,21 @@ class Treelet(GraphKernel): + [tuple(G[pattern[1]][pattern[0]][el] for el in self._edge_labels)] \ + prekey3 + prekey2 + canonkey3 + canonkey2 treelet.append(tuple(['9'] - + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + + [tuple(G.nodes[pattern[0]][nl] for nl in self._node_labels)] + canonkey_t)) canonkey_l.update(Counter(treelet)) - + return canonkey_l - + return canonkey - - + + def _wrapper_get_canonkeys(self, itr_item): g = itr_item[0] i = itr_item[1] return i, self._get_canonkeys(g) - - + + def _add_dummy_labels(self, Gn): if len(self._node_labels) == 0 or (len(self._node_labels) == 1 and self._node_labels[0] == SpecialLabel.DUMMY): for i in range(len(Gn)): diff --git a/gklearn/tests/test_graph_kernels.py b/gklearn/tests/test_graph_kernels.py index 00fcb28..8c593f1 100644 --- a/gklearn/tests/test_graph_kernels.py +++ b/gklearn/tests/test_graph_kernels.py @@ -555,5 +555,12 @@ if __name__ == "__main__": # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'fp', None, None) # test_RandomWalk('Acyclic', 'spectral', 'exp', 'imap_unordered') - # test_CommonWalk('AIDS', 0.01, 'geo') + # test_CommonWalk('Acyclic', 0.01, 'geo') + # test_Marginalized('Acyclic', False) # test_ShortestPath('Acyclic') +# test_PathUpToH('Acyclic', 'MinMax') +# test_Treelet('Acyclic') +# test_SylvesterEquation('Acyclic') +# test_ConjugateGradient('Acyclic') +# test_FixedPoint('Acyclic') +# test_SpectralDecomposition('Acyclic', 'exp') \ No newline at end of file diff --git a/gklearn/utils/__init__.py b/gklearn/utils/__init__.py index 7c120f3..0461a78 100644 --- a/gklearn/utils/__init__.py +++ b/gklearn/utils/__init__.py @@ -25,3 +25,4 @@ from gklearn.utils.utils import normalize_gram_matrix, compute_distance_matrix from gklearn.utils.trie import Trie from gklearn.utils.knn import knn_cv, knn_classification from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel +from gklearn.utils.iters import get_iters diff --git a/gklearn/utils/iters.py b/gklearn/utils/iters.py new file mode 100644 index 0000000..190b602 --- /dev/null +++ b/gklearn/utils/iters.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Dec 24 10:35:26 2020 + +@author: ljia +""" + +from tqdm import tqdm +import math + + +def get_iters(iterable, desc=None, file=None, length=None, verbose=True, **kwargs): + if verbose: + if 'miniters' not in kwargs: + if length is None: + try: + kwargs['miniters'] = math.ceil(len(iterable) / 100) + except TypeError: + raise + kwargs['miniters'] = 100 + else: + kwargs['miniters'] = math.ceil(length / 100) + if 'maxinterval' not in kwargs: + kwargs['maxinterval'] = 600 + return tqdm(iterable, desc=desc, file=file, **kwargs) + else: + return iterable + + + +# class mytqdm(tqdm): + + +# def __init__(iterable=None, desc=None, total=None, leave=True, +# file=None, ncols=None, mininterval=0.1, maxinterval=10.0, +# miniters=None, ascii=None, disable=False, unit='it', +# unit_scale=False, dynamic_ncols=False, smoothing=0.3, +# bar_format=None, initial=0, position=None, postfix=None, +# unit_divisor=1000, write_bytes=None, lock_args=None, +# nrows=None, +# gui=False, **kwargs): +# if iterable is not None: +# miniters=math.ceil(len(iterable) / 100) +# maxinterval=600 +# super().__init__(iterable=iterable, desc=desc, total=total, leave=leave, +# file=file, ncols=ncols, mininterval=mininterval, maxinterval=maxinterval, +# miniters=miniters, ascii=ascii, disable=disable, unit=unit, +# unit_scale=unit_scale, dynamic_ncols=dynamic_ncols, smoothing=smoothing, +# bar_format=bar_format, initial=initial, position=position, postfix=postfix, +# unit_divisor=unit_divisor, write_bytes=write_bytes, lock_args=lock_args, +# nrows=nrows, +# gui=gui, **kwargs) + +# tqdm = mytqdm \ No newline at end of file