From 9710c639c535c977779c7f4537cae6e5a3013dee Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 29 Sep 2020 10:58:55 +0200 Subject: [PATCH 1/5] Add the FixPoint graph kernel class. --- gklearn/kernels/fixed_point.py | 245 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 245 insertions(+) create mode 100644 gklearn/kernels/fixed_point.py diff --git a/gklearn/kernels/fixed_point.py b/gklearn/kernels/fixed_point.py new file mode 100644 index 0000000..4eeeb8a --- /dev/null +++ b/gklearn/kernels/fixed_point.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Aug 20 16:09:51 2020 + +@author: ljia + +@references: + + [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. +""" + +import sys +from tqdm import tqdm +import numpy as np +import networkx as nx +from control import dlyap +from gklearn.utils.parallel import parallel_gm, parallel_me +from gklearn.kernels import RandomWalk + + +class FixedPoint(RandomWalk): + + + def __init__(self, **kwargs): + RandomWalk.__init__(self, **kwargs) + + + def _compute_gm_series(self): + self._check_edge_weight(self._graphs) + self._check_graphs(self._graphs) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + lmda = self._weight + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) + else: + iterator = self._graphs + A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] + # # normalized adjacency matrices + # A_wave_list = [] + # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): + # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() + # norm = A_tilde.sum(axis=0) + # norm[norm == 0] = 1 + # A_wave_list.append(A_tilde / norm) + + if self._p == None: # p is uniform distribution as default. + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + if self._verbose >= 2: + iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + else: + iterator = itr + + for i, j in iterator: + kernel = self.__kernel_do(A_wave_list[i], A_wave_list[j], lmda) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + + else: # @todo + pass + else: # @todo + pass + + return gram_matrix + + + def _compute_gm_imap_unordered(self): + self._check_edge_weight(self._graphs) + self._check_graphs(self._graphs) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) + else: + iterator = self._graphs + A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? + + if self._p == None: # p is uniform distribution as default. + def init_worker(A_wave_list_toshare): + global G_A_wave_list + G_A_wave_list = A_wave_list_toshare + + do_fun = self._wrapper_kernel_do + + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) + + else: # @todo + pass + else: # @todo + pass + + return gram_matrix + + + def _compute_kernel_list_series(self, g1, g_list): + self._check_edge_weight(g_list + [g1]) + self._check_graphs(g_list + [g1]) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + lmda = self._weight + + # compute kernel list. + kernel_list = [None] * len(g_list) + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) + else: + iterator = range(len(g_list)) + A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] + + if self._p == None: # p is uniform distribution as default. + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + else: + iterator = range(len(g_list)) + + for i in iterator: + kernel = self.__kernel_do(A_wave_1, A_wave_list[i], lmda) + kernel_list[i] = kernel + + else: # @todo + pass + else: # @todo + pass + + return kernel_list + + + def _compute_kernel_list_imap_unordered(self, g1, g_list): + self._check_edge_weight(g_list + [g1]) + self._check_graphs(g_list + [g1]) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + # compute kernel list. + kernel_list = [None] * len(g_list) + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) + else: + iterator = range(len(g_list)) + A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? + + if self._p == None: # p is uniform distribution as default. + def init_worker(A_wave_1_toshare, A_wave_list_toshare): + global G_A_wave_1, G_A_wave_list + G_A_wave_1 = A_wave_1_toshare + G_A_wave_list = A_wave_list_toshare + + do_fun = self._wrapper_kernel_list_do + + def func_assign(result, var_to_assign): + var_to_assign[result[0]] = result[1] + itr = range(len(g_list)) + len_itr = len(g_list) + parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, + init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', + n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + + else: # @todo + pass + else: # @todo + pass + + return kernel_list + + + def _wrapper_kernel_list_do(self, itr): + return itr, self._kernel_do(G_A_wave_1, G_A_wave_list[itr], self._weight) + + + def _compute_single_kernel_series(self, g1, g2): + self._check_edge_weight([g1] + [g2]) + self._check_graphs([g1] + [g2]) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + lmda = self._weight + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + A_wave_2 = nx.adjacency_matrix(g2, self._edge_weight).todense().transpose() + if self._p == None: # p is uniform distribution as default. + kernel = self.__kernel_do(A_wave_1, A_wave_2, lmda) + else: # @todo + pass + else: # @todo + pass + + return kernel + + + def __kernel_do(self, A_wave1, A_wave2, lmda): + + S = lmda * A_wave2 + T_t = A_wave1 + # use uniform distribution if there is no prior knowledge. + nb_pd = len(A_wave1) * len(A_wave2) + p_times_uni = 1 / nb_pd + M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni) + X = dlyap(S, T_t, M0) + X = np.reshape(X, (-1, 1), order='F') + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, nb_pd), p_times_uni) + return np.dot(q_times, X) + + + def _wrapper_kernel_do(self, itr): + i = itr[0] + j = itr[1] + return i, j, self.__kernel_do(G_A_wave_list[i], G_A_wave_list[j], self._weight) \ No newline at end of file From c898c62eb85b19123c7fd54609e732c0b340d7a3 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 29 Sep 2020 10:58:59 +0200 Subject: [PATCH 2/5] Add the chunksize argument to the function version of graph kernels. --- gklearn/kernels/commonWalkKernel.py | 799 +++++++------- gklearn/kernels/marginalizedKernel.py | 544 +++++----- gklearn/kernels/path_up_to_h.py | 12 +- gklearn/kernels/randomWalkKernel.py | 1647 +++++++++++++++-------------- gklearn/kernels/spKernel.py | 586 +++++----- gklearn/kernels/structuralspKernel.py | 1572 +++++++++++++-------------- gklearn/kernels/treeletKernel.py | 12 +- gklearn/kernels/untilHPathKernel.py | 1320 +++++++++++------------ gklearn/kernels/weisfeilerLehmanKernel.py | 11 +- gklearn/utils/parallel.py | 4 +- 10 files changed, 3265 insertions(+), 3242 deletions(-) diff --git a/gklearn/kernels/commonWalkKernel.py b/gklearn/kernels/commonWalkKernel.py index 56eec30..a5f9cb1 100644 --- a/gklearn/kernels/commonWalkKernel.py +++ b/gklearn/kernels/commonWalkKernel.py @@ -3,9 +3,9 @@ @references: - [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: - Hardness results and efficient alternatives. Learning Theory and Kernel - Machines, pages 129–143, 2003. + [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: + Hardness results and efficient alternatives. Learning Theory and Kernel + Machines, pages 129–143, 2003. """ import sys @@ -22,428 +22,429 @@ from gklearn.utils.parallel import parallel_gm def commonwalkkernel(*args, - node_label='atom', - edge_label='bond_type', -# n=None, - weight=1, - compute_method=None, - n_jobs=None, - verbose=True): - """Calculate common walk graph kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - - G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. - node_label : string - Node attribute used as symbolic label. The default node label is 'atom'. - edge_label : string - Edge attribute used as symbolic label. The default edge label is 'bond_type'. - weight: integer - Weight coefficient of different lengths of walks, which represents beta - in 'exp' method and gamma in 'geo'. - compute_method : string - Method used to compute walk kernel. The Following choices are - available: - - 'exp': method based on exponential serials applied on the direct - product graph, as shown in reference [1]. The time complexity is O(n^6) - for graphs with n vertices. - - 'geo': method based on geometric serials applied on the direct product - graph, as shown in reference [1]. The time complexity is O(n^6) for - graphs with n vertices. - - n_jobs : int - Number of jobs for parallelization. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is a common walk kernel between 2 - graphs. - """ -# n : integer -# Longest length of walks. Only useful when applying the 'brute' method. -# 'brute': brute force, simply search for all walks and compare them. - compute_method = compute_method.lower() - # arrange all graphs in a list - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - - # remove graphs with only 1 node, as they do not have adjacency matrices - len_gn = len(Gn) - Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] - idx = [G[0] for G in Gn] - Gn = [G[1] for G in Gn] - if len(Gn) != len_gn: - if verbose: - print('\n %d graphs are removed as they have only 1 node.\n' % - (len_gn - len(Gn))) - - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'edge_labeled', 'is_directed'], - node_label=node_label, edge_label=edge_label) - if not ds_attrs['node_labeled']: - for G in Gn: - nx.set_node_attributes(G, '0', 'atom') - if not ds_attrs['edge_labeled']: - for G in Gn: - nx.set_edge_attributes(G, '0', 'bond_type') - if not ds_attrs['is_directed']: # convert - Gn = [G.to_directed() for G in Gn] - - start_time = time.time() - - Kmatrix = np.zeros((len(Gn), len(Gn))) - - # ---- use pool.imap_unordered to parallel and track progress. ---- - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - # direct product graph method - exponential - if compute_method == 'exp': - do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) - # direct product graph method - geometric - elif compute_method == 'geo': - do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) - - -# pool = Pool(n_jobs) -# itr = zip(combinations_with_replacement(Gn, 2), -# combinations_with_replacement(range(0, len(Gn)), 2)) -# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) -# if len_itr < 1000 * n_jobs: -# chunksize = int(len_itr / n_jobs) + 1 -# else: -# chunksize = 1000 + node_label='atom', + edge_label='bond_type', +# n=None, + weight=1, + compute_method=None, + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate common walk graph kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + node_label : string + Node attribute used as symbolic label. The default node label is 'atom'. + edge_label : string + Edge attribute used as symbolic label. The default edge label is 'bond_type'. + weight: integer + Weight coefficient of different lengths of walks, which represents beta + in 'exp' method and gamma in 'geo'. + compute_method : string + Method used to compute walk kernel. The Following choices are + available: + + 'exp': method based on exponential serials applied on the direct + product graph, as shown in reference [1]. The time complexity is O(n^6) + for graphs with n vertices. + + 'geo': method based on geometric serials applied on the direct product + graph, as shown in reference [1]. The time complexity is O(n^6) for + graphs with n vertices. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is a common walk kernel between 2 + graphs. + """ +# n : integer +# Longest length of walks. Only useful when applying the 'brute' method. +# 'brute': brute force, simply search for all walks and compare them. + compute_method = compute_method.lower() + # arrange all graphs in a list + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + + # remove graphs with only 1 node, as they do not have adjacency matrices + len_gn = len(Gn) + Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] + idx = [G[0] for G in Gn] + Gn = [G[1] for G in Gn] + if len(Gn) != len_gn: + if verbose: + print('\n %d graphs are removed as they have only 1 node.\n' % + (len_gn - len(Gn))) + + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'edge_labeled', 'is_directed'], + node_label=node_label, edge_label=edge_label) + if not ds_attrs['node_labeled']: + for G in Gn: + nx.set_node_attributes(G, '0', 'atom') + if not ds_attrs['edge_labeled']: + for G in Gn: + nx.set_edge_attributes(G, '0', 'bond_type') + if not ds_attrs['is_directed']: # convert + Gn = [G.to_directed() for G in Gn] + + start_time = time.time() + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + # direct product graph method - exponential + if compute_method == 'exp': + do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) + # direct product graph method - geometric + elif compute_method == 'geo': + do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + + +# pool = Pool(n_jobs) +# itr = zip(combinations_with_replacement(Gn, 2), +# combinations_with_replacement(range(0, len(Gn)), 2)) +# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) +# if len_itr < 1000 * n_jobs: +# chunksize = int(len_itr / n_jobs) + 1 +# else: +# chunksize = 1000 # -# # direct product graph method - exponential -# if compute_method == 'exp': -# do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) -# # direct product graph method - geometric -# elif compute_method == 'geo': -# do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) +# # direct product graph method - exponential +# if compute_method == 'exp': +# do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) +# # direct product graph method - geometric +# elif compute_method == 'geo': +# do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) # -# for i, j, kernel in tqdm( -# pool.imap_unordered(do_partial, itr, chunksize), -# desc='calculating kernels', -# file=sys.stdout): -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel -# pool.close() -# pool.join() - - -# # ---- direct running, normally use single CPU core. ---- -# # direct product graph method - exponential -# itr = combinations_with_replacement(range(0, len(Gn)), 2) -# if compute_method == 'exp': -# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): -# Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, -# edge_label, weight) -# Kmatrix[j][i] = Kmatrix[i][j] +# for i, j, kernel in tqdm( +# pool.imap_unordered(do_partial, itr, chunksize), +# desc='calculating kernels', +# file=sys.stdout): +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# pool.close() +# pool.join() + + +# # ---- direct running, normally use single CPU core. ---- +# # direct product graph method - exponential +# itr = combinations_with_replacement(range(0, len(Gn)), 2) +# if compute_method == 'exp': +# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, +# edge_label, weight) +# Kmatrix[j][i] = Kmatrix[i][j] # -# # direct product graph method - geometric -# elif compute_method == 'geo': -# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): -# Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, -# edge_label, weight) -# Kmatrix[j][i] = Kmatrix[i][j] - - -# # search all paths use brute force. -# elif compute_method == 'brute': -# n = int(n) -# # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. -# all_walks = [ -# find_all_walks_until_length(Gn[i], n, node_label, edge_label) -# for i in range(0, len(Gn)) -# ] +# # direct product graph method - geometric +# elif compute_method == 'geo': +# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, +# edge_label, weight) +# Kmatrix[j][i] = Kmatrix[i][j] + + +# # search all paths use brute force. +# elif compute_method == 'brute': +# n = int(n) +# # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. +# all_walks = [ +# find_all_walks_until_length(Gn[i], n, node_label, edge_label) +# for i in range(0, len(Gn)) +# ] # -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# Kmatrix[i][j] = _commonwalkkernel_brute( -# all_walks[i], -# all_walks[j], -# node_label=node_label, -# edge_label=edge_label) -# Kmatrix[j][i] = Kmatrix[i][j] +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# Kmatrix[i][j] = _commonwalkkernel_brute( +# all_walks[i], +# all_walks[j], +# node_label=node_label, +# edge_label=edge_label) +# Kmatrix[j][i] = Kmatrix[i][j] - run_time = time.time() - start_time - if verbose: - print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" - % (len(Gn), run_time)) + run_time = time.time() - start_time + if verbose: + print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" + % (len(Gn), run_time)) - return Kmatrix, run_time, idx + return Kmatrix, run_time, idx def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): - """Calculate walk graph kernels up to n between 2 graphs using exponential - series. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - node_label : string - Node attribute used as label. - edge_label : string - Edge attribute used as label. - beta : integer - Weight. - ij : tuple of integer - Index of graphs between which the kernel is computed. - - Return - ------ - kernel : float - The common walk Kernel between 2 graphs. - """ - - # get tensor product / direct product - gp = direct_product(g1, g2, node_label, edge_label) - # return 0 if the direct product graph have no more than 1 node. - if nx.number_of_nodes(gp) < 2: - return 0 - A = nx.adjacency_matrix(gp).todense() - # print(A) - - # from matplotlib import pyplot as plt - # nx.draw_networkx(G1) - # plt.show() - # nx.draw_networkx(G2) - # plt.show() - # nx.draw_networkx(gp) - # plt.show() - # print(G1.nodes(data=True)) - # print(G2.nodes(data=True)) - # print(gp.nodes(data=True)) - # print(gp.edges(data=True)) - - ew, ev = np.linalg.eig(A) - # print('ew: ', ew) - # print(ev) - # T = np.matrix(ev) - # print('T: ', T) - # T = ev.I - D = np.zeros((len(ew), len(ew))) - for i in range(len(ew)): - D[i][i] = np.exp(beta * ew[i]) - # print('D: ', D) - # print('hshs: ', T.I * D * T) - - # print(np.exp(-2)) - # print(D) - # print(np.exp(weight * D)) - # print(ev) - # print(np.linalg.inv(ev)) - exp_D = ev * D * ev.T - # print(exp_D) - # print(np.exp(weight * A)) - # print('-------') - - return exp_D.sum() + """Calculate walk graph kernels up to n between 2 graphs using exponential + series. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + node_label : string + Node attribute used as label. + edge_label : string + Edge attribute used as label. + beta : integer + Weight. + ij : tuple of integer + Index of graphs between which the kernel is computed. + + Return + ------ + kernel : float + The common walk Kernel between 2 graphs. + """ + + # get tensor product / direct product + gp = direct_product(g1, g2, node_label, edge_label) + # return 0 if the direct product graph have no more than 1 node. + if nx.number_of_nodes(gp) < 2: + return 0 + A = nx.adjacency_matrix(gp).todense() + # print(A) + + # from matplotlib import pyplot as plt + # nx.draw_networkx(G1) + # plt.show() + # nx.draw_networkx(G2) + # plt.show() + # nx.draw_networkx(gp) + # plt.show() + # print(G1.nodes(data=True)) + # print(G2.nodes(data=True)) + # print(gp.nodes(data=True)) + # print(gp.edges(data=True)) + + ew, ev = np.linalg.eig(A) + # print('ew: ', ew) + # print(ev) + # T = np.matrix(ev) + # print('T: ', T) + # T = ev.I + D = np.zeros((len(ew), len(ew))) + for i in range(len(ew)): + D[i][i] = np.exp(beta * ew[i]) + # print('D: ', D) + # print('hshs: ', T.I * D * T) + + # print(np.exp(-2)) + # print(D) + # print(np.exp(weight * D)) + # print(ev) + # print(np.linalg.inv(ev)) + exp_D = ev * D * ev.T + # print(exp_D) + # print(np.exp(weight * A)) + # print('-------') + + return exp_D.sum() def wrapper_cw_exp(node_label, edge_label, beta, itr): - i = itr[0] - j = itr[1] - return i, j, _commonwalkkernel_exp(G_gn[i], G_gn[j], node_label, edge_label, beta) + i = itr[0] + j = itr[1] + return i, j, _commonwalkkernel_exp(G_gn[i], G_gn[j], node_label, edge_label, beta) def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): - """Calculate common walk graph kernels up to n between 2 graphs using - geometric series. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - node_label : string - Node attribute used as label. - edge_label : string - Edge attribute used as label. - gamma: integer - Weight. - ij : tuple of integer - Index of graphs between which the kernel is computed. - - Return - ------ - kernel : float - The common walk Kernel between 2 graphs. - """ - # get tensor product / direct product - gp = direct_product(g1, g2, node_label, edge_label) - # return 0 if the direct product graph have no more than 1 node. - if nx.number_of_nodes(gp) < 2: - return 0 - A = nx.adjacency_matrix(gp).todense() - mat = np.identity(len(A)) - gamma * A -# try: - return mat.I.sum() -# except np.linalg.LinAlgError: -# return np.nan - - + """Calculate common walk graph kernels up to n between 2 graphs using + geometric series. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + node_label : string + Node attribute used as label. + edge_label : string + Edge attribute used as label. + gamma: integer + Weight. + ij : tuple of integer + Index of graphs between which the kernel is computed. + + Return + ------ + kernel : float + The common walk Kernel between 2 graphs. + """ + # get tensor product / direct product + gp = direct_product(g1, g2, node_label, edge_label) + # return 0 if the direct product graph have no more than 1 node. + if nx.number_of_nodes(gp) < 2: + return 0 + A = nx.adjacency_matrix(gp).todense() + mat = np.identity(len(A)) - gamma * A +# try: + return mat.I.sum() +# except np.linalg.LinAlgError: +# return np.nan + + def wrapper_cw_geo(node_label, edge_label, gama, itr): - i = itr[0] - j = itr[1] - return i, j, _commonwalkkernel_geo(G_gn[i], G_gn[j], node_label, edge_label, gama) + i = itr[0] + j = itr[1] + return i, j, _commonwalkkernel_geo(G_gn[i], G_gn[j], node_label, edge_label, gama) def _commonwalkkernel_brute(walks1, - walks2, - node_label='atom', - edge_label='bond_type', - labeled=True): - """Calculate walk graph kernels up to n between 2 graphs. - - Parameters - ---------- - walks1, walks2 : list - List of walks in 2 graphs, where for unlabeled graphs, each walk is - represented by a list of nodes; while for labeled graphs, each walk is - represented by a string consists of labels of nodes and edges on that - walk. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - labeled : boolean - Whether the graphs are labeled. The default is True. - - Return - ------ - kernel : float - Treelet Kernel between 2 graphs. - """ - counts_walks1 = dict(Counter(walks1)) - counts_walks2 = dict(Counter(walks2)) - all_walks = list(set(walks1 + walks2)) - - vector1 = [(counts_walks1[walk] if walk in walks1 else 0) - for walk in all_walks] - vector2 = [(counts_walks2[walk] if walk in walks2 else 0) - for walk in all_walks] - kernel = np.dot(vector1, vector2) - - return kernel + walks2, + node_label='atom', + edge_label='bond_type', + labeled=True): + """Calculate walk graph kernels up to n between 2 graphs. + + Parameters + ---------- + walks1, walks2 : list + List of walks in 2 graphs, where for unlabeled graphs, each walk is + represented by a list of nodes; while for labeled graphs, each walk is + represented by a string consists of labels of nodes and edges on that + walk. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + labeled : boolean + Whether the graphs are labeled. The default is True. + + Return + ------ + kernel : float + Treelet Kernel between 2 graphs. + """ + counts_walks1 = dict(Counter(walks1)) + counts_walks2 = dict(Counter(walks2)) + all_walks = list(set(walks1 + walks2)) + + vector1 = [(counts_walks1[walk] if walk in walks1 else 0) + for walk in all_walks] + vector2 = [(counts_walks2[walk] if walk in walks2 else 0) + for walk in all_walks] + kernel = np.dot(vector1, vector2) + + return kernel # this method find walks repetively, it could be faster. def find_all_walks_until_length(G, - length, - node_label='atom', - edge_label='bond_type', - labeled=True): - """Find all walks with a certain maximum length in a graph. - A recursive depth first search is applied. - - Parameters - ---------- - G : NetworkX graphs - The graph in which walks are searched. - length : integer - The maximum length of walks. - node_label : string - node attribute used as label. The default node label is atom. - edge_label : string - edge attribute used as label. The default edge label is bond_type. - labeled : boolean - Whether the graphs are labeled. The default is True. - - Return - ------ - walk : list - List of walks retrieved, where for unlabeled graphs, each walk is - represented by a list of nodes; while for labeled graphs, each walk - is represented by a string consists of labels of nodes and edges on - that walk. - """ - all_walks = [] - # @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n) - for i in range(0, length + 1): - new_walks = find_all_walks(G, i) - if new_walks == []: - break - all_walks.extend(new_walks) - - if labeled == True: # convert paths to strings - walk_strs = [] - for walk in all_walks: - strlist = [ - G.node[node][node_label] + - G[node][walk[walk.index(node) + 1]][edge_label] - for node in walk[:-1] - ] - walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) - - return walk_strs - - return all_walks + length, + node_label='atom', + edge_label='bond_type', + labeled=True): + """Find all walks with a certain maximum length in a graph. + A recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which walks are searched. + length : integer + The maximum length of walks. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + labeled : boolean + Whether the graphs are labeled. The default is True. + + Return + ------ + walk : list + List of walks retrieved, where for unlabeled graphs, each walk is + represented by a list of nodes; while for labeled graphs, each walk + is represented by a string consists of labels of nodes and edges on + that walk. + """ + all_walks = [] + # @todo: in this way, the time complexity is close to N(d^n+d^(n+1)+...+1), which could be optimized to O(Nd^n) + for i in range(0, length + 1): + new_walks = find_all_walks(G, i) + if new_walks == []: + break + all_walks.extend(new_walks) + + if labeled == True: # convert paths to strings + walk_strs = [] + for walk in all_walks: + strlist = [ + G.node[node][node_label] + + G[node][walk[walk.index(node) + 1]][edge_label] + for node in walk[:-1] + ] + walk_strs.append(''.join(strlist) + G.node[walk[-1]][node_label]) + + return walk_strs + + return all_walks def find_walks(G, source_node, length): - """Find all walks with a certain length those start from a source node. A - recursive depth first search is applied. - - Parameters - ---------- - G : NetworkX graphs - The graph in which walks are searched. - source_node : integer - The number of the node from where all walks start. - length : integer - The length of walks. - - Return - ------ - walk : list of list - List of walks retrieved, where each walk is represented by a list of - nodes. - """ - return [[source_node]] if length == 0 else \ - [[source_node] + walk for neighbor in G[source_node] - for walk in find_walks(G, neighbor, length - 1)] + """Find all walks with a certain length those start from a source node. A + recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which walks are searched. + source_node : integer + The number of the node from where all walks start. + length : integer + The length of walks. + + Return + ------ + walk : list of list + List of walks retrieved, where each walk is represented by a list of + nodes. + """ + return [[source_node]] if length == 0 else \ + [[source_node] + walk for neighbor in G[source_node] + for walk in find_walks(G, neighbor, length - 1)] def find_all_walks(G, length): - """Find all walks with a certain length in a graph. A recursive depth first - search is applied. - - Parameters - ---------- - G : NetworkX graphs - The graph in which walks are searched. - length : integer - The length of walks. - - Return - ------ - walk : list of list - List of walks retrieved, where each walk is represented by a list of - nodes. - """ - all_walks = [] - for node in G: - all_walks.extend(find_walks(G, node, length)) - - # The following process is not carried out according to the original article - # all_paths_r = [ path[::-1] for path in all_paths ] - - # # For each path, two presentation are retrieved from its two extremities. Remove one of them. - # for idx, path in enumerate(all_paths[:-1]): - # for path2 in all_paths_r[idx+1::]: - # if path == path2: - # all_paths[idx] = [] - # break - - # return list(filter(lambda a: a != [], all_paths)) - return all_walks + """Find all walks with a certain length in a graph. A recursive depth first + search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which walks are searched. + length : integer + The length of walks. + + Return + ------ + walk : list of list + List of walks retrieved, where each walk is represented by a list of + nodes. + """ + all_walks = [] + for node in G: + all_walks.extend(find_walks(G, node, length)) + + # The following process is not carried out according to the original article + # all_paths_r = [ path[::-1] for path in all_paths ] + + # # For each path, two presentation are retrieved from its two extremities. Remove one of them. + # for idx, path in enumerate(all_paths[:-1]): + # for path2 in all_paths_r[idx+1::]: + # if path == path2: + # all_paths[idx] = [] + # break + + # return list(filter(lambda a: a != [], all_paths)) + return all_walks diff --git a/gklearn/kernels/marginalizedKernel.py b/gklearn/kernels/marginalizedKernel.py index 9915445..950f1a6 100644 --- a/gklearn/kernels/marginalizedKernel.py +++ b/gklearn/kernels/marginalizedKernel.py @@ -3,14 +3,14 @@ @references: - [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between - labeled graphs. In Proceedings of the 20th International Conference on - Machine Learning, Washington, DC, United States, 2003. - - [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and - Jean-Philippe Vert. Extensions of marginalized graph kernels. In - Proceedings of the twenty-first international conference on Machine - learning, page 70. ACM, 2004. + [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between + labeled graphs. In Proceedings of the 20th International Conference on + Machine Learning, Washington, DC, United States, 2003. + + [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and + Jean-Philippe Vert. Extensions of marginalized graph kernels. In + Proceedings of the twenty-first international conference on Machine + learning, page 70. ACM, 2004. """ import sys @@ -31,275 +31,277 @@ from gklearn.utils.parallel import parallel_gm def marginalizedkernel(*args, - node_label='atom', - edge_label='bond_type', - p_quit=0.5, - n_iteration=20, - remove_totters=False, - n_jobs=None, - verbose=True): - """Calculate marginalized graph kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - - G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. - - node_label : string - Node attribute used as symbolic label. The default node label is 'atom'. - - edge_label : string - Edge attribute used as symbolic label. The default edge label is 'bond_type'. - - p_quit : integer - The termination probability in the random walks generating step. - - n_iteration : integer - Time of iterations to calculate R_inf. - - remove_totters : boolean - Whether to remove totterings by method introduced in [2]. The default - value is False. - - n_jobs : int - Number of jobs for parallelization. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the marginalized kernel between - 2 praphs. - """ - # pre-process - n_iteration = int(n_iteration) - Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] - Gn = [g.copy() for g in Gn] - - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'edge_labeled', 'is_directed'], - node_label=node_label, edge_label=edge_label) - if not ds_attrs['node_labeled'] or node_label == None: - node_label = 'atom' - for G in Gn: - nx.set_node_attributes(G, '0', 'atom') - if not ds_attrs['edge_labeled'] or edge_label == None: - edge_label = 'bond_type' - for G in Gn: - nx.set_edge_attributes(G, '0', 'bond_type') - - start_time = time.time() - - if remove_totters: - # ---- use pool.imap_unordered to parallel and track progress. ---- - pool = Pool(n_jobs) - untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) - if len(Gn) < 100 * n_jobs: - chunksize = int(len(Gn) / n_jobs) + 1 - else: - chunksize = 100 - for i, g in tqdm( - pool.imap_unordered( - untotter_partial, range(0, len(Gn)), chunksize), - desc='removing tottering', - file=sys.stdout): - Gn[i] = g - pool.close() - pool.join() - -# # ---- direct running, normally use single CPU core. ---- -# Gn = [ -# untotterTransformation(G, node_label, edge_label) -# for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) -# ] - - Kmatrix = np.zeros((len(Gn), len(Gn))) - - # ---- use pool.imap_unordered to parallel and track progress. ---- - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - do_partial = partial(wrapper_marg_do, node_label, edge_label, - p_quit, n_iteration) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) - - -# # ---- direct running, normally use single CPU core. ---- -## pbar = tqdm( -## total=(1 + len(Gn)) * len(Gn) / 2, -## desc='calculating kernels', -## file=sys.stdout) -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -## print(i, j) -# Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, -# edge_label, p_quit, n_iteration) -# Kmatrix[j][i] = Kmatrix[i][j] -## pbar.update(1) - - run_time = time.time() - start_time - if verbose: - print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" - % (len(Gn), run_time)) - - return Kmatrix, run_time + node_label='atom', + edge_label='bond_type', + p_quit=0.5, + n_iteration=20, + remove_totters=False, + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate marginalized graph kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + + node_label : string + Node attribute used as symbolic label. The default node label is 'atom'. + + edge_label : string + Edge attribute used as symbolic label. The default edge label is 'bond_type'. + + p_quit : integer + The termination probability in the random walks generating step. + + n_iteration : integer + Time of iterations to calculate R_inf. + + remove_totters : boolean + Whether to remove totterings by method introduced in [2]. The default + value is False. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the marginalized kernel between + 2 praphs. + """ + # pre-process + n_iteration = int(n_iteration) + Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] + Gn = [g.copy() for g in Gn] + + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'edge_labeled', 'is_directed'], + node_label=node_label, edge_label=edge_label) + if not ds_attrs['node_labeled'] or node_label == None: + node_label = 'atom' + for G in Gn: + nx.set_node_attributes(G, '0', 'atom') + if not ds_attrs['edge_labeled'] or edge_label == None: + edge_label = 'bond_type' + for G in Gn: + nx.set_edge_attributes(G, '0', 'bond_type') + + start_time = time.time() + + if remove_totters: + # ---- use pool.imap_unordered to parallel and track progress. ---- + pool = Pool(n_jobs) + untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) + if chunksize is None: + if len(Gn) < 100 * n_jobs: + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + for i, g in tqdm( + pool.imap_unordered( + untotter_partial, range(0, len(Gn)), chunksize), + desc='removing tottering', + file=sys.stdout): + Gn[i] = g + pool.close() + pool.join() + +# # ---- direct running, normally use single CPU core. ---- +# Gn = [ +# untotterTransformation(G, node_label, edge_label) +# for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) +# ] + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(wrapper_marg_do, node_label, edge_label, + p_quit, n_iteration) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + + +# # ---- direct running, normally use single CPU core. ---- +## pbar = tqdm( +## total=(1 + len(Gn)) * len(Gn) / 2, +## desc='calculating kernels', +## file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +## print(i, j) +# Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, +# edge_label, p_quit, n_iteration) +# Kmatrix[j][i] = Kmatrix[i][j] +## pbar.update(1) + + run_time = time.time() - start_time + if verbose: + print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): - """Calculate marginalized graph kernel between 2 graphs. - - Parameters - ---------- - G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - p_quit : integer - the termination probability in the random walks generating step. - n_iteration : integer - time of iterations to calculate R_inf. - - Return - ------ - kernel : float - Marginalized Kernel between 2 graphs. - """ - # init parameters - kernel = 0 - num_nodes_G1 = nx.number_of_nodes(g1) - num_nodes_G2 = nx.number_of_nodes(g2) - # the initial probability distribution in the random walks generating step - # (uniform distribution over |G|) - p_init_G1 = 1 / num_nodes_G1 - p_init_G2 = 1 / num_nodes_G2 - - q = p_quit * p_quit - r1 = q - -# # initial R_inf -# # matrix to save all the R_inf for all pairs of nodes -# R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) + """Calculate marginalized graph kernel between 2 graphs. + + Parameters + ---------- + G1, G2 : NetworkX graphs + 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + p_quit : integer + the termination probability in the random walks generating step. + n_iteration : integer + time of iterations to calculate R_inf. + + Return + ------ + kernel : float + Marginalized Kernel between 2 graphs. + """ + # init parameters + kernel = 0 + num_nodes_G1 = nx.number_of_nodes(g1) + num_nodes_G2 = nx.number_of_nodes(g2) + # the initial probability distribution in the random walks generating step + # (uniform distribution over |G|) + p_init_G1 = 1 / num_nodes_G1 + p_init_G2 = 1 / num_nodes_G2 + + q = p_quit * p_quit + r1 = q + +# # initial R_inf +# # matrix to save all the R_inf for all pairs of nodes +# R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) # -# # calculate R_inf with a simple interative method -# for i in range(1, n_iteration): -# R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) -# R_inf_new.fill(r1) +# # calculate R_inf with a simple interative method +# for i in range(1, n_iteration): +# R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) +# R_inf_new.fill(r1) # -# # calculate R_inf for each pair of nodes -# for node1 in g1.nodes(data=True): -# neighbor_n1 = g1[node1[0]] -# # the transition probability distribution in the random walks -# # generating step (uniform distribution over the vertices adjacent -# # to the current vertex) -# if len(neighbor_n1) > 0: -# p_trans_n1 = (1 - p_quit) / len(neighbor_n1) -# for node2 in g2.nodes(data=True): -# neighbor_n2 = g2[node2[0]] -# if len(neighbor_n2) > 0: -# p_trans_n2 = (1 - p_quit) / len(neighbor_n2) -# -# for neighbor1 in neighbor_n1: -# for neighbor2 in neighbor_n2: -# t = p_trans_n1 * p_trans_n2 * \ -# deltakernel(g1.node[neighbor1][node_label], -# g2.node[neighbor2][node_label]) * \ -# deltakernel( -# neighbor_n1[neighbor1][edge_label], -# neighbor_n2[neighbor2][edge_label]) -# -# R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ -# neighbor2] # ref [1] equation (8) -# R_inf[:] = R_inf_new +# # calculate R_inf for each pair of nodes +# for node1 in g1.nodes(data=True): +# neighbor_n1 = g1[node1[0]] +# # the transition probability distribution in the random walks +# # generating step (uniform distribution over the vertices adjacent +# # to the current vertex) +# if len(neighbor_n1) > 0: +# p_trans_n1 = (1 - p_quit) / len(neighbor_n1) +# for node2 in g2.nodes(data=True): +# neighbor_n2 = g2[node2[0]] +# if len(neighbor_n2) > 0: +# p_trans_n2 = (1 - p_quit) / len(neighbor_n2) +# +# for neighbor1 in neighbor_n1: +# for neighbor2 in neighbor_n2: +# t = p_trans_n1 * p_trans_n2 * \ +# deltakernel(g1.node[neighbor1][node_label], +# g2.node[neighbor2][node_label]) * \ +# deltakernel( +# neighbor_n1[neighbor1][edge_label], +# neighbor_n2[neighbor2][edge_label]) +# +# R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ +# neighbor2] # ref [1] equation (8) +# R_inf[:] = R_inf_new # -# # add elements of R_inf up and calculate kernel -# for node1 in g1.nodes(data=True): -# for node2 in g2.nodes(data=True): -# s = p_init_G1 * p_init_G2 * deltakernel( -# node1[1][node_label], node2[1][node_label]) -# kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) - - - R_inf = {} # dict to save all the R_inf for all pairs of nodes - # initial R_inf, the 1st iteration. - for node1 in g1.nodes(): - for node2 in g2.nodes(): -# R_inf[(node1[0], node2[0])] = r1 - if len(g1[node1]) > 0: - if len(g2[node2]) > 0: - R_inf[(node1, node2)] = r1 - else: - R_inf[(node1, node2)] = p_quit - else: - if len(g2[node2]) > 0: - R_inf[(node1, node2)] = p_quit - else: - R_inf[(node1, node2)] = 1 - - # compute all transition probability first. - t_dict = {} - if n_iteration > 1: - for node1 in g1.nodes(): - neighbor_n1 = g1[node1] - # the transition probability distribution in the random walks - # generating step (uniform distribution over the vertices adjacent - # to the current vertex) - if len(neighbor_n1) > 0: - p_trans_n1 = (1 - p_quit) / len(neighbor_n1) - for node2 in g2.nodes(): - neighbor_n2 = g2[node2] - if len(neighbor_n2) > 0: - p_trans_n2 = (1 - p_quit) / len(neighbor_n2) - for neighbor1 in neighbor_n1: - for neighbor2 in neighbor_n2: - t_dict[(node1, node2, neighbor1, neighbor2)] = \ - p_trans_n1 * p_trans_n2 * \ - deltakernel(g1.nodes[neighbor1][node_label], - g2.nodes[neighbor2][node_label]) * \ - deltakernel( - neighbor_n1[neighbor1][edge_label], - neighbor_n2[neighbor2][edge_label]) - - # calculate R_inf with a simple interative method - for i in range(2, n_iteration + 1): - R_inf_old = R_inf.copy() - - # calculate R_inf for each pair of nodes - for node1 in g1.nodes(): - neighbor_n1 = g1[node1] - # the transition probability distribution in the random walks - # generating step (uniform distribution over the vertices adjacent - # to the current vertex) - if len(neighbor_n1) > 0: - for node2 in g2.nodes(): - neighbor_n2 = g2[node2] - if len(neighbor_n2) > 0: - R_inf[(node1, node2)] = r1 - for neighbor1 in neighbor_n1: - for neighbor2 in neighbor_n2: - R_inf[(node1, node2)] += \ - (t_dict[(node1, node2, neighbor1, neighbor2)] * \ - R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) - - # add elements of R_inf up and calculate kernel - for (n1, n2), value in R_inf.items(): - s = p_init_G1 * p_init_G2 * deltakernel( - g1.nodes[n1][node_label], g2.nodes[n2][node_label]) - kernel += s * value # ref [1] equation (6) - - return kernel - - +# # add elements of R_inf up and calculate kernel +# for node1 in g1.nodes(data=True): +# for node2 in g2.nodes(data=True): +# s = p_init_G1 * p_init_G2 * deltakernel( +# node1[1][node_label], node2[1][node_label]) +# kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) + + + R_inf = {} # dict to save all the R_inf for all pairs of nodes + # initial R_inf, the 1st iteration. + for node1 in g1.nodes(): + for node2 in g2.nodes(): +# R_inf[(node1[0], node2[0])] = r1 + if len(g1[node1]) > 0: + if len(g2[node2]) > 0: + R_inf[(node1, node2)] = r1 + else: + R_inf[(node1, node2)] = p_quit + else: + if len(g2[node2]) > 0: + R_inf[(node1, node2)] = p_quit + else: + R_inf[(node1, node2)] = 1 + + # compute all transition probability first. + t_dict = {} + if n_iteration > 1: + for node1 in g1.nodes(): + neighbor_n1 = g1[node1] + # the transition probability distribution in the random walks + # generating step (uniform distribution over the vertices adjacent + # to the current vertex) + if len(neighbor_n1) > 0: + p_trans_n1 = (1 - p_quit) / len(neighbor_n1) + for node2 in g2.nodes(): + neighbor_n2 = g2[node2] + if len(neighbor_n2) > 0: + p_trans_n2 = (1 - p_quit) / len(neighbor_n2) + for neighbor1 in neighbor_n1: + for neighbor2 in neighbor_n2: + t_dict[(node1, node2, neighbor1, neighbor2)] = \ + p_trans_n1 * p_trans_n2 * \ + deltakernel(g1.nodes[neighbor1][node_label], + g2.nodes[neighbor2][node_label]) * \ + deltakernel( + neighbor_n1[neighbor1][edge_label], + neighbor_n2[neighbor2][edge_label]) + + # calculate R_inf with a simple interative method + for i in range(2, n_iteration + 1): + R_inf_old = R_inf.copy() + + # calculate R_inf for each pair of nodes + for node1 in g1.nodes(): + neighbor_n1 = g1[node1] + # the transition probability distribution in the random walks + # generating step (uniform distribution over the vertices adjacent + # to the current vertex) + if len(neighbor_n1) > 0: + for node2 in g2.nodes(): + neighbor_n2 = g2[node2] + if len(neighbor_n2) > 0: + R_inf[(node1, node2)] = r1 + for neighbor1 in neighbor_n1: + for neighbor2 in neighbor_n2: + R_inf[(node1, node2)] += \ + (t_dict[(node1, node2, neighbor1, neighbor2)] * \ + R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) + + # add elements of R_inf up and calculate kernel + for (n1, n2), value in R_inf.items(): + s = p_init_G1 * p_init_G2 * deltakernel( + g1.nodes[n1][node_label], g2.nodes[n2][node_label]) + kernel += s * value # ref [1] equation (6) + + return kernel + + def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr): - i= itr[0] - j = itr[1] - return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration) - + i= itr[0] + j = itr[1] + return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration) + def wrapper_untotter(Gn, node_label, edge_label, i): - return i, untotterTransformation(Gn[i], node_label, edge_label) + return i, untotterTransformation(Gn[i], node_label, edge_label) diff --git a/gklearn/kernels/path_up_to_h.py b/gklearn/kernels/path_up_to_h.py index 8952764..1c8b5e2 100644 --- a/gklearn/kernels/path_up_to_h.py +++ b/gklearn/kernels/path_up_to_h.py @@ -373,8 +373,18 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None for key in all_paths] kernel = np.sum(np.minimum(vector1, vector2)) / \ np.sum(np.maximum(vector1, vector2)) + + elif self.__k_func is None: # no sub-kernel used; compare paths directly. + path_count1 = Counter(paths1) + path_count2 = Counter(paths2) + vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) + for key in all_paths] + vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) + for key in all_paths] + kernel = np.dot(vector1, vector2) + else: - raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax".') + raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax" and None.') return kernel diff --git a/gklearn/kernels/randomWalkKernel.py b/gklearn/kernels/randomWalkKernel.py index 0bea111..346bc98 100644 --- a/gklearn/kernels/randomWalkKernel.py +++ b/gklearn/kernels/randomWalkKernel.py @@ -3,7 +3,7 @@ @references: - [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. + [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. """ import time @@ -21,888 +21,889 @@ from gklearn.utils.graphdataset import get_dataset_attributes from gklearn.utils.parallel import parallel_gm def randomwalkkernel(*args, - # params for all method. - compute_method=None, - weight=1, - p=None, - q=None, - edge_weight=None, - # params for conjugate and fp method. - node_kernels=None, - edge_kernels=None, - node_label='atom', - edge_label='bond_type', - # params for spectral method. - sub_kernel=None, - n_jobs=None, - verbose=True): - """Calculate random walk graph kernels. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - - G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. - - compute_method : string - Method used to compute kernel. The Following choices are - available: - - 'sylvester' - Sylvester equation method. - - 'conjugate' - conjugate gradient method. - - 'fp' - fixed-point iterations. - - 'spectral' - spectral decomposition. - - weight : float - A constant weight set for random walks of length h. - - p : None - Initial probability distribution on the unlabeled direct product graph - of two graphs. It is set to be uniform over all vertices in the direct - product graph. - - q : None - Stopping probability distribution on the unlabeled direct product graph - of two graphs. It is set to be uniform over all vertices in the direct - product graph. - - edge_weight : float - - Edge attribute name corresponding to the edge weight. - - node_kernels: dict - A dictionary of kernel functions for nodes, including 3 items: 'symb' - for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' - for both labels. The first 2 functions take two node labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two nodes. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns a number - as the kernel value. Ignored when nodes are unlabeled. This argument - is designated to conjugate gradient method and fixed-point iterations. - - edge_kernels: dict - A dictionary of kernel functions for edges, including 3 items: 'symb' - for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' - for both labels. The first 2 functions take two edge labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two edges. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns a number - as the kernel value. Ignored when edges are unlabeled. This argument - is designated to conjugate gradient method and fixed-point iterations. - - node_label: string - Node attribute used as label. The default node label is atom. This - argument is designated to conjugate gradient method and fixed-point - iterations. - - edge_label : string - Edge attribute used as label. The default edge label is bond_type. This - argument is designated to conjugate gradient method and fixed-point - iterations. - - sub_kernel: string - Method used to compute walk kernel. The Following choices are - available: - 'exp' : method based on exponential serials. - 'geo' : method based on geometric serials. - - n_jobs: int - Number of jobs for parallelization. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the path kernel up to d between 2 praphs. - """ - compute_method = compute_method.lower() - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - Gn = [g.copy() for g in Gn] - - eweight = None - if edge_weight == None: - if verbose: - print('\n None edge weight specified. Set all weight to 1.\n') - else: - try: - some_weight = list( - nx.get_edge_attributes(Gn[0], edge_weight).values())[0] - if isinstance(some_weight, float) or isinstance(some_weight, int): - eweight = edge_weight - else: - if verbose: - print('\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' - % edge_weight) - except: - if verbose: - print('\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' - % edge_weight) - - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', - 'edge_attr_dim', 'is_directed'], - node_label=node_label, - edge_label=edge_label) - - # remove graphs with no edges, as no walk can be found in their structures, - # so the weight matrix between such a graph and itself might be zero. - len_gn = len(Gn) - Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] - idx = [G[0] for G in Gn] - Gn = [G[1] for G in Gn] - if len(Gn) != len_gn: - if verbose: - print('\n %d graphs are removed as they don\'t contain edges.\n' % - (len_gn - len(Gn))) - - start_time = time.time() - -# # get vertex and edge concatenated labels for each graph -# label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed']) -# gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed']) - - if compute_method == 'sylvester': - if verbose: - import warnings - warnings.warn('All labels are ignored.') - Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs, verbose=verbose) - - elif compute_method == 'conjugate': - Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, node_kernels, - edge_kernels, node_label, edge_label, - eweight, n_jobs, verbose=verbose) - - elif compute_method == 'fp': - Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels, - edge_kernels, node_label, edge_label, - eweight, n_jobs, verbose=verbose) - - elif compute_method == 'spectral': - if verbose: - import warnings - warnings.warn('All labels are ignored. Only works for undirected graphs.') - Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, - eweight, n_jobs, verbose=verbose) - - elif compute_method == 'kron': - pass - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j], - node_label, edge_label) - Kmatrix[j][i] = Kmatrix[i][j] - else: - raise Exception( - 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".' - ) - - run_time = time.time() - start_time - if verbose: - print("\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---" - % (len(Gn), run_time)) - - return Kmatrix, run_time, idx + # params for all method. + compute_method=None, + weight=1, + p=None, + q=None, + edge_weight=None, + # params for conjugate and fp method. + node_kernels=None, + edge_kernels=None, + node_label='atom', + edge_label='bond_type', + # params for spectral method. + sub_kernel=None, + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate random walk graph kernels. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + + compute_method : string + Method used to compute kernel. The Following choices are + available: + + 'sylvester' - Sylvester equation method. + + 'conjugate' - conjugate gradient method. + + 'fp' - fixed-point iterations. + + 'spectral' - spectral decomposition. + + weight : float + A constant weight set for random walks of length h. + + p : None + Initial probability distribution on the unlabeled direct product graph + of two graphs. It is set to be uniform over all vertices in the direct + product graph. + + q : None + Stopping probability distribution on the unlabeled direct product graph + of two graphs. It is set to be uniform over all vertices in the direct + product graph. + + edge_weight : float + + Edge attribute name corresponding to the edge weight. + + node_kernels: dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when nodes are unlabeled. This argument + is designated to conjugate gradient method and fixed-point iterations. + + edge_kernels: dict + A dictionary of kernel functions for edges, including 3 items: 'symb' + for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' + for both labels. The first 2 functions take two edge labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two edges. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when edges are unlabeled. This argument + is designated to conjugate gradient method and fixed-point iterations. + + node_label: string + Node attribute used as label. The default node label is atom. This + argument is designated to conjugate gradient method and fixed-point + iterations. + + edge_label : string + Edge attribute used as label. The default edge label is bond_type. This + argument is designated to conjugate gradient method and fixed-point + iterations. + + sub_kernel: string + Method used to compute walk kernel. The Following choices are + available: + 'exp' : method based on exponential serials. + 'geo' : method based on geometric serials. + + n_jobs: int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the path kernel up to d between 2 praphs. + """ + compute_method = compute_method.lower() + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + Gn = [g.copy() for g in Gn] + + eweight = None + if edge_weight == None: + if verbose: + print('\n None edge weight specified. Set all weight to 1.\n') + else: + try: + some_weight = list( + nx.get_edge_attributes(Gn[0], edge_weight).values())[0] + if isinstance(some_weight, float) or isinstance(some_weight, int): + eweight = edge_weight + else: + if verbose: + print('\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' + % edge_weight) + except: + if verbose: + print('\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' + % edge_weight) + + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', + 'edge_attr_dim', 'is_directed'], + node_label=node_label, + edge_label=edge_label) + + # remove graphs with no edges, as no walk can be found in their structures, + # so the weight matrix between such a graph and itself might be zero. + len_gn = len(Gn) + Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] + idx = [G[0] for G in Gn] + Gn = [G[1] for G in Gn] + if len(Gn) != len_gn: + if verbose: + print('\n %d graphs are removed as they don\'t contain edges.\n' % + (len_gn - len(Gn))) + + start_time = time.time() + +# # get vertex and edge concatenated labels for each graph +# label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed']) +# gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed']) + + if compute_method == 'sylvester': + if verbose: + import warnings + warnings.warn('All labels are ignored.') + Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs, chunksize, verbose=verbose) + + elif compute_method == 'conjugate': + Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, node_kernels, + edge_kernels, node_label, edge_label, + eweight, n_jobs, chunksize, verbose=verbose) + + elif compute_method == 'fp': + Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels, + edge_kernels, node_label, edge_label, + eweight, n_jobs, chunksize, verbose=verbose) + + elif compute_method == 'spectral': + if verbose: + import warnings + warnings.warn('All labels are ignored. Only works for undirected graphs.') + Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, + eweight, n_jobs, chunksize, verbose=verbose) + + elif compute_method == 'kron': + pass + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j], + node_label, edge_label) + Kmatrix[j][i] = Kmatrix[i][j] + else: + raise Exception( + 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".' + ) + + run_time = time.time() - start_time + if verbose: + print("\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time, idx ############################################################################### -def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True): - """Calculate walk graph kernels up to n between 2 graphs using Sylvester method. - - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - Kmatrix = np.zeros((len(Gn), len(Gn))) - - if q == None: - # don't normalize adjacency matrices if q is a uniform vector. Note - # A_wave_list actually contains the transposes of the adjacency matrices. - A_wave_list = [ - nx.adjacency_matrix(G, eweight).todense().transpose() for G in - (tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) if - verbose else Gn) - ] -# # normalized adjacency matrices -# A_wave_list = [] -# for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): -# A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() -# norm = A_tilde.sum(axis=0) -# norm[norm == 0] = 1 -# A_wave_list.append(A_tilde / norm) - if p == None: # p is uniform distribution as default. - def init_worker(Awl_toshare): - global G_Awl - G_Awl = Awl_toshare - do_partial = partial(wrapper_se_do, lmda) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(A_wave_list,), n_jobs=n_jobs, verbose=verbose) - -# pbar = tqdm( -# total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', -# file=sys.stdout) -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# S = lmda * A_wave_list[j] -# T_t = A_wave_list[i] -# # use uniform distribution if there is no prior knowledge. -# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) -# p_times_uni = 1 / nb_pd -# M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni) -# X = dlyap(S, T_t, M0) -# X = np.reshape(X, (-1, 1), order='F') -# # use uniform distribution if there is no prior knowledge. -# q_times = np.full((1, nb_pd), p_times_uni) -# Kmatrix[i][j] = np.dot(q_times, X) -# Kmatrix[j][i] = Kmatrix[i][j] -# pbar.update(1) - - return Kmatrix +def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, chunksize, verbose=True): + """Calculate walk graph kernels up to n between 2 graphs using Sylvester method. + + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + Kmatrix = np.zeros((len(Gn), len(Gn))) + + if q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + A_wave_list = [ + nx.adjacency_matrix(G, eweight).todense().transpose() for G in + (tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) if + verbose else Gn) + ] +# # normalized adjacency matrices +# A_wave_list = [] +# for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): +# A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() +# norm = A_tilde.sum(axis=0) +# norm[norm == 0] = 1 +# A_wave_list.append(A_tilde / norm) + if p == None: # p is uniform distribution as default. + def init_worker(Awl_toshare): + global G_Awl + G_Awl = Awl_toshare + do_partial = partial(wrapper_se_do, lmda) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(A_wave_list,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + +# pbar = tqdm( +# total=(1 + len(Gn)) * len(Gn) / 2, +# desc='calculating kernels', +# file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# S = lmda * A_wave_list[j] +# T_t = A_wave_list[i] +# # use uniform distribution if there is no prior knowledge. +# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) +# p_times_uni = 1 / nb_pd +# M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni) +# X = dlyap(S, T_t, M0) +# X = np.reshape(X, (-1, 1), order='F') +# # use uniform distribution if there is no prior knowledge. +# q_times = np.full((1, nb_pd), p_times_uni) +# Kmatrix[i][j] = np.dot(q_times, X) +# Kmatrix[j][i] = Kmatrix[i][j] +# pbar.update(1) + + return Kmatrix def wrapper_se_do(lmda, itr): - i = itr[0] - j = itr[1] - return i, j, _se_do(G_Awl[i], G_Awl[j], lmda) + i = itr[0] + j = itr[1] + return i, j, _se_do(G_Awl[i], G_Awl[j], lmda) def _se_do(A_wave1, A_wave2, lmda): - from control import dlyap - S = lmda * A_wave2 - T_t = A_wave1 - # use uniform distribution if there is no prior knowledge. - nb_pd = len(A_wave1) * len(A_wave2) - p_times_uni = 1 / nb_pd - M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni) - X = dlyap(S, T_t, M0) - X = np.reshape(X, (-1, 1), order='F') - # use uniform distribution if there is no prior knowledge. - q_times = np.full((1, nb_pd), p_times_uni) - return np.dot(q_times, X) + from control import dlyap + S = lmda * A_wave2 + T_t = A_wave1 + # use uniform distribution if there is no prior knowledge. + nb_pd = len(A_wave1) * len(A_wave2) + p_times_uni = 1 / nb_pd + M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni) + X = dlyap(S, T_t, M0) + X = np.reshape(X, (-1, 1), order='F') + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, nb_pd), p_times_uni) + return np.dot(q_times, X) ############################################################################### def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, - node_label, edge_label, eweight, n_jobs, verbose=True): - """Calculate walk graph kernels up to n between 2 graphs using conjugate method. - - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - Kmatrix = np.zeros((len(Gn), len(Gn))) - -# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ -# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1: -# # this is faster from unlabeled graphs. @todo: why? -# if q == None: -# # don't normalize adjacency matrices if q is a uniform vector. Note -# # A_wave_list actually contains the transposes of the adjacency matrices. -# A_wave_list = [ -# nx.adjacency_matrix(G, eweight).todense().transpose() for G in -# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) -# ] -# if p == None: # p is uniform distribution as default. -# def init_worker(Awl_toshare): -# global G_Awl -# G_Awl = Awl_toshare -# do_partial = partial(wrapper_cg_unlabled_do, lmda) -# parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, -# glbv=(A_wave_list,), n_jobs=n_jobs) -# else: - # reindex nodes using consecutive integers for convenience of kernel calculation. - Gn = [nx.convert_node_labels_to_integers( - g, first_label=0, label_attribute='label_orignal') for g in (tqdm( - Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)] - - if p == None and q == None: # p and q are uniform distributions as default. - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels, - node_label, edge_kernels, edge_label, lmda) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) - -# pbar = tqdm( -# total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', -# file=sys.stdout) -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels, -# node_label, edge_kernels, edge_label, lmda) -# Kmatrix[i][j] = result -# Kmatrix[j][i] = Kmatrix[i][j] -# pbar.update(1) - return Kmatrix + node_label, edge_label, eweight, n_jobs, chunksize, verbose=True): + """Calculate walk graph kernels up to n between 2 graphs using conjugate method. + + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + Kmatrix = np.zeros((len(Gn), len(Gn))) + +# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ +# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1: +# # this is faster from unlabeled graphs. @todo: why? +# if q == None: +# # don't normalize adjacency matrices if q is a uniform vector. Note +# # A_wave_list actually contains the transposes of the adjacency matrices. +# A_wave_list = [ +# nx.adjacency_matrix(G, eweight).todense().transpose() for G in +# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) +# ] +# if p == None: # p is uniform distribution as default. +# def init_worker(Awl_toshare): +# global G_Awl +# G_Awl = Awl_toshare +# do_partial = partial(wrapper_cg_unlabled_do, lmda) +# parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, +# glbv=(A_wave_list,), n_jobs=n_jobs) +# else: + # reindex nodes using consecutive integers for convenience of kernel calculation. + Gn = [nx.convert_node_labels_to_integers( + g, first_label=0, label_attribute='label_orignal') for g in (tqdm( + Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)] + + if p == None and q == None: # p and q are uniform distributions as default. + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels, + node_label, edge_kernels, edge_label, lmda) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + +# pbar = tqdm( +# total=(1 + len(Gn)) * len(Gn) / 2, +# desc='calculating kernels', +# file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels, +# node_label, edge_kernels, edge_label, lmda) +# Kmatrix[i][j] = result +# Kmatrix[j][i] = Kmatrix[i][j] +# pbar.update(1) + return Kmatrix def wrapper_cg_unlabled_do(lmda, itr): - i = itr[0] - j = itr[1] - return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda) + i = itr[0] + j = itr[1] + return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda) def _cg_unlabled_do(A_wave1, A_wave2, lmda): - nb_pd = len(A_wave1) * len(A_wave2) - p_times_uni = 1 / nb_pd - w_times = kron(A_wave1, A_wave2).todense() - A = identity(w_times.shape[0]) - w_times * lmda - b = np.full((nb_pd, 1), p_times_uni) - x, _ = cg(A, b) - # use uniform distribution if there is no prior knowledge. - q_times = np.full((1, nb_pd), p_times_uni) - return np.dot(q_times, x) + nb_pd = len(A_wave1) * len(A_wave2) + p_times_uni = 1 / nb_pd + w_times = kron(A_wave1, A_wave2).todense() + A = identity(w_times.shape[0]) - w_times * lmda + b = np.full((nb_pd, 1), p_times_uni) + x, _ = cg(A, b) + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, nb_pd), p_times_uni) + return np.dot(q_times, x) def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, - edge_label, lmda, itr): - i = itr[0] - j = itr[1] - return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, - node_label, edge_kernels, edge_label, lmda) + edge_label, lmda, itr): + i = itr[0] + j = itr[1] + return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, + node_label, edge_kernels, edge_label, lmda) def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, - edge_kernels, edge_label, lmda): - # Frist, compute kernels between all pairs of nodes, method borrowed - # from FCSP. It is faster than directly computing all edge kernels - # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the - # graphs compared, which is the most case we went though. For very - # sparse graphs, this would be slow. - vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) - - # Compute weight matrix of the direct product graph. - w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, - edge_kernels, edge_label) - # use uniform distribution if there is no prior knowledge. - p_times_uni = 1 / w_dim - A = identity(w_times.shape[0]) - w_times * lmda - b = np.full((w_dim, 1), p_times_uni) - x, _ = cg(A, b) - # use uniform distribution if there is no prior knowledge. - q_times = np.full((1, w_dim), p_times_uni) - return np.dot(q_times, x) + edge_kernels, edge_label, lmda): + # Frist, compute kernels between all pairs of nodes, method borrowed + # from FCSP. It is faster than directly computing all edge kernels + # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the + # graphs compared, which is the most case we went though. For very + # sparse graphs, this would be slow. + vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) + + # Compute weight matrix of the direct product graph. + w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, + edge_kernels, edge_label) + # use uniform distribution if there is no prior knowledge. + p_times_uni = 1 / w_dim + A = identity(w_times.shape[0]) - w_times * lmda + b = np.full((w_dim, 1), p_times_uni) + x, _ = cg(A, b) + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, w_dim), p_times_uni) + return np.dot(q_times, x) ############################################################################### def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, - node_label, edge_label, eweight, n_jobs, verbose=True): - """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method. - - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - - - Kmatrix = np.zeros((len(Gn), len(Gn))) - -# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ -# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1: -# # this is faster from unlabeled graphs. @todo: why? -# if q == None: -# # don't normalize adjacency matrices if q is a uniform vector. Note -# # A_wave_list actually contains the transposes of the adjacency matrices. -# A_wave_list = [ -# nx.adjacency_matrix(G, eweight).todense().transpose() for G in -# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) -# ] -# if p == None: # p is uniform distribution as default. -# pbar = tqdm( -# total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', -# file=sys.stdout) -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# # use uniform distribution if there is no prior knowledge. -# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) -# p_times_uni = 1 / nb_pd -# w_times = kron(A_wave_list[i], A_wave_list[j]).todense() -# p_times = np.full((nb_pd, 1), p_times_uni) -# x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times)) -# # use uniform distribution if there is no prior knowledge. -# q_times = np.full((1, nb_pd), p_times_uni) -# Kmatrix[i][j] = np.dot(q_times, x) -# Kmatrix[j][i] = Kmatrix[i][j] -# pbar.update(1) -# else: - # reindex nodes using consecutive integers for convenience of kernel calculation. - Gn = [nx.convert_node_labels_to_integers( - g, first_label=0, label_attribute='label_orignal') for g in (tqdm( - Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)] - - if p == None and q == None: # p and q are uniform distributions as default. - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels, - node_label, edge_kernels, edge_label, lmda) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) - return Kmatrix + node_label, edge_label, eweight, n_jobs, chunksize, verbose=True): + """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method. + + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + + + Kmatrix = np.zeros((len(Gn), len(Gn))) + +# if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ +# not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1: +# # this is faster from unlabeled graphs. @todo: why? +# if q == None: +# # don't normalize adjacency matrices if q is a uniform vector. Note +# # A_wave_list actually contains the transposes of the adjacency matrices. +# A_wave_list = [ +# nx.adjacency_matrix(G, eweight).todense().transpose() for G in +# tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) +# ] +# if p == None: # p is uniform distribution as default. +# pbar = tqdm( +# total=(1 + len(Gn)) * len(Gn) / 2, +# desc='calculating kernels', +# file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# # use uniform distribution if there is no prior knowledge. +# nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) +# p_times_uni = 1 / nb_pd +# w_times = kron(A_wave_list[i], A_wave_list[j]).todense() +# p_times = np.full((nb_pd, 1), p_times_uni) +# x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times)) +# # use uniform distribution if there is no prior knowledge. +# q_times = np.full((1, nb_pd), p_times_uni) +# Kmatrix[i][j] = np.dot(q_times, x) +# Kmatrix[j][i] = Kmatrix[i][j] +# pbar.update(1) +# else: + # reindex nodes using consecutive integers for convenience of kernel calculation. + Gn = [nx.convert_node_labels_to_integers( + g, first_label=0, label_attribute='label_orignal') for g in (tqdm( + Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)] + + if p == None and q == None: # p and q are uniform distributions as default. + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels, + node_label, edge_kernels, edge_label, lmda) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + return Kmatrix def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, - edge_label, lmda, itr): - i = itr[0] - j = itr[1] - return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, - node_label, edge_kernels, edge_label, lmda) + edge_label, lmda, itr): + i = itr[0] + j = itr[1] + return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, + node_label, edge_kernels, edge_label, lmda) def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, - edge_kernels, edge_label, lmda): - # Frist, compute kernels between all pairs of nodes, method borrowed - # from FCSP. It is faster than directly computing all edge kernels - # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the - # graphs compared, which is the most case we went though. For very - # sparse graphs, this would be slow. - vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) - - # Compute weight matrix of the direct product graph. - w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, - edge_kernels, edge_label) - # use uniform distribution if there is no prior knowledge. - p_times_uni = 1 / w_dim - p_times = np.full((w_dim, 1), p_times_uni) - x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times), - xtol=1e-06, maxiter=1000) - # use uniform distribution if there is no prior knowledge. - q_times = np.full((1, w_dim), p_times_uni) - return np.dot(q_times, x) + edge_kernels, edge_label, lmda): + # Frist, compute kernels between all pairs of nodes, method borrowed + # from FCSP. It is faster than directly computing all edge kernels + # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the + # graphs compared, which is the most case we went though. For very + # sparse graphs, this would be slow. + vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) + + # Compute weight matrix of the direct product graph. + w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, + edge_kernels, edge_label) + # use uniform distribution if there is no prior knowledge. + p_times_uni = 1 / w_dim + p_times = np.full((w_dim, 1), p_times_uni) + x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times), + xtol=1e-06, maxiter=1000) + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, w_dim), p_times_uni) + return np.dot(q_times, x) def func_fp(x, p_times, lmda, w_times): - haha = w_times * x - haha = lmda * haha - haha = p_times + haha - return p_times + lmda * np.dot(w_times, x) + haha = w_times * x + haha = lmda * haha + haha = p_times + haha + return p_times + lmda * np.dot(w_times, x) ############################################################################### -def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=True): - """Calculate walk graph kernels up to n between 2 unlabeled graphs using - spectral decomposition method. Labels will be ignored. - - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. - - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - Kmatrix = np.zeros((len(Gn), len(Gn))) - - if q == None: - # precompute the spectral decomposition of each graph. - P_list = [] - D_list = [] - for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout) if - verbose else Gn): - # don't normalize adjacency matrices if q is a uniform vector. Note - # A actually is the transpose of the adjacency matrix. - A = nx.adjacency_matrix(G, eweight).todense().transpose() - ew, ev = np.linalg.eig(A) - D_list.append(ew) - P_list.append(ev) -# P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? - - if p == None: # p is uniform distribution as default. - q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn] -# q_T_list = [q.T for q in q_list] - def init_worker(q_T_toshare, P_toshare, D_toshare): - global G_q_T, G_P, G_D - G_q_T = q_T_toshare - G_P = P_toshare - G_D = D_toshare - do_partial = partial(wrapper_sd_do, weight, sub_kernel) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs, - verbose=verbose) - - -# pbar = tqdm( -# total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', -# file=sys.stdout) -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], -# D_list[i], D_list[j], weight, sub_kernel) -# Kmatrix[i][j] = result -# Kmatrix[j][i] = Kmatrix[i][j] -# pbar.update(1) - return Kmatrix +def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, chunksize, verbose=True): + """Calculate walk graph kernels up to n between 2 unlabeled graphs using + spectral decomposition method. Labels will be ignored. + + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + Kmatrix = np.zeros((len(Gn), len(Gn))) + + if q == None: + # precompute the spectral decomposition of each graph. + P_list = [] + D_list = [] + for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout) if + verbose else Gn): + # don't normalize adjacency matrices if q is a uniform vector. Note + # A actually is the transpose of the adjacency matrix. + A = nx.adjacency_matrix(G, eweight).todense().transpose() + ew, ev = np.linalg.eig(A) + D_list.append(ew) + P_list.append(ev) +# P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? + + if p == None: # p is uniform distribution as default. + q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn] +# q_T_list = [q.T for q in q_list] + def init_worker(q_T_toshare, P_toshare, D_toshare): + global G_q_T, G_P, G_D + G_q_T = q_T_toshare + G_P = P_toshare + G_D = D_toshare + do_partial = partial(wrapper_sd_do, weight, sub_kernel) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs, + chunksize=chunksize, verbose=verbose) + + +# pbar = tqdm( +# total=(1 + len(Gn)) * len(Gn) / 2, +# desc='calculating kernels', +# file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +# result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], +# D_list[i], D_list[j], weight, sub_kernel) +# Kmatrix[i][j] = result +# Kmatrix[j][i] = Kmatrix[i][j] +# pbar.update(1) + return Kmatrix def wrapper_sd_do(weight, sub_kernel, itr): - i = itr[0] - j = itr[1] - return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j], - weight, sub_kernel) - - -def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): - # use uniform distribution if there is no prior knowledge. - kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense() - # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs -# kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense() - if sub_kernel == 'exp': - D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) - kmiddle = np.diag(np.exp(weight * D_diag)) - elif sub_kernel == 'geo': - D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) - kmiddle = np.diag(weight * D_diag) - kmiddle = np.identity(len(kmiddle)) - weight * kmiddle - kmiddle = np.linalg.inv(kmiddle) - return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0] + i = itr[0] + j = itr[1] + return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j], + weight, sub_kernel) + + +def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): + # use uniform distribution if there is no prior knowledge. + kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense() + # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs +# kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense() + if sub_kernel == 'exp': + D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) + kmiddle = np.diag(np.exp(weight * D_diag)) + elif sub_kernel == 'geo': + D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) + kmiddle = np.diag(weight * D_diag) + kmiddle = np.identity(len(kmiddle)) - weight * kmiddle + kmiddle = np.linalg.inv(kmiddle) + return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0] ############################################################################### def _randomwalkkernel_kron(G1, G2, node_label, edge_label): - """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method. + """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method. - Parameters - ---------- - G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. - node_label : string - node attribute used as label. - edge_label : string - edge attribute used as label. + Parameters + ---------- + G1, G2 : NetworkX graph + Graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. - Return - ------ - kernel : float - Kernel between 2 graphs. - """ - pass + Return + ------ + kernel : float + Kernel between 2 graphs. + """ + pass ############################################################################### def getLabels(Gn, node_label, edge_label, directed): - """Get symbolic labels of a graph dataset, where vertex labels are dealt - with by concatenating them to the edge labels of adjacent edges. - """ - label_list = [] - label_set = set() - for g in Gn: - label_g = {} - for e in g.edges(data=True): - nl1 = g.node[e[0]][node_label] - nl2 = g.node[e[1]][node_label] - if not directed and nl1 > nl2: - nl1, nl2 = nl2, nl1 - label = (nl1, e[2][edge_label], nl2) - label_g[(e[0], e[1])] = label - label_list.append(label_g) - label_set = set([l for lg in label_list for l in lg.values()]) - return label_list, len(label_set) + """Get symbolic labels of a graph dataset, where vertex labels are dealt + with by concatenating them to the edge labels of adjacent edges. + """ + label_list = [] + label_set = set() + for g in Gn: + label_g = {} + for e in g.edges(data=True): + nl1 = g.node[e[0]][node_label] + nl2 = g.node[e[1]][node_label] + if not directed and nl1 > nl2: + nl1, nl2 = nl2, nl1 + label = (nl1, e[2][edge_label], nl2) + label_g[(e[0], e[1])] = label + label_list.append(label_g) + label_set = set([l for lg in label_list for l in lg.values()]) + return label_list, len(label_set) def filterGramMatrix(gmt, label_dict, label, directed): - """Compute (the transpose of) the Gram matrix filtered by a label. - """ - gmf = np.zeros(gmt.shape) - for (n1, n2), l in label_dict.items(): - if l == label: - gmf[n2, n1] = gmt[n2, n1] - if not directed: - gmf[n1, n2] = gmt[n1, n2] - return gmf + """Compute (the transpose of) the Gram matrix filtered by a label. + """ + gmf = np.zeros(gmt.shape) + for (n1, n2), l in label_dict.items(): + if l == label: + gmf[n2, n1] = gmt[n2, n1] + if not directed: + gmf[n1, n2] = gmt[n1, n2] + return gmf def computeVK(g1, g2, ds_attrs, node_kernels, node_label): - '''Compute vertex kernels between vertices of two graphs. - ''' - vk_dict = {} # shortest path matrices dict - if ds_attrs['node_labeled']: - # node symb and non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['mix'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn( - n1[1][node_label], n2[1][node_label], - n1[1]['attributes'], n2[1]['attributes']) - # node symb labeled - else: - kn = node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], - n2[1][node_label]) - else: - # node non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], - n2[1]['attributes']) - # node unlabeled - else: - pass - return vk_dict + '''Compute vertex kernels between vertices of two graphs. + ''' + vk_dict = {} # shortest path matrices dict + if ds_attrs['node_labeled']: + # node symb and non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['mix'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn( + n1[1][node_label], n2[1][node_label], + n1[1]['attributes'], n2[1]['attributes']) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], + n2[1][node_label]) + else: + # node non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], + n2[1]['attributes']) + # node unlabeled + else: + pass + return vk_dict def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label): - '''Compute weight matrix of the direct product graph. - ''' - w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) - w_times = np.zeros((w_dim, w_dim)) - if vk_dict: # node labeled - if ds_attrs['is_directed']: - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] - # edge unlabeled - else: - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * vk_dict[(e1[1], e2[1])] - else: # undirected - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] \ - + vk_dict[(e1[0], e2[1])] \ - * ek_temp * vk_dict[(e1[1], e2[0])] - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] \ - + vk_dict[(e1[0], e2[1])] \ - * ek_temp * vk_dict[(e1[1], e2[0])] - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * ek_temp * vk_dict[(e1[1], e2[1])] \ - + vk_dict[(e1[0], e2[1])] \ - * ek_temp * vk_dict[(e1[1], e2[0])] - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - # edge unlabeled - else: - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ - * vk_dict[(e1[1], e2[1])] \ - + vk_dict[(e1[0], e2[1])] \ - * vk_dict[(e1[1], e2[0])] - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - else: # node unlabeled - if ds_attrs['is_directed']: - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - # edge unlabeled - else: - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = 1 - else: # undirected - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = ek_temp - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - # edge unlabeled - else: - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], - e1[1] * nx.number_of_nodes(g2) + e2[1]) - w_times[w_idx] = 1 - w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] - w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], - e1[1] * nx.number_of_nodes(g2) + e2[0]) - w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] - w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] - return w_times, w_dim + '''Compute weight matrix of the direct product graph. + ''' + w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) + w_times = np.zeros((w_dim, w_dim)) + if vk_dict: # node labeled + if ds_attrs['is_directed']: + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] + # edge unlabeled + else: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * vk_dict[(e1[1], e2[1])] + else: # undirected + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] \ + + vk_dict[(e1[0], e2[1])] \ + * ek_temp * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] \ + + vk_dict[(e1[0], e2[1])] \ + * ek_temp * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * ek_temp * vk_dict[(e1[1], e2[1])] \ + + vk_dict[(e1[0], e2[1])] \ + * ek_temp * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + # edge unlabeled + else: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] \ + * vk_dict[(e1[1], e2[1])] \ + + vk_dict[(e1[0], e2[1])] \ + * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + else: # node unlabeled + if ds_attrs['is_directed']: + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + # edge unlabeled + else: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = 1 + else: # undirected + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + # edge unlabeled + else: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], + e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = 1 + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], + e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + return w_times, w_dim diff --git a/gklearn/kernels/spKernel.py b/gklearn/kernels/spKernel.py index 615d2fc..b48a905 100644 --- a/gklearn/kernels/spKernel.py +++ b/gklearn/kernels/spKernel.py @@ -2,9 +2,9 @@ @author: linlin @references: - - [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData - Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. + + [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData + Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. """ import sys @@ -22,303 +22,305 @@ from gklearn.utils.graphdataset import get_dataset_attributes from gklearn.utils.parallel import parallel_gm def spkernel(*args, - node_label='atom', - edge_weight=None, - node_kernels=None, - parallel='imap_unordered', - n_jobs=None, - verbose=True): - """Calculate shortest-path kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - - G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. - - node_label : string - Node attribute used as label. The default node label is atom. - - edge_weight : string - Edge attribute name corresponding to the edge weight. - - node_kernels : dict - A dictionary of kernel functions for nodes, including 3 items: 'symb' - for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' - for both labels. The first 2 functions take two node labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two nodes. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns an - number as the kernel value. Ignored when nodes are unlabeled. - - n_jobs : int - Number of jobs for parallelization. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the sp kernel between 2 praphs. - """ - # pre-process - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - Gn = [g.copy() for g in Gn] - weight = None - if edge_weight is None: - if verbose: - print('\n None edge weight specified. Set all weight to 1.\n') - else: - try: - some_weight = list( - nx.get_edge_attributes(Gn[0], edge_weight).values())[0] - if isinstance(some_weight, (float, int)): - weight = edge_weight - else: - if verbose: - print( - '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' - % edge_weight) - except: - if verbose: - print( - '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' - % edge_weight) - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], - node_label=node_label) - - # remove graphs with no edges, as no sp can be found in their structures, - # so the kernel between such a graph and itself will be zero. - len_gn = len(Gn) - Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] - idx = [G[0] for G in Gn] - Gn = [G[1] for G in Gn] - if len(Gn) != len_gn: - if verbose: - print('\n %d graphs are removed as they don\'t contain edges.\n' % - (len_gn - len(Gn))) - - start_time = time.time() - - if parallel == 'imap_unordered': - pool = Pool(n_jobs) - # get shortest path graphs of Gn - getsp_partial = partial(wrapper_getSPGraph, weight) - itr = zip(Gn, range(0, len(Gn))) - if len(Gn) < 100 * n_jobs: - # # use default chunksize as pool.map when iterable is less than 100 - # chunksize, extra = divmod(len(Gn), n_jobs * 4) - # if extra: - # chunksize += 1 - chunksize = int(len(Gn) / n_jobs) + 1 - else: - chunksize = 100 - if verbose: - iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), - desc='getting sp graphs', file=sys.stdout) - else: - iterator = pool.imap_unordered(getsp_partial, itr, chunksize) - for i, g in iterator: - Gn[i] = g - pool.close() - pool.join() - - elif parallel is None: - pass -# # ---- direct running, normally use single CPU core. ---- -# for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): -# i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) - - # # ---- use pool.map to parallel ---- - # result_sp = pool.map(getsp_partial, range(0, len(Gn))) - # for i in result_sp: - # Gn[i[0]] = i[1] - # or - # getsp_partial = partial(wrap_getSPGraph, Gn, weight) - # for i, g in tqdm( - # pool.map(getsp_partial, range(0, len(Gn))), - # desc='getting sp graphs', - # file=sys.stdout): - # Gn[i] = g - - # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) - # sp_ml = [0] * len(Gn) # shortest path matrices - # for i in result_sp: - # sp_ml[i[0]] = i[1] - # edge_x_g = [[] for i in range(len(sp_ml))] - # edge_y_g = [[] for i in range(len(sp_ml))] - # edge_w_g = [[] for i in range(len(sp_ml))] - # for idx, item in enumerate(sp_ml): - # for i1 in range(len(item)): - # for i2 in range(i1 + 1, len(item)): - # if item[i1, i2] != np.inf: - # edge_x_g[idx].append(i1) - # edge_y_g[idx].append(i2) - # edge_w_g[idx].append(item[i1, i2]) - # print(len(edge_x_g[0])) - # print(len(edge_y_g[0])) - # print(len(edge_w_g[0])) - - Kmatrix = np.zeros((len(Gn), len(Gn))) - - # ---- use pool.imap_unordered to parallel and track progress. ---- - def init_worker(gn_toshare): - global G_gn - G_gn = gn_toshare - do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) - - - # # ---- use pool.map to parallel. ---- - # # result_perf = pool.map(do_partial, itr) - # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) - # itr = combinations_with_replacement(range(0, len(Gn)), 2) - # for i, j, kernel in tqdm( - # pool.map(do_partial, itr), desc='calculating kernels', - # file=sys.stdout): - # Kmatrix[i][j] = kernel - # Kmatrix[j][i] = kernel - # pool.close() - # pool.join() - - # # ---- use joblib.Parallel to parallel and track progress. ---- - # result_perf = Parallel( - # n_jobs=n_jobs, verbose=10)( - # delayed(do_partial)(ij) - # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) - # result_perf = [ - # do_partial(ij) - # for ij in combinations_with_replacement(range(0, len(Gn)), 2) - # ] - # for i in result_perf: - # Kmatrix[i[0]][i[1]] = i[2] - # Kmatrix[i[1]][i[0]] = i[2] - -# # ---- direct running, normally use single CPU core. ---- -# from itertools import combinations_with_replacement -# itr = combinations_with_replacement(range(0, len(Gn)), 2) -# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): -# kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel - - run_time = time.time() - start_time - if verbose: - print( - "\n --- shortest path kernel matrix of size %d built in %s seconds ---" - % (len(Gn), run_time)) - - return Kmatrix, run_time, idx + node_label='atom', + edge_weight=None, + node_kernels=None, + parallel='imap_unordered', + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate shortest-path kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + + node_label : string + Node attribute used as label. The default node label is atom. + + edge_weight : string + Edge attribute name corresponding to the edge weight. + + node_kernels : dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns an + number as the kernel value. Ignored when nodes are unlabeled. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the sp kernel between 2 praphs. + """ + # pre-process + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + Gn = [g.copy() for g in Gn] + weight = None + if edge_weight is None: + if verbose: + print('\n None edge weight specified. Set all weight to 1.\n') + else: + try: + some_weight = list( + nx.get_edge_attributes(Gn[0], edge_weight).values())[0] + if isinstance(some_weight, (float, int)): + weight = edge_weight + else: + if verbose: + print( + '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' + % edge_weight) + except: + if verbose: + print( + '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' + % edge_weight) + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], + node_label=node_label) + + # remove graphs with no edges, as no sp can be found in their structures, + # so the kernel between such a graph and itself will be zero. + len_gn = len(Gn) + Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] + idx = [G[0] for G in Gn] + Gn = [G[1] for G in Gn] + if len(Gn) != len_gn: + if verbose: + print('\n %d graphs are removed as they don\'t contain edges.\n' % + (len_gn - len(Gn))) + + start_time = time.time() + + if parallel == 'imap_unordered': + pool = Pool(n_jobs) + # get shortest path graphs of Gn + getsp_partial = partial(wrapper_getSPGraph, weight) + itr = zip(Gn, range(0, len(Gn))) + if chunksize is None: + if len(Gn) < 100 * n_jobs: + # # use default chunksize as pool.map when iterable is less than 100 + # chunksize, extra = divmod(len(Gn), n_jobs * 4) + # if extra: + # chunksize += 1 + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + if verbose: + iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), + desc='getting sp graphs', file=sys.stdout) + else: + iterator = pool.imap_unordered(getsp_partial, itr, chunksize) + for i, g in iterator: + Gn[i] = g + pool.close() + pool.join() + + elif parallel is None: + pass +# # ---- direct running, normally use single CPU core. ---- +# for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): +# i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) + + # # ---- use pool.map to parallel ---- + # result_sp = pool.map(getsp_partial, range(0, len(Gn))) + # for i in result_sp: + # Gn[i[0]] = i[1] + # or + # getsp_partial = partial(wrap_getSPGraph, Gn, weight) + # for i, g in tqdm( + # pool.map(getsp_partial, range(0, len(Gn))), + # desc='getting sp graphs', + # file=sys.stdout): + # Gn[i] = g + + # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) + # sp_ml = [0] * len(Gn) # shortest path matrices + # for i in result_sp: + # sp_ml[i[0]] = i[1] + # edge_x_g = [[] for i in range(len(sp_ml))] + # edge_y_g = [[] for i in range(len(sp_ml))] + # edge_w_g = [[] for i in range(len(sp_ml))] + # for idx, item in enumerate(sp_ml): + # for i1 in range(len(item)): + # for i2 in range(i1 + 1, len(item)): + # if item[i1, i2] != np.inf: + # edge_x_g[idx].append(i1) + # edge_y_g[idx].append(i2) + # edge_w_g[idx].append(item[i1, i2]) + # print(len(edge_x_g[0])) + # print(len(edge_y_g[0])) + # print(len(edge_w_g[0])) + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + + + # # ---- use pool.map to parallel. ---- + # # result_perf = pool.map(do_partial, itr) + # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) + # itr = combinations_with_replacement(range(0, len(Gn)), 2) + # for i, j, kernel in tqdm( + # pool.map(do_partial, itr), desc='calculating kernels', + # file=sys.stdout): + # Kmatrix[i][j] = kernel + # Kmatrix[j][i] = kernel + # pool.close() + # pool.join() + + # # ---- use joblib.Parallel to parallel and track progress. ---- + # result_perf = Parallel( + # n_jobs=n_jobs, verbose=10)( + # delayed(do_partial)(ij) + # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) + # result_perf = [ + # do_partial(ij) + # for ij in combinations_with_replacement(range(0, len(Gn)), 2) + # ] + # for i in result_perf: + # Kmatrix[i[0]][i[1]] = i[2] + # Kmatrix[i[1]][i[0]] = i[2] + +# # ---- direct running, normally use single CPU core. ---- +# from itertools import combinations_with_replacement +# itr = combinations_with_replacement(range(0, len(Gn)), 2) +# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel + + run_time = time.time() - start_time + if verbose: + print( + "\n --- shortest path kernel matrix of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time, idx def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels): - - kernel = 0 - - # compute shortest path matrices first, method borrowed from FCSP. - vk_dict = {} # shortest path matrices dict - if ds_attrs['node_labeled']: - # node symb and non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['mix'] - for n1, n2 in product( - g1.nodes(data=True), g2.nodes(data=True)): - vk_dict[(n1[0], n2[0])] = kn( - n1[1][node_label], n2[1][node_label], - n1[1]['attributes'], n2[1]['attributes']) - # node symb labeled - else: - kn = node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], - n2[1][node_label]) - else: - # node non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], - n2[1]['attributes']) - # node unlabeled - else: - for e1, e2 in product( - g1.edges(data=True), g2.edges(data=True)): - if e1[2]['cost'] == e2[2]['cost']: - kernel += 1 - return kernel - - # compute graph kernels - if ds_attrs['is_directed']: - for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): - if e1[2]['cost'] == e2[2]['cost']: - nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], - e2[1])] - kn1 = nk11 * nk22 - kernel += kn1 - else: - for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): - if e1[2]['cost'] == e2[2]['cost']: - # each edge walk is counted twice, starting from both its extreme nodes. - nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( - e1[0], e2[1])], vk_dict[(e1[1], - e2[0])], vk_dict[(e1[1], - e2[1])] - kn1 = nk11 * nk22 - kn2 = nk12 * nk21 - kernel += kn1 + kn2 - - # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation - # # compute vertex kernels - # try: - # vk_mat = np.zeros((nx.number_of_nodes(g1), - # nx.number_of_nodes(g2))) - # g1nl = enumerate(g1.nodes(data=True)) - # g2nl = enumerate(g2.nodes(data=True)) - # for i1, n1 in g1nl: - # for i2, n2 in g2nl: - # vk_mat[i1][i2] = kn( - # n1[1][node_label], n2[1][node_label], - # [n1[1]['attributes']], [n2[1]['attributes']]) - - # range1 = range(0, len(edge_w_g[i])) - # range2 = range(0, len(edge_w_g[j])) - # for i1 in range1: - # x1 = edge_x_g[i][i1] - # y1 = edge_y_g[i][i1] - # w1 = edge_w_g[i][i1] - # for i2 in range2: - # x2 = edge_x_g[j][i2] - # y2 = edge_y_g[j][i2] - # w2 = edge_w_g[j][i2] - # ke = (w1 == w2) - # if ke > 0: - # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] - # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] - # kernel += kn1 + kn2 - - return kernel + + kernel = 0 + + # compute shortest path matrices first, method borrowed from FCSP. + vk_dict = {} # shortest path matrices dict + if ds_attrs['node_labeled']: + # node symb and non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['mix'] + for n1, n2 in product( + g1.nodes(data=True), g2.nodes(data=True)): + vk_dict[(n1[0], n2[0])] = kn( + n1[1][node_label], n2[1][node_label], + n1[1]['attributes'], n2[1]['attributes']) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], + n2[1][node_label]) + else: + # node non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], + n2[1]['attributes']) + # node unlabeled + else: + for e1, e2 in product( + g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + kernel += 1 + return kernel + + # compute graph kernels + if ds_attrs['is_directed']: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], + e2[1])] + kn1 = nk11 * nk22 + kernel += kn1 + else: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + # each edge walk is counted twice, starting from both its extreme nodes. + nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( + e1[0], e2[1])], vk_dict[(e1[1], + e2[0])], vk_dict[(e1[1], + e2[1])] + kn1 = nk11 * nk22 + kn2 = nk12 * nk21 + kernel += kn1 + kn2 + + # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation + # # compute vertex kernels + # try: + # vk_mat = np.zeros((nx.number_of_nodes(g1), + # nx.number_of_nodes(g2))) + # g1nl = enumerate(g1.nodes(data=True)) + # g2nl = enumerate(g2.nodes(data=True)) + # for i1, n1 in g1nl: + # for i2, n2 in g2nl: + # vk_mat[i1][i2] = kn( + # n1[1][node_label], n2[1][node_label], + # [n1[1]['attributes']], [n2[1]['attributes']]) + + # range1 = range(0, len(edge_w_g[i])) + # range2 = range(0, len(edge_w_g[j])) + # for i1 in range1: + # x1 = edge_x_g[i][i1] + # y1 = edge_y_g[i][i1] + # w1 = edge_w_g[i][i1] + # for i2 in range2: + # x2 = edge_x_g[j][i2] + # y2 = edge_y_g[j][i2] + # w2 = edge_w_g[j][i2] + # ke = (w1 == w2) + # if ke > 0: + # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] + # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] + # kernel += kn1 + kn2 + + return kernel def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr): - i = itr[0] - j = itr[1] - return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) + i = itr[0] + j = itr[1] + return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels) #def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr_item): -# g1 = itr_item[0][0] -# g2 = itr_item[0][1] -# i = itr_item[1][0] -# j = itr_item[1][1] -# return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) +# g1 = itr_item[0][0] +# g2 = itr_item[0][1] +# i = itr_item[1][0] +# j = itr_item[1][1] +# return i, j, spkernel_do(g1, g2, ds_attrs, node_label, node_kernels) def wrapper_getSPGraph(weight, itr_item): - g = itr_item[0] - i = itr_item[1] - return i, getSPGraph(g, edge_weight=weight) - # return i, nx.floyd_warshall_numpy(g, weight=weight) + g = itr_item[0] + i = itr_item[1] + return i, getSPGraph(g, edge_weight=weight) + # return i, nx.floyd_warshall_numpy(g, weight=weight) diff --git a/gklearn/kernels/structuralspKernel.py b/gklearn/kernels/structuralspKernel.py index b200b36..fb8dbf9 100644 --- a/gklearn/kernels/structuralspKernel.py +++ b/gklearn/kernels/structuralspKernel.py @@ -7,8 +7,8 @@ Created on Thu Sep 27 10:56:23 2018 @references: - [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For - Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). + [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For + Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). """ import sys @@ -26,836 +26,838 @@ from gklearn.utils.parallel import parallel_gm from gklearn.utils.trie import Trie def structuralspkernel(*args, - node_label='atom', - edge_weight=None, - edge_label='bond_type', - node_kernels=None, - edge_kernels=None, - compute_method='naive', - parallel='imap_unordered', -# parallel=None, - n_jobs=None, - verbose=True): - """Calculate mean average structural shortest path kernels between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - - G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. - - node_label : string - Node attribute used as label. The default node label is atom. - - edge_weight : string - Edge attribute name corresponding to the edge weight. Applied for the - computation of the shortest paths. - - edge_label : string - Edge attribute used as label. The default edge label is bond_type. - - node_kernels : dict - A dictionary of kernel functions for nodes, including 3 items: 'symb' - for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' - for both labels. The first 2 functions take two node labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two nodes. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns a number - as the kernel value. Ignored when nodes are unlabeled. - - edge_kernels : dict - A dictionary of kernel functions for edges, including 3 items: 'symb' - for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' - for both labels. The first 2 functions take two edge labels as - parameters, and the 'mix' function takes 4 parameters, a symbolic and a - non-symbolic label for each the two edges. Each label is in form of 2-D - dimension array (n_samples, n_features). Each function returns a number - as the kernel value. Ignored when edges are unlabeled. - - compute_method : string - Computation method to store the shortest paths and compute the graph - kernel. The Following choices are available: - - 'trie': store paths as tries. - - 'naive': store paths to lists. - - n_jobs : int - Number of jobs for parallelization. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the mean average structural - shortest path kernel between 2 praphs. - """ - # pre-process - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - Gn = [g.copy() for g in Gn] - weight = None - if edge_weight is None: - if verbose: - print('\n None edge weight specified. Set all weight to 1.\n') - else: - try: - some_weight = list( - nx.get_edge_attributes(Gn[0], edge_weight).values())[0] - if isinstance(some_weight, (float, int)): - weight = edge_weight - else: - if verbose: - print( - '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' - % edge_weight) - except: - if verbose: - print( - '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' - % edge_weight) - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', - 'edge_attr_dim', 'is_directed'], - node_label=node_label, edge_label=edge_label) - - start_time = time.time() - - # get shortest paths of each graph in Gn - if parallel == 'imap_unordered': - splist = [None] * len(Gn) - pool = Pool(n_jobs) - itr = zip(Gn, range(0, len(Gn))) - if len(Gn) < 100 * n_jobs: - chunksize = int(len(Gn) / n_jobs) + 1 - else: - chunksize = 100 - # get shortest path graphs of Gn - if compute_method == 'trie': - getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) - else: - getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) - if verbose: - iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), - desc='getting shortest paths', file=sys.stdout) - else: - iterator = pool.imap_unordered(getsp_partial, itr, chunksize) - for i, sp in iterator: - splist[i] = sp - # time.sleep(10) - pool.close() - pool.join() - # ---- direct running, normally use single CPU core. ---- - elif parallel is None: - splist = [] - if verbose: - iterator = tqdm(Gn, desc='getting sp graphs', file=sys.stdout) - else: - iterator = Gn - if compute_method == 'trie': - for g in iterator: - splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed'])) - else: - for g in iterator: - splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) - -# ss = 0 -# ss += sys.getsizeof(splist) -# for spss in splist: -# ss += sys.getsizeof(spss) -# for spp in spss: -# ss += sys.getsizeof(spp) - - -# time.sleep(20) - - - - # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) - # sp_ml = [0] * len(Gn) # shortest path matrices - # for i in result_sp: - # sp_ml[i[0]] = i[1] - # edge_x_g = [[] for i in range(len(sp_ml))] - # edge_y_g = [[] for i in range(len(sp_ml))] - # edge_w_g = [[] for i in range(len(sp_ml))] - # for idx, item in enumerate(sp_ml): - # for i1 in range(len(item)): - # for i2 in range(i1 + 1, len(item)): - # if item[i1, i2] != np.inf: - # edge_x_g[idx].append(i1) - # edge_y_g[idx].append(i2) - # edge_w_g[idx].append(item[i1, i2]) - # print(len(edge_x_g[0])) - # print(len(edge_y_g[0])) - # print(len(edge_w_g[0])) - - Kmatrix = np.zeros((len(Gn), len(Gn))) - - # ---- use pool.imap_unordered to parallel and track progress. ---- - if parallel == 'imap_unordered': - def init_worker(spl_toshare, gs_toshare): - global G_spl, G_gs - G_spl = spl_toshare - G_gs = gs_toshare - if compute_method == 'trie': - do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) - else: - do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) - # ---- direct running, normally use single CPU core. ---- - elif parallel is None: - from itertools import combinations_with_replacement - itr = combinations_with_replacement(range(0, len(Gn)), 2) - if verbose: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) - else: - iterator = itr - if compute_method == 'trie': - for i, j in iterator: - kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j], - ds_attrs, node_label, edge_label, node_kernels, edge_kernels) - Kmatrix[i][j] = kernel - Kmatrix[j][i] = kernel - else: - for i, j in iterator: - kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], - ds_attrs, node_label, edge_label, node_kernels, edge_kernels) - # if(kernel > 1): - # print("error here ") - Kmatrix[i][j] = kernel - Kmatrix[j][i] = kernel - -# # ---- use pool.map to parallel. ---- -# pool = Pool(n_jobs) -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, -# node_kernels, edge_kernels) -# itr = zip(combinations_with_replacement(Gn, 2), -# combinations_with_replacement(splist, 2), -# combinations_with_replacement(range(0, len(Gn)), 2)) -# for i, j, kernel in tqdm( -# pool.map(do_partial, itr), desc='calculating kernels', -# file=sys.stdout): -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel -# pool.close() -# pool.join() - -# # ---- use pool.imap_unordered to parallel and track progress. ---- -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, -# node_kernels, edge_kernels) -# itr = zip(combinations_with_replacement(Gn, 2), -# combinations_with_replacement(splist, 2), -# combinations_with_replacement(range(0, len(Gn)), 2)) -# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) -# if len_itr < 1000 * n_jobs: -# chunksize = int(len_itr / n_jobs) + 1 -# else: -# chunksize = 1000 -# from contextlib import closing -# with closing(Pool(n_jobs)) as pool: -# for i, j, kernel in tqdm( -# pool.imap_unordered(do_partial, itr, 1000), -# desc='calculating kernels', -# file=sys.stdout): -# Kmatrix[i][j] = kernel -# Kmatrix[j][i] = kernel -# pool.close() -# pool.join() - - - - run_time = time.time() - start_time - if verbose: - print("\n --- shortest path kernel matrix of size %d built in %s seconds ---" - % (len(Gn), run_time)) - - return Kmatrix, run_time + node_label='atom', + edge_weight=None, + edge_label='bond_type', + node_kernels=None, + edge_kernels=None, + compute_method='naive', + parallel='imap_unordered', +# parallel=None, + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate mean average structural shortest path kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + + node_label : string + Node attribute used as label. The default node label is atom. + + edge_weight : string + Edge attribute name corresponding to the edge weight. Applied for the + computation of the shortest paths. + + edge_label : string + Edge attribute used as label. The default edge label is bond_type. + + node_kernels : dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when nodes are unlabeled. + + edge_kernels : dict + A dictionary of kernel functions for edges, including 3 items: 'symb' + for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' + for both labels. The first 2 functions take two edge labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two edges. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when edges are unlabeled. + + compute_method : string + Computation method to store the shortest paths and compute the graph + kernel. The Following choices are available: + + 'trie': store paths as tries. + + 'naive': store paths to lists. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the mean average structural + shortest path kernel between 2 praphs. + """ + # pre-process + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + Gn = [g.copy() for g in Gn] + weight = None + if edge_weight is None: + if verbose: + print('\n None edge weight specified. Set all weight to 1.\n') + else: + try: + some_weight = list( + nx.get_edge_attributes(Gn[0], edge_weight).values())[0] + if isinstance(some_weight, (float, int)): + weight = edge_weight + else: + if verbose: + print( + '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' + % edge_weight) + except: + if verbose: + print( + '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' + % edge_weight) + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', + 'edge_attr_dim', 'is_directed'], + node_label=node_label, edge_label=edge_label) + + start_time = time.time() + + # get shortest paths of each graph in Gn + if parallel == 'imap_unordered': + splist = [None] * len(Gn) + pool = Pool(n_jobs) + itr = zip(Gn, range(0, len(Gn))) + if chunksize is None: + if len(Gn) < 100 * n_jobs: + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + # get shortest path graphs of Gn + if compute_method == 'trie': + getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) + else: + getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) + if verbose: + iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), + desc='getting shortest paths', file=sys.stdout) + else: + iterator = pool.imap_unordered(getsp_partial, itr, chunksize) + for i, sp in iterator: + splist[i] = sp + # time.sleep(10) + pool.close() + pool.join() + # ---- direct running, normally use single CPU core. ---- + elif parallel is None: + splist = [] + if verbose: + iterator = tqdm(Gn, desc='getting sp graphs', file=sys.stdout) + else: + iterator = Gn + if compute_method == 'trie': + for g in iterator: + splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed'])) + else: + for g in iterator: + splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) + +# ss = 0 +# ss += sys.getsizeof(splist) +# for spss in splist: +# ss += sys.getsizeof(spss) +# for spp in spss: +# ss += sys.getsizeof(spp) + + +# time.sleep(20) + + + + # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) + # sp_ml = [0] * len(Gn) # shortest path matrices + # for i in result_sp: + # sp_ml[i[0]] = i[1] + # edge_x_g = [[] for i in range(len(sp_ml))] + # edge_y_g = [[] for i in range(len(sp_ml))] + # edge_w_g = [[] for i in range(len(sp_ml))] + # for idx, item in enumerate(sp_ml): + # for i1 in range(len(item)): + # for i2 in range(i1 + 1, len(item)): + # if item[i1, i2] != np.inf: + # edge_x_g[idx].append(i1) + # edge_y_g[idx].append(i2) + # edge_w_g[idx].append(item[i1, i2]) + # print(len(edge_x_g[0])) + # print(len(edge_y_g[0])) + # print(len(edge_w_g[0])) + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + if parallel == 'imap_unordered': + def init_worker(spl_toshare, gs_toshare): + global G_spl, G_gs + G_spl = spl_toshare + G_gs = gs_toshare + if compute_method == 'trie': + do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + else: + do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + # ---- direct running, normally use single CPU core. ---- + elif parallel is None: + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(Gn)), 2) + if verbose: + iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + else: + iterator = itr + if compute_method == 'trie': + for i, j in iterator: + kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j], + ds_attrs, node_label, edge_label, node_kernels, edge_kernels) + Kmatrix[i][j] = kernel + Kmatrix[j][i] = kernel + else: + for i, j in iterator: + kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], + ds_attrs, node_label, edge_label, node_kernels, edge_kernels) + # if(kernel > 1): + # print("error here ") + Kmatrix[i][j] = kernel + Kmatrix[j][i] = kernel + +# # ---- use pool.map to parallel. ---- +# pool = Pool(n_jobs) +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# node_kernels, edge_kernels) +# itr = zip(combinations_with_replacement(Gn, 2), +# combinations_with_replacement(splist, 2), +# combinations_with_replacement(range(0, len(Gn)), 2)) +# for i, j, kernel in tqdm( +# pool.map(do_partial, itr), desc='calculating kernels', +# file=sys.stdout): +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# pool.close() +# pool.join() + +# # ---- use pool.imap_unordered to parallel and track progress. ---- +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# node_kernels, edge_kernels) +# itr = zip(combinations_with_replacement(Gn, 2), +# combinations_with_replacement(splist, 2), +# combinations_with_replacement(range(0, len(Gn)), 2)) +# len_itr = int(len(Gn) * (len(Gn) + 1) / 2) +# if len_itr < 1000 * n_jobs: +# chunksize = int(len_itr / n_jobs) + 1 +# else: +# chunksize = 1000 +# from contextlib import closing +# with closing(Pool(n_jobs)) as pool: +# for i, j, kernel in tqdm( +# pool.imap_unordered(do_partial, itr, 1000), +# desc='calculating kernels', +# file=sys.stdout): +# Kmatrix[i][j] = kernel +# Kmatrix[j][i] = kernel +# pool.close() +# pool.join() + + + + run_time = time.time() - start_time + if verbose: + print("\n --- shortest path kernel matrix of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels): - - kernel = 0 - - # First, compute shortest path matrices, method borrowed from FCSP. - vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) - # Then, compute kernels between all pairs of edges, which is an idea of - # extension of FCSP. It suits sparse graphs, which is the most case we - # went though. For dense graphs, this would be slow. - ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) - - # compute graph kernels - if vk_dict: - if ek_dict: - for p1, p2 in product(spl1, spl2): - if len(p1) == len(p2): - kpath = vk_dict[(p1[0], p2[0])] - if kpath: - for idx in range(1, len(p1)): - kpath *= vk_dict[(p1[idx], p2[idx])] * \ - ek_dict[((p1[idx-1], p1[idx]), - (p2[idx-1], p2[idx]))] - if not kpath: - break - kernel += kpath # add up kernels of all paths - else: - for p1, p2 in product(spl1, spl2): - if len(p1) == len(p2): - kpath = vk_dict[(p1[0], p2[0])] - if kpath: - for idx in range(1, len(p1)): - kpath *= vk_dict[(p1[idx], p2[idx])] - if not kpath: - break - kernel += kpath # add up kernels of all paths - else: - if ek_dict: - for p1, p2 in product(spl1, spl2): - if len(p1) == len(p2): - if len(p1) == 0: - kernel += 1 - else: - kpath = 1 - for idx in range(0, len(p1) - 1): - kpath *= ek_dict[((p1[idx], p1[idx+1]), - (p2[idx], p2[idx+1]))] - if not kpath: - break - kernel += kpath # add up kernels of all paths - else: - for p1, p2 in product(spl1, spl2): - if len(p1) == len(p2): - kernel += 1 - try: - kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average - except ZeroDivisionError: - print(spl1, spl2) - print(g1.nodes(data=True)) - print(g1.edges(data=True)) - raise Exception - - # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation - # # compute vertex kernel matrix - # try: - # vk_mat = np.zeros((nx.number_of_nodes(g1), - # nx.number_of_nodes(g2))) - # g1nl = enumerate(g1.nodes(data=True)) - # g2nl = enumerate(g2.nodes(data=True)) - # for i1, n1 in g1nl: - # for i2, n2 in g2nl: - # vk_mat[i1][i2] = kn( - # n1[1][node_label], n2[1][node_label], - # [n1[1]['attributes']], [n2[1]['attributes']]) - - # range1 = range(0, len(edge_w_g[i])) - # range2 = range(0, len(edge_w_g[j])) - # for i1 in range1: - # x1 = edge_x_g[i][i1] - # y1 = edge_y_g[i][i1] - # w1 = edge_w_g[i][i1] - # for i2 in range2: - # x2 = edge_x_g[j][i2] - # y2 = edge_y_g[j][i2] - # w2 = edge_w_g[j][i2] - # ke = (w1 == w2) - # if ke > 0: - # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] - # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] - # Kmatrix += kn1 + kn2 - return kernel + node_kernels, edge_kernels): + + kernel = 0 + + # First, compute shortest path matrices, method borrowed from FCSP. + vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) + # Then, compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) + + # compute graph kernels + if vk_dict: + if ek_dict: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = vk_dict[(p1[0], p2[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], p2[idx])] * \ + ek_dict[((p1[idx-1], p1[idx]), + (p2[idx-1], p2[idx]))] + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = vk_dict[(p1[0], p2[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], p2[idx])] + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + if ek_dict: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + if len(p1) == 0: + kernel += 1 + else: + kpath = 1 + for idx in range(0, len(p1) - 1): + kpath *= ek_dict[((p1[idx], p1[idx+1]), + (p2[idx], p2[idx+1]))] + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kernel += 1 + try: + kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average + except ZeroDivisionError: + print(spl1, spl2) + print(g1.nodes(data=True)) + print(g1.edges(data=True)) + raise Exception + + # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation + # # compute vertex kernel matrix + # try: + # vk_mat = np.zeros((nx.number_of_nodes(g1), + # nx.number_of_nodes(g2))) + # g1nl = enumerate(g1.nodes(data=True)) + # g2nl = enumerate(g2.nodes(data=True)) + # for i1, n1 in g1nl: + # for i2, n2 in g2nl: + # vk_mat[i1][i2] = kn( + # n1[1][node_label], n2[1][node_label], + # [n1[1]['attributes']], [n2[1]['attributes']]) + + # range1 = range(0, len(edge_w_g[i])) + # range2 = range(0, len(edge_w_g[j])) + # for i1 in range1: + # x1 = edge_x_g[i][i1] + # y1 = edge_y_g[i][i1] + # w1 = edge_w_g[i][i1] + # for i2 in range2: + # x2 = edge_x_g[j][i2] + # y2 = edge_y_g[j][i2] + # w2 = edge_w_g[j][i2] + # ke = (w1 == w2) + # if ke > 0: + # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] + # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] + # Kmatrix += kn1 + kn2 + return kernel def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, - edge_kernels, itr): - i = itr[0] - j = itr[1] - return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], - ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - - + edge_kernels, itr): + i = itr[0] + j = itr[1] + return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], + ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + + def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels): - -# # traverse all paths in graph1. Deep-first search is applied. -# def traverseBothTrie(root, trie2, kernel, pcurrent=[]): -# for key, node in root['children'].items(): -# pcurrent.append(key) -# if node['isEndOfWord']: -# # print(node['count']) -# traverseTrie2(trie2.root, pcurrent, kernel, -# pcurrent=[]) -# if node['children'] != {}: -# traverseBothTrie(node, trie2, kernel, pcurrent) -# else: -# del pcurrent[-1] -# if pcurrent != []: -# del pcurrent[-1] -# -# -# # traverse all paths in graph2 and find out those that are not in -# # graph1. Deep-first search is applied. -# def traverseTrie2(root, p1, kernel, pcurrent=[]): -# for key, node in root['children'].items(): -# pcurrent.append(key) -# if node['isEndOfWord']: -# # print(node['count']) -# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) -# if node['children'] != {}: -# traverseTrie2(node, p1, kernel, pcurrent) -# else: -# del pcurrent[-1] -# if pcurrent != []: -# del pcurrent[-1] + node_kernels, edge_kernels): + +# # traverse all paths in graph1. Deep-first search is applied. +# def traverseBothTrie(root, trie2, kernel, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# traverseTrie2(trie2.root, pcurrent, kernel, +# pcurrent=[]) +# if node['children'] != {}: +# traverseBothTrie(node, trie2, kernel, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. +# def traverseTrie2(root, p1, kernel, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) +# if node['children'] != {}: +# traverseTrie2(node, p1, kernel, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] # -# -# kernel = [0] +# +# kernel = [0] # -# # First, compute shortest path matrices, method borrowed from FCSP. -# vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) -# # Then, compute kernels between all pairs of edges, which is an idea of -# # extension of FCSP. It suits sparse graphs, which is the most case we -# # went though. For dense graphs, this would be slow. -# ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) +# # First, compute shortest path matrices, method borrowed from FCSP. +# vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) +# # Then, compute kernels between all pairs of edges, which is an idea of +# # extension of FCSP. It suits sparse graphs, which is the most case we +# # went though. For dense graphs, this would be slow. +# ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) # -# # compute graph kernels -# traverseBothTrie(trie1[0].root, trie2[0], kernel) +# # compute graph kernels +# traverseBothTrie(trie1[0].root, trie2[0], kernel) # -# kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average - -# # traverse all paths in graph1. Deep-first search is applied. -# def traverseBothTrie(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): -# for key, node in root['children'].items(): -# pcurrent.append(key) -# if node['isEndOfWord']: -# # print(node['count']) -# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict, -# pcurrent=[]) -# if node['children'] != {}: -# traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent) -# else: -# del pcurrent[-1] -# if pcurrent != []: -# del pcurrent[-1] -# -# -# # traverse all paths in graph2 and find out those that are not in -# # graph1. Deep-first search is applied. -# def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): -# for key, node in root['children'].items(): -# pcurrent.append(key) -# if node['isEndOfWord']: -# # print(node['count']) -# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) -# if node['children'] != {}: -# traverseTrie2(node, p1, kernel, vk_dict, ek_dict, pcurrent) -# else: -# del pcurrent[-1] -# if pcurrent != []: -# del pcurrent[-1] +# kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average + +# # traverse all paths in graph1. Deep-first search is applied. +# def traverseBothTrie(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict, +# pcurrent=[]) +# if node['children'] != {}: +# traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. +# def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): +# for key, node in root['children'].items(): +# pcurrent.append(key) +# if node['isEndOfWord']: +# # print(node['count']) +# kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) +# if node['children'] != {}: +# traverseTrie2(node, p1, kernel, vk_dict, ek_dict, pcurrent) +# else: +# del pcurrent[-1] +# if pcurrent != []: +# del pcurrent[-1] - - kernel = [0] - - # First, compute shortest path matrices, method borrowed from FCSP. - vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) - # Then, compute kernels between all pairs of edges, which is an idea of - # extension of FCSP. It suits sparse graphs, which is the most case we - # went though. For dense graphs, this would be slow. - ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) - - # compute graph kernels -# traverseBothTrie(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) - if vk_dict: - if ek_dict: - traverseBothTriem(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) - else: - traverseBothTriev(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) - else: - if ek_dict: - traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) - else: - traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) - - kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average - - return kernel + + kernel = [0] + + # First, compute shortest path matrices, method borrowed from FCSP. + vk_dict = getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs) + # Then, compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs) + + # compute graph kernels +# traverseBothTrie(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + if vk_dict: + if ek_dict: + traverseBothTriem(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + else: + traverseBothTriev(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + else: + if ek_dict: + traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + else: + traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + + kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average + + return kernel def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels, - edge_kernels, itr): - i = itr[0] - j = itr[1] - return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs, - node_label, edge_label, node_kernels, edge_kernels) - + edge_kernels, itr): + i = itr[0] + j = itr[1] + return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs, + node_label, edge_label, node_kernels, edge_kernels) + def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs): - # compute shortest path matrices, method borrowed from FCSP. - vk_dict = {} # shortest path matrices dict - if ds_attrs['node_labeled']: - # node symb and non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['mix'] - for n1, n2 in product( - g1.nodes(data=True), g2.nodes(data=True)): - vk_dict[(n1[0], n2[0])] = kn( - n1[1][node_label], n2[1][node_label], - n1[1]['attributes'], n2[1]['attributes']) - # node symb labeled - else: - kn = node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], - n2[1][node_label]) - else: - # node non-synb labeled - if ds_attrs['node_attr_dim'] > 0: - kn = node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], - n2[1]['attributes']) - # node unlabeled - else: - pass - - return vk_dict + # compute shortest path matrices, method borrowed from FCSP. + vk_dict = {} # shortest path matrices dict + if ds_attrs['node_labeled']: + # node symb and non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['mix'] + for n1, n2 in product( + g1.nodes(data=True), g2.nodes(data=True)): + vk_dict[(n1[0], n2[0])] = kn( + n1[1][node_label], n2[1][node_label], + n1[1]['attributes'], n2[1]['attributes']) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label], + n2[1][node_label]) + else: + # node non-synb labeled + if ds_attrs['node_attr_dim'] > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'], + n2[1]['attributes']) + # node unlabeled + else: + pass + + return vk_dict def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs): - # compute kernels between all pairs of edges, which is an idea of - # extension of FCSP. It suits sparse graphs, which is the most case we - # went though. For dense graphs, this would be slow. - ek_dict = {} # dict of edge kernels - if ds_attrs['edge_labeled']: - # edge symb and non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['mix'] - for e1, e2 in product( - g1.edges(data=True), g2.edges(data=True)): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label], - e1[2]['attributes'], e2[2]['attributes']) - ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp - # edge symb labeled - else: - ke = edge_kernels['symb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) - ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp - else: - # edge non-synb labeled - if ds_attrs['edge_attr_dim'] > 0: - ke = edge_kernels['nsymb'] - for e1 in g1.edges(data=True): - for e2 in g2.edges(data=True): - ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) - ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp - ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp - ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp - # edge unlabeled - else: - pass - - return ek_dict - - + # compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = {} # dict of edge kernels + if ds_attrs['edge_labeled']: + # edge symb and non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['mix'] + for e1, e2 in product( + g1.edges(data=True), g2.edges(data=True)): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label], + e1[2]['attributes'], e2[2]['attributes']) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + # edge symb labeled + else: + ke = edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2][edge_label], e2[2][edge_label]) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + else: + # edge non-synb labeled + if ds_attrs['edge_attr_dim'] > 0: + ke = edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + ek_temp = ke(e1[2]['attributes'], e2[2]['attributes']) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + # edge unlabeled + else: + pass + + return ek_dict + + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: -# print(node['count']) - traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict, - pcurrent=[]) - if node['children'] != {}: - traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + # traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. +# graph1. Deep-first search is applied. def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: -# print(node['count']) - if len(p1) == len(pcurrent): - kpath = vk_dict[(p1[0], pcurrent[0])] - if kpath: - for idx in range(1, len(p1)): - kpath *= vk_dict[(p1[idx], pcurrent[idx])] * \ - ek_dict[((p1[idx-1], p1[idx]), - (pcurrent[idx-1], pcurrent[idx]))] - if not kpath: - break - kernel[0] += kpath # add up kernels of all paths - if node['children'] != {}: - traverseTrie2m(node, p1, kernel, vk_dict, ek_dict, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + kpath = vk_dict[(p1[0], pcurrent[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], pcurrent[idx])] * \ + ek_dict[((p1[idx-1], p1[idx]), + (pcurrent[idx-1], pcurrent[idx]))] + if not kpath: + break + kernel[0] += kpath # add up kernels of all paths + if node['children'] != {}: + traverseTrie2m(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: -# print(node['count']) - traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict, - pcurrent=[]) - if node['children'] != {}: - traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + # traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. +# graph1. Deep-first search is applied. def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: -# print(node['count']) - if len(p1) == len(pcurrent): - kpath = vk_dict[(p1[0], pcurrent[0])] - if kpath: - for idx in range(1, len(p1)): - kpath *= vk_dict[(p1[idx], pcurrent[idx])] - if not kpath: - break - kernel[0] += kpath # add up kernels of all paths - if node['children'] != {}: - traverseTrie2v(node, p1, kernel, vk_dict, ek_dict, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + kpath = vk_dict[(p1[0], pcurrent[0])] + if kpath: + for idx in range(1, len(p1)): + kpath *= vk_dict[(p1[idx], pcurrent[idx])] + if not kpath: + break + kernel[0] += kpath # add up kernels of all paths + if node['children'] != {}: + traverseTrie2v(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: -# print(node['count']) - traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict, - pcurrent=[]) - if node['children'] != {}: - traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + # traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. +# graph1. Deep-first search is applied. def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: -# print(node['count']) - if len(p1) == len(pcurrent): - if len(p1) == 0: - kernel += 1 - else: - kpath = 1 - for idx in range(0, len(p1) - 1): - kpath *= ek_dict[((p1[idx], p1[idx+1]), - (pcurrent[idx], pcurrent[idx+1]))] - if not kpath: - break - kernel[0] += kpath # add up kernels of all paths - if node['children'] != {}: - traverseTrie2e(node, p1, kernel, vk_dict, ek_dict, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + if len(p1) == 0: + kernel += 1 + else: + kpath = 1 + for idx in range(0, len(p1) - 1): + kpath *= ek_dict[((p1[idx], p1[idx+1]), + (pcurrent[idx], pcurrent[idx+1]))] + if not kpath: + break + kernel[0] += kpath # add up kernels of all paths + if node['children'] != {}: + traverseTrie2e(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: -# print(node['count']) - traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict, - pcurrent=[]) - if node['children'] != {}: - traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + pcurrent=[]) + if node['children'] != {}: + traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + # traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. +# graph1. Deep-first search is applied. def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: -# print(node['count']) - if len(p1) == len(pcurrent): - kernel[0] += 1 - if node['children'] != {}: - traverseTrie2u(node, p1, kernel, vk_dict, ek_dict, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + if len(p1) == len(pcurrent): + kernel[0] += 1 + if node['children'] != {}: + traverseTrie2u(node, p1, kernel, vk_dict, ek_dict, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + #def computePathKernel(p1, p2, vk_dict, ek_dict): -# kernel = 0 -# if vk_dict: -# if ek_dict: -# if len(p1) == len(p2): -# kpath = vk_dict[(p1[0], p2[0])] -# if kpath: -# for idx in range(1, len(p1)): -# kpath *= vk_dict[(p1[idx], p2[idx])] * \ -# ek_dict[((p1[idx-1], p1[idx]), -# (p2[idx-1], p2[idx]))] -# if not kpath: -# break -# kernel += kpath # add up kernels of all paths -# else: -# if len(p1) == len(p2): -# kpath = vk_dict[(p1[0], p2[0])] -# if kpath: -# for idx in range(1, len(p1)): -# kpath *= vk_dict[(p1[idx], p2[idx])] -# if not kpath: -# break -# kernel += kpath # add up kernels of all paths -# else: -# if ek_dict: -# if len(p1) == len(p2): -# if len(p1) == 0: -# kernel += 1 -# else: -# kpath = 1 -# for idx in range(0, len(p1) - 1): -# kpath *= ek_dict[((p1[idx], p1[idx+1]), -# (p2[idx], p2[idx+1]))] -# if not kpath: -# break -# kernel += kpath # add up kernels of all paths -# else: -# if len(p1) == len(p2): -# kernel += 1 -# -# return kernel +# kernel = 0 +# if vk_dict: +# if ek_dict: +# if len(p1) == len(p2): +# kpath = vk_dict[(p1[0], p2[0])] +# if kpath: +# for idx in range(1, len(p1)): +# kpath *= vk_dict[(p1[idx], p2[idx])] * \ +# ek_dict[((p1[idx-1], p1[idx]), +# (p2[idx-1], p2[idx]))] +# if not kpath: +# break +# kernel += kpath # add up kernels of all paths +# else: +# if len(p1) == len(p2): +# kpath = vk_dict[(p1[0], p2[0])] +# if kpath: +# for idx in range(1, len(p1)): +# kpath *= vk_dict[(p1[idx], p2[idx])] +# if not kpath: +# break +# kernel += kpath # add up kernels of all paths +# else: +# if ek_dict: +# if len(p1) == len(p2): +# if len(p1) == 0: +# kernel += 1 +# else: +# kpath = 1 +# for idx in range(0, len(p1) - 1): +# kpath *= ek_dict[((p1[idx], p1[idx+1]), +# (p2[idx], p2[idx+1]))] +# if not kpath: +# break +# kernel += kpath # add up kernels of all paths +# else: +# if len(p1) == len(p2): +# kernel += 1 +# +# return kernel def get_shortest_paths(G, weight, directed): - """Get all shortest paths of a graph. - - Parameters - ---------- - G : NetworkX graphs - The graphs whose paths are calculated. - weight : string/None - edge attribute used as weight to calculate the shortest path. - directed: boolean - Whether graph is directed. - - Return - ------ - sp : list of list - List of shortest paths of the graph, where each path is represented by a list of nodes. - """ - sp = [] - for n1, n2 in combinations(G.nodes(), 2): - try: - spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) - except nx.NetworkXNoPath: # nodes not connected - # sp.append([]) - pass - else: - sp += spltemp - # each edge walk is counted twice, starting from both its extreme nodes. - if not directed: - sp += [sptemp[::-1] for sptemp in spltemp] - - # add single nodes as length 0 paths. - sp += [[n] for n in G.nodes()] - return sp + """Get all shortest paths of a graph. + + Parameters + ---------- + G : NetworkX graphs + The graphs whose paths are calculated. + weight : string/None + edge attribute used as weight to calculate the shortest path. + directed: boolean + Whether graph is directed. + + Return + ------ + sp : list of list + List of shortest paths of the graph, where each path is represented by a list of nodes. + """ + sp = [] + for n1, n2 in combinations(G.nodes(), 2): + try: + spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) + except nx.NetworkXNoPath: # nodes not connected + # sp.append([]) + pass + else: + sp += spltemp + # each edge walk is counted twice, starting from both its extreme nodes. + if not directed: + sp += [sptemp[::-1] for sptemp in spltemp] + + # add single nodes as length 0 paths. + sp += [[n] for n in G.nodes()] + return sp def wrapper_getSP_naive(weight, directed, itr_item): - g = itr_item[0] - i = itr_item[1] - return i, get_shortest_paths(g, weight, directed) + g = itr_item[0] + i = itr_item[1] + return i, get_shortest_paths(g, weight, directed) def get_sps_as_trie(G, weight, directed): - """Get all shortest paths of a graph and insert them into a trie. - - Parameters - ---------- - G : NetworkX graphs - The graphs whose paths are calculated. - weight : string/None - edge attribute used as weight to calculate the shortest path. - directed: boolean - Whether graph is directed. - - Return - ------ - sp : list of list - List of shortest paths of the graph, where each path is represented by a list of nodes. - """ - sptrie = Trie() - lensp = 0 - for n1, n2 in combinations(G.nodes(), 2): - try: - spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) - except nx.NetworkXNoPath: # nodes not connected - pass - else: - lensp += len(spltemp) - if not directed: - lensp += len(spltemp) - for sp in spltemp: - sptrie.insertWord(sp) - # each edge walk is counted twice, starting from both its extreme nodes. - if not directed: - sptrie.insertWord(sp[::-1]) - - # add single nodes as length 0 paths. - for n in G.nodes(): - sptrie.insertWord([n]) - - return sptrie, lensp + nx.number_of_nodes(G) + """Get all shortest paths of a graph and insert them into a trie. + + Parameters + ---------- + G : NetworkX graphs + The graphs whose paths are calculated. + weight : string/None + edge attribute used as weight to calculate the shortest path. + directed: boolean + Whether graph is directed. + + Return + ------ + sp : list of list + List of shortest paths of the graph, where each path is represented by a list of nodes. + """ + sptrie = Trie() + lensp = 0 + for n1, n2 in combinations(G.nodes(), 2): + try: + spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight)) + except nx.NetworkXNoPath: # nodes not connected + pass + else: + lensp += len(spltemp) + if not directed: + lensp += len(spltemp) + for sp in spltemp: + sptrie.insertWord(sp) + # each edge walk is counted twice, starting from both its extreme nodes. + if not directed: + sptrie.insertWord(sp[::-1]) + + # add single nodes as length 0 paths. + for n in G.nodes(): + sptrie.insertWord([n]) + + return sptrie, lensp + nx.number_of_nodes(G) def wrapper_getSP_trie(weight, directed, itr_item): - g = itr_item[0] - i = itr_item[1] - return i, get_sps_as_trie(g, weight, directed) + g = itr_item[0] + i = itr_item[1] + return i, get_sps_as_trie(g, weight, directed) diff --git a/gklearn/kernels/treeletKernel.py b/gklearn/kernels/treeletKernel.py index d4447b1..809a623 100644 --- a/gklearn/kernels/treeletKernel.py +++ b/gklearn/kernels/treeletKernel.py @@ -27,6 +27,7 @@ def treeletkernel(*args, edge_label='bond_type', parallel='imap_unordered', n_jobs=None, + chunksize=None, verbose=True): """Calculate treelet graph kernels between graphs. @@ -92,10 +93,11 @@ def treeletkernel(*args, # time, but this may cost a lot of memory for large dataset. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) - if len(Gn) < 100 * n_jobs: - chunksize = int(len(Gn) / n_jobs) + 1 - else: - chunksize = 100 + if chunksize is None: + if len(Gn) < 100 * n_jobs: + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 canonkeys = [[] for _ in range(len(Gn))] get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, labeled, ds_attrs['is_directed']) @@ -115,7 +117,7 @@ def treeletkernel(*args, G_canonkeys = canonkeys_toshare do_partial = partial(wrapper_treeletkernel_do, sub_kernel) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(canonkeys,), n_jobs=n_jobs, verbose=verbose) + glbv=(canonkeys,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # ---- do not use parallelization. ---- elif parallel == None: diff --git a/gklearn/kernels/untilHPathKernel.py b/gklearn/kernels/untilHPathKernel.py index c44300d..9bac28b 100644 --- a/gklearn/kernels/untilHPathKernel.py +++ b/gklearn/kernels/untilHPathKernel.py @@ -3,9 +3,9 @@ @references: - [1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre - Baldi. Graph kernels for chemical informatics. Neural networks, - 18(8):1093–1110, 2005. + [1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre + Baldi. Graph kernels for chemical informatics. Neural networks, + 18(8):1093–1110, 2005. """ import sys @@ -25,700 +25,702 @@ from gklearn.utils.trie import Trie def untilhpathkernel(*args, - node_label='atom', - edge_label='bond_type', - depth=10, - k_func='MinMax', - compute_method='trie', - parallel='imap_unordered', - n_jobs=None, - verbose=True): - """Calculate path graph kernels up to depth/hight h between graphs. - - Parameters - ---------- - Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. - - G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. - - node_label : string - Node attribute used as label. The default node label is atom. - - edge_label : string - Edge attribute used as label. The default edge label is bond_type. - - depth : integer - Depth of search. Longest length of paths. - - k_func : function - A kernel function applied using different notions of fingerprint - similarity, defining the type of feature map and normalization method - applied for the graph kernel. The Following choices are available: - - 'MinMax': use the MiniMax kernel and counting feature map. - - 'tanimoto': use the Tanimoto kernel and binary feature map. - - None: no sub-kernel is used, the kernel is computed directly. - - compute_method : string - Computation method to store paths and compute the graph kernel. The - Following choices are available: - - 'trie': store paths as tries. - - 'naive': store paths to lists. - - n_jobs : int - Number of jobs for parallelization. - - Return - ------ - Kmatrix : Numpy matrix - Kernel matrix, each element of which is the path kernel up to h between - 2 praphs. - """ - # pre-process - depth = int(depth) - Gn = args[0] if len(args) == 1 else [args[0], args[1]] - Gn = [g.copy() for g in Gn] - Kmatrix = np.zeros((len(Gn), len(Gn))) - ds_attrs = get_dataset_attributes( - Gn, - attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', - 'edge_attr_dim', 'is_directed'], - node_label=node_label, edge_label=edge_label) - if k_func != None: - if not ds_attrs['node_labeled']: - for G in Gn: - nx.set_node_attributes(G, '0', 'atom') - if not ds_attrs['edge_labeled']: - for G in Gn: - nx.set_edge_attributes(G, '0', 'bond_type') - - start_time = time.time() - - if parallel == 'imap_unordered': - # ---- use pool.imap_unordered to parallel and track progress. ---- - # get all paths of all graphs before calculating kernels to save time, - # but this may cost a lot of memory for large datasets. - pool = Pool(n_jobs) - itr = zip(Gn, range(0, len(Gn))) - if len(Gn) < 100 * n_jobs: - chunksize = int(len(Gn) / n_jobs) + 1 - else: - chunksize = 100 - all_paths = [[] for _ in range(len(Gn))] - if compute_method == 'trie' and k_func != None: - getps_partial = partial(wrapper_find_all_path_as_trie, depth, - ds_attrs, node_label, edge_label) - elif compute_method != 'trie' and k_func != None: - getps_partial = partial(wrapper_find_all_paths_until_length, depth, - ds_attrs, node_label, edge_label, True) - else: - getps_partial = partial(wrapper_find_all_paths_until_length, depth, - ds_attrs, node_label, edge_label, False) - if verbose: - iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), - desc='getting paths', file=sys.stdout) - else: - iterator = pool.imap_unordered(getps_partial, itr, chunksize) - for i, ps in iterator: - all_paths[i] = ps - pool.close() - pool.join() - -# for g in Gn: -# if compute_method == 'trie' and k_func != None: -# find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) -# elif compute_method != 'trie' and k_func != None: -# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) -# else: -# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False) - -## size = sys.getsizeof(all_paths) -## for item in all_paths: -## size += sys.getsizeof(item) -## for pppps in item: -## size += sys.getsizeof(pppps) -## print(size) -# -## ttt = time.time() -## # ---- ---- use pool.map to parallel ---- -## for i, ps in tqdm( -## pool.map(getps_partial, range(0, len(Gn))), -## desc='getting paths', file=sys.stdout): -## all_paths[i] = ps -## print(time.time() - ttt) - - if compute_method == 'trie' and k_func != None: - def init_worker(trie_toshare): - global G_trie - G_trie = trie_toshare - do_partial = partial(wrapper_uhpath_do_trie, k_func) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose) - elif compute_method != 'trie' and k_func != None: - def init_worker(plist_toshare): - global G_plist - G_plist = plist_toshare - do_partial = partial(wrapper_uhpath_do_naive, k_func) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose) - else: - def init_worker(plist_toshare): - global G_plist - G_plist = plist_toshare - do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose) - - elif parallel == None: -# from pympler import asizeof - # ---- direct running, normally use single CPU core. ---- -# print(asizeof.asized(all_paths, detail=1).format()) - - if compute_method == 'trie': - all_paths = [ - find_all_path_as_trie(Gn[i], - depth, - ds_attrs, - node_label=node_label, - edge_label=edge_label) for i in tqdm( - range(0, len(Gn)), desc='getting paths', file=sys.stdout) - ] -# sizeof_allpaths = asizeof.asizeof(all_paths) -# print(sizeof_allpaths) - pbar = tqdm( - total=((len(Gn) + 1) * len(Gn) / 2), - desc='calculating kernels', - file=sys.stdout) - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _untilhpathkernel_do_trie(all_paths[i], - all_paths[j], k_func) - Kmatrix[j][i] = Kmatrix[i][j] - pbar.update(1) - else: - all_paths = [ - find_all_paths_until_length( - Gn[i], - depth, - ds_attrs, - node_label=node_label, - edge_label=edge_label) for i in tqdm( - range(0, len(Gn)), desc='getting paths', file=sys.stdout) - ] -# sizeof_allpaths = asizeof.asizeof(all_paths) -# print(sizeof_allpaths) - pbar = tqdm( - total=((len(Gn) + 1) * len(Gn) / 2), - desc='calculating kernels', - file=sys.stdout) - for i in range(0, len(Gn)): - for j in range(i, len(Gn)): - Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j], - k_func) - Kmatrix[j][i] = Kmatrix[i][j] - pbar.update(1) - - run_time = time.time() - start_time - if verbose: - print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" - % (depth, len(Gn), run_time)) - -# print(Kmatrix[0][0:10]) - return Kmatrix, run_time + node_label='atom', + edge_label='bond_type', + depth=10, + k_func='MinMax', + compute_method='trie', + parallel='imap_unordered', + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate path graph kernels up to depth/hight h between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + + node_label : string + Node attribute used as label. The default node label is atom. + + edge_label : string + Edge attribute used as label. The default edge label is bond_type. + + depth : integer + Depth of search. Longest length of paths. + + k_func : function + A kernel function applied using different notions of fingerprint + similarity, defining the type of feature map and normalization method + applied for the graph kernel. The Following choices are available: + + 'MinMax': use the MiniMax kernel and counting feature map. + + 'tanimoto': use the Tanimoto kernel and binary feature map. + + None: no sub-kernel is used, the kernel is computed directly. + + compute_method : string + Computation method to store paths and compute the graph kernel. The + Following choices are available: + + 'trie': store paths as tries. + + 'naive': store paths to lists. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the path kernel up to h between + 2 praphs. + """ + # pre-process + depth = int(depth) + Gn = args[0] if len(args) == 1 else [args[0], args[1]] + Gn = [g.copy() for g in Gn] + Kmatrix = np.zeros((len(Gn), len(Gn))) + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', + 'edge_attr_dim', 'is_directed'], + node_label=node_label, edge_label=edge_label) + if k_func != None: + if not ds_attrs['node_labeled']: + for G in Gn: + nx.set_node_attributes(G, '0', 'atom') + if not ds_attrs['edge_labeled']: + for G in Gn: + nx.set_edge_attributes(G, '0', 'bond_type') + + start_time = time.time() + + if parallel == 'imap_unordered': + # ---- use pool.imap_unordered to parallel and track progress. ---- + # get all paths of all graphs before calculating kernels to save time, + # but this may cost a lot of memory for large datasets. + pool = Pool(n_jobs) + itr = zip(Gn, range(0, len(Gn))) + if chunksize is None: + if len(Gn) < 100 * n_jobs: + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + all_paths = [[] for _ in range(len(Gn))] + if compute_method == 'trie' and k_func != None: + getps_partial = partial(wrapper_find_all_path_as_trie, depth, + ds_attrs, node_label, edge_label) + elif compute_method != 'trie' and k_func != None: + getps_partial = partial(wrapper_find_all_paths_until_length, depth, + ds_attrs, node_label, edge_label, True) + else: + getps_partial = partial(wrapper_find_all_paths_until_length, depth, + ds_attrs, node_label, edge_label, False) + if verbose: + iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), + desc='getting paths', file=sys.stdout) + else: + iterator = pool.imap_unordered(getps_partial, itr, chunksize) + for i, ps in iterator: + all_paths[i] = ps + pool.close() + pool.join() + +# for g in Gn: +# if compute_method == 'trie' and k_func != None: +# find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) +# elif compute_method != 'trie' and k_func != None: +# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) +# else: +# find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False) + +## size = sys.getsizeof(all_paths) +## for item in all_paths: +## size += sys.getsizeof(item) +## for pppps in item: +## size += sys.getsizeof(pppps) +## print(size) +# +## ttt = time.time() +## # ---- ---- use pool.map to parallel ---- +## for i, ps in tqdm( +## pool.map(getps_partial, range(0, len(Gn))), +## desc='getting paths', file=sys.stdout): +## all_paths[i] = ps +## print(time.time() - ttt) + + if compute_method == 'trie' and k_func != None: + def init_worker(trie_toshare): + global G_trie + G_trie = trie_toshare + do_partial = partial(wrapper_uhpath_do_trie, k_func) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + elif compute_method != 'trie' and k_func != None: + def init_worker(plist_toshare): + global G_plist + G_plist = plist_toshare + do_partial = partial(wrapper_uhpath_do_naive, k_func) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + else: + def init_worker(plist_toshare): + global G_plist + G_plist = plist_toshare + do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + + elif parallel == None: +# from pympler import asizeof + # ---- direct running, normally use single CPU core. ---- +# print(asizeof.asized(all_paths, detail=1).format()) + + if compute_method == 'trie': + all_paths = [ + find_all_path_as_trie(Gn[i], + depth, + ds_attrs, + node_label=node_label, + edge_label=edge_label) for i in tqdm( + range(0, len(Gn)), desc='getting paths', file=sys.stdout) + ] +# sizeof_allpaths = asizeof.asizeof(all_paths) +# print(sizeof_allpaths) + pbar = tqdm( + total=((len(Gn) + 1) * len(Gn) / 2), + desc='calculating kernels', + file=sys.stdout) + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + Kmatrix[i][j] = _untilhpathkernel_do_trie(all_paths[i], + all_paths[j], k_func) + Kmatrix[j][i] = Kmatrix[i][j] + pbar.update(1) + else: + all_paths = [ + find_all_paths_until_length( + Gn[i], + depth, + ds_attrs, + node_label=node_label, + edge_label=edge_label) for i in tqdm( + range(0, len(Gn)), desc='getting paths', file=sys.stdout) + ] +# sizeof_allpaths = asizeof.asizeof(all_paths) +# print(sizeof_allpaths) + pbar = tqdm( + total=((len(Gn) + 1) * len(Gn) / 2), + desc='calculating kernels', + file=sys.stdout) + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j], + k_func) + Kmatrix[j][i] = Kmatrix[i][j] + pbar.update(1) + + run_time = time.time() - start_time + if verbose: + print("\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" + % (depth, len(Gn), run_time)) + +# print(Kmatrix[0][0:10]) + return Kmatrix, run_time def _untilhpathkernel_do_trie(trie1, trie2, k_func): - """Calculate path graph kernels up to depth d between 2 graphs using trie. - - Parameters - ---------- - trie1, trie2 : list - Tries that contains all paths in 2 graphs. - k_func : function - A kernel function applied using different notions of fingerprint - similarity. - - Return - ------ - kernel : float - Path kernel up to h between 2 graphs. - """ - if k_func == 'tanimoto': - # traverse all paths in graph1 and search them in graph2. Deep-first - # search is applied. - def traverseTrie1t(root, trie2, setlist, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: - setlist[1] += 1 - count2 = trie2.searchWord(pcurrent) - if count2 != 0: - setlist[0] += 1 - if node['children'] != {}: - traverseTrie1t(node, trie2, setlist, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - - # traverse all paths in graph2 and find out those that are not in - # graph1. Deep-first search is applied. - def traverseTrie2t(root, trie1, setlist, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: - # print(node['count']) - count1 = trie1.searchWord(pcurrent) - if count1 == 0: - setlist[1] += 1 - if node['children'] != {}: - traverseTrie2t(node, trie1, setlist, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - setlist = [0, 0] # intersection and union of path sets of g1, g2. -# print(trie1.root) -# print(trie2.root) - traverseTrie1t(trie1.root, trie2, setlist) -# print(setlist) - traverseTrie2t(trie2.root, trie1, setlist) -# print(setlist) - kernel = setlist[0] / setlist[1] - - else: # MinMax kernel - # traverse all paths in graph1 and search them in graph2. Deep-first - # search is applied. - def traverseTrie1m(root, trie2, sumlist, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: - # print(node['count']) - count1 = node['count'] - count2 = trie2.searchWord(pcurrent) - sumlist[0] += min(count1, count2) - sumlist[1] += max(count1, count2) - if node['children'] != {}: - traverseTrie1m(node, trie2, sumlist, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - # traverse all paths in graph2 and find out those that are not in - # graph1. Deep-first search is applied. - def traverseTrie2m(root, trie1, sumlist, pcurrent=[]): - for key, node in root['children'].items(): - pcurrent.append(key) - if node['isEndOfWord']: - # print(node['count']) - count1 = trie1.searchWord(pcurrent) - if count1 == 0: - sumlist[1] += node['count'] - if node['children'] != {}: - traverseTrie2m(node, trie1, sumlist, pcurrent) - else: - del pcurrent[-1] - if pcurrent != []: - del pcurrent[-1] - - sumlist = [0, 0] # sum of mins and sum of maxs -# print(trie1.root) -# print(trie2.root) - traverseTrie1m(trie1.root, trie2, sumlist) -# print(sumlist) - traverseTrie2m(trie2.root, trie1, sumlist) -# print(sumlist) - kernel = sumlist[0] / sumlist[1] - - return kernel + """Calculate path graph kernels up to depth d between 2 graphs using trie. + + Parameters + ---------- + trie1, trie2 : list + Tries that contains all paths in 2 graphs. + k_func : function + A kernel function applied using different notions of fingerprint + similarity. + + Return + ------ + kernel : float + Path kernel up to h between 2 graphs. + """ + if k_func == 'tanimoto': + # traverse all paths in graph1 and search them in graph2. Deep-first + # search is applied. + def traverseTrie1t(root, trie2, setlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + setlist[1] += 1 + count2 = trie2.searchWord(pcurrent) + if count2 != 0: + setlist[0] += 1 + if node['children'] != {}: + traverseTrie1t(node, trie2, setlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + + # traverse all paths in graph2 and find out those that are not in + # graph1. Deep-first search is applied. + def traverseTrie2t(root, trie1, setlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + # print(node['count']) + count1 = trie1.searchWord(pcurrent) + if count1 == 0: + setlist[1] += 1 + if node['children'] != {}: + traverseTrie2t(node, trie1, setlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + setlist = [0, 0] # intersection and union of path sets of g1, g2. +# print(trie1.root) +# print(trie2.root) + traverseTrie1t(trie1.root, trie2, setlist) +# print(setlist) + traverseTrie2t(trie2.root, trie1, setlist) +# print(setlist) + kernel = setlist[0] / setlist[1] + + else: # MinMax kernel + # traverse all paths in graph1 and search them in graph2. Deep-first + # search is applied. + def traverseTrie1m(root, trie2, sumlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + # print(node['count']) + count1 = node['count'] + count2 = trie2.searchWord(pcurrent) + sumlist[0] += min(count1, count2) + sumlist[1] += max(count1, count2) + if node['children'] != {}: + traverseTrie1m(node, trie2, sumlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + # traverse all paths in graph2 and find out those that are not in + # graph1. Deep-first search is applied. + def traverseTrie2m(root, trie1, sumlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + # print(node['count']) + count1 = trie1.searchWord(pcurrent) + if count1 == 0: + sumlist[1] += node['count'] + if node['children'] != {}: + traverseTrie2m(node, trie1, sumlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + sumlist = [0, 0] # sum of mins and sum of maxs +# print(trie1.root) +# print(trie2.root) + traverseTrie1m(trie1.root, trie2, sumlist) +# print(sumlist) + traverseTrie2m(trie2.root, trie1, sumlist) +# print(sumlist) + kernel = sumlist[0] / sumlist[1] + + return kernel def wrapper_uhpath_do_trie(k_func, itr): - i = itr[0] - j = itr[1] - return i, j, _untilhpathkernel_do_trie(G_trie[i], G_trie[j], k_func) - + i = itr[0] + j = itr[1] + return i, j, _untilhpathkernel_do_trie(G_trie[i], G_trie[j], k_func) + def _untilhpathkernel_do_naive(paths1, paths2, k_func): - """Calculate path graph kernels up to depth d between 2 graphs naively. - - Parameters - ---------- - paths_list : list of list - List of list of paths in all graphs, where for unlabeled graphs, each - path is represented by a list of nodes; while for labeled graphs, each - path is represented by a string consists of labels of nodes and/or - edges on that path. - k_func : function - A kernel function applied using different notions of fingerprint - similarity. - - Return - ------ - kernel : float - Path kernel up to h between 2 graphs. - """ - all_paths = list(set(paths1 + paths2)) - - if k_func == 'tanimoto': - length_union = len(set(paths1 + paths2)) - kernel = (len(set(paths1)) + len(set(paths2)) - - length_union) / length_union -# vector1 = [(1 if path in paths1 else 0) for path in all_paths] -# vector2 = [(1 if path in paths2 else 0) for path in all_paths] -# kernel_uv = np.dot(vector1, vector2) -# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) - - else: # MinMax kernel - path_count1 = Counter(paths1) - path_count2 = Counter(paths2) - vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) - for key in all_paths] - vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) - for key in all_paths] - kernel = np.sum(np.minimum(vector1, vector2)) / \ - np.sum(np.maximum(vector1, vector2)) - - return kernel + """Calculate path graph kernels up to depth d between 2 graphs naively. + + Parameters + ---------- + paths_list : list of list + List of list of paths in all graphs, where for unlabeled graphs, each + path is represented by a list of nodes; while for labeled graphs, each + path is represented by a string consists of labels of nodes and/or + edges on that path. + k_func : function + A kernel function applied using different notions of fingerprint + similarity. + + Return + ------ + kernel : float + Path kernel up to h between 2 graphs. + """ + all_paths = list(set(paths1 + paths2)) + + if k_func == 'tanimoto': + length_union = len(set(paths1 + paths2)) + kernel = (len(set(paths1)) + len(set(paths2)) - + length_union) / length_union +# vector1 = [(1 if path in paths1 else 0) for path in all_paths] +# vector2 = [(1 if path in paths2 else 0) for path in all_paths] +# kernel_uv = np.dot(vector1, vector2) +# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) + + else: # MinMax kernel + path_count1 = Counter(paths1) + path_count2 = Counter(paths2) + vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) + for key in all_paths] + vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) + for key in all_paths] + kernel = np.sum(np.minimum(vector1, vector2)) / \ + np.sum(np.maximum(vector1, vector2)) + + return kernel def wrapper_uhpath_do_naive(k_func, itr): - i = itr[0] - j = itr[1] - return i, j, _untilhpathkernel_do_naive(G_plist[i], G_plist[j], k_func) + i = itr[0] + j = itr[1] + return i, j, _untilhpathkernel_do_naive(G_plist[i], G_plist[j], k_func) def _untilhpathkernel_do_kernelless(paths1, paths2, k_func): - """Calculate path graph kernels up to depth d between 2 graphs naively. - - Parameters - ---------- - paths_list : list of list - List of list of paths in all graphs, where for unlabeled graphs, each - path is represented by a list of nodes; while for labeled graphs, each - path is represented by a string consists of labels of nodes and/or - edges on that path. - k_func : function - A kernel function applied using different notions of fingerprint - similarity. - - Return - ------ - kernel : float - Path kernel up to h between 2 graphs. - """ - all_paths = list(set(paths1 + paths2)) - - if k_func == 'tanimoto': - length_union = len(set(paths1 + paths2)) - kernel = (len(set(paths1)) + len(set(paths2)) - - length_union) / length_union -# vector1 = [(1 if path in paths1 else 0) for path in all_paths] -# vector2 = [(1 if path in paths2 else 0) for path in all_paths] -# kernel_uv = np.dot(vector1, vector2) -# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) - - else: # MinMax kernel - path_count1 = Counter(paths1) - path_count2 = Counter(paths2) - vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) - for key in all_paths] - vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) - for key in all_paths] - kernel = np.sum(np.minimum(vector1, vector2)) / \ - np.sum(np.maximum(vector1, vector2)) - - return kernel + """Calculate path graph kernels up to depth d between 2 graphs naively. + + Parameters + ---------- + paths_list : list of list + List of list of paths in all graphs, where for unlabeled graphs, each + path is represented by a list of nodes; while for labeled graphs, each + path is represented by a string consists of labels of nodes and/or + edges on that path. + k_func : function + A kernel function applied using different notions of fingerprint + similarity. + + Return + ------ + kernel : float + Path kernel up to h between 2 graphs. + """ + all_paths = list(set(paths1 + paths2)) + + if k_func == 'tanimoto': + length_union = len(set(paths1 + paths2)) + kernel = (len(set(paths1)) + len(set(paths2)) - + length_union) / length_union +# vector1 = [(1 if path in paths1 else 0) for path in all_paths] +# vector2 = [(1 if path in paths2 else 0) for path in all_paths] +# kernel_uv = np.dot(vector1, vector2) +# kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) + + else: # MinMax kernel + path_count1 = Counter(paths1) + path_count2 = Counter(paths2) + vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) + for key in all_paths] + vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) + for key in all_paths] + kernel = np.sum(np.minimum(vector1, vector2)) / \ + np.sum(np.maximum(vector1, vector2)) + + return kernel def wrapper_uhpath_do_kernelless(k_func, itr): - i = itr[0] - j = itr[1] - return i, j, _untilhpathkernel_do_kernelless(G_plist[i], G_plist[j], k_func) + i = itr[0] + j = itr[1] + return i, j, _untilhpathkernel_do_kernelless(G_plist[i], G_plist[j], k_func) # @todo: (can be removed maybe) this method find paths repetively, it could be faster. def find_all_paths_until_length(G, - length, - ds_attrs, - node_label='atom', - edge_label='bond_type', - tolabelseqs=True): - """Find all paths no longer than a certain maximum length in a graph. A - recursive depth first search is applied. - - Parameters - ---------- - G : NetworkX graphs - The graph in which paths are searched. - length : integer - The maximum length of paths. - ds_attrs: dict - Dataset attributes. - node_label : string - Node attribute used as label. The default node label is atom. - edge_label : string - Edge attribute used as label. The default edge label is bond_type. - - Return - ------ - path : list - List of paths retrieved, where for unlabeled graphs, each path is - represented by a list of nodes; while for labeled graphs, each path is - represented by a list of strings consists of labels of nodes and/or - edges on that path. - """ - # path_l = [tuple([n]) for n in G.nodes] # paths of length l - # all_paths = path_l[:] - # for l in range(1, length + 1): - # path_l_new = [] - # for path in path_l: - # for neighbor in G[path[-1]]: - # if len(path) < 2 or neighbor != path[-2]: - # tmp = path + (neighbor, ) - # if tuple(tmp[::-1]) not in path_l_new: - # path_l_new.append(tuple(tmp)) - - # all_paths += path_l_new - # path_l = path_l_new[:] - - path_l = [[n] for n in G.nodes] # paths of length l - all_paths = [p.copy() for p in path_l] - for l in range(1, length + 1): - path_lplus1 = [] - for path in path_l: - for neighbor in G[path[-1]]: - if neighbor not in path: - tmp = path + [neighbor] -# if tmp[::-1] not in path_lplus1: - path_lplus1.append(tmp) - - all_paths += path_lplus1 - path_l = [p.copy() for p in path_lplus1] - - # for i in range(0, length + 1): - # new_paths = find_all_paths(G, i) - # if new_paths == []: - # break - # all_paths.extend(new_paths) - - # consider labels -# print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)) -# print() - return (paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label) - if tolabelseqs else all_paths) - - + length, + ds_attrs, + node_label='atom', + edge_label='bond_type', + tolabelseqs=True): + """Find all paths no longer than a certain maximum length in a graph. A + recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + length : integer + The maximum length of paths. + ds_attrs: dict + Dataset attributes. + node_label : string + Node attribute used as label. The default node label is atom. + edge_label : string + Edge attribute used as label. The default edge label is bond_type. + + Return + ------ + path : list + List of paths retrieved, where for unlabeled graphs, each path is + represented by a list of nodes; while for labeled graphs, each path is + represented by a list of strings consists of labels of nodes and/or + edges on that path. + """ + # path_l = [tuple([n]) for n in G.nodes] # paths of length l + # all_paths = path_l[:] + # for l in range(1, length + 1): + # path_l_new = [] + # for path in path_l: + # for neighbor in G[path[-1]]: + # if len(path) < 2 or neighbor != path[-2]: + # tmp = path + (neighbor, ) + # if tuple(tmp[::-1]) not in path_l_new: + # path_l_new.append(tuple(tmp)) + + # all_paths += path_l_new + # path_l = path_l_new[:] + + path_l = [[n] for n in G.nodes] # paths of length l + all_paths = [p.copy() for p in path_l] + for l in range(1, length + 1): + path_lplus1 = [] + for path in path_l: + for neighbor in G[path[-1]]: + if neighbor not in path: + tmp = path + [neighbor] +# if tmp[::-1] not in path_lplus1: + path_lplus1.append(tmp) + + all_paths += path_lplus1 + path_l = [p.copy() for p in path_lplus1] + + # for i in range(0, length + 1): + # new_paths = find_all_paths(G, i) + # if new_paths == []: + # break + # all_paths.extend(new_paths) + + # consider labels +# print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)) +# print() + return (paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label) + if tolabelseqs else all_paths) + + def wrapper_find_all_paths_until_length(length, ds_attrs, node_label, - edge_label, tolabelseqs, itr_item): - g = itr_item[0] - i = itr_item[1] - return i, find_all_paths_until_length(g, length, ds_attrs, - node_label=node_label, edge_label=edge_label, - tolabelseqs=tolabelseqs) + edge_label, tolabelseqs, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, find_all_paths_until_length(g, length, ds_attrs, + node_label=node_label, edge_label=edge_label, + tolabelseqs=tolabelseqs) def find_all_path_as_trie(G, - length, - ds_attrs, - node_label='atom', - edge_label='bond_type'): -# time1 = time.time() - -# all_path = find_all_paths_until_length(G, length, ds_attrs, -# node_label=node_label, -# edge_label=edge_label) -# ptrie = Trie() -# for path in all_path: -# ptrie.insertWord(path) - -# ptrie = Trie() -# path_l = [[n] for n in G.nodes] # paths of length l -# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) -# for p in path_l_str: -# ptrie.insertWord(p) -# for l in range(1, length + 1): -# path_lplus1 = [] -# for path in path_l: -# for neighbor in G[path[-1]]: -# if neighbor not in path: -# tmp = path + [neighbor] -## if tmp[::-1] not in path_lplus1: -# path_lplus1.append(tmp) -# path_l = path_lplus1[:] -# # consider labels -# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) -# for p in path_l_str: -# ptrie.insertWord(p) -# -# print(time.time() - time1) -# print(ptrie.root) -# print() - - - # traverse all paths up to length h in a graph and construct a trie with - # them. Deep-first search is applied. Notice the reverse of each path is - # also stored to the trie. - def traverseGraph(root, ptrie, length, G, ds_attrs, node_label, edge_label, - pcurrent=[]): - if len(pcurrent) < length + 1: - for neighbor in G[root]: - if neighbor not in pcurrent: - pcurrent.append(neighbor) - plstr = paths2labelseqs([pcurrent], G, ds_attrs, - node_label, edge_label) - ptrie.insertWord(plstr[0]) - traverseGraph(neighbor, ptrie, length, G, ds_attrs, - node_label, edge_label, pcurrent) - del pcurrent[-1] - - - ptrie = Trie() - path_l = [[n] for n in G.nodes] # paths of length l - path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) - for p in path_l_str: - ptrie.insertWord(p) - for n in G.nodes: - traverseGraph(n, ptrie, length, G, ds_attrs, node_label, edge_label, - pcurrent=[n]) - - -# def traverseGraph(root, all_paths, length, G, ds_attrs, node_label, edge_label, -# pcurrent=[]): -# if len(pcurrent) < length + 1: -# for neighbor in G[root]: -# if neighbor not in pcurrent: -# pcurrent.append(neighbor) -# plstr = paths2labelseqs([pcurrent], G, ds_attrs, -# node_label, edge_label) -# all_paths.append(pcurrent[:]) -# traverseGraph(neighbor, all_paths, length, G, ds_attrs, -# node_label, edge_label, pcurrent) -# del pcurrent[-1] + length, + ds_attrs, + node_label='atom', + edge_label='bond_type'): +# time1 = time.time() + +# all_path = find_all_paths_until_length(G, length, ds_attrs, +# node_label=node_label, +# edge_label=edge_label) +# ptrie = Trie() +# for path in all_path: +# ptrie.insertWord(path) + +# ptrie = Trie() +# path_l = [[n] for n in G.nodes] # paths of length l +# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) +# for p in path_l_str: +# ptrie.insertWord(p) +# for l in range(1, length + 1): +# path_lplus1 = [] +# for path in path_l: +# for neighbor in G[path[-1]]: +# if neighbor not in path: +# tmp = path + [neighbor] +## if tmp[::-1] not in path_lplus1: +# path_lplus1.append(tmp) +# path_l = path_lplus1[:] +# # consider labels +# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) +# for p in path_l_str: +# ptrie.insertWord(p) +# +# print(time.time() - time1) +# print(ptrie.root) +# print() + + + # traverse all paths up to length h in a graph and construct a trie with + # them. Deep-first search is applied. Notice the reverse of each path is + # also stored to the trie. + def traverseGraph(root, ptrie, length, G, ds_attrs, node_label, edge_label, + pcurrent=[]): + if len(pcurrent) < length + 1: + for neighbor in G[root]: + if neighbor not in pcurrent: + pcurrent.append(neighbor) + plstr = paths2labelseqs([pcurrent], G, ds_attrs, + node_label, edge_label) + ptrie.insertWord(plstr[0]) + traverseGraph(neighbor, ptrie, length, G, ds_attrs, + node_label, edge_label, pcurrent) + del pcurrent[-1] + + + ptrie = Trie() + path_l = [[n] for n in G.nodes] # paths of length l + path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) + for p in path_l_str: + ptrie.insertWord(p) + for n in G.nodes: + traverseGraph(n, ptrie, length, G, ds_attrs, node_label, edge_label, + pcurrent=[n]) + + +# def traverseGraph(root, all_paths, length, G, ds_attrs, node_label, edge_label, +# pcurrent=[]): +# if len(pcurrent) < length + 1: +# for neighbor in G[root]: +# if neighbor not in pcurrent: +# pcurrent.append(neighbor) +# plstr = paths2labelseqs([pcurrent], G, ds_attrs, +# node_label, edge_label) +# all_paths.append(pcurrent[:]) +# traverseGraph(neighbor, all_paths, length, G, ds_attrs, +# node_label, edge_label, pcurrent) +# del pcurrent[-1] # # -# path_l = [[n] for n in G.nodes] # paths of length l -# all_paths = path_l[:] -# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) -## for p in path_l_str: -## ptrie.insertWord(p) -# for n in G.nodes: -# traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label, -# pcurrent=[n]) - -# print(ptrie.root) - return ptrie +# path_l = [[n] for n in G.nodes] # paths of length l +# all_paths = path_l[:] +# path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) +## for p in path_l_str: +## ptrie.insertWord(p) +# for n in G.nodes: +# traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label, +# pcurrent=[n]) + +# print(ptrie.root) + return ptrie def wrapper_find_all_path_as_trie(length, ds_attrs, node_label, - edge_label, itr_item): - g = itr_item[0] - i = itr_item[1] - return i, find_all_path_as_trie(g, length, ds_attrs, - node_label=node_label, edge_label=edge_label) + edge_label, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, find_all_path_as_trie(g, length, ds_attrs, + node_label=node_label, edge_label=edge_label) def paths2labelseqs(plist, G, ds_attrs, node_label, edge_label): - if ds_attrs['node_labeled']: - if ds_attrs['edge_labeled']: - path_strs = [ - tuple( - list( - chain.from_iterable( - (G.nodes[node][node_label], - G[node][path[idx + 1]][edge_label]) - for idx, node in enumerate(path[:-1]))) + - [G.nodes[path[-1]][node_label]]) for path in plist - ] - # path_strs = [] - # for path in all_paths: - # strlist = list( - # chain.from_iterable((G.node[node][node_label], - # G[node][path[idx + 1]][edge_label]) - # for idx, node in enumerate(path[:-1]))) - # strlist.append(G.node[path[-1]][node_label]) - # path_strs.append(tuple(strlist)) - else: - path_strs = [ - tuple([G.nodes[node][node_label] for node in path]) - for path in plist - ] - return path_strs - else: - if ds_attrs['edge_labeled']: - return [ - tuple([] if len(path) == 1 else [ - G[node][path[idx + 1]][edge_label] - for idx, node in enumerate(path[:-1]) - ]) for path in plist - ] - else: - return [tuple(['0' for node in path]) for path in plist] -# return [tuple([len(path)]) for path in all_paths] + if ds_attrs['node_labeled']: + if ds_attrs['edge_labeled']: + path_strs = [ + tuple( + list( + chain.from_iterable( + (G.nodes[node][node_label], + G[node][path[idx + 1]][edge_label]) + for idx, node in enumerate(path[:-1]))) + + [G.nodes[path[-1]][node_label]]) for path in plist + ] + # path_strs = [] + # for path in all_paths: + # strlist = list( + # chain.from_iterable((G.node[node][node_label], + # G[node][path[idx + 1]][edge_label]) + # for idx, node in enumerate(path[:-1]))) + # strlist.append(G.node[path[-1]][node_label]) + # path_strs.append(tuple(strlist)) + else: + path_strs = [ + tuple([G.nodes[node][node_label] for node in path]) + for path in plist + ] + return path_strs + else: + if ds_attrs['edge_labeled']: + return [ + tuple([] if len(path) == 1 else [ + G[node][path[idx + 1]][edge_label] + for idx, node in enumerate(path[:-1]) + ]) for path in plist + ] + else: + return [tuple(['0' for node in path]) for path in plist] +# return [tuple([len(path)]) for path in all_paths] # #def paths2GSuffixTree(paths): -# return Tree(paths, builder=ukkonen.Builder) +# return Tree(paths, builder=ukkonen.Builder) # def find_paths(G, source_node, length): -# """Find all paths no longer than a certain length those start from a source node. A recursive depth first search is applied. - -# Parameters -# ---------- -# G : NetworkX graphs -# The graph in which paths are searched. -# source_node : integer -# The number of the node from where all paths start. -# length : integer -# The length of paths. - -# Return -# ------ -# path : list of list -# List of paths retrieved, where each path is represented by a list of nodes. -# """ -# return [[source_node]] if length == 0 else \ -# [[source_node] + path for neighbor in G[source_node] -# for path in find_paths(G, neighbor, length - 1) if source_node not in path] +# """Find all paths no longer than a certain length those start from a source node. A recursive depth first search is applied. + +# Parameters +# ---------- +# G : NetworkX graphs +# The graph in which paths are searched. +# source_node : integer +# The number of the node from where all paths start. +# length : integer +# The length of paths. + +# Return +# ------ +# path : list of list +# List of paths retrieved, where each path is represented by a list of nodes. +# """ +# return [[source_node]] if length == 0 else \ +# [[source_node] + path for neighbor in G[source_node] +# for path in find_paths(G, neighbor, length - 1) if source_node not in path] # def find_all_paths(G, length): -# """Find all paths with a certain length in a graph. A recursive depth first search is applied. - -# Parameters -# ---------- -# G : NetworkX graphs -# The graph in which paths are searched. -# length : integer -# The length of paths. - -# Return -# ------ -# path : list of list -# List of paths retrieved, where each path is represented by a list of nodes. -# """ -# all_paths = [] -# for node in G: -# all_paths.extend(find_paths(G, node, length)) - -# # The following process is not carried out according to the original article -# # all_paths_r = [ path[::-1] for path in all_paths ] - -# # # For each path, two presentation are retrieved from its two extremities. Remove one of them. -# # for idx, path in enumerate(all_paths[:-1]): -# # for path2 in all_paths_r[idx+1::]: -# # if path == path2: -# # all_paths[idx] = [] -# # break - -# # return list(filter(lambda a: a != [], all_paths)) -# return all_paths +# """Find all paths with a certain length in a graph. A recursive depth first search is applied. + +# Parameters +# ---------- +# G : NetworkX graphs +# The graph in which paths are searched. +# length : integer +# The length of paths. + +# Return +# ------ +# path : list of list +# List of paths retrieved, where each path is represented by a list of nodes. +# """ +# all_paths = [] +# for node in G: +# all_paths.extend(find_paths(G, node, length)) + +# # The following process is not carried out according to the original article +# # all_paths_r = [ path[::-1] for path in all_paths ] + +# # # For each path, two presentation are retrieved from its two extremities. Remove one of them. +# # for idx, path in enumerate(all_paths[:-1]): +# # for path2 in all_paths_r[idx+1::]: +# # if path == path2: +# # all_paths[idx] = [] +# # break + +# # return list(filter(lambda a: a != [], all_paths)) +# return all_paths diff --git a/gklearn/kernels/weisfeilerLehmanKernel.py b/gklearn/kernels/weisfeilerLehmanKernel.py index ecbcf49..960d391 100644 --- a/gklearn/kernels/weisfeilerLehmanKernel.py +++ b/gklearn/kernels/weisfeilerLehmanKernel.py @@ -30,6 +30,7 @@ def weisfeilerlehmankernel(*args, base_kernel='subtree', parallel=None, n_jobs=None, + chunksize=None, verbose=True): """Calculate Weisfeiler-Lehman kernels between graphs. @@ -91,7 +92,7 @@ def weisfeilerlehmankernel(*args, # for WL subtree kernel if base_kernel == 'subtree': - Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose) + Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose) # for WL shortest path kernel elif base_kernel == 'sp': @@ -113,7 +114,7 @@ def weisfeilerlehmankernel(*args, return Kmatrix, run_time -def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose): +def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose): """Calculate Weisfeiler-Lehman kernels between graphs. Parameters @@ -146,7 +147,7 @@ def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose) all_num_of_each_label.append(dict(Counter(labels_ori))) # calculate subtree kernel with the 0th iteration and add it to the final kernel - compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False) + compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False) # iterate each height for h in range(1, height + 1): @@ -304,7 +305,7 @@ def wrapper_wl_iteration(node_label, itr_item): return i, all_multisets -def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose): +def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, verbose): """Compute kernel matrix using the base kernel. """ if parallel == 'imap_unordered': @@ -314,7 +315,7 @@ def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, G_alllabels = alllabels_toshare do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose) + glbv=(all_num_of_each_label,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) elif parallel == None: for i in range(len(Kmatrix)): for j in range(i, len(Kmatrix)): diff --git a/gklearn/utils/parallel.py b/gklearn/utils/parallel.py index e6edb70..71bb47c 100644 --- a/gklearn/utils/parallel.py +++ b/gklearn/utils/parallel.py @@ -24,7 +24,7 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker n_jobs = multiprocessing.cpu_count() with Pool(processes=n_jobs, initializer=init_worker, initargs=glbv) as pool: - if chunksize == None: + if chunksize is None: if len_itr < 100 * n_jobs: chunksize = int(len_itr / n_jobs) + 1 else: @@ -39,7 +39,7 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker if n_jobs == None: n_jobs = multiprocessing.cpu_count() with Pool(processes=n_jobs) as pool: - if chunksize == None: + if chunksize is None: if len_itr < 100 * n_jobs: chunksize = int(len_itr / n_jobs) + 1 else: From 0eef570c62ac2052715836e1d1476f49915c861c Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 29 Sep 2020 11:27:42 +0200 Subject: [PATCH 3/5] Fix bugs in exps. --- gklearn/experiments/papers/PRL_2020/runtimes_28cores.py | 17 +++++------------ .../experiments/papers/PRL_2020/synthesized_graphs_N.py | 4 ++-- .../papers/PRL_2020/synthesized_graphs_degrees.py | 4 ++-- .../papers/PRL_2020/synthesized_graphs_num_el.py | 4 ++-- .../papers/PRL_2020/synthesized_graphs_num_nl.py | 3 ++- .../papers/PRL_2020/synthesized_graphs_num_nodes.py | 3 ++- 6 files changed, 15 insertions(+), 20 deletions(-) diff --git a/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py b/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py index 7472bdd..4c827ce 100644 --- a/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py +++ b/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py @@ -10,19 +10,12 @@ from gklearn.utils.graphdataset import load_predefined_dataset import logging -# def get_graphs(ds_name): -# from gklearn.utils.graph_synthesizer import GraphSynthesizer -# gsyzer = GraphSynthesizer() -# graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=num_nodes, num_edges=int(num_nodes*2), num_node_labels=0, num_edge_labels=0, seed=None, directed=False) -# return graphs - - -def xp_runtimes_of_all_7cores(): +def xp_runtimes_of_all_28cores(): # Run and save. import pickle import os - save_dir = 'outputs/runtimes_of_all_7cores/' + save_dir = 'outputs/runtimes_of_all_28cores/' if not os.path.exists(save_dir): os.makedirs(save_dir) @@ -41,16 +34,16 @@ def xp_runtimes_of_all_7cores(): graphs, _ = load_predefined_dataset(ds_name) # Compute Gram matrix. + run_time = 'error' try: gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=28) - run_times[kernel_name].append(run_time) except Exception as exp: - run_times[kernel_name].append('error') print('An exception occured when running this experiment:') LOG_FILENAME = save_dir + 'error.txt' logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) + run_times[kernel_name].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.pkl', 'wb')) @@ -61,4 +54,4 @@ def xp_runtimes_of_all_7cores(): if __name__ == '__main__': - xp_runtimes_of_all_7cores() + xp_runtimes_of_all_28cores() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py index 0994e15..36bf1bc 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py @@ -41,16 +41,16 @@ def xp_synthesied_graphs_dataset_size(): sub_graphs = [g.copy() for g in graphs[0:num_graphs]] + run_time = 'error' try: gram_matrix, run_time = compute_graph_kernel(sub_graphs, kernel_name, n_jobs=1) - run_times[kernel_name].append(run_time) except Exception as exp: - run_times[kernel_name].append('error') print('An exception occured when running this experiment:') LOG_FILENAME = save_dir + 'error.txt' logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) + run_times[kernel_name].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_graphs) + '.pkl', 'wb')) diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py index 6efc732..0562d81 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py @@ -40,16 +40,16 @@ def xp_synthesied_graphs_degrees(): graphs = generate_graphs(degree) # Compute Gram matrix. + run_time = 'error' try: gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) - run_times[kernel_name].append(run_time) except Exception as exp: - run_times[kernel_name].append('error') print('An exception occured when running this experiment:') LOG_FILENAME = save_dir + 'error.txt' logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) + run_times[kernel_name].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(degree) + '.pkl', 'wb')) diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py index b28b8f7..9a8e721 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py @@ -40,16 +40,16 @@ def xp_synthesied_graphs_num_edge_label_alphabet(): graphs = generate_graphs(num_el_alp) # Compute Gram matrix. + run_time = 'error' try: gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) - run_times[kernel_name].append(run_time) except Exception as exp: - run_times[kernel_name].append('error') print('An exception occured when running this experiment:') LOG_FILENAME = save_dir + 'error.txt' logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) + run_times[kernel_name].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_el_alp) + '.pkl', 'wb')) diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py index 411c3c1..2ab63ee 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py @@ -40,9 +40,9 @@ def xp_synthesied_graphs_num_node_label_alphabet(): graphs = generate_graphs(num_nl_alp) # Compute Gram matrix. + run_time = 'error' try: gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) - run_times[kernel_name].append(run_time) except Exception as exp: run_times[kernel_name].append('error') print('An exception occured when running this experiment:') @@ -50,6 +50,7 @@ def xp_synthesied_graphs_num_node_label_alphabet(): logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) + run_times[kernel_name].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nl_alp) + '.pkl', 'wb')) diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py index 0f5a395..d0d6ebb 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py @@ -40,9 +40,9 @@ def xp_synthesied_graphs_num_nodes(): graphs = generate_graphs(num_nodes) # Compute Gram matrix. + run_time = 'error' try: gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) - run_times[kernel_name].append(run_time) except Exception as exp: run_times[kernel_name].append('error') print('An exception occured when running this experiment:') @@ -50,6 +50,7 @@ def xp_synthesied_graphs_num_nodes(): logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) + run_times[kernel_name].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nodes) + '.pkl', 'wb')) From 5303897fb558f75f4a9698b976e020e5f30cbb12 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 29 Sep 2020 15:15:09 +0200 Subject: [PATCH 4/5] Add exps. --- Problems.md | 4 +- README.md | 2 +- .../papers/PRL_2020/runtimes_diff_chunksizes.py | 62 ++++++++++++++++++++++ gklearn/experiments/papers/PRL_2020/utils.py | 3 +- 4 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py diff --git a/Problems.md b/Problems.md index 9662dfd..1929f5f 100644 --- a/Problems.md +++ b/Problems.md @@ -1,6 +1,6 @@ # About graph kenrels. -## (Random walk) Sylvester equation kernel. +## (Random walk) Sylvester equation kernel. ### ImportError: cannot import name 'frange' from 'matplotlib.mlab' @@ -18,4 +18,6 @@ Install MKL. Then add the following to your path: export PATH=/opt/intel/bin:$PATH export LD_LIBRARY_PATH=/opt/intel/lib/intel64:/opt/intel/mkl/lib/intel64:$LD_LIBRARY_PATH + +export LD_PRELOAD=/opt/intel/mkl/lib/intel64/libmkl_def.so:/opt/intel/mkl/lib/intel64/libmkl_avx2.so:/opt/intel/mkl/lib/intel64/libmkl_core.so:/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.so:/opt/intel/mkl/lib/intel64/libmkl_intel_thread.so:/opt/intel/lib/intel64_lin/libiomp5.so ``` diff --git a/README.md b/README.md index e0d5b25..f9eec77 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Check [`notebooks`](https://github.com/jajupmochi/graphkit-learn/tree/master/not The docs of the library can be found [here](https://graphkit-learn.readthedocs.io/en/master/?badge=master). -## Main contents +## Main contents ### 1 List of graph kernels diff --git a/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py b/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py new file mode 100644 index 0000000..343694c --- /dev/null +++ b/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Sep 21 10:34:26 2020 + +@author: ljia +""" +from utils import Graph_Kernel_List, Dataset_List, compute_graph_kernel +from gklearn.utils.graphdataset import load_predefined_dataset +import logging + + +def xp_runtimes_diff_chunksizes(): + + # Run and save. + import pickle + import os + save_dir = 'outputs/runtimes_diff_chunksizes/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + run_times = {} + + for kernel_name in Graph_Kernel_List: + print() + print('Kernel:', kernel_name) + + run_times[kernel_name] = [] + for ds_name in Dataset_List: + print() + print('Dataset:', ds_name) + + run_times[kernel_name].append([]) + for chunksize in [1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000]: + print() + print('Chunksize:', chunksize) + + # get graphs. + graphs, _ = load_predefined_dataset(ds_name) + + # Compute Gram matrix. + run_time = 'error' + try: + gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, chunksize=chunksize) + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception('') + print(repr(exp)) + run_times[kernel_name][-1].append(run_time) + + pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.' + str(chunksize) + '.pkl', 'wb')) + + # Save all. + pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb')) + + return + + +if __name__ == '__main__': + xp_runtimes_diff_chunksizes() diff --git a/gklearn/experiments/papers/PRL_2020/utils.py b/gklearn/experiments/papers/PRL_2020/utils.py index 4290180..07c82f7 100644 --- a/gklearn/experiments/papers/PRL_2020/utils.py +++ b/gklearn/experiments/papers/PRL_2020/utils.py @@ -27,7 +27,7 @@ Graph_Kernel_List_ECon = ['ConjugateGradient', 'FixedPoint', 'StructuralSP'] Dataset_List = ['Alkane', 'Acyclic', 'MAO', 'PAH', 'MUTAG', 'Letter-med', 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD'] -def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count()): +def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count(), chunksize=None): if kernel_name == 'CommonWalk': from gklearn.kernels.commonWalkKernel import commonwalkkernel @@ -105,6 +105,7 @@ def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count() # params['parallel'] = None params['n_jobs'] = n_jobs + params['chunksize'] = chunksize params['verbose'] = True results = estimator(graphs, **params) From 5eea854d3df4cb058ec73002cd64da93aa7f1683 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 2 Oct 2020 14:36:58 +0200 Subject: [PATCH 5/5] Add CHANGELOG.md. --- CHANGELOG.md | 0 Problems.md | 2 +- README.md | 28 ++++++++++++++-------------- setup.py | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e69de29 diff --git a/Problems.md b/Problems.md index 1929f5f..cb7dd1e 100644 --- a/Problems.md +++ b/Problems.md @@ -10,7 +10,7 @@ Update your `control` package. ### Intel MKL FATAL ERROR: Cannot load libmkl_avx2.so or libmkl_def.so. -The Intel Math Kernel Library (MKL) is missing or not properly set. I assume the MKL is required by `control` module. +The Intel Math Kernel Library (MKL) is missing or not properly set. I assume MKL is required by the `control` module. Install MKL. Then add the following to your path: diff --git a/README.md b/README.md index f9eec77..9ade200 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,20 @@ A comparison of performances of graph kernels on benchmark datasets can be found Fork the library and open a pull request! Make your own contribute to the community! +## Authors + +* [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie +* [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie +* [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie + +## Citation + +Still waiting... + +## Acknowledgments + +This research was supported by CSC (China Scholarship Council) and the French national research agency (ANR) under the grant APi (ANR-18-CE23-0014). The authors would like to thank the CRIANN (Le Centre Régional Informatique et d’Applications Numériques de Normandie) for providing computational resources. + ## References [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003. @@ -153,17 +167,3 @@ Fork the library and open a pull request! Make your own contribute to the commun [10] Gaüzere, B., Brun, L., Villemin, D., 2012. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters 33, 2038–2047. [11] Shervashidze, N., Schweitzer, P., Leeuwen, E.J.v., Mehlhorn, K., Borgwardt, K.M., 2011. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research 12, 2539–2561. - -## Authors - -* [Linlin Jia](https://jajupmochi.github.io/), LITIS, INSA Rouen Normandie -* [Benoit Gaüzère](http://pagesperso.litislab.fr/~bgauzere/#contact_en), LITIS, INSA Rouen Normandie -* [Paul Honeine](http://honeine.fr/paul/Welcome.html), LITIS, Université de Rouen Normandie - -## Citation - -Still waiting... - -## Acknowledgments - -This research was supported by CSC (China Scholarship Council) and the French national research agency (ANR) under the grant APi (ANR-18-CE23-0014). The authors would like to thank the CRIANN (Le Centre Régional Informatique et d’Applications Numériques de Normandie) for providing computational resources. diff --git a/setup.py b/setup.py index f2f1048..ea10603 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ with open('requirements_pypi.txt') as fp: setuptools.setup( name="graphkit-learn", - version="0.2b4", + version="0.2.0", author="Linlin Jia", author_email="linlin.jia@insa-rouen.fr", description="A Python library for graph kernels, graph edit distances, and graph pre-images",