diff --git a/Problems.md b/Problems.md new file mode 100644 index 0000000..9662dfd --- /dev/null +++ b/Problems.md @@ -0,0 +1,21 @@ +# About graph kenrels. + +## (Random walk) Sylvester equation kernel. + +### ImportError: cannot import name 'frange' from 'matplotlib.mlab' + +You are using an outdated `control` with a recent `matplotlib`. `mlab.frange` was removed in `matplotlib-3.1.0`, and `control` removed the call in `control-0.8.2`. + +Update your `control` package. + +### Intel MKL FATAL ERROR: Cannot load libmkl_avx2.so or libmkl_def.so. + +The Intel Math Kernel Library (MKL) is missing or not properly set. I assume the MKL is required by `control` module. + +Install MKL. Then add the following to your path: + +``` +export PATH=/opt/intel/bin:$PATH + +export LD_LIBRARY_PATH=/opt/intel/lib/intel64:/opt/intel/mkl/lib/intel64:$LD_LIBRARY_PATH +``` diff --git a/README.md b/README.md index 386d9a4..085084d 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,12 @@ A Python package for graph kernels, graph edit distances and graph pre-image pro * python>=3.5 * numpy>=1.16.2 * scipy>=1.1.0 -* matplotlib>=3.0.0 +* matplotlib>=3.1.0 * networkx>=2.2 * scikit-learn>=0.20.0 * tabulate>=0.8.2 * tqdm>=4.26.0 -* control==0.8.0 (for generalized random walk kernels only) +* control>=0.8.2 (for generalized random walk kernels only) * slycot==0.3.3 (for generalized random walk kernels only, which requires a fortran compiler, gfortran for example) ## How to use? diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py new file mode 100644 index 0000000..37762d6 --- /dev/null +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Sep 21 10:34:26 2020 + +@author: ljia +""" +from utils import Graph_Kernel_List, compute_graph_kernel + + +def generate_graphs(): + from gklearn.utils.graph_synthesizer import GraphSynthesizer + gsyzer = GraphSynthesizer() + graphs = gsyzer.unified_graphs(num_graphs=1000, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) + return graphs + + +def xp_synthesied_graphs_dataset_size(): + + # Generate graphs. + graphs = generate_graphs() + + # Run and save. + import pickle + import os + save_dir = 'outputs/synthesized_graphs_N/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + run_times = {} + + for kernel_name in Graph_Kernel_List: + print() + print('Kernel:', kernel_name) + + run_times[kernel_name] = [] + for num_graphs in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]: + print() + print('Number of graphs:', num_graphs) + + sub_graphs = [g.copy() for g in graphs[0:num_graphs]] + gram_matrix, run_time = compute_graph_kernel(sub_graphs, kernel_name) + run_times[kernel_name].append(run_time) + + pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_graphs) + '.pkl', 'wb')) + + # Save all. + pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb')) + + return + + +if __name__ == '__main__': + xp_synthesied_graphs_dataset_size() \ No newline at end of file diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py new file mode 100644 index 0000000..d6e5d8b --- /dev/null +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Sep 21 10:34:26 2020 + +@author: ljia +""" +from utils import Graph_Kernel_List, compute_graph_kernel + + +def generate_graphs(degree): + from gklearn.utils.graph_synthesizer import GraphSynthesizer + gsyzer = GraphSynthesizer() + graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=int(10*degree), num_node_labels=0, num_edge_labels=0, seed=None, directed=False) + return graphs + + +def xp_synthesied_graphs_degrees(): + + # Run and save. + import pickle + import os + save_dir = 'outputs/synthesized_graphs_degrees/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + run_times = {} + + for kernel_name in Graph_Kernel_List: + print() + print('Kernel:', kernel_name) + + run_times[kernel_name] = [] + for degree in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: + print() + print('Degree:', degree) + + # Generate graphs. + graphs = generate_graphs(degree) + + # Compute Gram matrix. + gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name) + run_times[kernel_name].append(run_time) + + pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(degree) + '.pkl', 'wb')) + + # Save all. + pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb')) + + return + + +if __name__ == '__main__': + xp_synthesied_graphs_degrees() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py new file mode 100644 index 0000000..2341ba9 --- /dev/null +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Sep 21 10:34:26 2020 + +@author: ljia +""" +from utils import Graph_Kernel_List_ESym, compute_graph_kernel + + +def generate_graphs(num_el_alp): + from gklearn.utils.graph_synthesizer import GraphSynthesizer + gsyzer = GraphSynthesizer() + graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=num_el_alp, seed=None, directed=False) + return graphs + + +def xp_synthesied_graphs_num_edge_label_alphabet(): + + # Run and save. + import pickle + import os + save_dir = 'outputs/synthesized_graphs_num_edge_label_alphabet/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + run_times = {} + + for kernel_name in Graph_Kernel_List_ESym: + print() + print('Kernel:', kernel_name) + + run_times[kernel_name] = [] + for num_el_alp in [0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40]: + print() + print('Number of edge label alphabet:', num_el_alp) + + # Generate graphs. + graphs = generate_graphs(num_el_alp) + + # Compute Gram matrix. + gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name) + run_times[kernel_name].append(run_time) + + pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_el_alp) + '.pkl', 'wb')) + + # Save all. + pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb')) + + return + + +if __name__ == '__main__': + xp_synthesied_graphs_num_edge_label_alphabet() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py new file mode 100644 index 0000000..005ab35 --- /dev/null +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Sep 21 10:34:26 2020 + +@author: ljia +""" +from utils import Graph_Kernel_List_VSym, compute_graph_kernel + + +def generate_graphs(num_nl_alp): + from gklearn.utils.graph_synthesizer import GraphSynthesizer + gsyzer = GraphSynthesizer() + graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=20, num_edges=40, num_node_labels=num_nl_alp, num_edge_labels=0, seed=None, directed=False) + return graphs + + +def xp_synthesied_graphs_num_node_label_alphabet(): + + # Run and save. + import pickle + import os + save_dir = 'outputs/synthesized_graphs_num_node_label_alphabet/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + run_times = {} + + for kernel_name in Graph_Kernel_List_VSym: + print() + print('Kernel:', kernel_name) + + run_times[kernel_name] = [] + for num_nl_alp in [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]: + print() + print('Number of node label alphabet:', num_nl_alp) + + # Generate graphs. + graphs = generate_graphs(num_nl_alp) + + # Compute Gram matrix. + gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name) + run_times[kernel_name].append(run_time) + + pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nl_alp) + '.pkl', 'wb')) + + # Save all. + pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb')) + + return + + +if __name__ == '__main__': + xp_synthesied_graphs_num_node_label_alphabet() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py new file mode 100644 index 0000000..24ba722 --- /dev/null +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Sep 21 10:34:26 2020 + +@author: ljia +""" +from utils import Graph_Kernel_List, compute_graph_kernel + + +def generate_graphs(num_nodes): + from gklearn.utils.graph_synthesizer import GraphSynthesizer + gsyzer = GraphSynthesizer() + graphs = gsyzer.unified_graphs(num_graphs=100, num_nodes=num_nodes, num_edges=int(num_nodes*2), num_node_labels=0, num_edge_labels=0, seed=None, directed=False) + return graphs + + +def xp_synthesied_graphs_num_nodes(): + + # Run and save. + import pickle + import os + save_dir = 'outputs/synthesized_graphs_num_nodes/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + run_times = {} + + for kernel_name in Graph_Kernel_List: + print() + print('Kernel:', kernel_name) + + run_times[kernel_name] = [] + for num_nodes in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: + print() + print('Number of nodes:', num_nodes) + + # Generate graphs. + graphs = generate_graphs(num_nodes) + + # Compute Gram matrix. + gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name) + run_times[kernel_name].append(run_time) + + pickle.dump(run_times, open(save_dir + 'run_time.' + kernel_name + '.' + str(num_nodes) + '.pkl', 'wb')) + + # Save all. + pickle.dump(run_times, open(save_dir + 'run_times.pkl', 'wb')) + + return + + +if __name__ == '__main__': + xp_synthesied_graphs_num_nodes() diff --git a/gklearn/experiments/papers/PRL_2020/utils.py b/gklearn/experiments/papers/PRL_2020/utils.py new file mode 100644 index 0000000..5cfeca7 --- /dev/null +++ b/gklearn/experiments/papers/PRL_2020/utils.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Sep 22 11:33:28 2020 + +@author: ljia +""" +Graph_Kernel_List = ['PathUpToH', 'WLSubtree', 'SylvesterEquation', 'Marginalized', 'ShortestPath', 'Treelet', 'ConjugateGradient', 'FixedPoint', 'SpectralDecomposition', 'StructuralSP', 'CommonWalk'] +# Graph_Kernel_List = ['CommonWalk', 'Marginalized', 'SylvesterEquation', 'ConjugateGradient', 'FixedPoint', 'SpectralDecomposition', 'ShortestPath', 'StructuralSP', 'PathUpToH', 'Treelet', 'WLSubtree'] + + +Graph_Kernel_List_VSym = ['PathUpToH', 'WLSubtree', 'Marginalized', 'ShortestPath', 'Treelet', 'ConjugateGradient', 'FixedPoint', 'StructuralSP', 'CommonWalk'] + + +Graph_Kernel_List_ESym = ['PathUpToH', 'Marginalized', 'Treelet', 'ConjugateGradient', 'FixedPoint', 'StructuralSP', 'CommonWalk'] + + +Graph_Kernel_List_VCon = ['ShortestPath', 'ConjugateGradient', 'FixedPoint', 'StructuralSP'] + + +Graph_Kernel_List_ECon = ['ConjugateGradient', 'FixedPoint', 'StructuralSP'] + + +def compute_graph_kernel(graphs, kernel_name): + import multiprocessing + + if kernel_name == 'CommonWalk': + from gklearn.kernels.commonWalkKernel import commonwalkkernel + estimator = commonwalkkernel + params = {'compute_method': 'geo', 'weight': 0.1} + + elif kernel_name == 'Marginalized': + from gklearn.kernels.marginalizedKernel import marginalizedkernel + estimator = marginalizedkernel + params = {'p_quit': 0.5, 'n_iteration': 5, 'remove_totters': False} + + elif kernel_name == 'SylvesterEquation': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + params = {'compute_method': 'sylvester', 'weight': 0.1} + + elif kernel_name == 'ConjugateGradient': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + params = {'compute_method': 'conjugate', 'weight': 0.1, 'node_kernels': sub_kernel, 'edge_kernels': sub_kernel} + + elif kernel_name == 'FixedPoint': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + params = {'compute_method': 'fp', 'weight': 1e-3, 'node_kernels': sub_kernel, 'edge_kernels': sub_kernel} + + elif kernel_name == 'SpectralDecomposition': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + params = {'compute_method': 'spectral', 'sub_kernel': 'geo', 'weight': 0.1} + + elif kernel_name == 'ShortestPath': + from gklearn.kernels.spKernel import spkernel + estimator = spkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + params = {'node_kernels': sub_kernel} + + elif kernel_name == 'StructuralSP': + from gklearn.kernels.structuralspKernel import structuralspkernel + estimator = structuralspkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + params = {'node_kernels': sub_kernel, 'edge_kernels': sub_kernel} + + elif kernel_name == 'PathUpToH': + from gklearn.kernels.untilHPathKernel import untilhpathkernel + estimator = untilhpathkernel + params = {'depth': 5, 'k_func': 'MinMax', 'compute_method': 'trie'} + + elif kernel_name == 'Treelet': + from gklearn.kernels.treeletKernel import treeletkernel + estimator = treeletkernel + from gklearn.utils.kernels import polynomialkernel + import functools + sub_kernel = functools.partial(polynomialkernel, d=4, c=1e+8) + params = {'sub_kernel': sub_kernel} + + elif kernel_name == 'WLSubtree': + from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel + estimator = weisfeilerlehmankernel + params = {'base_kernel': 'subtree', 'height': 5} + +# params['parallel'] = None + params['n_jobs'] = multiprocessing.cpu_count() + params['verbose'] = True + results = estimator(graphs, **params) + + return results[0], results[1] \ No newline at end of file diff --git a/gklearn/ged/env/ged_env.py b/gklearn/ged/env/ged_env.py index 572db58..60b671b 100644 --- a/gklearn/ged/env/ged_env.py +++ b/gklearn/ged/env/ged_env.py @@ -637,6 +637,10 @@ class GEDEnv(object): return [i for i in self.__internal_to_original_node_ids[graph_id].values()] + def get_node_cost(self, node_label_1, node_label_2): + return self.__ged_data.node_cost(node_label_1, node_label_2) + + def get_node_rel_cost(self, node_label_1, node_label_2): """ /*! @@ -650,7 +654,7 @@ class GEDEnv(object): node_label_1 = tuple(sorted(node_label_1.items(), key=lambda kv: kv[0])) if isinstance(node_label_2, dict): node_label_2 = tuple(sorted(node_label_2.items(), key=lambda kv: kv[0])) - return self.__ged_data._edit_cost.node_rel_cost_fun(node_label_1, node_label_2) + return self.__ged_data._edit_cost.node_rel_cost_fun(node_label_1, node_label_2) # @todo: may need to use node_cost() instead (or change node_cost() and modify ged_method for pre-defined cost matrices.) def get_node_del_cost(self, node_label): @@ -677,6 +681,10 @@ class GEDEnv(object): if isinstance(node_label, dict): node_label = tuple(sorted(node_label.items(), key=lambda kv: kv[0])) return self.__ged_data._edit_cost.node_ins_cost_fun(node_label) + + + def get_edge_cost(self, edge_label_1, edge_label_2): + return self.__ged_data.edge_cost(edge_label_1, edge_label_2) def get_edge_rel_cost(self, edge_label_1, edge_label_2): diff --git a/gklearn/ged/learning/__init__.py b/gklearn/ged/learning/__init__.py new file mode 100644 index 0000000..f867ab3 --- /dev/null +++ b/gklearn/ged/learning/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jul 7 16:07:25 2020 + +@author: ljia +""" + +from gklearn.ged.learning.cost_matrices_learner import CostMatricesLearner \ No newline at end of file diff --git a/gklearn/ged/learning/cost_matrices_learner.py b/gklearn/ged/learning/cost_matrices_learner.py new file mode 100644 index 0000000..a0d8091 --- /dev/null +++ b/gklearn/ged/learning/cost_matrices_learner.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jul 7 11:42:48 2020 + +@author: ljia +""" +import numpy as np +import cvxpy as cp +import time +from gklearn.ged.learning.costs_learner import CostsLearner +from gklearn.ged.util import compute_geds_cml + + +class CostMatricesLearner(CostsLearner): + + + def __init__(self, edit_cost='CONSTANT', triangle_rule=False, allow_zeros=True, parallel=False, verbose=2): + super().__init__(parallel, verbose) + self._edit_cost = edit_cost + self._triangle_rule = triangle_rule + self._allow_zeros = allow_zeros + + + def fit(self, X, y): + if self._edit_cost == 'LETTER': + raise Exception('Cannot compute for cost "LETTER".') + elif self._edit_cost == 'LETTER2': + raise Exception('Cannot compute for cost "LETTER2".') + elif self._edit_cost == 'NON_SYMBOLIC': + raise Exception('Cannot compute for cost "NON_SYMBOLIC".') + elif self._edit_cost == 'CONSTANT': # @todo: node/edge may not labeled. + if not self._triangle_rule and self._allow_zeros: + w = cp.Variable(X.shape[1]) + cost_fun = cp.sum_squares(X @ w - y) + constraints = [w >= [0.0 for i in range(X.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.execute_cvx(prob) + edit_costs_new = w.value + residual = np.sqrt(prob.value) + elif self._triangle_rule and self._allow_zeros: # @todo + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif not self._triangle_rule and not self._allow_zeros: # @todo + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])]] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + elif self._triangle_rule and not self._allow_zeros: # @todo + x = cp.Variable(nb_cost_mat.shape[1]) + cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost_fun), constraints) + self.__execute_cvx(prob) + edit_costs_new = x.value + residual = np.sqrt(prob.value) + else: + raise Exception('The edit cost "', self._ged_options['edit_cost'], '" is not supported for update progress.') + + self._cost_list.append(edit_costs_new) + + + def init_geds_and_nb_eo(self, y, graphs): + time0 = time.time() + self._cost_list.append(np.concatenate((self._ged_options['node_label_costs'], + self._ged_options['edge_label_costs']))) + ged_vec, self._nb_eo = self.compute_geds_and_nb_eo(graphs) + self._residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - y)))) + self._runtime_list.append(time.time() - time0) + + if self._verbose >= 2: + print('Current node label costs:', self._cost_list[-1][0:len(self._ged_options['node_label_costs'])]) + print('Current edge label costs:', self._cost_list[-1][len(self._ged_options['node_label_costs']):]) + print('Residual list:', self._residual_list) + + + def update_geds_and_nb_eo(self, y, graphs, time0): + self._ged_options['node_label_costs'] = self._cost_list[-1][0:len(self._ged_options['node_label_costs'])] + self._ged_options['edge_label_costs'] = self._cost_list[-1][len(self._ged_options['node_label_costs']):] + ged_vec, self._nb_eo = self.compute_geds_and_nb_eo(graphs) + self._residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - y)))) + self._runtime_list.append(time.time() - time0) + + + def compute_geds_and_nb_eo(self, graphs): + ged_vec, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=self._ged_options, parallel=self._parallel, verbose=(self._verbose > 1)) + return ged_vec, np.array(n_edit_operations) + + + def check_convergency(self): + self._ec_changed = False + for i, cost in enumerate(self._cost_list[-1]): + if cost == 0: + if self._cost_list[-2][i] > self._epsilon_ec: + self._ec_changed = True + break + elif abs(cost - self._cost_list[-2][i]) / cost > self._epsilon_ec: + self._ec_changed = True + break +# if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec: +# ec_changed = True +# break + self._residual_changed = False + if self._residual_list[-1] == 0: + if self._residual_list[-2] > self._epsilon_residual: + self._residual_changed = True + elif abs(self._residual_list[-1] - self._residual_list[-2]) / self._residual_list[-1] > self._epsilon_residual: + self._residual_changed = True + self._converged = not (self._ec_changed or self._residual_changed) + if self._converged: + self._itrs_without_update += 1 + else: + self._itrs_without_update = 0 + self._num_updates_ecs += 1 + + + def print_current_states(self): + print() + print('-------------------------------------------------------------------------') + print('States of iteration', self._itrs + 1) + print('-------------------------------------------------------------------------') +# print('Time spend:', self.__runtime_optimize_ec) + print('Total number of iterations for optimizing:', self._itrs + 1) + print('Total number of updating edit costs:', self._num_updates_ecs) + print('Was optimization of edit costs converged:', self._converged) + print('Did edit costs change:', self._ec_changed) + print('Did residual change:', self._residual_changed) + print('Iterations without update:', self._itrs_without_update) + print('Current node label costs:', self._cost_list[-1][0:len(self._ged_options['node_label_costs'])]) + print('Current edge label costs:', self._cost_list[-1][len(self._ged_options['node_label_costs']):]) + print('Residual list:', self._residual_list) + print('-------------------------------------------------------------------------') \ No newline at end of file diff --git a/gklearn/ged/learning/costs_learner.py b/gklearn/ged/learning/costs_learner.py new file mode 100644 index 0000000..9c77fc5 --- /dev/null +++ b/gklearn/ged/learning/costs_learner.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jul 7 11:30:31 2020 + +@author: ljia +""" +import numpy as np +import cvxpy as cp +import time +from gklearn.utils import Timer + + +class CostsLearner(object): + + + def __init__(self, parallel, verbose): + ### To set. + self._parallel = parallel + self._verbose = verbose + # For update(). + self._time_limit_in_sec = 0 + self._max_itrs = 100 + self._max_itrs_without_update = 3 + self._epsilon_residual = 0.01 + self._epsilon_ec = 0.1 + ### To compute. + self._residual_list = [] + self._runtime_list = [] + self._cost_list = [] + self._nb_eo = None + # For update(). + self._itrs = 0 + self._converged = False + self._num_updates_ecs = 0 + self._ec_changed = None + self._residual_changed = None + self._itrs_without_update = 0 + ### Both set and get. + self._ged_options = None + + + def fit(self, X, y): + pass + + + def preprocess(self): + pass # @todo: remove the zero numbers of edit costs. + + + def postprocess(self): + for i in range(len(self._cost_list[-1])): + if -1e-9 <= self._cost_list[-1][i] <= 1e-9: + self._cost_list[-1][i] = 0 + if self._cost_list[-1][i] < 0: + raise ValueError('The edit cost is negative.') + + + def set_update_params(self, **kwargs): + self._time_limit_in_sec = kwargs.get('time_limit_in_sec', self._time_limit_in_sec) + self._max_itrs = kwargs.get('max_itrs', self._max_itrs) + self._max_itrs_without_update = kwargs.get('max_itrs_without_update', self._max_itrs_without_update) + self._epsilon_residual = kwargs.get('epsilon_residual', self._epsilon_residual) + self._epsilon_ec = kwargs.get('epsilon_ec', self._epsilon_ec) + + + def update(self, y, graphs, ged_options, **kwargs): + # Set parameters. + self._ged_options = ged_options + if kwargs != {}: + self.set_update_params(**kwargs) + + # The initial iteration. + if self._verbose >= 2: + print('\ninitial:') + self.init_geds_and_nb_eo(y, graphs) + + self._converged = False + self._itrs_without_update = 0 + self._itrs = 0 + self._num_updates_ecs = 0 + timer = Timer(self._time_limit_in_sec) + # Run iterations from initial edit costs. + while not self.termination_criterion_met(self._converged, timer, self._itrs, self._itrs_without_update): + if self._verbose >= 2: + print('\niteration', self._itrs + 1) + time0 = time.time() + + # Fit GED space to the target space. + self.preprocess() + self.fit(self._nb_eo, y) + self.postprocess() + + # Compute new GEDs and numbers of edit operations. + self.update_geds_and_nb_eo(y, graphs, time0) + + # Check convergency. + self.check_convergency() + + # Print current states. + if self._verbose >= 2: + self.print_current_states() + + self._itrs += 1 + + + def init_geds_and_nb_eo(self, y, graphs): + pass + + + def update_geds_and_nb_eo(self, y, graphs, time0): + pass + + + def compute_geds_and_nb_eo(self, graphs): + pass + + + def check_convergency(self): + pass + + + def print_current_states(self): + pass + + + def termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self._max_itrs if self._max_itrs >= 0 else False): +# if self.__state == AlgorithmState.TERMINATED: +# self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self._max_itrs_without_update if self._max_itrs_without_update >= 0 else False) + + + def execute_cvx(self, prob): + try: + prob.solve(verbose=(self._verbose>=2)) + except MemoryError as error0: + if self._verbose >= 2: + print('\nUsing solver "OSQP" caused a memory error.') + print('the original error message is\n', error0) + print('solver status: ', prob.status) + print('trying solver "CVXOPT" instead...\n') + try: + prob.solve(solver=cp.CVXOPT, verbose=(self._verbose>=2)) + except Exception as error1: + if self._verbose >= 2: + print('\nAn error occured when using solver "CVXOPT".') + print('the original error message is\n', error1) + print('solver status: ', prob.status) + print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') + prob.solve(solver=cp.MOSEK, verbose=(self._verbose>=2)) + else: + if self._verbose >= 2: + print('solver status: ', prob.status) + else: + if self._verbose >= 2: + print('solver status: ', prob.status) + if self._verbose >= 2: + print() + + + def get_results(self): + results = {} + results['residual_list'] = self._residual_list + results['runtime_list'] = self._runtime_list + results['cost_list'] = self._cost_list + results['nb_eo'] = self._nb_eo + results['itrs'] = self._itrs + results['converged'] = self._converged + results['num_updates_ecs'] = self._num_updates_ecs + results['ec_changed'] = self._ec_changed + results['residual_changed'] = self._residual_changed + results['itrs_without_update'] = self._itrs_without_update + return results \ No newline at end of file diff --git a/gklearn/ged/median/__init__.py b/gklearn/ged/median/__init__.py index 0a96c31..9eb4384 100644 --- a/gklearn/ged/median/__init__.py +++ b/gklearn/ged/median/__init__.py @@ -1,3 +1,4 @@ from gklearn.ged.median.median_graph_estimator import MedianGraphEstimator from gklearn.ged.median.median_graph_estimator_py import MedianGraphEstimatorPy +from gklearn.ged.median.median_graph_estimator_cml import MedianGraphEstimatorCML from gklearn.ged.median.utils import constant_node_costs, mge_options_to_string diff --git a/gklearn/ged/median/median_graph_estimator_cml.py b/gklearn/ged/median/median_graph_estimator_cml.py new file mode 100644 index 0000000..2d5b110 --- /dev/null +++ b/gklearn/ged/median/median_graph_estimator_cml.py @@ -0,0 +1,1676 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 16 18:04:55 2020 + +@author: ljia +""" +import numpy as np +import time +from tqdm import tqdm +import sys +import networkx as nx +import multiprocessing +from multiprocessing import Pool +from functools import partial +from gklearn.ged.env import AlgorithmState, NodeMap +from gklearn.ged.util import misc +from gklearn.utils import Timer, SpecialLabel + + +class MedianGraphEstimatorCML(object): # @todo: differ dummy_node from undifined node? + """Estimate median graphs using the pure Python version of GEDEnv. + """ + + def __init__(self, ged_env, constant_node_costs): + """Constructor. + + Parameters + ---------- + ged_env : gklearn.gedlib.gedlibpy.GEDEnv + Initialized GED environment. The edit costs must be set by the user. + + constant_node_costs : Boolean + Set to True if the node relabeling costs are constant. + """ + self.__ged_env = ged_env + self.__init_method = 'BRANCH_FAST' + self.__init_options = '' + self.__descent_method = 'BRANCH_FAST' + self.__descent_options = '' + self.__refine_method = 'IPFP' + self.__refine_options = '' + self.__constant_node_costs = constant_node_costs + self.__labeled_nodes = (ged_env.get_num_node_labels() > 1) + self.__node_del_cost = ged_env.get_node_del_cost(ged_env.get_node_label(1, to_dict=False)) + self.__node_ins_cost = ged_env.get_node_ins_cost(ged_env.get_node_label(1, to_dict=False)) + self.__labeled_edges = (ged_env.get_num_edge_labels() > 1) + self.__edge_del_cost = ged_env.get_edge_del_cost(ged_env.get_edge_label(1, to_dict=False)) + self.__edge_ins_cost = ged_env.get_edge_ins_cost(ged_env.get_edge_label(1, to_dict=False)) + self.__init_type = 'RANDOM' + self.__num_random_inits = 10 + self.__desired_num_random_inits = 10 + self.__use_real_randomness = True + self.__seed = 0 + self.__parallel = True + self.__update_order = True + self.__sort_graphs = True # sort graphs by size when computing GEDs. + self.__refine = True + self.__time_limit_in_sec = 0 + self.__epsilon = 0.0001 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__num_inits_increase_order = 10 + self.__init_type_increase_order = 'K-MEANS++' + self.__max_itrs_increase_order = 10 + self.__print_to_stdout = 2 + self.__median_id = np.inf # @todo: check + self.__node_maps_from_median = {} + self.__sum_of_distances = 0 + self.__best_init_sum_of_distances = np.inf + self.__converged_sum_of_distances = np.inf + self.__runtime = None + self.__runtime_initialized = None + self.__runtime_converged = None + self.__itrs = [] # @todo: check: {} ? + self.__num_decrease_order = 0 + self.__num_increase_order = 0 + self.__num_converged_descents = 0 + self.__state = AlgorithmState.TERMINATED + self.__label_names = {} + + if ged_env is None: + raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') + elif not ged_env.is_initialized(): + raise Exception('The GED environment is uninitialized. Call gedlibpy.GEDEnv.init() before passing it to the constructor of MedianGraphEstimator.') + + + def set_options(self, options): + """Sets the options of the estimator. + + Parameters + ---------- + options : string + String that specifies with which options to run the estimator. + """ + self.__set_default_options() + options_map = misc.options_string_to_options_map(options) + for opt_name, opt_val in options_map.items(): + if opt_name == 'init-type': + self.__init_type = opt_val + if opt_val != 'MEDOID' and opt_val != 'RANDOM' and opt_val != 'MIN' and opt_val != 'MAX' and opt_val != 'MEAN': + raise Exception('Invalid argument ' + opt_val + ' for option init-type. Usage: options = "[--init-type RANDOM|MEDOID|EMPTY|MIN|MAX|MEAN] [...]"') + elif opt_name == 'random-inits': + try: + self.__num_random_inits = int(opt_val) + self.__desired_num_random_inits = self.__num_random_inits + except: + raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') + + if self.__num_random_inits <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option random-inits. Usage: options = "[--random-inits ]"') + + elif opt_name == 'randomness': + if opt_val == 'PSEUDO': + self.__use_real_randomness = False + + elif opt_val == 'REAL': + self.__use_real_randomness = True + + else: + raise Exception('Invalid argument "' + opt_val + '" for option randomness. Usage: options = "[--randomness REAL|PSEUDO] [...]"') + + elif opt_name == 'stdout': + if opt_val == '0': + self.__print_to_stdout = 0 + + elif opt_val == '1': + self.__print_to_stdout = 1 + + elif opt_val == '2': + self.__print_to_stdout = 2 + + else: + raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') + + elif opt_name == 'parallel': + if opt_val == 'TRUE': + self.__parallel = True + + elif opt_val == 'FALSE': + self.__parallel = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option parallel. Usage: options = "[--parallel TRUE|FALSE] [...]"') + + elif opt_name == 'update-order': + if opt_val == 'TRUE': + self.__update_order = True + + elif opt_val == 'FALSE': + self.__update_order = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option update-order. Usage: options = "[--update-order TRUE|FALSE] [...]"') + + elif opt_name == 'sort-graphs': + if opt_val == 'TRUE': + self.__sort_graphs = True + + elif opt_val == 'FALSE': + self.__sort_graphs = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option sort-graphs. Usage: options = "[--sort-graphs TRUE|FALSE] [...]"') + + elif opt_name == 'refine': + if opt_val == 'TRUE': + self.__refine = True + + elif opt_val == 'FALSE': + self.__refine = False + + else: + raise Exception('Invalid argument "' + opt_val + '" for option refine. Usage: options = "[--refine TRUE|FALSE] [...]"') + + elif opt_name == 'time-limit': + try: + self.__time_limit_in_sec = float(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option time-limit. Usage: options = "[--time-limit ] [...]') + + elif opt_name == 'max-itrs': + try: + self.__max_itrs = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs. Usage: options = "[--max-itrs ] [...]') + + elif opt_name == 'max-itrs-without-update': + try: + self.__max_itrs_without_update = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-without-update. Usage: options = "[--max-itrs-without-update ] [...]') + + elif opt_name == 'seed': + try: + self.__seed = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option seed. Usage: options = "[--seed ] [...]') + + elif opt_name == 'epsilon': + try: + self.__epsilon = float(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') + + if self.__epsilon <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option epsilon. Usage: options = "[--epsilon ] [...]') + + elif opt_name == 'inits-increase-order': + try: + self.__num_inits_increase_order = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') + + if self.__num_inits_increase_order <= 0: + raise Exception('Invalid argument "' + opt_val + '" for option inits-increase-order. Usage: options = "[--inits-increase-order ]"') + + elif opt_name == 'init-type-increase-order': + self.__init_type_increase_order = opt_val + if opt_val != 'CLUSTERS' and opt_val != 'K-MEANS++': + raise Exception('Invalid argument ' + opt_val + ' for option init-type-increase-order. Usage: options = "[--init-type-increase-order CLUSTERS|K-MEANS++] [...]"') + + elif opt_name == 'max-itrs-increase-order': + try: + self.__max_itrs_increase_order = int(opt_val) + + except: + raise Exception('Invalid argument "' + opt_val + '" for option max-itrs-increase-order. Usage: options = "[--max-itrs-increase-order ] [...]') + + else: + valid_options = '[--init-type ] [--random-inits ] [--randomness ] [--seed ] [--stdout ] ' + valid_options += '[--time-limit ] [--max-itrs ] [--epsilon ] ' + valid_options += '[--inits-increase-order ] [--init-type-increase-order ] [--max-itrs-increase-order ]' + raise Exception('Invalid option "' + opt_name + '". Usage: options = "' + valid_options + '"') + + + def set_init_method(self, init_method, init_options={}): + """Selects method to be used for computing the initial medoid graph. + + Parameters + ---------- + init_method : string + The selected method. Default: ged::Options::GEDMethod::BRANCH_UNIFORM. + + init_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect unless "--init-type MEDOID" is passed to set_options(). + """ + self.__init_method = init_method; + self.__init_options = init_options; + + + def set_descent_method(self, descent_method, descent_options=''): + """Selects method to be used for block gradient descent.. + + Parameters + ---------- + descent_method : string + The selected method. Default: ged::Options::GEDMethod::BRANCH_FAST. + + descent_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect unless "--init-type MEDOID" is passed to set_options(). + """ + self.__descent_method = descent_method; + self.__descent_options = descent_options; + + + def set_refine_method(self, refine_method, refine_options): + """Selects method to be used for improving the sum of distances and the node maps for the converged median. + + Parameters + ---------- + refine_method : string + The selected method. Default: "IPFP". + + refine_options : string + The options for the selected method. Default: "". + + Notes + ----- + Has no effect if "--refine FALSE" is passed to set_options(). + """ + self.__refine_method = refine_method + self.__refine_options = refine_options + + + def run(self, graph_ids, set_median_id, gen_median_id): + """Computes a generalized median graph. + + Parameters + ---------- + graph_ids : list[integer] + The IDs of the graphs for which the median should be computed. Must have been added to the environment passed to the constructor. + + set_median_id : integer + The ID of the computed set-median. A dummy graph with this ID must have been added to the environment passed to the constructor. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). + + + gen_median_id : integer + The ID of the computed generalized median. Upon termination, the computed median can be obtained via gklearn.gedlib.gedlibpy.GEDEnv.get_graph(). + """ + # Sanity checks. + if len(graph_ids) == 0: + raise Exception('Empty vector of graph IDs, unable to compute median.') + all_graphs_empty = True + for graph_id in graph_ids: + if self.__ged_env.get_graph_num_nodes(graph_id) > 0: + all_graphs_empty = False + break + if all_graphs_empty: + raise Exception('All graphs in the collection are empty.') + + # Start timer and record start time. + start = time.time() + timer = Timer(self.__time_limit_in_sec) + self.__median_id = gen_median_id + self.__state = AlgorithmState.TERMINATED + + # Get NetworkX graph representations of the input graphs. + graphs = {} + for graph_id in graph_ids: + # @todo: get_nx_graph() function may need to be modified according to the coming code. + graphs[graph_id] = self.__ged_env.get_nx_graph(graph_id) +# print(self.__ged_env.get_graph_internal_id(0)) +# print(graphs[0].graph) +# print(graphs[0].nodes(data=True)) +# print(graphs[0].edges(data=True)) +# print(nx.adjacency_matrix(graphs[0])) + + # Construct initial medians. + medians = [] + self.__construct_initial_medians(graph_ids, timer, medians) + end_init = time.time() + self.__runtime_initialized = end_init - start +# print(medians[0].graph) +# print(medians[0].nodes(data=True)) +# print(medians[0].edges(data=True)) +# print(nx.adjacency_matrix(medians[0])) + + # Reset information about iterations and number of times the median decreases and increases. + self.__itrs = [0] * len(medians) + self.__num_decrease_order = 0 + self.__num_increase_order = 0 + self.__num_converged_descents = 0 + + # Initialize the best median. + best_sum_of_distances = np.inf + self.__best_init_sum_of_distances = np.inf + node_maps_from_best_median = {} + + # Run block gradient descent from all initial medians. + self.__ged_env.set_method(self.__descent_method, self.__descent_options) + for median_pos in range(0, len(medians)): + + # Terminate if the timer has expired and at least one SOD has been computed. + if timer.expired() and median_pos > 0: + break + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Block gradient descent for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('-----------------------------------------------------------') + + # Get reference to the median. + median = medians[median_pos] + + # Load initial median into the environment. + self.__ged_env.load_nx_graph(median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + + # Compute node maps and sum of distances for initial median. +# xxx = self.__node_maps_from_median + self.__compute_init_node_maps(graph_ids, gen_median_id) +# yyy = self.__node_maps_from_median + + self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) + self.__ged_env.load_nx_graph(median, set_median_id) +# print(self.__best_init_sum_of_distances) + + # Run block gradient descent from initial median. + converged = False + itrs_without_update = 0 + while not self.__termination_criterion_met(converged, timer, self.__itrs[median_pos], itrs_without_update): + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Iteration', str(self.__itrs[median_pos] + 1), 'for initial median', str(median_pos + 1), 'of', str(len(medians)), '.') + print('-----------------------------------------------------------') + + # Initialize flags that tell us what happened in the iteration. + median_modified = False + node_maps_modified = False + decreased_order = False + increased_order = False + + # Update the median. + median_modified = self.__update_median(graphs, median) + if self.__update_order: + pass # @todo: +# if not median_modified or self.__itrs[median_pos] == 0: +# decreased_order = self.__decrease_order(graphs, median) +# if not decreased_order or self.__itrs[median_pos] == 0: +# increased_order = self.__increase_order(graphs, median) + + # Update the number of iterations without update of the median. + if median_modified or decreased_order or increased_order: + itrs_without_update = 0 + else: + itrs_without_update += 1 + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Loading median to environment: ... ', end='') + + # Load the median into the environment. + # @todo: should this function use the original node label? + self.__ged_env.load_nx_graph(median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Updating induced costs: ... ', end='') + + # Compute induced costs of the old node maps w.r.t. the updated median. + for graph_id in graph_ids: +# print(self.__node_maps_from_median[graph_id].induced_cost()) +# xxx = self.__node_maps_from_median[graph_id] + self.__ged_env.compute_induced_cost(gen_median_id, graph_id, self.__node_maps_from_median[graph_id]) +# print('---------------------------------------') +# print(self.__node_maps_from_median[graph_id].induced_cost()) + # @todo:!!!!!!!!!!!!!!!!!!!!!!!!!!!!This value is a slight different from the c++ program, which might be a bug! Use it very carefully! + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Update the node maps. + node_maps_modified = self.__update_node_maps() + + # Update the order of the median if no improvement can be found with the current order. + + # Update the sum of distances. + old_sum_of_distances = self.__sum_of_distances + self.__sum_of_distances = 0 + for graph_id, node_map in self.__node_maps_from_median.items(): + self.__sum_of_distances += node_map.induced_cost() +# print(self.__sum_of_distances) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Old local SOD: ', old_sum_of_distances) + print('New local SOD: ', self.__sum_of_distances) + print('Best converged SOD: ', best_sum_of_distances) + print('Modified median: ', median_modified) + print('Modified node maps: ', node_maps_modified) + print('Decreased order: ', decreased_order) + print('Increased order: ', increased_order) + print('===========================================================\n') + + converged = not (median_modified or node_maps_modified or decreased_order or increased_order) + + self.__itrs[median_pos] += 1 + + # Update the best median. + if self.__sum_of_distances < best_sum_of_distances: + best_sum_of_distances = self.__sum_of_distances + node_maps_from_best_median = self.__node_maps_from_median.copy() # @todo: this is a shallow copy, not sure if it is enough. + best_median = median + + # Update the number of converged descents. + if converged: + self.__num_converged_descents += 1 + + # Store the best encountered median. + self.__sum_of_distances = best_sum_of_distances + self.__node_maps_from_median = node_maps_from_best_median + self.__ged_env.load_nx_graph(best_median, gen_median_id) + self.__ged_env.init(self.__ged_env.get_init_type()) + end_descent = time.time() + self.__runtime_converged = end_descent - start + + # Refine the sum of distances and the node maps for the converged median. + self.__converged_sum_of_distances = self.__sum_of_distances + if self.__refine: + self.__improve_sum_of_distances(timer) + + # Record end time, set runtime and reset the number of initial medians. + end = time.time() + self.__runtime = end - start + self.__num_random_inits = self.__desired_num_random_inits + + # Print global information. + if self.__print_to_stdout != 0: + print('\n===========================================================') + print('Finished computation of generalized median graph.') + print('-----------------------------------------------------------') + print('Best SOD after initialization: ', self.__best_init_sum_of_distances) + print('Converged SOD: ', self.__converged_sum_of_distances) + if self.__refine: + print('Refined SOD: ', self.__sum_of_distances) + print('Overall runtime: ', self.__runtime) + print('Runtime of initialization: ', self.__runtime_initialized) + print('Runtime of block gradient descent: ', self.__runtime_converged - self.__runtime_initialized) + if self.__refine: + print('Runtime of refinement: ', self.__runtime - self.__runtime_converged) + print('Number of initial medians: ', len(medians)) + total_itr = 0 + num_started_descents = 0 + for itr in self.__itrs: + total_itr += itr + if itr > 0: + num_started_descents += 1 + print('Size of graph collection: ', len(graph_ids)) + print('Number of started descents: ', num_started_descents) + print('Number of converged descents: ', self.__num_converged_descents) + print('Overall number of iterations: ', total_itr) + print('Overall number of times the order decreased: ', self.__num_decrease_order) + print('Overall number of times the order increased: ', self.__num_increase_order) + print('===========================================================\n') + + + def __improve_sum_of_distances(self, timer): # @todo: go through and test + # Use method selected for refinement phase. + self.__ged_env.set_method(self.__refine_method, self.__refine_options) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Improving node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + print('\n===========================================================') + print('Improving node maps and SOD for converged median.') + print('-----------------------------------------------------------') + progress.update(1) + + # Improving the node maps. + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__gen_median_id) + for graph_id, node_map in self.__node_maps_from_median.items(): + if time.expired(): + if self.__state == AlgorithmState.TERMINATED: + self.__state = AlgorithmState.CONVERGED + break + + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(self.__gen_median_id, graph_id) + if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost(): + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id) + else: + self.__ged_env.run_method(graph_id, self.__gen_median_id) + if self.__ged_env.get_upper_bound(graph_id, self.__gen_median_id) < node_map.induced_cost(): + node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__gen_median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + + self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + + # Print information. + if self.__print_to_stdout == 2: + progress.update(1) + + self.__sum_of_distances = 0.0 + for key, val in self.__node_maps_from_median.items(): + self.__sum_of_distances += val.induced_cost() + + # Print information. + if self.__print_to_stdout == 2: + print('===========================================================\n') + + + def __median_available(self): + return self.__median_id != np.inf + + + def get_state(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_state().') + return self.__state + + + def get_sum_of_distances(self, state=''): + """Returns the sum of distances. + + Parameters + ---------- + state : string + The state of the estimator. Can be 'initialized' or 'converged'. Default: "" + + Returns + ------- + float + The sum of distances (SOD) of the median when the estimator was in the state `state` during the last call to run(). If `state` is not given, the converged SOD (without refinement) or refined SOD (with refinement) is returned. + """ + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_sum_of_distances().') + if state == 'initialized': + return self.__best_init_sum_of_distances + if state == 'converged': + return self.__converged_sum_of_distances + return self.__sum_of_distances + + + def get_runtime(self, state): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_runtime().') + if state == AlgorithmState.INITIALIZED: + return self.__runtime_initialized + if state == AlgorithmState.CONVERGED: + return self.__runtime_converged + return self.__runtime + + + def get_num_itrs(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_itrs().') + return self.__itrs + + + def get_num_times_order_decreased(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_times_order_decreased().') + return self.__num_decrease_order + + + def get_num_times_order_increased(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_times_order_increased().') + return self.__num_increase_order + + + def get_num_converged_descents(self): + if not self.__median_available(): + raise Exception('No median has been computed. Call run() before calling get_num_converged_descents().') + return self.__num_converged_descents + + + def get_ged_env(self): + return self.__ged_env + + + def __set_default_options(self): + self.__init_type = 'RANDOM' + self.__num_random_inits = 10 + self.__desired_num_random_inits = 10 + self.__use_real_randomness = True + self.__seed = 0 + self.__parallel = True + self.__update_order = True + self.__sort_graphs = True + self.__refine = True + self.__time_limit_in_sec = 0 + self.__epsilon = 0.0001 + self.__max_itrs = 100 + self.__max_itrs_without_update = 3 + self.__num_inits_increase_order = 10 + self.__init_type_increase_order = 'K-MEANS++' + self.__max_itrs_increase_order = 10 + self.__print_to_stdout = 2 + self.__label_names = {} + + + def __construct_initial_medians(self, graph_ids, timer, initial_medians): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n===========================================================') + print('Constructing initial median(s).') + print('-----------------------------------------------------------') + + # Compute or sample the initial median(s). + initial_medians.clear() + if self.__init_type == 'MEDOID': + self.__compute_medoid(graph_ids, timer, initial_medians) + elif self.__init_type == 'MAX': + pass # @todo +# compute_max_order_graph_(graph_ids, initial_medians) + elif self.__init_type == 'MIN': + pass # @todo +# compute_min_order_graph_(graph_ids, initial_medians) + elif self.__init_type == 'MEAN': + pass # @todo +# compute_mean_order_graph_(graph_ids, initial_medians) + else: + pass # @todo +# sample_initial_medians_(graph_ids, initial_medians) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('===========================================================') + + + def __compute_medoid(self, graph_ids, timer, initial_medians): + # Use method selected for initialization phase. + self.__ged_env.set_method(self.__init_method, self.__init_options) + + # Compute the medoid. + if self.__parallel: + # @todo: notice when parallel self.__ged_env is not modified. + sum_of_distances_list = [np.inf] * len(graph_ids) + len_itr = len(graph_ids) + itr = zip(graph_ids, range(0, len(graph_ids))) + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(ged_env_toshare): + global G_ged_env + G_ged_env = ged_env_toshare + do_fun = partial(_compute_medoid_parallel, graph_ids, self.__sort_graphs) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) + if self.__print_to_stdout == 2: + iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), + desc='Computing medoid', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_fun, itr, chunksize) + for i, dis in iterator: + sum_of_distances_list[i] = dis + pool.close() + pool.join() + + medoid_id = np.argmin(sum_of_distances_list) + best_sum_of_distances = sum_of_distances_list[medoid_id] + + initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + + else: + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout) + + medoid_id = graph_ids[0] + best_sum_of_distances = np.inf + for g_id in graph_ids: + if timer.expired(): + self.__state = AlgorithmState.CALLED + break + nb_nodes_g = self.__ged_env.get_graph_num_nodes(g_id) + sum_of_distances = 0 + for h_id in graph_ids: # @todo: this can be faster, only a half is needed. + nb_nodes_h = self.__ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not self.__sort_graphs: + self.__ged_env.run_method(g_id, h_id) # @todo + sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id) + else: + self.__ged_env.run_method(h_id, g_id) + sum_of_distances += self.__ged_env.get_upper_bound(h_id, g_id) + if sum_of_distances < best_sum_of_distances: + best_sum_of_distances = sum_of_distances + medoid_id = g_id + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + initial_medians.append(self.__ged_env.get_nx_graph(medoid_id)) # @todo + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + + def __compute_init_node_maps(self, graph_ids, gen_median_id): + # Compute node maps and sum of distances for initial median. + if self.__parallel: + # @todo: notice when parallel self.__ged_env is not modified. + self.__sum_of_distances = 0 + self.__node_maps_from_median.clear() + sum_of_distances_list = [0] * len(graph_ids) + + len_itr = len(graph_ids) + itr = graph_ids + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(ged_env_toshare): + global G_ged_env + G_ged_env = ged_env_toshare + nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self.__sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) + if self.__print_to_stdout == 2: + iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), + desc='Computing initial node maps', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_fun, itr, chunksize) + for g_id, sod, node_maps in iterator: + sum_of_distances_list[g_id] = sod + self.__node_maps_from_median[g_id] = node_maps + pool.close() + pool.join() + + self.__sum_of_distances = np.sum(sum_of_distances_list) +# xxx = self.__node_maps_from_median + + else: + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout) + + self.__sum_of_distances = 0 + self.__node_maps_from_median.clear() + nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id) + for graph_id in graph_ids: + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(gen_median_id, graph_id) + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id) + else: + self.__ged_env.run_method(graph_id, gen_median_id) + node_map_tmp = self.__ged_env.get_node_map(graph_id, gen_median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + # print(self.__node_maps_from_median[graph_id]) + self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost() + # print(self.__sum_of_distances) + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + + def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): + if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): + if self.__state == AlgorithmState.TERMINATED: + self.__state = AlgorithmState.INITIALIZED + return True + return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) + + + def __update_median(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Updating median: ', end='') + + # Store copy of the old median. + old_median = median.copy() # @todo: this is just a shallow copy. + + # Update the node labels. + if self.__labeled_nodes: + self.__update_node_labels(graphs, median) + + # Update the edges and their labels. + self.__update_edges(graphs, median) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + return not self.__are_graphs_equal(median, old_median) + + + def __update_node_labels(self, graphs, median): +# print('----------------------------') + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('nodes ... ', end='') + + # Collect all possible node labels. + all_labels = self.__ged_env.get_all_node_labels() + + # Iterate through all nodes of the median. + for i in range(0, nx.number_of_nodes(median)): +# print('i: ', i) + + # Collect the labels of the substituted nodes. + node_labels = [] + for graph_id, graph in graphs.items(): + k = self.__node_maps_from_median[graph_id].image(i) + if k != np.inf: + node_labels.append(tuple(graph.nodes[k].items())) # @todo: sort + else: + node_labels.append(SpecialLabel.DUMMY) + + # Compute the median label and update the median. + if len(node_labels) > 0: + fi_min = np.inf + median_label = tuple() + + for label1 in all_labels: + fi = 0 + for label2 in node_labels: + fi += self.__ged_env.get_node_cost(label1, label2) # @todo: check inside, this might be slow + if fi < fi_min: # @todo: fi is too easy to be zero. use <= or consider multiple optimal labels. + fi_min = fi + median_label = label1 + + median_label = {kv[0]: kv[1] for kv in median_label} + nx.set_node_attributes(median, {i: median_label}) + +# median_label = self.__get_median_node_label(node_labels) +# if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: +# nx.set_node_attributes(median, {i: median_label}) + + + def __update_edges(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('edges ... ', end='') + + # Collect all possible edge labels. + all_labels = self.__ged_env.get_all_edge_labels() + + # @todo: what if edge is not labeled? + # Iterate through all possible edges (i,j) of the median. + for i in range(0, nx.number_of_nodes(median)): + for j in range(i + 1, nx.number_of_nodes(median)): + + # Collect the labels of the edges to which (i,j) is mapped by the node maps. + edge_labels = [] + for graph_id, graph in graphs.items(): + k = self.__node_maps_from_median[graph_id].image(i) + l = self.__node_maps_from_median[graph_id].image(j) + if k != np.inf and l != np.inf and graph.has_edge(k, l): + edge_labels.append(tuple(graph.edges[(k, l)].items())) # @todo: sort + else: + edge_labels.append(SpecialLabel.DUMMY) + + # Compute the median edge label and the overall edge relabeling cost. + if self.__labeled_edges and len(edge_labels) > 0: + fij1_min = np.inf + median_label = tuple() + + # Compute f_ij^0. + fij0 = 0 + for label2 in edge_labels: + fij0 += self.__ged_env.get_edge_cost(SpecialLabel.DUMMY, label2) + + for label1 in all_labels: + fij1 = 0 + for label2 in edge_labels: + fij1 += self.__ged_env.get_edge_cost(label1, label2) + + if fij1 < fij1_min: + fij1_min = fij1 + median_label = label1 + + # Update the median. + if median.has_edge(i, j): + median.remove_edge(i, j) + if fij1_min < fij0: # @todo: this never happens. + median_label = {kv[0]: kv[1] for kv in median_label} + median.add_edge(i, j, **median_label) + +# if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: +# median_label = new_median_label + + + def __update_node_maps(self): + # Update the node maps. + if self.__parallel: + # @todo: notice when parallel self.__ged_env is not modified. + node_maps_were_modified = False +# xxx = self.__node_maps_from_median.copy() + + len_itr = len(self.__node_maps_from_median) + itr = [item for item in self.__node_maps_from_median.items()] + n_jobs = multiprocessing.cpu_count() + if len_itr < 100 * n_jobs: + chunksize = int(len_itr / n_jobs) + 1 + else: + chunksize = 100 + def init_worker(ged_env_toshare): + global G_ged_env + G_ged_env = ged_env_toshare + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) + do_fun = partial(_update_node_maps_parallel, self.__median_id, self.__epsilon, self.__sort_graphs, nb_nodes_median) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,)) + if self.__print_to_stdout == 2: + iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), + desc='Updating node maps', file=sys.stdout) + else: + iterator = pool.imap_unordered(do_fun, itr, chunksize) + for g_id, node_map, nm_modified in iterator: + self.__node_maps_from_median[g_id] = node_map + if nm_modified: + node_maps_were_modified = True + pool.close() + pool.join() +# yyy = self.__node_maps_from_median.copy() + + else: + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout) + + node_maps_were_modified = False + nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id) + for graph_id, node_map in self.__node_maps_from_median.items(): + nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id) + + if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs: + self.__ged_env.run_method(self.__median_id, graph_id) + if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon: + # xxx = self.__node_maps_from_median[graph_id] + self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id) + node_maps_were_modified = True + + else: + self.__ged_env.run_method(graph_id, self.__median_id) + if self.__ged_env.get_upper_bound(graph_id, self.__median_id) < node_map.induced_cost() - self.__epsilon: + node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__median_id) + node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map + self.__node_maps_from_median[graph_id] = node_map_tmp + node_maps_were_modified = True + + # Print information about current iteration. + if self.__print_to_stdout == 2: + progress.update(1) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('\n') + + # Return true if the node maps were modified. + return node_maps_were_modified + + + def __decrease_order(self, graphs, median): + # Print information about current iteration + if self.__print_to_stdout == 2: + print('Trying to decrease order: ... ', end='') + + if nx.number_of_nodes(median) <= 1: + if self.__print_to_stdout == 2: + print('median graph has only 1 node, skip decrease.') + return False + + # Initialize ID of the node that is to be deleted. + id_deleted_node = [None] # @todo: or np.inf + decreased_order = False + + # Decrease the order as long as the best deletion delta is negative. + while self.__compute_best_deletion_delta(graphs, median, id_deleted_node) < -self.__epsilon: + decreased_order = True + self.__delete_node_from_median(id_deleted_node[0], median) + if nx.number_of_nodes(median) <= 1: + if self.__print_to_stdout == 2: + print('decrease stopped because median graph remains only 1 node. ', end='') + break + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Return true iff the order was decreased. + return decreased_order + + + def __compute_best_deletion_delta(self, graphs, median, id_deleted_node): + best_delta = 0.0 + + # Determine node that should be deleted (if any). + for i in range(0, nx.number_of_nodes(median)): + # Compute cost delta. + delta = 0.0 + for graph_id, graph in graphs.items(): + k = self.__node_maps_from_median[graph_id].image(i) + if k == np.inf: + delta -= self.__node_del_cost + else: + delta += self.__node_ins_cost - self.__ged_env.get_node_rel_cost(median.nodes[i], graph.nodes[k]) + for j, j_label in median[i].items(): + l = self.__node_maps_from_median[graph_id].image(j) + if k == np.inf or l == np.inf: + delta -= self.__edge_del_cost + elif not graph.has_edge(k, l): + delta -= self.__edge_del_cost + else: + delta += self.__edge_ins_cost - self.__ged_env.get_edge_rel_cost(j_label, graph.edges[(k, l)]) + + # Update best deletion delta. + if delta < best_delta - self.__epsilon: + best_delta = delta + id_deleted_node[0] = i +# id_deleted_node[0] = 3 # @todo: + + return best_delta + + + def __delete_node_from_median(self, id_deleted_node, median): + # Update the median. + mapping = {} + for i in range(0, nx.number_of_nodes(median)): + if i != id_deleted_node: + new_i = (i if i < id_deleted_node else (i - 1)) + mapping[i] = new_i + median.remove_node(id_deleted_node) + nx.relabel_nodes(median, mapping, copy=False) + + # Update the node maps. +# xxx = self.__node_maps_from_median + for key, node_map in self.__node_maps_from_median.items(): + new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) + is_unassigned_target_node = [True] * node_map.num_target_nodes() + for i in range(0, nx.number_of_nodes(median) + 1): + if i != id_deleted_node: + new_i = (i if i < id_deleted_node else (i - 1)) + k = node_map.image(i) + new_node_map.add_assignment(new_i, k) + if k != np.inf: + is_unassigned_target_node[k] = False + for k in range(0, node_map.num_target_nodes()): + if is_unassigned_target_node[k]: + new_node_map.add_assignment(np.inf, k) +# print(self.__node_maps_from_median[key].forward_map, self.__node_maps_from_median[key].backward_map) +# print(new_node_map.forward_map, new_node_map.backward_map + self.__node_maps_from_median[key] = new_node_map + + # Increase overall number of decreases. + self.__num_decrease_order += 1 + + + def __increase_order(self, graphs, median): + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('Trying to increase order: ... ', end='') + + # Initialize the best configuration and the best label of the node that is to be inserted. + best_config = {} + best_label = self.__ged_env.get_node_label(1, to_dict=True) + increased_order = False + + # Increase the order as long as the best insertion delta is negative. + while self.__compute_best_insertion_delta(graphs, best_config, best_label) < - self.__epsilon: + increased_order = True + self.__add_node_to_median(best_config, best_label, median) + + # Print information about current iteration. + if self.__print_to_stdout == 2: + print('done.') + + # Return true iff the order was increased. + return increased_order + + + def __compute_best_insertion_delta(self, graphs, best_config, best_label): + # Construct sets of inserted nodes. + no_inserted_node = True + inserted_nodes = {} + for graph_id, graph in graphs.items(): + inserted_nodes[graph_id] = [] + best_config[graph_id] = np.inf + for k in range(nx.number_of_nodes(graph)): + if self.__node_maps_from_median[graph_id].pre_image(k) == np.inf: + no_inserted_node = False + inserted_nodes[graph_id].append((k, tuple(item for item in graph.nodes[k].items()))) # @todo: can order of label names be garantteed? + + # Return 0.0 if no node is inserted in any of the graphs. + if no_inserted_node: + return 0.0 + + # Compute insertion configuration, label, and delta. + best_delta = 0.0 # @todo + if len(self.__label_names['node_labels']) == 0 and len(self.__label_names['node_attrs']) == 0: # @todo + best_delta = self.__compute_insertion_delta_unlabeled(inserted_nodes, best_config, best_label) + elif len(self.__label_names['node_labels']) > 0: # self.__constant_node_costs: + best_delta = self.__compute_insertion_delta_constant(inserted_nodes, best_config, best_label) + else: + best_delta = self.__compute_insertion_delta_generic(inserted_nodes, best_config, best_label) + + # Return the best delta. + return best_delta + + + def __compute_insertion_delta_unlabeled(self, inserted_nodes, best_config, best_label): # @todo: go through and test. + # Construct the nest configuration and compute its insertion delta. + best_delta = 0.0 + best_config.clear() + for graph_id, node_set in inserted_nodes.items(): + if len(node_set) == 0: + best_config[graph_id] = np.inf + best_delta += self.__node_del_cost + else: + best_config[graph_id] = node_set[0][0] + best_delta -= self.__node_ins_cost + + # Return the best insertion delta. + return best_delta + + + def __compute_insertion_delta_constant(self, inserted_nodes, best_config, best_label): + # Construct histogram and inverse label maps. + hist = {} + inverse_label_maps = {} + for graph_id, node_set in inserted_nodes.items(): + inverse_label_maps[graph_id] = {} + for node in node_set: + k = node[0] + label = node[1] + if label not in inverse_label_maps[graph_id]: + inverse_label_maps[graph_id][label] = k + if label not in hist: + hist[label] = 1 + else: + hist[label] += 1 + + # Determine the best label. + best_count = 0 + for key, val in hist.items(): + if val > best_count: + best_count = val + best_label_tuple = key + + # get best label. + best_label.clear() + for key, val in best_label_tuple: + best_label[key] = val + + # Construct the best configuration and compute its insertion delta. + best_config.clear() + best_delta = 0.0 + node_rel_cost = self.__ged_env.get_node_rel_cost(self.__ged_env.get_node_label(1, to_dict=False), self.__ged_env.get_node_label(2, to_dict=False)) + triangle_ineq_holds = (node_rel_cost <= self.__node_del_cost + self.__node_ins_cost) + for graph_id, _ in inserted_nodes.items(): + if best_label_tuple in inverse_label_maps[graph_id]: + best_config[graph_id] = inverse_label_maps[graph_id][best_label_tuple] + best_delta -= self.__node_ins_cost + elif triangle_ineq_holds and not len(inserted_nodes[graph_id]) == 0: + best_config[graph_id] = inserted_nodes[graph_id][0][0] + best_delta += node_rel_cost - self.__node_ins_cost + else: + best_config[graph_id] = np.inf + best_delta += self.__node_del_cost + + # Return the best insertion delta. + return best_delta + + + def __compute_insertion_delta_generic(self, inserted_nodes, best_config, best_label): + # Collect all node labels of inserted nodes. + node_labels = [] + for _, node_set in inserted_nodes.items(): + for node in node_set: + node_labels.append(node[1]) + + # Compute node label medians that serve as initial solutions for block gradient descent. + initial_node_labels = [] + self.__compute_initial_node_labels(node_labels, initial_node_labels) + + # Determine best insertion configuration, label, and delta via parallel block gradient descent from all initial node labels. + best_delta = 0.0 + for node_label in initial_node_labels: + # Construct local configuration. + config = {} + for graph_id, _ in inserted_nodes.items(): + config[graph_id] = tuple((np.inf, self.__ged_env.get_node_label(1, to_dict=False))) + + # Run block gradient descent. + converged = False + itr = 0 + while not self.__insertion_termination_criterion_met(converged, itr): + converged = not self.__update_config(node_label, inserted_nodes, config, node_labels) + node_label_dict = dict(node_label) + converged = converged and (not self.__update_node_label([dict(item) for item in node_labels], node_label_dict)) # @todo: the dict is tupled again in the function, can be better. + node_label = tuple(item for item in node_label_dict.items()) # @todo: watch out: initial_node_labels[i] is not modified here. + + itr += 1 + + # Compute insertion delta of converged solution. + delta = 0.0 + for _, node in config.items(): + if node[0] == np.inf: + delta += self.__node_del_cost + else: + delta += self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost + + # Update best delta and global configuration if improvement has been found. + if delta < best_delta - self.__epsilon: + best_delta = delta + best_label.clear() + for key, val in node_label: + best_label[key] = val + best_config.clear() + for graph_id, val in config.items(): + best_config[graph_id] = val[0] + + # Return the best delta. + return best_delta + + + def __compute_initial_node_labels(self, node_labels, median_labels): + median_labels.clear() + if self.__use_real_randomness: # @todo: may not work if parallelized. + rng = np.random.randint(0, high=2**32 - 1, size=1) + urng = np.random.RandomState(seed=rng[0]) + else: + urng = np.random.RandomState(seed=self.__seed) + + # Generate the initial node label medians. + if self.__init_type_increase_order == 'K-MEANS++': + # Use k-means++ heuristic to generate the initial node label medians. + already_selected = [False] * len(node_labels) + selected_label_id = urng.randint(low=0, high=len(node_labels), size=1)[0] # c++ test: 23 + median_labels.append(node_labels[selected_label_id]) + already_selected[selected_label_id] = True +# xxx = [41, 0, 18, 9, 6, 14, 21, 25, 33] for c++ test +# iii = 0 for c++ test + while len(median_labels) < self.__num_inits_increase_order: + weights = [np.inf] * len(node_labels) + for label_id in range(0, len(node_labels)): + if already_selected[label_id]: + weights[label_id] = 0 + continue + for label in median_labels: + weights[label_id] = min(weights[label_id], self.__ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) + + # get non-zero weights. + weights_p, idx_p = [], [] + for i, w in enumerate(weights): + if w != 0: + weights_p.append(w) + idx_p.append(i) + if len(weights_p) > 0: + p = np.array(weights_p) / np.sum(weights_p) + selected_label_id = urng.choice(range(0, len(weights_p)), size=1, p=p)[0] # for c++ test: xxx[iii] + selected_label_id = idx_p[selected_label_id] +# iii += 1 for c++ test + median_labels.append(node_labels[selected_label_id]) + already_selected[selected_label_id] = True + else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self.__num_inits_increase_order. + break + else: + # Compute the initial node medians as the medians of randomly generated clusters of (roughly) equal size. + # @todo: go through and test. + shuffled_node_labels = [np.inf] * len(node_labels) #@todo: random? + # @todo: std::shuffle(shuffled_node_labels.begin(), shuffled_node_labels.end(), urng);? + cluster_size = len(node_labels) / self.__num_inits_increase_order + pos = 0.0 + cluster = [] + while len(median_labels) < self.__num_inits_increase_order - 1: + while pos < (len(median_labels) + 1) * cluster_size: + cluster.append(shuffled_node_labels[pos]) + pos += 1 + median_labels.append(self.__get_median_node_label(cluster)) + cluster.clear() + while pos < len(shuffled_node_labels): + pos += 1 + cluster.append(shuffled_node_labels[pos]) + median_labels.append(self.__get_median_node_label(cluster)) + cluster.clear() + + # Run Lloyd's Algorithm. + converged = False + closest_median_ids = [np.inf] * len(node_labels) + clusters = [[] for _ in range(len(median_labels))] + itr = 1 + while not self.__insertion_termination_criterion_met(converged, itr): + converged = not self.__update_clusters(node_labels, median_labels, closest_median_ids) + if not converged: + for cluster in clusters: + cluster.clear() + for label_id in range(0, len(node_labels)): + clusters[closest_median_ids[label_id]].append(node_labels[label_id]) + for cluster_id in range(0, len(clusters)): + node_label = dict(median_labels[cluster_id]) + self.__update_node_label([dict(item) for item in clusters[cluster_id]], node_label) # @todo: the dict is tupled again in the function, can be better. + median_labels[cluster_id] = tuple(item for item in node_label.items()) + itr += 1 + + + def __insertion_termination_criterion_met(self, converged, itr): + return converged or (itr >= self.__max_itrs_increase_order if self.__max_itrs_increase_order > 0 else False) + + + def __update_config(self, node_label, inserted_nodes, config, node_labels): + # Determine the best configuration. + config_modified = False + for graph_id, node_set in inserted_nodes.items(): + best_assignment = config[graph_id] + best_cost = 0.0 + if best_assignment[0] == np.inf: + best_cost = self.__node_del_cost + else: + best_cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(best_assignment[1])) - self.__node_ins_cost + for node in node_set: + cost = self.__ged_env.get_node_rel_cost(dict(node_label), dict(node[1])) - self.__node_ins_cost + if cost < best_cost - self.__epsilon: + best_cost = cost + best_assignment = node + config_modified = True + if self.__node_del_cost < best_cost - self.__epsilon: + best_cost = self.__node_del_cost + best_assignment = tuple((np.inf, best_assignment[1])) + config_modified = True + config[graph_id] = best_assignment + + # Collect the node labels contained in the best configuration. + node_labels.clear() + for key, val in config.items(): + if val[0] != np.inf: + node_labels.append(val[1]) + + # Return true if the configuration was modified. + return config_modified + + + def __update_node_label(self, node_labels, node_label): + if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling __update_config(). + return False + new_node_label = self.__get_median_node_label(node_labels) + if self.__ged_env.get_node_rel_cost(new_node_label, node_label) > self.__epsilon: + node_label.clear() + for key, val in new_node_label.items(): + node_label[key] = val + return True + return False + + + def __update_clusters(self, node_labels, median_labels, closest_median_ids): + # Determine the closest median for each node label. + clusters_modified = False + for label_id in range(0, len(node_labels)): + closest_median_id = np.inf + dist_to_closest_median = np.inf + for median_id in range(0, len(median_labels)): + dist_to_median = self.__ged_env.get_node_rel_cost(dict(median_labels[median_id]), dict(node_labels[label_id])) + if dist_to_median < dist_to_closest_median - self.__epsilon: + dist_to_closest_median = dist_to_median + closest_median_id = median_id + if closest_median_id != closest_median_ids[label_id]: + closest_median_ids[label_id] = closest_median_id + clusters_modified = True + + # Return true if the clusters were modified. + return clusters_modified + + + def __add_node_to_median(self, best_config, best_label, median): + # Update the median. + nb_nodes_median = nx.number_of_nodes(median) + median.add_node(nb_nodes_median, **best_label) + + # Update the node maps. + for graph_id, node_map in self.__node_maps_from_median.items(): + node_map_as_rel = [] + node_map.as_relation(node_map_as_rel) + new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) + for assignment in node_map_as_rel: + new_node_map.add_assignment(assignment[0], assignment[1]) + new_node_map.add_assignment(nx.number_of_nodes(median) - 1, best_config[graph_id]) + self.__node_maps_from_median[graph_id] = new_node_map + + # Increase overall number of increases. + self.__num_increase_order += 1 + + + def __are_graphs_equal(self, g1, g2): + """ + Check if the two graphs are equal. + + Parameters + ---------- + g1 : NetworkX graph object + Graph 1 to be compared. + + g2 : NetworkX graph object + Graph 2 to be compared. + + Returns + ------- + bool + True if the two graph are equal. + + Notes + ----- + This is not an identical check. Here the two graphs are equal if and only if their original_node_ids, nodes, all node labels, edges and all edge labels are equal. This function is specifically designed for class `MedianGraphEstimator` and should not be used elsewhere. + """ + # check original node ids. + if not g1.graph['original_node_ids'] == g2.graph['original_node_ids']: + return False # @todo: why check this? + # check nodes. + nlist1 = [n for n in g1.nodes(data=True)] # @todo: shallow? + nlist2 = [n for n in g2.nodes(data=True)] + if not nlist1 == nlist2: + return False + # check edges. + elist1 = [n for n in g1.edges(data=True)] + elist2 = [n for n in g2.edges(data=True)] + if not elist1 == elist2: + return False + + return True + + + def compute_my_cost(g, h, node_map): + cost = 0.0 + for node in g.nodes: + cost += 0 + + + def set_label_names(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): + self.__label_names = {'node_labels': node_labels, 'edge_labels': edge_labels, + 'node_attrs': node_attrs, 'edge_attrs': edge_attrs} + + +# def __get_median_node_label(self, node_labels): +# if len(self.__label_names['node_labels']) > 0: +# return self.__get_median_label_symbolic(node_labels) +# elif len(self.__label_names['node_attrs']) > 0: +# return self.__get_median_label_nonsymbolic(node_labels) +# else: +# raise Exception('Node label names are not given.') +# +# +# def __get_median_edge_label(self, edge_labels): +# if len(self.__label_names['edge_labels']) > 0: +# return self.__get_median_label_symbolic(edge_labels) +# elif len(self.__label_names['edge_attrs']) > 0: +# return self.__get_median_label_nonsymbolic(edge_labels) +# else: +# raise Exception('Edge label names are not given.') +# +# +# def __get_median_label_symbolic(self, labels): +# f_i = np.inf +# +# for label in labels: +# pass +# +# # Construct histogram. +# hist = {} +# for label in labels: +# label = tuple([kv for kv in label.items()]) # @todo: this may be slow. +# if label not in hist: +# hist[label] = 1 +# else: +# hist[label] += 1 +# +# # Return the label that appears most frequently. +# best_count = 0 +# median_label = {} +# for label, count in hist.items(): +# if count > best_count: +# best_count = count +# median_label = {kv[0]: kv[1] for kv in label} +# +# return median_label +# +# +# def __get_median_label_nonsymbolic(self, labels): +# if len(labels) == 0: +# return {} # @todo +# else: +# # Transform the labels into coordinates and compute mean label as initial solution. +# labels_as_coords = [] +# sums = {} +# for key, val in labels[0].items(): +# sums[key] = 0 +# for label in labels: +# coords = {} +# for key, val in label.items(): +# label_f = float(val) +# sums[key] += label_f +# coords[key] = label_f +# labels_as_coords.append(coords) +# median = {} +# for key, val in sums.items(): +# median[key] = val / len(labels) +# +# # Run main loop of Weiszfeld's Algorithm. +# epsilon = 0.0001 +# delta = 1.0 +# num_itrs = 0 +# all_equal = False +# while ((delta > epsilon) and (num_itrs < 100) and (not all_equal)): +# numerator = {} +# for key, val in sums.items(): +# numerator[key] = 0 +# denominator = 0 +# for label_as_coord in labels_as_coords: +# norm = 0 +# for key, val in label_as_coord.items(): +# norm += (val - median[key]) ** 2 +# norm = np.sqrt(norm) +# if norm > 0: +# for key, val in label_as_coord.items(): +# numerator[key] += val / norm +# denominator += 1.0 / norm +# if denominator == 0: +# all_equal = True +# else: +# new_median = {} +# delta = 0.0 +# for key, val in numerator.items(): +# this_median = val / denominator +# new_median[key] = this_median +# delta += np.abs(median[key] - this_median) +# median = new_median +# +# num_itrs += 1 +# +# # Transform the solution to strings and return it. +# median_label = {} +# for key, val in median.items(): +# median_label[key] = str(val) +# return median_label + + +def _compute_medoid_parallel(graph_ids, sort, itr): + g_id = itr[0] + i = itr[1] + # @todo: timer not considered here. +# if timer.expired(): +# self.__state = AlgorithmState.CALLED +# break + nb_nodes_g = G_ged_env.get_graph_num_nodes(g_id) + sum_of_distances = 0 + for h_id in graph_ids: + nb_nodes_h = G_ged_env.get_graph_num_nodes(h_id) + if nb_nodes_g <= nb_nodes_h or not sort: + G_ged_env.run_method(g_id, h_id) + sum_of_distances += G_ged_env.get_upper_bound(g_id, h_id) + else: + G_ged_env.run_method(h_id, g_id) + sum_of_distances += G_ged_env.get_upper_bound(h_id, g_id) + return i, sum_of_distances + + +def _compute_init_node_maps_parallel(gen_median_id, sort, nb_nodes_median, itr): + graph_id = itr + nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not sort: + G_ged_env.run_method(gen_median_id, graph_id) + node_map = G_ged_env.get_node_map(gen_median_id, graph_id) +# print(self.__node_maps_from_median[graph_id]) + else: + G_ged_env.run_method(graph_id, gen_median_id) + node_map = G_ged_env.get_node_map(graph_id, gen_median_id) + node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map + sum_of_distance = node_map.induced_cost() +# print(self.__sum_of_distances) + return graph_id, sum_of_distance, node_map + + +def _update_node_maps_parallel(median_id, epsilon, sort, nb_nodes_median, itr): + graph_id = itr[0] + node_map = itr[1] + + node_maps_were_modified = False + nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id) + if nb_nodes_median <= nb_nodes_g or not sort: + G_ged_env.run_method(median_id, graph_id) + if G_ged_env.get_upper_bound(median_id, graph_id) < node_map.induced_cost() - epsilon: + node_map = G_ged_env.get_node_map(median_id, graph_id) + node_maps_were_modified = True + else: + G_ged_env.run_method(graph_id, median_id) + if G_ged_env.get_upper_bound(graph_id, median_id) < node_map.induced_cost() - epsilon: + node_map = G_ged_env.get_node_map(graph_id, median_id) + node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map + node_maps_were_modified = True + + return graph_id, node_map, node_maps_were_modified \ No newline at end of file diff --git a/gklearn/ged/util/__init__.py b/gklearn/ged/util/__init__.py index b2305d3..f885b18 100644 --- a/gklearn/ged/util/__init__.py +++ b/gklearn/ged/util/__init__.py @@ -1,3 +1,3 @@ from gklearn.ged.util.lsape_solver import LSAPESolver from gklearn.ged.util.util import compute_geds, ged_options_to_string -from gklearn.ged.util.util import compute_geds_cml \ No newline at end of file +from gklearn.ged.util.util import compute_geds_cml, label_costs_to_matrix diff --git a/gklearn/kernels/__init__.py b/gklearn/kernels/__init__.py index e642043..c986891 100644 --- a/gklearn/kernels/__init__.py +++ b/gklearn/kernels/__init__.py @@ -8,7 +8,11 @@ __author__ = "Linlin Jia" __date__ = "November 2018" from gklearn.kernels.graph_kernel import GraphKernel +from gklearn.kernels.common_walk import CommonWalk from gklearn.kernels.marginalized import Marginalized +from gklearn.kernels.random_walk import RandomWalk +from gklearn.kernels.sylvester_equation import SylvesterEquation +from gklearn.kernels.spectral_decomposition import SpectralDecomposition from gklearn.kernels.shortest_path import ShortestPath from gklearn.kernels.structural_sp import StructuralSP from gklearn.kernels.path_up_to_h import PathUpToH diff --git a/gklearn/kernels/common_walk.py b/gklearn/kernels/common_walk.py new file mode 100644 index 0000000..f6ee71d --- /dev/null +++ b/gklearn/kernels/common_walk.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Aug 18 11:21:31 2020 + +@author: ljia + +@references: + + [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: + Hardness results and efficient alternatives. Learning Theory and Kernel + Machines, pages 129–143, 2003. +""" + +import sys +from tqdm import tqdm +import numpy as np +import networkx as nx +from gklearn.utils import SpecialLabel +from gklearn.utils.parallel import parallel_gm, parallel_me +from gklearn.utils.utils import direct_product_graph +from gklearn.kernels import GraphKernel + + +class CommonWalk(GraphKernel): + + def __init__(self, **kwargs): + GraphKernel.__init__(self) + self.__node_labels = kwargs.get('node_labels', []) + self.__edge_labels = kwargs.get('edge_labels', []) + self.__weight = kwargs.get('weight', 1) + self.__compute_method = kwargs.get('compute_method', None) + self.__ds_infos = kwargs.get('ds_infos', {}) + self.__compute_method = self.__compute_method.lower() + + + def _compute_gm_series(self): + self.__check_graphs(self._graphs) + self.__add_dummy_labels(self._graphs) + if not self.__ds_infos['directed']: # convert + self._graphs = [G.to_directed() for G in self._graphs] + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + if self._verbose >= 2: + iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + else: + iterator = itr + + # direct product graph method - exponential + if self.__compute_method == 'exp': + for i, j in iterator: + kernel = self.__kernel_do_exp(self._graphs[i], self._graphs[j], self.__weight) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + # direct product graph method - geometric + elif self.__compute_method == 'geo': + for i, j in iterator: + kernel = self.__kernel_do_geo(self._graphs[i], self._graphs[j], self.__weight) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + + return gram_matrix + + + def _compute_gm_imap_unordered(self): + self.__check_graphs(self._graphs) + self.__add_dummy_labels(self._graphs) + if not self.__ds_infos['directed']: # convert + self._graphs = [G.to_directed() for G in self._graphs] + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + + # direct product graph method - exponential + if self.__compute_method == 'exp': + do_fun = self._wrapper_kernel_do_exp + # direct product graph method - geometric + elif self.__compute_method == 'geo': + do_fun = self._wrapper_kernel_do_geo + + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + + return gram_matrix + + + def _compute_kernel_list_series(self, g1, g_list): + self.__check_graphs(g_list + [g1]) + self.__add_dummy_labels(g_list + [g1]) + if not self.__ds_infos['directed']: # convert + g1 = g1.to_directed() + g_list = [G.to_directed() for G in g_list] + + # compute kernel list. + kernel_list = [None] * len(g_list) + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + else: + iterator = range(len(g_list)) + + # direct product graph method - exponential + if self.__compute_method == 'exp': + for i in iterator: + kernel = self.__kernel_do_exp(g1, g_list[i], self.__weight) + kernel_list[i] = kernel + # direct product graph method - geometric + elif self.__compute_method == 'geo': + for i in iterator: + kernel = self.__kernel_do_geo(g1, g_list[i], self.__weight) + kernel_list[i] = kernel + + return kernel_list + + + def _compute_kernel_list_imap_unordered(self, g1, g_list): + self.__check_graphs(g_list + [g1]) + self.__add_dummy_labels(g_list + [g1]) + if not self.__ds_infos['directed']: # convert + g1 = g1.to_directed() + g_list = [G.to_directed() for G in g_list] + + # compute kernel list. + kernel_list = [None] * len(g_list) + + def init_worker(g1_toshare, g_list_toshare): + global G_g1, G_g_list + G_g1 = g1_toshare + G_g_list = g_list_toshare + + # direct product graph method - exponential + if self.__compute_method == 'exp': + do_fun = self._wrapper_kernel_list_do_exp + # direct product graph method - geometric + elif self.__compute_method == 'geo': + do_fun = self._wrapper_kernel_list_do_geo + + def func_assign(result, var_to_assign): + var_to_assign[result[0]] = result[1] + itr = range(len(g_list)) + len_itr = len(g_list) + parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, + init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', + n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + + return kernel_list + + + def _wrapper_kernel_list_do_exp(self, itr): + return itr, self.__kernel_do_exp(G_g1, G_g_list[itr], self.__weight) + + + def _wrapper_kernel_list_do_geo(self, itr): + return itr, self.__kernel_do_geo(G_g1, G_g_list[itr], self.__weight) + + + def _compute_single_kernel_series(self, g1, g2): + self.__check_graphs([g1] + [g2]) + self.__add_dummy_labels([g1] + [g2]) + if not self.__ds_infos['directed']: # convert + g1 = g1.to_directed() + g2 = g2.to_directed() + + # direct product graph method - exponential + if self.__compute_method == 'exp': + kernel = self.__kernel_do_exp(g1, g2, self.__weight) + # direct product graph method - geometric + elif self.__compute_method == 'geo': + kernel = self.__kernel_do_geo(g1, g2, self.__weight) + + return kernel + + + def __kernel_do_exp(self, g1, g2, beta): + """Calculate common walk graph kernel between 2 graphs using exponential + series. + + Parameters + ---------- + g1, g2 : NetworkX graphs + Graphs between which the kernels are calculated. + beta : integer + Weight. + + Return + ------ + kernel : float + The common walk Kernel between 2 graphs. + """ + # get tensor product / direct product + gp = direct_product_graph(g1, g2, self.__node_labels, self.__edge_labels) + # return 0 if the direct product graph have no more than 1 node. + if nx.number_of_nodes(gp) < 2: + return 0 + A = nx.adjacency_matrix(gp).todense() + + ew, ev = np.linalg.eig(A) +# # remove imaginary part if possible. +# # @todo: don't know if it is necessary. +# for i in range(len(ew)): +# if np.abs(ew[i].imag) < 1e-9: +# ew[i] = ew[i].real +# for i in range(ev.shape[0]): +# for j in range(ev.shape[1]): +# if np.abs(ev[i, j].imag) < 1e-9: +# ev[i, j] = ev[i, j].real + + D = np.zeros((len(ew), len(ew)), dtype=complex) # @todo: use complex? + for i in range(len(ew)): + D[i][i] = np.exp(beta * ew[i]) + + exp_D = ev * D * ev.T + kernel = exp_D.sum() + if (kernel.real == 0 and np.abs(kernel.imag) < 1e-9) or np.abs(kernel.imag / kernel.real) < 1e-9: + kernel = kernel.real + + return kernel + + + def _wrapper_kernel_do_exp(self, itr): + i = itr[0] + j = itr[1] + return i, j, self.__kernel_do_exp(G_gn[i], G_gn[j], self.__weight) + + + def __kernel_do_geo(self, g1, g2, gamma): + """Calculate common walk graph kernel between 2 graphs using geometric + series. + + Parameters + ---------- + g1, g2 : NetworkX graphs + Graphs between which the kernels are calculated. + gamma : integer + Weight. + + Return + ------ + kernel : float + The common walk Kernel between 2 graphs. + """ + # get tensor product / direct product + gp = direct_product_graph(g1, g2, self.__node_labels, self.__edge_labels) + # return 0 if the direct product graph have no more than 1 node. + if nx.number_of_nodes(gp) < 2: + return 0 + A = nx.adjacency_matrix(gp).todense() + mat = np.identity(len(A)) - gamma * A + # try: + return mat.I.sum() + # except np.linalg.LinAlgError: + # return np.nan + + + def _wrapper_kernel_do_geo(self, itr): + i = itr[0] + j = itr[1] + return i, j, self.__kernel_do_geo(G_gn[i], G_gn[j], self.__weight) + + + def __check_graphs(self, Gn): + for g in Gn: + if nx.number_of_nodes(g) == 1: + raise Exception('Graphs must contain more than 1 nodes to construct adjacency matrices.') + + + def __add_dummy_labels(self, Gn): + if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.__node_labels = [SpecialLabel.DUMMY] + if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.__edge_labels = [SpecialLabel.DUMMY] \ No newline at end of file diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index db4abf8..7c6afde 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -10,6 +10,7 @@ import networkx as nx import multiprocessing import time + class GraphKernel(object): def __init__(self): diff --git a/gklearn/kernels/marginalized.py b/gklearn/kernels/marginalized.py index 6ddec43..6910468 100644 --- a/gklearn/kernels/marginalized.py +++ b/gklearn/kernels/marginalized.py @@ -51,7 +51,7 @@ class Marginalized(GraphKernel): else: iterator = self._graphs # @todo: this may not work. - self._graphs = [untotterTransformation(G, self.__node_label, self.__edge_label) for G in iterator] + self._graphs = [untotterTransformation(G, self.__node_labels, self.__edge_labels) for G in iterator] # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) @@ -108,13 +108,13 @@ class Marginalized(GraphKernel): self.__add_dummy_labels(g_list + [g1]) if self.__remove_totters: - g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work. + g1 = untotterTransformation(g1, self.__node_labels, self.__edge_labels) # @todo: this may not work. if self._verbose >= 2: iterator = tqdm(g_list, desc='removing tottering', file=sys.stdout) else: iterator = g_list # @todo: this may not work. - g_list = [untotterTransformation(G, self.__node_label, self.__edge_label) for G in iterator] + g_list = [untotterTransformation(G, self.__node_labels, self.__edge_labels) for G in iterator] # compute kernel list. kernel_list = [None] * len(g_list) @@ -133,7 +133,7 @@ class Marginalized(GraphKernel): self.__add_dummy_labels(g_list + [g1]) if self.__remove_totters: - g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work. + g1 = untotterTransformation(g1, self.__node_labels, self.__edge_labels) # @todo: this may not work. pool = Pool(self._n_jobs) itr = range(0, len(g_list)) if len(g_list) < 100 * self._n_jobs: @@ -177,8 +177,8 @@ class Marginalized(GraphKernel): def _compute_single_kernel_series(self, g1, g2): self.__add_dummy_labels([g1] + [g2]) if self.__remove_totters: - g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work. - g2 = untotterTransformation(g2, self.__node_label, self.__edge_label) + g1 = untotterTransformation(g1, self.__node_labels, self.__edge_labels) # @todo: this may not work. + g2 = untotterTransformation(g2, self.__node_labels, self.__edge_labels) kernel = self.__kernel_do(g1, g2) return kernel @@ -324,7 +324,7 @@ class Marginalized(GraphKernel): def _wrapper_untotter(self, i): - return i, untotterTransformation(self._graphs[i], self.__node_label, self.__edge_label) # @todo: this may not work. + return i, untotterTransformation(self._graphs[i], self.__node_labels, self.__edge_labels) # @todo: this may not work. def __add_dummy_labels(self, Gn): diff --git a/gklearn/kernels/random_walk.py b/gklearn/kernels/random_walk.py new file mode 100644 index 0000000..f2d0961 --- /dev/null +++ b/gklearn/kernels/random_walk.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Aug 19 16:55:17 2020 + +@author: ljia + +@references: + + [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. +""" + +import sys +from tqdm import tqdm +import numpy as np +import networkx as nx +from gklearn.utils import SpecialLabel +from gklearn.utils.parallel import parallel_gm, parallel_me +from gklearn.utils.utils import direct_product_graph +from gklearn.kernels import GraphKernel + + +class RandomWalk(GraphKernel): + + + def __init__(self, **kwargs): + GraphKernel.__init__(self) + self._compute_method = kwargs.get('compute_method', None) + self._weight = kwargs.get('weight', 1) + self._p = kwargs.get('p', None) + self._q = kwargs.get('q', None) + self._edge_weight = kwargs.get('edge_weight', None) + self._ds_infos = kwargs.get('ds_infos', {}) + + self._compute_method = self.__compute_method.lower() + + + def _compute_gm_series(self): + pass + + + def _compute_gm_imap_unordered(self): + pass + + + def _compute_kernel_list_series(self, g1, g_list): + pass + + + def _compute_kernel_list_imap_unordered(self, g1, g_list): + pass + + + def _compute_single_kernel_series(self, g1, g2): + pass + + + def _check_graphs(self, Gn): + # remove graphs with no edges, as no walk can be found in their structures, + # so the weight matrix between such a graph and itself might be zero. + for g in Gn: + if nx.number_of_edges(g) == 0: + raise Exception('Graphs must contain edges to construct weight matrices.') + + + def _check_edge_weight(self, G0, verbose): + eweight = None + if self._edge_weight == None: + if verbose >= 2: + print('\n None edge weight is specified. Set all weight to 1.\n') + else: + try: + some_weight = list(nx.get_edge_attributes(G0, self._edge_weight).values())[0] + if isinstance(some_weight, float) or isinstance(some_weight, int): + eweight = self._edge_weight + else: + if verbose >= 2: + print('\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % self._edge_weight) + except: + if verbose >= 2: + print('\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % self._edge_weight) + + self._edge_weight = eweight + + + def _add_dummy_labels(self, Gn): + if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.__node_labels = [SpecialLabel.DUMMY] + if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.__edge_labels = [SpecialLabel.DUMMY] \ No newline at end of file diff --git a/gklearn/kernels/spectral_decomposition.py b/gklearn/kernels/spectral_decomposition.py new file mode 100644 index 0000000..5509ee6 --- /dev/null +++ b/gklearn/kernels/spectral_decomposition.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Aug 20 16:12:45 2020 + +@author: ljia + +@references: + + [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. +""" + +import sys +from tqdm import tqdm +import numpy as np +import networkx as nx +from scipy.sparse import kron +from gklearn.utils.parallel import parallel_gm, parallel_me +from gklearn.kernels import RandomWalk + + +class SpectralDecomposition(RandomWalk): + + + def __init__(self, **kwargs): + RandomWalk.__init__(self, **kwargs) + self._sub_kernel = kwargs.get('sub_kernel', None) + + + def _compute_gm_series(self): + self._check_edge_weight(self._graphs) + self._check_graphs(self._graphs) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored. Only works for undirected graphs.') + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + if self._q == None: + # precompute the spectral decomposition of each graph. + P_list = [] + D_list = [] + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='spectral decompose', file=sys.stdout) + else: + iterator = self._graphs + for G in iterator: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A actually is the transpose of the adjacency matrix. + A = nx.adjacency_matrix(G, self._edge_weight).todense().transpose() + ew, ev = np.linalg.eig(A) + D_list.append(ew) + P_list.append(ev) +# P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? + + if self._p == None: # p is uniform distribution as default. + q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in self._graphs] +# q_T_list = [q.T for q in q_list] + + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + if self._verbose >= 2: + iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + else: + iterator = itr + + for i, j in iterator: + kernel = self.__kernel_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], D_list[i], D_list[j], self._weight, self._sub_kernel) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + + else: # @todo + pass + else: # @todo + pass + + return gram_matrix + + + def _compute_gm_imap_unordered(self): + self._check_edge_weight(self._graphs) + self._check_graphs(self._graphs) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored. Only works for undirected graphs.') + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + if self._q == None: + # precompute the spectral decomposition of each graph. + P_list = [] + D_list = [] + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='spectral decompose', file=sys.stdout) + else: + iterator = self._graphs + for G in iterator: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A actually is the transpose of the adjacency matrix. + A = nx.adjacency_matrix(G, self._edge_weight).todense().transpose() + ew, ev = np.linalg.eig(A) + D_list.append(ew) + P_list.append(ev) # @todo: parallel? + + if self._p == None: # p is uniform distribution as default. + q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in self._graphs] # @todo: parallel? + + def init_worker(q_T_list_toshare, P_list_toshare, D_list_toshare): + global G_q_T_list, G_P_list, G_D_list + G_q_T_list = q_T_list_toshare + G_P_list = P_list_toshare + G_D_list = D_list_toshare + + do_fun = self._wrapper_kernel_do + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose) + + else: # @todo + pass + else: # @todo + pass + + return gram_matrix + + + def _compute_kernel_list_series(self, g1, g_list): + self._check_edge_weight(g_list + [g1]) + self._check_graphs(g_list + [g1]) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored. Only works for undirected graphs.') + + # compute kernel list. + kernel_list = [None] * len(g_list) + + if self._q == None: + # precompute the spectral decomposition of each graph. + A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + D1, P1 = np.linalg.eig(A1) + P_list = [] + D_list = [] + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='spectral decompose', file=sys.stdout) + else: + iterator = range(len(g_list)) + for G in iterator: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A actually is the transpose of the adjacency matrix. + A = nx.adjacency_matrix(G, self._edge_weight).todense().transpose() + ew, ev = np.linalg.eig(A) + D_list.append(ew) + P_list.append(ev) + + if self._p == None: # p is uniform distribution as default. + q_T1 = 1 / nx.number_of_nodes(g1) + q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + else: + iterator = range(len(g_list)) + + for i in iterator: + kernel = self.__kernel_do(q_T1, q_T_list[i], P1, P_list[i], D1, D_list[i], self._weight, self._sub_kernel) + kernel_list[i] = kernel + + else: # @todo + pass + else: # @todo + pass + + return kernel_list + + + def _compute_kernel_list_imap_unordered(self, g1, g_list): + self._check_edge_weight(g_list + [g1]) + self._check_graphs(g_list + [g1]) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored. Only works for undirected graphs.') + + # compute kernel list. + kernel_list = [None] * len(g_list) + + if self._q == None: + # precompute the spectral decomposition of each graph. + A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + D1, P1 = np.linalg.eig(A1) + P_list = [] + D_list = [] + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='spectral decompose', file=sys.stdout) + else: + iterator = range(len(g_list)) + for G in iterator: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A actually is the transpose of the adjacency matrix. + A = nx.adjacency_matrix(G, self._edge_weight).todense().transpose() + ew, ev = np.linalg.eig(A) + D_list.append(ew) + P_list.append(ev) # @todo: parallel? + + if self._p == None: # p is uniform distribution as default. + q_T1 = 1 / nx.number_of_nodes(g1) + q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] # @todo: parallel? + + def init_worker(q_T1_toshare, P1_toshare, D1_toshare, q_T_list_toshare, P_list_toshare, D_list_toshare): + global G_q_T1, G_P1, G_D1, G_q_T_list, G_P_list, G_D_list + G_q_T1 = q_T1_toshare + G_P1 = P1_toshare + G_D1 = D1_toshare + G_q_T_list = q_T_list_toshare + G_P_list = P_list_toshare + G_D_list = D_list_toshare + + do_fun = self._wrapper_kernel_list_do + + def func_assign(result, var_to_assign): + var_to_assign[result[0]] = result[1] + itr = range(len(g_list)) + len_itr = len(g_list) + parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, + init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + + else: # @todo + pass + else: # @todo + pass + + return kernel_list + + + def _wrapper_kernel_list_do(self, itr): + return itr, self._kernel_do(G_q_T1, G_q_T_list[itr], G_P1, G_P_list[itr], G_D1, G_D_list[itr], self._weight, self._sub_kernel) + + + def _compute_single_kernel_series(self, g1, g2): + self._check_edge_weight([g1] + [g2]) + self._check_graphs([g1] + [g2]) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored. Only works for undirected graphs.') + + if self._q == None: + # precompute the spectral decomposition of each graph. + A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + D1, P1 = np.linalg.eig(A1) + A2 = nx.adjacency_matrix(g2, self._edge_weight).todense().transpose() + D2, P2 = np.linalg.eig(A2) + + if self._p == None: # p is uniform distribution as default. + q_T1 = 1 / nx.number_of_nodes(g1) + q_T2 = 1 / nx.number_of_nodes(g2) + kernel = self.__kernel_do(q_T1, q_T2, P1, P2, D1, D2, self._weight, self._sub_kernel) + else: # @todo + pass + else: # @todo + pass + + return kernel + + + def __kernel_do(self, q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): + # use uniform distribution if there is no prior knowledge. + kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense() + # @todo: this is not needed when p = q (kr = kl.T) for undirected graphs. + # kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense() + if sub_kernel == 'exp': + D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) + kmiddle = np.diag(np.exp(weight * D_diag)) + elif sub_kernel == 'geo': + D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2]) + kmiddle = np.diag(weight * D_diag) + kmiddle = np.identity(len(kmiddle)) - weight * kmiddle + kmiddle = np.linalg.inv(kmiddle) + return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0] + + + def _wrapper_kernel_do(self, itr): + i = itr[0] + j = itr[1] + return i, j, self.__kernel_do(G_q_T_list[i], G_q_T_list[j], G_P_list[i], G_P_list[j], G_D_list[i], G_D_list[j], self._weight, self._sub_kernel) \ No newline at end of file diff --git a/gklearn/kernels/sylvester_equation.py b/gklearn/kernels/sylvester_equation.py new file mode 100644 index 0000000..3879b59 --- /dev/null +++ b/gklearn/kernels/sylvester_equation.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Aug 19 17:24:46 2020 + +@author: ljia + +@references: + + [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. +""" + +import sys +from tqdm import tqdm +import numpy as np +import networkx as nx +from control import dlyap +from gklearn.utils.parallel import parallel_gm, parallel_me +from gklearn.kernels import RandomWalk + + +class SylvesterEquation(RandomWalk): + + + def __init__(self, **kwargs): + RandomWalk.__init__(self, **kwargs) + + + def _compute_gm_series(self): + self._check_edge_weight(self._graphs) + self._check_graphs(self._graphs) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + lmda = self._weight + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) + else: + iterator = self._graphs + A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] + # # normalized adjacency matrices + # A_wave_list = [] + # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): + # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() + # norm = A_tilde.sum(axis=0) + # norm[norm == 0] = 1 + # A_wave_list.append(A_tilde / norm) + + if self._p == None: # p is uniform distribution as default. + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + if self._verbose >= 2: + iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + else: + iterator = itr + + for i, j in iterator: + kernel = self.__kernel_do(A_wave_list[i], A_wave_list[j], lmda) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + + else: # @todo + pass + else: # @todo + pass + + return gram_matrix + + + def _compute_gm_imap_unordered(self): + self._check_edge_weight(self._graphs) + self._check_graphs(self._graphs) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) + else: + iterator = self._graphs + A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? + + if self._p == None: # p is uniform distribution as default. + def init_worker(A_wave_list_toshare): + global G_A_wave_list + G_A_wave_list = A_wave_list_toshare + + do_fun = self._wrapper_kernel_do + + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) + + else: # @todo + pass + else: # @todo + pass + + return gram_matrix + + + def _compute_kernel_list_series(self, g1, g_list): + self._check_edge_weight(g_list + [g1]) + self._check_graphs(g_list + [g1]) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + lmda = self._weight + + # compute kernel list. + kernel_list = [None] * len(g_list) + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) + else: + iterator = range(len(g_list)) + A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] + + if self._p == None: # p is uniform distribution as default. + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + else: + iterator = range(len(g_list)) + + for i in iterator: + kernel = self.__kernel_do(A_wave_1, A_wave_list[i], lmda) + kernel_list[i] = kernel + + else: # @todo + pass + else: # @todo + pass + + return kernel_list + + + def _compute_kernel_list_imap_unordered(self, g1, g_list): + self._check_edge_weight(g_list + [g1]) + self._check_graphs(g_list + [g1]) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + # compute kernel list. + kernel_list = [None] * len(g_list) + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) + else: + iterator = range(len(g_list)) + A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? + + if self._p == None: # p is uniform distribution as default. + def init_worker(A_wave_1_toshare, A_wave_list_toshare): + global G_A_wave_1, G_A_wave_list + G_A_wave_1 = A_wave_1_toshare + G_A_wave_list = A_wave_list_toshare + + do_fun = self._wrapper_kernel_list_do + + def func_assign(result, var_to_assign): + var_to_assign[result[0]] = result[1] + itr = range(len(g_list)) + len_itr = len(g_list) + parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, + init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', + n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + + else: # @todo + pass + else: # @todo + pass + + return kernel_list + + + def _wrapper_kernel_list_do(self, itr): + return itr, self._kernel_do(G_A_wave_1, G_A_wave_list[itr], self._weight) + + + def _compute_single_kernel_series(self, g1, g2): + self._check_edge_weight([g1] + [g2]) + self._check_graphs([g1] + [g2]) + if self._verbose >= 2: + import warnings + warnings.warn('All labels are ignored.') + + lmda = self._weight + + if self._q == None: + # don't normalize adjacency matrices if q is a uniform vector. Note + # A_wave_list actually contains the transposes of the adjacency matrices. + A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + A_wave_2 = nx.adjacency_matrix(g2, self._edge_weight).todense().transpose() + if self._p == None: # p is uniform distribution as default. + kernel = self.__kernel_do(A_wave_1, A_wave_2, lmda) + else: # @todo + pass + else: # @todo + pass + + return kernel + + + def __kernel_do(self, A_wave1, A_wave2, lmda): + + S = lmda * A_wave2 + T_t = A_wave1 + # use uniform distribution if there is no prior knowledge. + nb_pd = len(A_wave1) * len(A_wave2) + p_times_uni = 1 / nb_pd + M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni) + X = dlyap(S, T_t, M0) + X = np.reshape(X, (-1, 1), order='F') + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, nb_pd), p_times_uni) + return np.dot(q_times, X) + + + def _wrapper_kernel_do(self, itr): + i = itr[0] + j = itr[1] + return i, j, self.__kernel_do(G_A_wave_list[i], G_A_wave_list[j], self._weight) \ No newline at end of file diff --git a/gklearn/kernels/untilHPathKernel.py b/gklearn/kernels/untilHPathKernel.py index bc1d71e..c44300d 100644 --- a/gklearn/kernels/untilHPathKernel.py +++ b/gklearn/kernels/untilHPathKernel.py @@ -649,7 +649,7 @@ def paths2labelseqs(plist, G, ds_attrs, node_label, edge_label): # path_strs.append(tuple(strlist)) else: path_strs = [ - tuple([G.node[node][node_label] for node in path]) + tuple([G.nodes[node][node_label] for node in path]) for path in plist ] return path_strs diff --git a/gklearn/preimage/median_preimage_generator_cml.py b/gklearn/preimage/median_preimage_generator_cml.py index 161475a..e6bca92 100644 --- a/gklearn/preimage/median_preimage_generator_cml.py +++ b/gklearn/preimage/median_preimage_generator_cml.py @@ -10,16 +10,14 @@ import time import random import multiprocessing import networkx as nx -import cvxpy as cp -import itertools from gklearn.preimage import PreimageGenerator from gklearn.preimage.utils import compute_k_dis -from gklearn.ged.util import compute_geds_cml from gklearn.ged.env import GEDEnv -from gklearn.ged.median import MedianGraphEstimatorPy +from gklearn.ged.learning import CostMatricesLearner +from gklearn.ged.median import MedianGraphEstimatorCML from gklearn.ged.median import constant_node_costs, mge_options_to_string -from gklearn.utils import Timer, SpecialLabel from gklearn.utils.utils import get_graph_kernel_by_name +from gklearn.ged.util import label_costs_to_matrix class MedianPreimageGeneratorCML(PreimageGenerator): @@ -28,7 +26,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): def __init__(self, dataset=None): PreimageGenerator.__init__(self, dataset=dataset) - # arguments to set. + ### arguments to set. self.__mge = None self.__ged_options = {} self.__mge_options = {} @@ -38,6 +36,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__parallel = True self.__n_jobs = multiprocessing.cpu_count() self.__ds_name = None + # for cml. self.__time_limit_in_sec = 0 self.__max_itrs = 100 self.__max_itrs_without_update = 3 @@ -45,7 +44,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__epsilon_ec = 0.1 self.__allow_zeros = True # self.__triangle_rule = True - # values to compute. + ### values to compute. self.__runtime_optimize_ec = None self.__runtime_generate_preimage = None self.__runtime_total = None @@ -57,12 +56,13 @@ class MedianPreimageGeneratorCML(PreimageGenerator): self.__k_dis_set_median = None self.__k_dis_gen_median = None self.__k_dis_dataset = None - self.__itrs = 0 - self.__converged = False - self.__num_updates_ecc = 0 self.__node_label_costs = None self.__edge_label_costs = None - # values that can be set or to be computed. + # for cml. + self.__itrs = 0 + self.__converged = False + self.__num_updates_ecs = 0 + ### values that can be set or to be computed. self.__edit_cost_constants = [] self.__gram_matrix_unnorm = None self.__runtime_precompute_gm = None @@ -154,7 +154,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): print('================================================================================') print('Finished generation of preimages.') print('--------------------------------------------------------------------------------') - print('The optimized edit cost constants:', self.__edit_cost_constants) + print('The optimized edit costs:', self.__edit_cost_constants) print('SOD of the set median:', self.__sod_set_median) print('SOD of the generalized median:', self.__sod_gen_median) print('Distance in kernel space for set median:', self.__k_dis_set_median) @@ -165,7 +165,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): print('Time to generate pre-images:', self.__runtime_generate_preimage) print('Total time:', self.__runtime_total) print('Total number of iterations for optimizing:', self.__itrs) - print('Total number of updating edit costs:', self.__num_updates_ecc) + print('Total number of updating edit costs:', self.__num_updates_ecs) print('Is optimization of edit costs converged:', self.__converged) print('================================================================================') print() @@ -185,7 +185,7 @@ class MedianPreimageGeneratorCML(PreimageGenerator): results['k_dis_dataset'] = self.__k_dis_dataset results['itrs'] = self.__itrs results['converged'] = self.__converged - results['num_updates_ecc'] = self.__num_updates_ecc + results['num_updates_ecc'] = self.__num_updates_ecs results['mge'] = {} results['mge']['num_decrease_order'] = self.__mge.get_num_times_order_decreased() results['mge']['num_increase_order'] = self.__mge.get_num_times_order_increased() @@ -302,598 +302,33 @@ class MedianPreimageGeneratorCML(PreimageGenerator): dis_k_vec.append(dis_k_mat[i, j]) dis_k_vec = np.array(dis_k_vec) - # init ged. - if self._verbose >= 2: - print('\ninitial:') - time0 = time.time() - graphs = [self.__clean_graph(g) for g in self._dataset.graphs] - self.__edit_cost_constants = self.__init_ecc + # Set GEDEnv options. +# graphs = [self.__clean_graph(g) for g in self._dataset.graphs] +# self.__edit_cost_constants = self.__init_ecc options = self.__ged_options.copy() - options['edit_cost_constants'] = self.__edit_cost_constants # @todo + options['edit_cost_constants'] = self.__edit_cost_constants # @todo: not needed. options['node_labels'] = self._dataset.node_labels options['edge_labels'] = self._dataset.edge_labels - options['node_attrs'] = self._dataset.node_attrs - options['edge_attrs'] = self._dataset.edge_attrs +# options['node_attrs'] = self._dataset.node_attrs +# options['edge_attrs'] = self._dataset.edge_attrs options['node_label_costs'] = self.__node_label_costs options['edge_label_costs'] = self.__edge_label_costs - ged_vec_init, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) - residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] - time_list = [time.time() - time0] - edit_cost_list = [self.__init_ecc] - nb_cost_mat = np.array(n_edit_operations) - nb_cost_mat_list = [nb_cost_mat] - if self._verbose >= 2: - print('Current edit cost constants:', self.__edit_cost_constants) - print('Residual list:', residual_list) - - # run iteration from initial edit costs. - self.__converged = False - itrs_without_update = 0 - self.__itrs = 0 - self.__num_updates_ecc = 0 - timer = Timer(self.__time_limit_in_sec) - while not self.__termination_criterion_met(self.__converged, timer, self.__itrs, itrs_without_update): - if self._verbose >= 2: - print('\niteration', self.__itrs + 1) - time0 = time.time() - # "fit" geds to distances in feature space by tuning edit costs using theLeast Squares Method. -# np.savez('results/xp_fit_method/fit_data_debug' + str(self.__itrs) + '.gm', -# nb_cost_mat=nb_cost_mat, dis_k_vec=dis_k_vec, -# n_edit_operations=n_edit_operations, ged_vec_init=ged_vec_init, -# ged_mat=ged_mat) - self.__edit_cost_constants, _ = self.__update_ecc(nb_cost_mat, dis_k_vec) - for i in range(len(self.__edit_cost_constants)): - if -1e-9 <= self.__edit_cost_constants[i] <= 1e-9: - self.__edit_cost_constants[i] = 0 - if self.__edit_cost_constants[i] < 0: - raise ValueError('The edit cost is negative.') - # for i in range(len(self.__edit_cost_constants)): - # if self.__edit_cost_constants[i] < 0: - # self.__edit_cost_constants[i] = 0 - - # compute new GEDs and numbers of edit operations. - options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75]) - options['edit_cost_constants'] = self.__edit_cost_constants # @todo - options['node_labels'] = self._dataset.node_labels - options['edge_labels'] = self._dataset.edge_labels - options['node_attrs'] = self._dataset.node_attrs - options['edge_attrs'] = self._dataset.edge_attrs - ged_vec, ged_mat, n_edit_operations = compute_geds_cml(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1)) - residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) - time_list.append(time.time() - time0) - edit_cost_list.append(self.__edit_cost_constants) - nb_cost_mat = np.array(n_edit_operations) - nb_cost_mat_list.append(nb_cost_mat) - - # check convergency. - ec_changed = False - for i, cost in enumerate(self.__edit_cost_constants): - if cost == 0: - if edit_cost_list[-2][i] > self.__epsilon_ec: - ec_changed = True - break - elif abs(cost - edit_cost_list[-2][i]) / cost > self.__epsilon_ec: - ec_changed = True - break -# if abs(cost - edit_cost_list[-2][i]) > self.__epsilon_ec: -# ec_changed = True -# break - residual_changed = False - if residual_list[-1] == 0: - if residual_list[-2] > self.__epsilon_residual: - residual_changed = True - elif abs(residual_list[-1] - residual_list[-2]) / residual_list[-1] > self.__epsilon_residual: - residual_changed = True - self.__converged = not (ec_changed or residual_changed) - if self.__converged: - itrs_without_update += 1 - else: - itrs_without_update = 0 - self.__num_updates_ecc += 1 - - # print current states. - if self._verbose >= 2: - print() - print('-------------------------------------------------------------------------') - print('States of iteration', self.__itrs + 1) - print('-------------------------------------------------------------------------') -# print('Time spend:', self.__runtime_optimize_ec) - print('Total number of iterations for optimizing:', self.__itrs + 1) - print('Total number of updating edit costs:', self.__num_updates_ecc) - print('Was optimization of edit costs converged:', self.__converged) - print('Did edit costs change:', ec_changed) - print('Did residual change:', residual_changed) - print('Iterations without update:', itrs_without_update) - print('Current edit cost constants:', self.__edit_cost_constants) - print('Residual list:', residual_list) - print('-------------------------------------------------------------------------') - - self.__itrs += 1 - - - def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): - if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): -# if self.__state == AlgorithmState.TERMINATED: -# self.__state = AlgorithmState.INITIALIZED - return True - return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) - - - def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'): - # if self.__ds_name == 'Letter-high': - if self.__ged_options['edit_cost'] == 'LETTER': - raise Exception('Cannot compute for cost "LETTER".') - pass - # # method 1: set alpha automatically, just tune c_vir and c_eir by - # # LMS using cvxpy. - # alpha = 0.5 - # coeff = 100 # np.max(alpha * nb_cost_mat[:,4] / dis_k_vec) - ## if np.count_nonzero(nb_cost_mat[:,4]) == 0: - ## alpha = 0.75 - ## else: - ## alpha = np.min([dis_k_vec / c_vs for c_vs in nb_cost_mat[:,4] if c_vs != 0]) - ## alpha = alpha * 0.99 - # param_vir = alpha * (nb_cost_mat[:,0] + nb_cost_mat[:,1]) - # param_eir = (1 - alpha) * (nb_cost_mat[:,4] + nb_cost_mat[:,5]) - # nb_cost_mat_new = np.column_stack((param_vir, param_eir)) - # dis_new = coeff * dis_k_vec - alpha * nb_cost_mat[:,3] - # - # x = cp.Variable(nb_cost_mat_new.shape[1]) - # cost = cp.sum_squares(nb_cost_mat_new * x - dis_new) - # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] - # prob = cp.Problem(cp.Minimize(cost), constraints) - # prob.solve() - # edit_costs_new = x.value - # edit_costs_new = np.array([edit_costs_new[0], edit_costs_new[1], alpha]) - # residual = np.sqrt(prob.value) - - # # method 2: tune c_vir, c_eir and alpha by nonlinear programming by - # # scipy.optimize.minimize. - # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] - # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] - # w2 = nb_cost_mat[:,3] - # w3 = dis_k_vec - # func_min = lambda x: np.sum((w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ - # + w2 * x[2] - w3 * x[3]) ** 2) - # bounds = ((0, None), (0., None), (0.5, 0.5), (0, None)) - # res = minimize(func_min, [0.9, 1.7, 0.75, 10], bounds=bounds) - # edit_costs_new = res.x[0:3] - # residual = res.fun - - # method 3: tune c_vir, c_eir and alpha by nonlinear programming using cvxpy. - - # # method 4: tune c_vir, c_eir and alpha by QP function - # # scipy.optimize.least_squares. An initial guess is required. - # w0 = nb_cost_mat[:,0] + nb_cost_mat[:,1] - # w1 = nb_cost_mat[:,4] + nb_cost_mat[:,5] - # w2 = nb_cost_mat[:,3] - # w3 = dis_k_vec - # func = lambda x: (w0 * x[0] * x[3] + w1 * x[1] * (1 - x[2]) \ - # + w2 * x[2] - w3 * x[3]) ** 2 - # res = optimize.root(func, [0.9, 1.7, 0.75, 100]) - # edit_costs_new = res.x - # residual = None - elif self.__ged_options['edit_cost'] == 'LETTER2': - # # 1. if c_vi != c_vr, c_ei != c_er. - # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - # x = cp.Variable(nb_cost_mat_new.shape[1]) - # cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - ## # 1.1 no constraints. - ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] - # # 1.2 c_vs <= c_vi + c_vr. - # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - ## # 2. if c_vi == c_vr, c_ei == c_er. - ## nb_cost_mat_new = nb_cost_mat[:,[0,3,4]] - ## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] - ## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] - ## x = cp.Variable(nb_cost_mat_new.shape[1]) - ## cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - ## # 2.1 no constraints. - ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] - ### # 2.2 c_vs <= c_vi + c_vr. - ### constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - ### np.array([2.0, -1.0, 0.0]).T@x >= 0.0] - # - # prob = cp.Problem(cp.Minimize(cost_fun), constraints) - # prob.solve() - # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] - # edit_costs_new = np.array(edit_costs_new) - # residual = np.sqrt(prob.value) - if not self.__triangle_rule and self.__allow_zeros: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif self.__triangle_rule and self.__allow_zeros: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01, - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif not self.__triangle_rule and not self.__allow_zeros: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - # elif method == 'inequality_modified': - # # c_vs <= c_vi + c_vr. - # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - # x = cp.Variable(nb_cost_mat_new.shape[1]) - # cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - # prob = cp.Problem(cp.Minimize(cost_fun), constraints) - # prob.solve() - # # use same costs for insertion and removal rather than the fitted costs. - # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] - # edit_costs_new = np.array(edit_costs_new) - # residual = np.sqrt(prob.value) - elif self.__triangle_rule and not self.__allow_zeros: - # c_vs <= c_vi + c_vr. - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif rw_constraints == '2constraints': # @todo: rearrange it later. - # c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er. - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0, - np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0, - np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - edit_costs_new = x.value - residual = np.sqrt(prob.value) - - elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': - is_n_attr = np.count_nonzero(nb_cost_mat[:,2]) - is_e_attr = np.count_nonzero(nb_cost_mat[:,5]) - - if self.__ds_name == 'SYNTHETICnew': # @todo: rearrenge this later. - # nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] - nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] - # constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]] - constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])], - np.array([0.0, 1.0, -1.0]).T@x == 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - prob.solve() - # print(x.value) - edit_costs_new = np.concatenate((np.array([0.0, 0.0]), x.value, - np.array([0.0]))) - residual = np.sqrt(prob.value) - - elif not self.__triangle_rule and self.__allow_zeros: - if is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif is_n_attr and not is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value, np.array([0.0]))) - residual = np.sqrt(prob.value) - elif not is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) - residual = np.sqrt(prob.value) - else: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), - x.value[2:], np.array([0.0]))) - residual = np.sqrt(prob.value) - elif self.__triangle_rule and self.__allow_zeros: - if is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, - np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, - np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif is_n_attr and not is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01, - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value, np.array([0.0]))) - residual = np.sqrt(prob.value) - elif not is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) - residual = np.sqrt(prob.value) - else: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), - x.value[2:], np.array([0.0]))) - residual = np.sqrt(prob.value) - elif not self.__triangle_rule and not self.__allow_zeros: - if is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif is_n_attr and not is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value, np.array([0.0]))) - residual = np.sqrt(prob.value) - elif not is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) - residual = np.sqrt(prob.value) - else: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), - x.value[2:], np.array([0.0]))) - residual = np.sqrt(prob.value) - elif self.__triangle_rule and not self.__allow_zeros: - # c_vs <= c_vi + c_vr. - if is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, - np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif is_n_attr and not is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value, np.array([0.0]))) - residual = np.sqrt(prob.value) - elif not is_n_attr and is_e_attr: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], - np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) - residual = np.sqrt(prob.value) - else: - nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] - x = cp.Variable(nb_cost_mat_new.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), - x.value[2:], np.array([0.0]))) - residual = np.sqrt(prob.value) - - elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled. - if not self.__triangle_rule and self.__allow_zeros: - x = cp.Variable(nb_cost_mat.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif self.__triangle_rule and self.__allow_zeros: - x = cp.Variable(nb_cost_mat.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], - np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01, - np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01, - np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, - np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif not self.__triangle_rule and not self.__allow_zeros: - x = cp.Variable(nb_cost_mat.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])]] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - elif self.__triangle_rule and not self.__allow_zeros: - x = cp.Variable(nb_cost_mat.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) - constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], - np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, - np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - else: - raise Exception('The edit cost "', self.__ged_options['edit_cost'], '" is not supported for update progress.') - # # method 1: simple least square method. - # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, - # rcond=None) - - # # method 2: least square method with x_i >= 0. - # edit_costs_new, residual = optimize.nnls(nb_cost_mat, dis_k_vec) - - # method 3: solve as a quadratic program with constraints. - # P = np.dot(nb_cost_mat.T, nb_cost_mat) - # q_T = -2 * np.dot(dis_k_vec.T, nb_cost_mat) - # G = -1 * np.identity(nb_cost_mat.shape[1]) - # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) - # A = np.array([1 for i in range(nb_cost_mat.shape[1])]) - # b = 1 - # x = cp.Variable(nb_cost_mat.shape[1]) - # prob = cp.Problem(cp.Minimize(cp.quad_form(x, P) + q_T@x), - # [G@x <= h]) - # prob.solve() - # edit_costs_new = x.value - # residual = prob.value - np.dot(dis_k_vec.T, dis_k_vec) - - # G = -1 * np.identity(nb_cost_mat.shape[1]) - # h = np.array([0 for i in range(nb_cost_mat.shape[1])]) - x = cp.Variable(nb_cost_mat.shape[1]) - cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec) - constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], - # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] - np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, - np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] - prob = cp.Problem(cp.Minimize(cost_fun), constraints) - self.__execute_cvx(prob) - edit_costs_new = x.value - residual = np.sqrt(prob.value) - - # method 4: - - return edit_costs_new, residual - - - def __execute_cvx(self, prob): - try: - prob.solve(verbose=(self._verbose>=2)) - except MemoryError as error0: - if self._verbose >= 2: - print('\nUsing solver "OSQP" caused a memory error.') - print('the original error message is\n', error0) - print('solver status: ', prob.status) - print('trying solver "CVXOPT" instead...\n') - try: - prob.solve(solver=cp.CVXOPT, verbose=(self._verbose>=2)) - except Exception as error1: - if self._verbose >= 2: - print('\nAn error occured when using solver "CVXOPT".') - print('the original error message is\n', error1) - print('solver status: ', prob.status) - print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n') - prob.solve(solver=cp.MOSEK, verbose=(self._verbose>=2)) - else: - if self._verbose >= 2: - print('solver status: ', prob.status) - else: - if self._verbose >= 2: - print('solver status: ', prob.status) - if self._verbose >= 2: - print() + # Learner cost matrices. + # Initialize cost learner. + cml = CostMatricesLearner(edit_cost='CONSTANT', triangle_rule=False, allow_zeros=True, parallel=self.__parallel, verbose=self._verbose) # @todo + cml.set_update_params(time_limit_in_sec=self.__time_limit_in_sec, max_itrs=self.__max_itrs, max_itrs_without_update=self.__max_itrs_without_update, epsilon_residual=self.__epsilon_residual, epsilon_ec=self.__epsilon_ec) + # Run cost learner. + cml.update(dis_k_vec, self._dataset.graphs, options) + + # Get results. + results = cml.get_results() + self.__converged = results['converged'] + self.__itrs = results['itrs'] + self.__num_updates_ecs = results['num_updates_ecs'] + cost_list = results['cost_list'] + self.__node_label_costs = cost_list[-1][0:len(self.__node_label_costs)] + self.__edge_label_costs = cost_list[-1][len(self.__node_label_costs):] def __gmg_bcu(self): @@ -913,12 +348,19 @@ class MedianPreimageGeneratorCML(PreimageGenerator): for g in graphs: ged_env.add_nx_graph(g, '') graph_ids = ged_env.get_all_graph_ids() + + node_labels = ged_env.get_all_node_labels() + edge_labels = ged_env.get_all_edge_labels() + node_label_costs = label_costs_to_matrix(self.__node_label_costs, len(node_labels)) + edge_label_costs = label_costs_to_matrix(self.__edge_label_costs, len(edge_labels)) + ged_env.set_label_costs(node_label_costs, edge_label_costs) + set_median_id = ged_env.add_graph('set_median') gen_median_id = ged_env.add_graph('gen_median') ged_env.init(init_type=self.__ged_options['init_option']) # Set up the madian graph estimator. - self.__mge = MedianGraphEstimatorPy(ged_env, constant_node_costs(self.__ged_options['edit_cost'])) + self.__mge = MedianGraphEstimatorCML(ged_env, constant_node_costs(self.__ged_options['edit_cost'])) self.__mge.set_refine_method(self.__ged_options['method'], self.__ged_options) options = self.__mge_options.copy() if not 'seed' in options: diff --git a/gklearn/tests/test_graph_kernels.py b/gklearn/tests/test_graph_kernels.py index 59efc88..c6fcfbe 100644 --- a/gklearn/tests/test_graph_kernels.py +++ b/gklearn/tests/test_graph_kernels.py @@ -52,94 +52,104 @@ def chooseDataset(ds_name): return dataset -# @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) -# @pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) -# #@pytest.mark.parametrize('parallel', ['imap_unordered', None]) -# def test_commonwalkkernel(ds_name, weight, compute_method): -# """Test common walk kernel. -# """ -# from gklearn.kernels.commonWalkKernel import commonwalkkernel +@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +@pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) +@pytest.mark.parametrize('parallel', ['imap_unordered', None]) +def test_CommonWalk(ds_name, parallel, weight, compute_method): + """Test common walk kernel. + """ + from gklearn.kernels import CommonWalk + import networkx as nx + + dataset = chooseDataset(ds_name) + dataset.load_graphs([g for g in dataset.graphs if nx.number_of_nodes(g) > 1]) -# Gn, y = chooseDataset(ds_name) + try: + graph_kernel = CommonWalk(node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + weight=weight, + compute_method=compute_method) + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) -# try: -# Kmatrix, run_time, idx = commonwalkkernel(Gn, -# node_label='atom', -# edge_label='bond_type', -# weight=weight, -# compute_method=compute_method, -# # parallel=parallel, -# n_jobs=multiprocessing.cpu_count(), -# verbose=True) -# except Exception as exception: -# assert False, exception + except Exception as exception: + assert False, exception -# @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) -# @pytest.mark.parametrize('remove_totters', [True, False]) -# #@pytest.mark.parametrize('parallel', ['imap_unordered', None]) -# def test_marginalizedkernel(ds_name, remove_totters): -# """Test marginalized kernel. -# """ -# from gklearn.kernels.marginalizedKernel import marginalizedkernel +@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +@pytest.mark.parametrize('remove_totters', [False]) #[True, False]) +@pytest.mark.parametrize('parallel', ['imap_unordered', None]) +def test_Marginalized(ds_name, parallel, remove_totters): + """Test marginalized kernel. + """ + from gklearn.kernels import Marginalized -# Gn, y = chooseDataset(ds_name) + dataset = chooseDataset(ds_name) + + try: + graph_kernel = Marginalized(node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + p_quit=0.5, + n_iteration=2, + remove_totters=remove_totters) + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) -# try: -# Kmatrix, run_time = marginalizedkernel(Gn, -# node_label='atom', -# edge_label='bond_type', -# p_quit=0.5, -# n_iteration=2, -# remove_totters=remove_totters, -# # parallel=parallel, -# n_jobs=multiprocessing.cpu_count(), -# verbose=True) -# except Exception as exception: -# assert False, exception + except Exception as exception: + assert False, exception # @pytest.mark.parametrize( -# 'compute_method,ds_name,sub_kernel', -# [ +# 'compute_method,ds_name,sub_kernel', +# [ # # ('sylvester', 'Alkane', None), # # ('conjugate', 'Alkane', None), # # ('conjugate', 'AIDS', None), # # ('fp', 'Alkane', None), # # ('fp', 'AIDS', None), -# ('spectral', 'Alkane', 'exp'), -# ('spectral', 'Alkane', 'geo'), -# ] +# ('spectral', 'Alkane', 'exp'), +# ('spectral', 'Alkane', 'geo'), +# ] # ) # #@pytest.mark.parametrize('parallel', ['imap_unordered', None]) # def test_randomwalkkernel(ds_name, compute_method, sub_kernel): -# """Test random walk kernel kernel. -# """ -# from gklearn.kernels.randomWalkKernel import randomwalkkernel -# from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct -# import functools +# """Test random walk kernel kernel. +# """ +# from gklearn.kernels.randomWalkKernel import randomwalkkernel +# from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct +# import functools -# Gn, y = chooseDataset(ds_name) +# Gn, y = chooseDataset(ds_name) -# mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) -# sub_kernels = [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}] -# try: -# Kmatrix, run_time, idx = randomwalkkernel(Gn, -# compute_method=compute_method, -# weight=1e-3, -# p=None, -# q=None, -# edge_weight=None, -# node_kernels=sub_kernels, -# edge_kernels=sub_kernels, -# node_label='atom', -# edge_label='bond_type', -# sub_kernel=sub_kernel, -# # parallel=parallel, -# n_jobs=multiprocessing.cpu_count(), -# verbose=True) -# except Exception as exception: -# assert False, exception +# mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) +# sub_kernels = [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}] +# try: +# Kmatrix, run_time, idx = randomwalkkernel(Gn, +# compute_method=compute_method, +# weight=1e-3, +# p=None, +# q=None, +# edge_weight=None, +# node_kernels=sub_kernels, +# edge_kernels=sub_kernels, +# node_label='atom', +# edge_label='bond_type', +# sub_kernel=sub_kernel, +# # parallel=parallel, +# n_jobs=multiprocessing.cpu_count(), +# verbose=True) +# except Exception as exception: +# assert False, exception @pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) @@ -157,9 +167,9 @@ def test_ShortestPath(ds_name, parallel): sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} try: graph_kernel = ShortestPath(node_labels=dataset.node_labels, - node_attrs=dataset.node_attrs, - ds_infos=dataset.get_dataset_infos(keys=['directed']), - node_kernels=sub_kernels) + node_attrs=dataset.node_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + node_kernels=sub_kernels) gram_matrix, run_time = graph_kernel.compute(dataset.graphs, parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], @@ -187,12 +197,12 @@ def test_StructuralSP(ds_name, parallel): sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} try: graph_kernel = StructuralSP(node_labels=dataset.node_labels, - edge_labels=dataset.edge_labels, - node_attrs=dataset.node_attrs, - edge_attrs=dataset.edge_attrs, - ds_infos=dataset.get_dataset_infos(keys=['directed']), - node_kernels=sub_kernels, - edge_kernels=sub_kernels) + edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, + edge_attrs=dataset.edge_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + node_kernels=sub_kernels, + edge_kernels=sub_kernels) gram_matrix, run_time = graph_kernel.compute(dataset.graphs, parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], @@ -218,9 +228,9 @@ def test_PathUpToH(ds_name, parallel, k_func, compute_method): try: graph_kernel = PathUpToH(node_labels=dataset.node_labels, - edge_labels=dataset.edge_labels, - ds_infos=dataset.get_dataset_infos(keys=['directed']), - depth=2, k_func=k_func, compute_method=compute_method) + edge_labels=dataset.edge_labels, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + depth=2, k_func=k_func, compute_method=compute_method) gram_matrix, run_time = graph_kernel.compute(dataset.graphs, parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], @@ -245,9 +255,9 @@ def test_Treelet(ds_name, parallel): pkernel = functools.partial(polynomialkernel, d=2, c=1e5) try: graph_kernel = Treelet(node_labels=dataset.node_labels, - edge_labels=dataset.edge_labels, - ds_infos=dataset.get_dataset_infos(keys=['directed']), - sub_kernel=pkernel) + edge_labels=dataset.edge_labels, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + sub_kernel=pkernel) gram_matrix, run_time = graph_kernel.compute(dataset.graphs, parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], @@ -271,9 +281,9 @@ def test_WLSubtree(ds_name, parallel): try: graph_kernel = WLSubtree(node_labels=dataset.node_labels, - edge_labels=dataset.edge_labels, - ds_infos=dataset.get_dataset_infos(keys=['directed']), - height=2) + edge_labels=dataset.edge_labels, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + height=2) gram_matrix, run_time = graph_kernel.compute(dataset.graphs, parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], diff --git a/gklearn/utils/graph_synthesizer.py b/gklearn/utils/graph_synthesizer.py new file mode 100644 index 0000000..2c5f650 --- /dev/null +++ b/gklearn/utils/graph_synthesizer.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 11 18:10:06 2020 + +@author: ljia +""" +import numpy as np +import networkx as nx +import random + + +class GraphSynthesizer(object): + + + def __init__(self): + pass + + + def random_graph(self, num_nodes, num_edges, num_node_labels=0, num_edge_labels=0, seed=None, directed=False, max_num_edges=None, all_edges=None): + g = nx.Graph() + if num_node_labels > 0: + node_labels = np.random.randint(0, high=num_node_labels, size=num_nodes) + for i in range(0, num_nodes): + g.add_node(str(i), atom=node_labels[i]) # @todo: update "atom". + else: + for i in range(0, num_nodes): + g.add_node(str(i)) + + if num_edge_labels > 0: + edge_labels = np.random.randint(0, high=num_edge_labels, size=num_edges) + for idx, i in enumerate(random.sample(range(0, max_num_edges), num_edges)): + node1, node2 = all_edges[i] + g.add_edge(str(node1), str(node2), bond_type=edge_labels[idx]) # @todo: update "bond_type". + else: + for i in random.sample(range(0, max_num_edges), num_edges): + node1, node2 = all_edges[i] + g.add_edge(str(node1), str(node2)) + + return g + + + def unified_graphs(self, num_graphs=1000, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=0, seed=None, directed=False): + max_num_edges = int((num_nodes - 1) * num_nodes / 2) + if num_edges > max_num_edges: + raise Exception('Too many edges.') + all_edges = [(i, j) for i in range(0, num_nodes) for j in range(i + 1, num_nodes)] # @todo: optimize. No directed graphs. + + graphs = [] + for idx in range(0, num_graphs): + graphs.append(self.random_graph(num_nodes, num_edges, num_node_labels=num_node_labels, num_edge_labels=num_edge_labels, seed=seed, directed=directed, max_num_edges=max_num_edges, all_edges=all_edges)) + + return graphs \ No newline at end of file diff --git a/gklearn/utils/parallel.py b/gklearn/utils/parallel.py index b5d0579..e6edb70 100644 --- a/gklearn/utils/parallel.py +++ b/gklearn/utils/parallel.py @@ -12,7 +12,7 @@ import sys def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker=None, glbv=None, method=None, n_jobs=None, chunksize=None, itr_desc='', - verbose=2): + verbose=True): ''' ''' if method == 'imap_unordered': @@ -30,7 +30,7 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker else: chunksize = 100 for result in (tqdm(pool.imap_unordered(func, itr, chunksize), - desc=itr_desc, file=sys.stdout) if verbose == 2 else + desc=itr_desc, file=sys.stdout) if verbose else pool.imap_unordered(func, itr, chunksize)): func_assign(result, var_to_assign) pool.close() @@ -45,7 +45,7 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker else: chunksize = 100 for result in (tqdm(pool.imap_unordered(func, itr, chunksize), - desc=itr_desc, file=sys.stdout) if verbose == 2 else + desc=itr_desc, file=sys.stdout) if verbose else pool.imap_unordered(func, itr, chunksize)): func_assign(result, var_to_assign) pool.close() @@ -54,7 +54,7 @@ def parallel_me(func, func_assign, var_to_assign, itr, len_itr=None, init_worker def parallel_gm(func, Kmatrix, Gn, init_worker=None, glbv=None, method='imap_unordered', n_jobs=None, chunksize=None, - verbose=True): + verbose=True): # @todo: Gn seems not necessary. from itertools import combinations_with_replacement def func_assign(result, var_to_assign): var_to_assign[result[0]][result[1]] = result[2] diff --git a/gklearn/utils/utils.py b/gklearn/utils/utils.py index 19e8db4..c32169d 100644 --- a/gklearn/utils/utils.py +++ b/gklearn/utils/utils.py @@ -222,6 +222,70 @@ def direct_product(G1, G2, node_label, edge_label): return gt +def direct_product_graph(G1, G2, node_labels, edge_labels): + """Return the direct/tensor product of directed graphs G1 and G2. + + Parameters + ---------- + G1, G2 : NetworkX graph + The original graphs. + node_labels : list + A list of node attributes used as labels. + edge_labels : list + A list of edge attributes used as labels. + + Return + ------ + gt : NetworkX graph + The direct product graph of G1 and G2. + + Notes + ----- + This method differs from networkx.tensor_product in that this method only adds nodes and edges in G1 and G2 that have the same labels to the direct product graph. + + References + ---------- + .. [1] Thomas Gärtner, Peter Flach, and Stefan Wrobel. On graph kernels: Hardness results and efficient alternatives. Learning Theory and Kernel Machines, pages 129–143, 2003. + """ + # arrange all graphs in a list + from itertools import product + # G = G.to_directed() + gt = nx.DiGraph() + # add nodes + for u, v in product(G1, G2): + label1 = tuple(G1.nodes[u][nl] for nl in node_labels) + label2 = tuple(G2.nodes[v][nl] for nl in node_labels) + if label1 == label2: + gt.add_node((u, v), node_label=label1) + + # add edges, faster for sparse graphs (no so many edges), which is the most case for now. + for (u1, v1), (u2, v2) in product(G1.edges, G2.edges): + if (u1, u2) in gt and (v1, v2) in gt: + label1 = tuple(G1.edges[u1, v1][el] for el in edge_labels) + label2 = tuple(G2.edges[u2, v2][el] for el in edge_labels) + if label1 == label2: + gt.add_edge((u1, u2), (v1, v2), edge_label=label1) + + + # # add edges, faster for dense graphs (a lot of edges, complete graph would be super). + # for u, v in product(gt, gt): + # if (u[0], v[0]) in G1.edges and ( + # u[1], v[1] + # ) in G2.edges and G1.edges[u[0], + # v[0]][edge_label] == G2.edges[u[1], + # v[1]][edge_label]: + # gt.add_edge((u[0], u[1]), (v[0], v[1])) + # gt.edges[(u[0], u[1]), (v[0], v[1])].update({ + # edge_label: + # G1.edges[u[0], v[0]][edge_label] + # }) + + # relabel nodes using consecutive integers for convenience of kernel calculation. + # gt = nx.convert_node_labels_to_integers( + # gt, first_label=0, label_attribute='label_orignal') + return gt + + def graph_deepcopy(G): """Deep copy a graph, including deep copy of all nodes, edges and attributes of the graph, nodes and edges. diff --git a/requirements.txt b/requirements.txt index 24d6efe..b790721 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy>=1.16.2 scipy>=1.1.0 -matplotlib>=3.0.0 +matplotlib>=3.1.0 networkx>=2.2 scikit-learn>=0.20.0 tabulate>=0.8.2 diff --git a/requirements_pypi.txt b/requirements_pypi.txt index f4854fc..77147c9 100644 --- a/requirements_pypi.txt +++ b/requirements_pypi.txt @@ -1,6 +1,6 @@ numpy>=1.16.2 scipy>=1.1.0 -matplotlib>=3.0.0 +matplotlib>=3.1.0 networkx>=2.2 scikit-learn>=0.20.0 tabulate>=0.8.2