|
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Mon Mar 30 11:59:57 2020
-
- @author: ljia
-
- @references:
-
- [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For
- Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360).
- """
- import sys
- from itertools import product
- from gklearn.utils import get_iters
- import numpy as np
- import time
- import os, errno
- import pickle
- from pympler import asizeof
- import networkx as nx
- from gklearn.utils.utils import get_shortest_paths
- from gklearn.kernels import StructuralSP
-
-
- def load_splist(file_name):
- if os.path.isfile(file_name):
- with open(file_name, 'rb') as f:
- return pickle.load(f)
- else:
- results_path = {'splist': [], 'i': -1, 'completed': False}
- return results_path
-
-
- def load_results(file_name, fcsp):
- if os.path.isfile(file_name):
- with open(file_name, 'rb') as f:
- return pickle.load(f)
- else:
- results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False}
- if fcsp:
- results['vk_dict_mem'] = []
- results['ek_dict_mem'] = []
- return results
-
-
- def save_results(file_name, results):
- with open(file_name, 'wb') as f:
- pickle.dump(results, f)
-
-
- def estimate_vk_memory(obj, nb_nodes1, nb_nodes2):
- # asizeof.asized(obj, detail=1).format()
- # return asizeof.asizeof(obj)
- key, val = next(iter(obj.items()))
- # key = dict.iterkeys().next()
- # key_mem = asizeof.asizeof(key)
- dict_flat = sys.getsizeof(obj)
- key_mem = 64
-
- if isinstance(val, float):
- val_mem = 24
- mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2)
- else: # value is True or False
- mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2)
-
- # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n')
- return mem
-
-
- def estimate_ek_memory(obj, nb_nodes1, nb_nodes2):
- # asizeof.asized(obj, detail=1).format()
- # return asizeof.asizeof(obj)
- key, val = next(iter(obj.items()))
- # key = dict.iterkeys().next()
- # key_mem = asizeof.asizeof(key)
- dict_flat = sys.getsizeof(obj)
- key_mem = 192
-
- if isinstance(val, float):
- val_mem = 24
- mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2)
- else: # value is True or False
- mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2)
-
- # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n')
- return mem
-
-
- def compute_stats(file_name, results, splist):
- del results['i']
- del results['j']
- results['nb_v_comparison'] = np.mean(results['nb_v_comparison'])
- # if len(results['nb_e_comparison']) > 0:
- results['nb_e_comparison'] = np.mean(results['nb_e_comparison'])
- results['completed'] = True
- if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0:
- results['vk_dict_mem'] = np.mean(results['vk_dict_mem'])
- if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0:
- results['ek_dict_mem'] = np.mean(results['ek_dict_mem'])
- results['nb_sp_ave'] = np.mean([len(ps) for ps in splist])
- results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist])
- results['sp_mem_all'] = asizeof.asizeof(splist)
- save_results(file_name, results)
-
-
- class SSPSpace(StructuralSP):
-
- def __init__(self, **kwargs):
- super().__init__(**kwargs)
- self._file_name = kwargs.get('file_name')
-
- # @profile
- def _compute_gm_series(self):
- # get shortest paths of each graph in the graphs.
- fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl'
- results_path = load_splist(fn_paths)
-
- if not results_path['completed']:
-
- iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2))
- if self._compute_method == 'trie':
- for g in iterator:
- splist.append(self._get_sps_as_trie(g))
- else:
- time0 = time.time()
- for i, g in enumerate(iterator):
- if i > results_path['i']:
- results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed']))
- results_path['i'] = i
-
- time1 = time.time()
- if time1 - time0 > 600:
- save_results(fn_paths, results_path)
- time0 = time1
-
- del results_path['i']
- results_path['completed'] = True
- save_results(fn_paths, results_path)
-
- #########
- splist = results_path['splist']
- results = load_results(self._file_name, self._fcsp)
-
- # compute Gram matrix.
- gram_matrix = np.zeros((len(self._graphs), len(self._graphs)))
-
- from itertools import combinations_with_replacement
- itr = combinations_with_replacement(range(0, len(self._graphs)), 2)
- len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2)
- iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout,
- length=len_itr, verbose=(self._verbose >= 2))
- if self._compute_method == 'trie':
- for i, j in iterator:
- kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j])
- gram_matrix[i][j] = kernel
- gram_matrix[j][i] = kernel
- else:
- time0 = time.time()
- for i, j in iterator:
- if i > results['i'] or (i == results['i'] and j > results['j']):
- data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j])
- results['nb_v_comparison'].append(data[0])
- results['nb_e_comparison'].append(data[1])
- if self._fcsp:
- if data[2] != {}:
- results['vk_dict_mem'].append(estimate_vk_memory(data[2],
- nx.number_of_nodes(self._graphs[i]),
- nx.number_of_nodes(self._graphs[j])))
- if data[3] != {}:
- results['ek_dict_mem'].append(estimate_ek_memory(data[3],
- nx.number_of_nodes(self._graphs[i]),
- nx.number_of_nodes(self._graphs[j])))
- results['i'] = i
- results['j'] = j
-
- time1 = time.time()
- if time1 - time0 > 600:
- save_results(self._file_name, results)
- time0 = time1
-
- compute_stats(self._file_name, results, splist)
- # @todo: may not remove the path file if the program stops exactly here.
- try:
- os.remove(fn_paths)
- except OSError as e:
- if e.errno != errno.ENOENT:
- raise
-
- return gram_matrix
-
-
- def _ssp_do_naive_space(self, g1, g2, spl1, spl2):
- if self._fcsp: # @todo: it may be put outside the _sp_do().
- return self._sp_do_naive_fcsp(g1, g2, spl1, spl2)
- else:
- return self._sp_do_naive_naive(g1, g2, spl1, spl2)
-
-
- def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2):
-
- # First, compute shortest path matrices, method borrowed from FCSP.
- vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2)
- # Then, compute kernels between all pairs of edges, which is an idea of
- # extension of FCSP. It suits sparse graphs, which is the most case we
- # went though. For dense graphs, this would be slow.
- ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2)
-
- return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict
-
-
- def _sp_do_naive_naive(self, g1, g2, spl1, spl2):
-
- nb_v_comparison = 0
- nb_e_comparison = 0
-
- # Define the function to compute kernels between vertices in each condition.
- if len(self._node_labels) > 0:
- # node symb and non-synb labeled
- if len(self._node_attrs) > 0:
- def compute_vk(n1, n2):
- kn = self._node_kernels['mix']
- n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
- n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
- n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
- n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
- return kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
- # node symb labeled
- else:
- def compute_vk(n1, n2):
- kn = self._node_kernels['symb']
- n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels]
- n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels]
- return kn(n1_labels, n2_labels)
- else:
- # node non-synb labeled
- if len(self._node_attrs) > 0:
- def compute_vk(n1, n2):
- kn = self._node_kernels['nsymb']
- n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs]
- n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs]
- return kn(n1_attrs, n2_attrs)
- # # node unlabeled
- # else:
- # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
- # if e1[2]['cost'] == e2[2]['cost']:
- # kernel += 1
- # return kernel
-
- # Define the function to compute kernels between edges in each condition.
- if len(self._edge_labels) > 0:
- # edge symb and non-synb labeled
- if len(self._edge_attrs) > 0:
- def compute_ek(e1, e2):
- ke = self._edge_kernels['mix']
- e1_labels = [g1.edges[e1][el] for el in self._edge_labels]
- e2_labels = [g2.edges[e2][el] for el in self._edge_labels]
- e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs]
- e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs]
- return ke(e1_labels, e2_labels, e1_attrs, e2_attrs)
- # edge symb labeled
- else:
- def compute_ek(e1, e2):
- ke = self._edge_kernels['symb']
- e1_labels = [g1.edges[e1][el] for el in self._edge_labels]
- e2_labels = [g2.edges[e2][el] for el in self._edge_labels]
- return ke(e1_labels, e2_labels)
- else:
- # edge non-synb labeled
- if len(self._edge_attrs) > 0:
- def compute_ek(e1, e2):
- ke = self._edge_kernels['nsymb']
- e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs]
- e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs]
- return ke(e1_attrs, e2_attrs)
-
-
- # compute graph kernels
- if len(self._node_labels) > 0 or len(self._node_attrs) > 0:
- if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- # nb_v_comparison = len(p1)
- # nb_e_comparison = len(p1) - 1
- kpath = compute_vk(p1[0], p2[0])
- nb_v_comparison += 1
- if kpath:
- for idx in range(1, len(p1)):
- kpath *= compute_vk(p1[idx], p2[idx]) * \
- compute_ek((p1[idx-1], p1[idx]),
- (p2[idx-1], p2[idx]))
- nb_v_comparison += 1
- nb_e_comparison += 1
- if not kpath:
- break
- # kernel += kpath # add up kernels of all paths
- else:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- kpath = compute_vk(p1[0], p2[0])
- nb_v_comparison += 1
- if kpath:
- for idx in range(1, len(p1)):
- kpath *= compute_vk(p1[idx], p2[idx])
- nb_v_comparison += 1
- if not kpath:
- break
- # kernel += kpath # add up kernels of all paths
- else:
- if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- if len(p1) == 0:
- pass
- else:
- kpath = 1
- for idx in range(0, len(p1) - 1):
- kpath *= compute_ek((p1[idx], p1[idx+1]),
- (p2[idx], p2[idx+1]))
- nb_e_comparison += 1
- if not kpath:
- break
- else:
- pass
- # for p1, p2 in product(spl1, spl2):
- # if len(p1) == len(p2):
- # kernel += 1
- # try:
- # kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average
- # except ZeroDivisionError:
- # print(spl1, spl2)
- # print(g1.nodes(data=True))
- # print(g1.edges(data=True))
- # raise Exception
-
- return nb_v_comparison, nb_e_comparison
-
-
- def _get_all_node_kernels(self, g1, g2):
- nb_comparison = 0
-
- vk_dict = {} # shortest path matrices dict
- if len(self._node_labels) > 0:
- # node symb and non-synb labeled
- if len(self._node_attrs) > 0:
- kn = self._node_kernels['mix']
- for n1 in g1.nodes(data=True):
- for n2 in g2.nodes(data=True):
- n1_labels = [n1[1][nl] for nl in self._node_labels]
- n2_labels = [n2[1][nl] for nl in self._node_labels]
- n1_attrs = [n1[1][na] for na in self._node_attrs]
- n2_attrs = [n2[1][na] for na in self._node_attrs]
- vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs)
- nb_comparison += 1
- # node symb labeled
- else:
- kn = self._node_kernels['symb']
- for n1 in g1.nodes(data=True):
- for n2 in g2.nodes(data=True):
- n1_labels = [n1[1][nl] for nl in self._node_labels]
- n2_labels = [n2[1][nl] for nl in self._node_labels]
- vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels)
- nb_comparison += 1
- else:
- # node non-synb labeled
- if len(self._node_attrs) > 0:
- kn = self._node_kernels['nsymb']
- for n1 in g1.nodes(data=True):
- for n2 in g2.nodes(data=True):
- n1_attrs = [n1[1][na] for na in self._node_attrs]
- n2_attrs = [n2[1][na] for na in self._node_attrs]
- vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs)
- nb_comparison += 1
- # node unlabeled
- else:
- pass # @todo: add edge weights.
- # for e1 in g1.edges(data=True):
- # for e2 in g2.edges(data=True):
- # if e1[2]['cost'] == e2[2]['cost']:
- # kernel += 1
- # return kernel
-
- return vk_dict, nb_comparison
-
-
- def _get_all_edge_kernels(self, g1, g2):
- nb_comparison = 0
-
- # compute kernels between all pairs of edges, which is an idea of
- # extension of FCSP. It suits sparse graphs, which is the most case we
- # went though. For dense graphs, this would be slow.
- ek_dict = {} # dict of edge kernels
- if len(self._edge_labels) > 0:
- # edge symb and non-synb labeled
- if len(self._edge_attrs) > 0:
- ke = self._edge_kernels['mix']
- for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
- e1_labels = [e1[2][el] for el in self._edge_labels]
- e2_labels = [e2[2][el] for el in self._edge_labels]
- e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
- e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
- ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs)
- ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
- nb_comparison += 1
- # edge symb labeled
- else:
- ke = self._edge_kernels['symb']
- for e1 in g1.edges(data=True):
- for e2 in g2.edges(data=True):
- e1_labels = [e1[2][el] for el in self._edge_labels]
- e2_labels = [e2[2][el] for el in self._edge_labels]
- ek_temp = ke(e1_labels, e2_labels)
- ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
- nb_comparison += 1
- else:
- # edge non-synb labeled
- if len(self._edge_attrs) > 0:
- ke = self._edge_kernels['nsymb']
- for e1 in g1.edges(data=True):
- for e2 in g2.edges(data=True):
- e1_attrs = [e1[2][ea] for ea in self._edge_attrs]
- e2_attrs = [e2[2][ea] for ea in self._edge_attrs]
- ek_temp = ke(e1_attrs, e2_attrs)
- ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
- nb_comparison += 1
- # edge unlabeled
- else:
- pass
-
- return ek_dict, nb_comparison
|