#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Apr 7 15:24:58 2020 @author: ljia @references: [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. """ import sys from itertools import product # from functools import partial from gklearn.utils import get_iters import numpy as np from gklearn.utils.utils import getSPGraph from gklearn.kernels import ShortestPath import os import pickle from pympler import asizeof import time import networkx as nx def load_results(file_name, fcsp): if os.path.isfile(file_name): with open(file_name, 'rb') as f: return pickle.load(f) else: results = {'nb_comparison': [], 'i': -1, 'j': -1, 'completed': False} if fcsp: results['vk_dict_mem'] = [] return results def save_results(file_name, results): with open(file_name, 'wb') as f: pickle.dump(results, f) def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): # asizeof.asized(obj, detail=1).format() # return asizeof.asizeof(obj) key, val = next(iter(obj.items())) # key = dict.iterkeys().next() # key_mem = asizeof.asizeof(key) dict_flat = sys.getsizeof(obj) key_mem = 64 if isinstance(val, float): val_mem = 24 mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) else: # value is True or False mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) # print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') return mem def compute_stats(file_name, results): del results['i'] del results['j'] results['nb_comparison'] = np.mean(results['nb_comparison']) results['completed'] = True if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) save_results(file_name, results) class SPSpace(ShortestPath): def __init__(self, **kwargs): super().__init__(**kwargs) self._file_name = kwargs.get('file_name') # @profile def _compute_gm_series(self): self._all_graphs_have_edges(self._graphs) # get shortest path graph of each graph. iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] results = load_results(self._file_name, self._fcsp) # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) iterator = get_iters(itr, desc='Computing kernels', length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) time0 = time.time() for i, j in iterator: if i > results['i'] or (i == results['i'] and j > results['j']): data = self._sp_do_space(self._graphs[i], self._graphs[j]) if self._fcsp: results['nb_comparison'].append(data[0]) if data[1] != {}: results['vk_dict_mem'].append(estimate_vk_memory(data[1], nx.number_of_nodes(self._graphs[i]), nx.number_of_nodes(self._graphs[j]))) else: results['nb_comparison'].append(data) results['i'] = i results['j'] = j time1 = time.time() if time1 - time0 > 600: save_results(self._file_name, results) time0 = time1 compute_stats(self._file_name, results) return gram_matrix def _sp_do_space(self, g1, g2): if self._fcsp: # @todo: it may be put outside the _sp_do(). return self._sp_do_fcsp(g1, g2) else: return self._sp_do_naive(g1, g2) def _sp_do_fcsp(self, g1, g2): nb_comparison = 0 # compute shortest path matrices first, method borrowed from FCSP. vk_dict = {} # shortest path matrices dict if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do(). # node symb and non-synb labeled if len(self._node_attrs) > 0: kn = self._node_kernels['mix'] for n1, n2 in product( g1.nodes(data=True), g2.nodes(data=True)): n1_labels = [n1[1][nl] for nl in self._node_labels] n2_labels = [n2[1][nl] for nl in self._node_labels] n1_attrs = [n1[1][na] for na in self._node_attrs] n2_attrs = [n2[1][na] for na in self._node_attrs] vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) nb_comparison += 1 # node symb labeled else: kn = self._node_kernels['symb'] for n1 in g1.nodes(data=True): for n2 in g2.nodes(data=True): n1_labels = [n1[1][nl] for nl in self._node_labels] n2_labels = [n2[1][nl] for nl in self._node_labels] vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) nb_comparison += 1 else: # node non-synb labeled if len(self._node_attrs) > 0: kn = self._node_kernels['nsymb'] for n1 in g1.nodes(data=True): for n2 in g2.nodes(data=True): n1_attrs = [n1[1][na] for na in self._node_attrs] n2_attrs = [n2[1][na] for na in self._node_attrs] vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) nb_comparison += 1 # node unlabeled else: for e1, e2 in product( g1.edges(data=True), g2.edges(data=True)): pass # if e1[2]['cost'] == e2[2]['cost']: # kernel += 1 # nb_comparison += 1 return nb_comparison, vk_dict # # compute graph kernels # if self._ds_infos['directed']: # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): # if e1[2]['cost'] == e2[2]['cost']: # nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])] # kn1 = nk11 * nk22 # kernel += kn1 # else: # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): # if e1[2]['cost'] == e2[2]['cost']: # # each edge walk is counted twice, starting from both its extreme nodes. # nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( # e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])] # kn1 = nk11 * nk22 # kn2 = nk12 * nk21 # kernel += kn1 + kn2 def _sp_do_naive(self, g1, g2): nb_comparison = 0 # Define the function to compute kernels between vertices in each condition. if len(self._node_labels) > 0: # node symb and non-synb labeled if len(self._node_attrs) > 0: def compute_vk(n1, n2): kn = self._node_kernels['mix'] n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) # node symb labeled else: def compute_vk(n1, n2): kn = self._node_kernels['symb'] n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] return kn(n1_labels, n2_labels) else: # node non-synb labeled if len(self._node_attrs) > 0: def compute_vk(n1, n2): kn = self._node_kernels['nsymb'] n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] return kn(n1_attrs, n2_attrs) # node unlabeled else: # for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): # if e1[2]['cost'] == e2[2]['cost']: # kernel += 1 return 0 # compute graph kernels if self._ds_infos['directed']: for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): if e1[2]['cost'] == e2[2]['cost']: # nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1]) # kn1 = nk11 * nk22 # kernel += kn1 nb_comparison += 2 else: for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): if e1[2]['cost'] == e2[2]['cost']: # each edge walk is counted twice, starting from both its extreme nodes. # nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk( # e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1]) # kn1 = nk11 * nk22 # kn2 = nk12 * nk21 # kernel += kn1 + kn2 nb_comparison += 4 return nb_comparison