diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp.py index 76e41b7..9a91b27 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp.py @@ -10,6 +10,7 @@ This script compares the results with and without FCSP. from gklearn.dataset import Dataset from gklearn.utils import get_graph_kernel_by_name from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct +from gklearn.experiments import DATASET_ROOT import functools import os import pickle @@ -17,50 +18,77 @@ import sys import logging -def run_all(fcsp): - save_dir = 'outputs/' + ('fscp' if fcsp == True else 'naive') + '/' - os.makedirs(save_dir, exist_ok=True) +# def run_all(fcsp): + +# from sklearn.model_selection import ParameterGrid + +# Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', +# 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', +# 'Letter-high', 'Letter-med', 'Letter-low', +# 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', +# 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', +# 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', +# 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', +# 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', +# 'Mutagenicity', 'REDDIT-BINARY'] + +# Kernel_List = ['ShortestPath', 'StructuralSP'] + +# task_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) + +# for task in list(task_grid): - from sklearn.model_selection import ParameterGrid +# save_file_suffix = '.' + task['kernel'] + '.' + task['dataset'] +# file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') +# if not os.path.isfile(file_name): +# print() +# print((task['kernel'], task['dataset'])) - Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', - 'PAH_unlabeled', 'PAH', 'MUTAG', 'Letter-high', 'Letter-med', 'Letter-low', - 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', - 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', - 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', - 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', - 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', - 'Mutagenicity', 'REDDIT-BINARY'] +# try: +# gram_matrix, run_time = compute(task['kernel'], task['dataset'], fcsp) - Kernel_List = ['ShortestPath', 'StructuralSP'] +# except Exception as exp: +# print('An exception occured when running this experiment:') +# LOG_FILENAME = save_dir + 'error.txt' +# logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) +# logging.exception('\n--------------' + save_file_suffix + '------------------') +# print(repr(exp)) +# else: +# save_file_suffix = '.' + task['kernel'] + task['dataset'] - work_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) +# with open(file_name, 'wb') as f: +# pickle.dump(run_time, f) - for work in list(work_grid): - save_file_suffix = '.' + work['kernel'] + '.' + work['dataset'] - file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') - if not os.path.isfile(file_name): - print() - print((work['kernel'], work['dataset'])) - try: - gram_matrix, run_time = run_work(work['kernel'], work['dataset'], fcsp) - except Exception as exp: - print('An exception occured when running this experiment:') - LOG_FILENAME = save_dir + 'error.txt' - logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) - logging.exception(save_file_suffix) - print(repr(exp)) +def run_task(kernel_name, ds_name, fcsp): + save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) + file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') - save_file_suffix = '.' + work['kernel'] + work['dataset'] + if not os.path.isfile(file_name): + print() + print((kernel_name, ds_name, str(fcsp))) + try: + gram_matrix, run_time = compute(kernel_name, ds_name, fcsp) + + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = os.path.join(save_dir, 'error' + save_file_suffix + '.txt') + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception('\n--------------' + save_file_suffix + '------------------') + print(repr(exp)) + + else: with open(file_name, 'wb') as f: pickle.dump(run_time, f) -def run_work(kernel_name, ds_name, fcsp): - dataset = Dataset(ds_name, verbose=True) +def compute(kernel_name, ds_name, fcsp): + dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) + if kernel_name == 'ShortestPath': + dataset.trim_dataset(edge_required=True) + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} @@ -87,8 +115,15 @@ def run_work(kernel_name, ds_name, fcsp): if __name__ == '__main__': if len(sys.argv) > 1: - fcsp = True if sys.argv[1] == 'True' else False + kernel_name = sys.argv[1] + ds_name = sys.argv[2] + fcsp = True if sys.argv[3] == 'True' else False else: + kernel_name = 'ShortestPath' + ds_name = 'Acyclic' fcsp = True - run_all(fcsp) + save_dir = 'outputs/' + os.makedirs(save_dir, exist_ok=True) + + run_task(kernel_name, ds_name, fcsp) \ No newline at end of file diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp_space.py b/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp_space.py new file mode 100644 index 0000000..3beacfd --- /dev/null +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/compare_fcsp_space.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Dec 2 17:41:54 2020 + +@author: ljia + +This script compares the results with and without FCSP. +""" +from gklearn.dataset import Dataset +from shortest_path import SPSpace +from structural_sp import SSPSpace +from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct +from gklearn.experiments import DATASET_ROOT +import functools +import os +import pickle +import sys +import logging + + +def run_task(kernel_name, ds_name, fcsp): + save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) + file_name = os.path.join(save_dir, 'space' + save_file_suffix + '.pkl') + + # Return if the task is already completed. + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + data = pickle.load(f) + if data['completed']: + return + + print() + print((kernel_name, ds_name, str(fcsp))) + + try: + gram_matrix, run_time = compute(kernel_name, ds_name, fcsp, file_name) + + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = os.path.join(save_dir, 'error.space' + save_file_suffix + '.txt') + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception('\n--------------' + save_file_suffix + '------------------') + print(repr(exp)) + +# else: +# with open(file_name, 'wb') as f: +# pickle.dump(run_time, f) + + +def compute(kernel_name, ds_name, fcsp, file_name): + dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) + if kernel_name == 'ShortestPath': + dataset.trim_dataset(edge_required=True) +# dataset.cut_graphs(range(0, 10)) + kernel_class = SPSpace + else: +# dataset.cut_graphs(range(0, 10)) + kernel_class = SSPSpace + + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + edge_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + + graph_kernel = kernel_class(name=kernel_name, + node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, + edge_attrs=dataset.edge_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + fcsp=fcsp, + compute_method='naive', + node_kernels=node_kernels, + edge_kernels=edge_kernels, + file_name=file_name + ) + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, + parallel=None, + normalize=False, + verbose=2 + ) + return gram_matrix, run_time + + +if __name__ == '__main__': + if len(sys.argv) > 1: + kernel_name = sys.argv[1] + ds_name = sys.argv[2] + fcsp = True if sys.argv[3] == 'True' else False + else: + kernel_name = 'StructuralSP' + ds_name = 'Fingerprint' + fcsp = True + + save_dir = 'outputs/' + os.makedirs(save_dir, exist_ok=True) + + run_task(kernel_name, ds_name, fcsp) \ No newline at end of file diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py index 53ae39c..b241c22 100644 --- a/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/run_jobs_compare_fcsp.py @@ -10,27 +10,60 @@ import os import re -def get_job_script(param): +OUT_TIME_LIST = [('ShortestPath', 'ENZYMES', 'False'), + ('StructuralSP', 'ENZYMES', 'True'), + ('StructuralSP', 'ENZYMES', 'False'), + ('StructuralSP', 'AIDS', 'False'), + ('ShortestPath', 'NCI1', 'False'), + ('StructuralSP', 'NCI1', 'True'), + ('StructuralSP', 'NCI1', 'False'), + ('ShortestPath', 'NCI109', 'False'), + ('StructuralSP', 'NCI109', 'True'), + ('StructuralSP', 'NCI109', 'False'), + ('ShortestPath', 'DD', 'True'), + ('ShortestPath', 'DD', 'False'), + ('StructuralSP', 'BZR', 'False'), + ('ShortestPath', 'COX2', 'False'), + ('StructuralSP', 'COX2', 'False'), + ('ShortestPath', 'DHFR', 'False'), + ] + +OUT_MEM_LIST = [('StructuralSP', 'PROTEINS', 'True'), + ('StructuralSP', 'PROTEINS', 'False'), + ('StructuralSP', 'PROTEINS_full', 'True'), + ('StructuralSP', 'PROTEINS_full', 'False'), + ('ShortestPath', 'REDDIT-BINARY', 'True'), + ] + +MISS_LABEL_LIST = [('StructuralSP', 'GREC', 'True'), + ('StructuralSP', 'GREC', 'False'), + ('StructuralSP', 'Web', 'True'), + ('StructuralSP', 'Web', 'False'), + ] + + +def get_job_script(kernel, dataset, fcsp): script = r""" #!/bin/bash #SBATCH --exclusive -#SBATCH --job-name="fcsp.""" + param + r"""" -#SBATCH --partition=long +#SBATCH --job-name="fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r"""" +#SBATCH --partition=tlong #SBATCH --mail-type=ALL #SBATCH --mail-user=jajupmochi@gmail.com -#SBATCH --output="outputs/output_fcsp.""" + param + r""".txt" -#SBATCH --error="errors/error_fcsp.""" + param + r""".txt" +#SBATCH --output="outputs/output_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" +#SBATCH --error="errors/error_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" # #SBATCH --ntasks=1 #SBATCH --nodes=1 #SBATCH --cpus-per-task=1 -#SBATCH --time=100:00:00 -#SBATCH --mem-per-cpu=4000 +#SBATCH --time=300:00:00 +##SBATCH --mem-per-cpu=4000 +#SBATCH --mem=40000 srun hostname srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp -srun python3 compare_fcsp.py """ + param +srun python3 compare_fcsp.py """ + kernel + r" " + dataset + r" " + fcsp script = script.strip() script = re.sub('\n\t+', '\n', script) script = re.sub('\n +', '\n', script) @@ -38,15 +71,75 @@ srun python3 compare_fcsp.py """ + param return script +def check_task_status(save_dir, *params): + str_task_id = '.' + '.'.join(params) + + # Check if the task is in out of memeory or out of space lists or missing labels. + if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: + return True + + # Check if the task is running or in queue of slurm. + command = 'squeue --user ljia02 --name "fcsp' + str_task_id + '" --format "%.2t" --noheader' + stream = os.popen(command) + output = stream.readlines() + if len(output) > 0: + return True + + # Check if the results are already computed. + file_name = os.path.join(save_dir, 'run_time' + str_task_id + '.pkl') + if os.path.isfile(file_name): + return True + + return False + + if __name__ == '__main__': + save_dir = 'outputs/' + os.makedirs(save_dir, exist_ok=True) os.makedirs('outputs/', exist_ok=True) os.makedirs('errors/', exist_ok=True) - param_list = ['True', 'False'] - for param in param_list[:]: - job_script = get_job_script(param) - command = 'sbatch < 0: + return True + + # Check if the task is already computed. + file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + data = pickle.load(f) + if data['completed']: + return True + + return False + + +if __name__ == '__main__': + save_dir = 'outputs/' + os.makedirs(save_dir, exist_ok=True) + os.makedirs('outputs/', exist_ok=True) + os.makedirs('errors/', exist_ok=True) + + from sklearn.model_selection import ParameterGrid + + Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', + 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', + 'Letter-high', 'Letter-med', 'Letter-low', + 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', + # new: not so large. + 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', + 'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', + 'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', + 'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', + # new: large. + 'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', + 'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', + 'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', + 'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', + 'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', + 'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', + 'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', + 'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', + 'COLLAB', 'COIL-DEL', + 'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', + 'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', + 'REDDIT-MULTI-12K'] + + Kernel_List = ['ShortestPath', 'StructuralSP'] + + fcsp_list = ['True', 'False'] + + task_grid = ParameterGrid({'kernel': Kernel_List[:], + 'dataset': Dataset_List[:], + 'fcsp': fcsp_list[:]}) + + from tqdm import tqdm + + for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): + + if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): + job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) + command = 'sbatch < 0: + results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) + save_results(file_name, results) + + +class SPSpace(ShortestPath): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._file_name = kwargs.get('file_name') + +# @profile + def _compute_gm_series(self): + self._all_graphs_have_edges(self._graphs) + # get shortest path graph of each graph. + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] + + + results = load_results(self._file_name, self._fcsp) + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', + length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) + + time0 = time.time() + for i, j in iterator: + if i > results['i'] or (i == results['i'] and j > results['j']): + data = self._sp_do_space(self._graphs[i], self._graphs[j]) + if self._fcsp: + results['nb_comparison'].append(data[0]) + if data[1] != {}: + results['vk_dict_mem'].append(estimate_vk_memory(data[1], + nx.number_of_nodes(self._graphs[i]), + nx.number_of_nodes(self._graphs[j]))) + else: + results['nb_comparison'].append(data) + results['i'] = i + results['j'] = j + + time1 = time.time() + if time1 - time0 > 600: + save_results(self._file_name, results) + time0 = time1 + + compute_stats(self._file_name, results) + + return gram_matrix + + + def _sp_do_space(self, g1, g2): + + if self._fcsp: # @todo: it may be put outside the _sp_do(). + return self._sp_do_fcsp(g1, g2) + else: + return self._sp_do_naive(g1, g2) + + + def _sp_do_fcsp(self, g1, g2): + + nb_comparison = 0 + + # compute shortest path matrices first, method borrowed from FCSP. + vk_dict = {} # shortest path matrices dict + if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do(). + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + kn = self._node_kernels['mix'] + for n1, n2 in product( + g1.nodes(data=True), g2.nodes(data=True)): + n1_labels = [n1[1][nl] for nl in self._node_labels] + n2_labels = [n2[1][nl] for nl in self._node_labels] + n1_attrs = [n1[1][na] for na in self._node_attrs] + n2_attrs = [n2[1][na] for na in self._node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + nb_comparison += 1 + # node symb labeled + else: + kn = self._node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in self._node_labels] + n2_labels = [n2[1][nl] for nl in self._node_labels] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + nb_comparison += 1 + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + kn = self._node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_attrs = [n1[1][na] for na in self._node_attrs] + n2_attrs = [n2[1][na] for na in self._node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) + nb_comparison += 1 + # node unlabeled + else: + for e1, e2 in product( + g1.edges(data=True), g2.edges(data=True)): + pass +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 +# nb_comparison += 1 + + return nb_comparison, vk_dict + +# # compute graph kernels +# if self._ds_infos['directed']: +# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): +# if e1[2]['cost'] == e2[2]['cost']: +# nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])] +# kn1 = nk11 * nk22 +# kernel += kn1 +# else: +# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): +# if e1[2]['cost'] == e2[2]['cost']: +# # each edge walk is counted twice, starting from both its extreme nodes. +# nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( +# e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])] +# kn1 = nk11 * nk22 +# kn2 = nk12 * nk21 +# kernel += kn1 + kn2 + + + def _sp_do_naive(self, g1, g2): + + nb_comparison = 0 + + # Define the function to compute kernels between vertices in each condition. + if len(self._node_labels) > 0: + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['mix'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + # node symb labeled + else: + def compute_vk(n1, n2): + kn = self._node_kernels['symb'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + return kn(n1_labels, n2_labels) + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['nsymb'] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_attrs, n2_attrs) + # node unlabeled + else: +# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 + return 0 + + # compute graph kernels + if self._ds_infos['directed']: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: +# nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1]) +# kn1 = nk11 * nk22 +# kernel += kn1 + nb_comparison += 2 + else: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + # each edge walk is counted twice, starting from both its extreme nodes. +# nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk( +# e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1]) +# kn1 = nk11 * nk22 +# kn2 = nk12 * nk21 +# kernel += kn1 + kn2 + nb_comparison += 4 + + return nb_comparison \ No newline at end of file diff --git a/gklearn/experiments/thesis/graph_kernels/fcsp/structural_sp.py b/gklearn/experiments/thesis/graph_kernels/fcsp/structural_sp.py new file mode 100644 index 0000000..7f5b721 --- /dev/null +++ b/gklearn/experiments/thesis/graph_kernels/fcsp/structural_sp.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Mar 30 11:59:57 2020 + +@author: ljia + +@references: + + [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For + Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). +""" +import sys +from itertools import product +from gklearn.utils import get_iters +import numpy as np +import time +import os, errno +import pickle +from pympler import asizeof +import networkx as nx +from gklearn.utils.utils import get_shortest_paths +from gklearn.kernels import StructuralSP + + +def load_splist(file_name): + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + return pickle.load(f) + else: + results_path = {'splist': [], 'i': -1, 'completed': False} + return results_path + + +def load_results(file_name, fcsp): + if os.path.isfile(file_name): + with open(file_name, 'rb') as f: + return pickle.load(f) + else: + results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False} + if fcsp: + results['vk_dict_mem'] = [] + results['ek_dict_mem'] = [] + return results + + +def save_results(file_name, results): + with open(file_name, 'wb') as f: + pickle.dump(results, f) + + +def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): +# asizeof.asized(obj, detail=1).format() +# return asizeof.asizeof(obj) + key, val = next(iter(obj.items())) +# key = dict.iterkeys().next() +# key_mem = asizeof.asizeof(key) + dict_flat = sys.getsizeof(obj) + key_mem = 64 + + if isinstance(val, float): + val_mem = 24 + mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) + else: # value is True or False + mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) + +# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') + return mem + + +def estimate_ek_memory(obj, nb_nodes1, nb_nodes2): +# asizeof.asized(obj, detail=1).format() +# return asizeof.asizeof(obj) + key, val = next(iter(obj.items())) +# key = dict.iterkeys().next() +# key_mem = asizeof.asizeof(key) + dict_flat = sys.getsizeof(obj) + key_mem = 192 + + if isinstance(val, float): + val_mem = 24 + mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) + else: # value is True or False + mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) + +# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') + return mem + + +def compute_stats(file_name, results, splist): + del results['i'] + del results['j'] + results['nb_v_comparison'] = np.mean(results['nb_v_comparison']) +# if len(results['nb_e_comparison']) > 0: + results['nb_e_comparison'] = np.mean(results['nb_e_comparison']) + results['completed'] = True + if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: + results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) + if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0: + results['ek_dict_mem'] = np.mean(results['ek_dict_mem']) + results['nb_sp_ave'] = np.mean([len(ps) for ps in splist]) + results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist]) + results['sp_mem_all'] = asizeof.asizeof(splist) + save_results(file_name, results) + + +class SSPSpace(StructuralSP): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._file_name = kwargs.get('file_name') + +# @profile + def _compute_gm_series(self): + # get shortest paths of each graph in the graphs. + fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl' + results_path = load_splist(fn_paths) + + if not results_path['completed']: + + iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) + if self._compute_method == 'trie': + for g in iterator: + splist.append(self._get_sps_as_trie(g)) + else: + time0 = time.time() + for i, g in enumerate(iterator): + if i > results_path['i']: + results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) + results_path['i'] = i + + time1 = time.time() + if time1 - time0 > 600: + save_results(fn_paths, results_path) + time0 = time1 + + del results_path['i'] + results_path['completed'] = True + save_results(fn_paths, results_path) + + ######### + splist = results_path['splist'] + results = load_results(self._file_name, self._fcsp) + + # compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) + iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, + length=len_itr, verbose=(self._verbose >= 2)) + if self._compute_method == 'trie': + for i, j in iterator: + kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + else: + time0 = time.time() + for i, j in iterator: + if i > results['i'] or (i == results['i'] and j > results['j']): + data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j]) + results['nb_v_comparison'].append(data[0]) + results['nb_e_comparison'].append(data[1]) + if self._fcsp: + if data[2] != {}: + results['vk_dict_mem'].append(estimate_vk_memory(data[2], + nx.number_of_nodes(self._graphs[i]), + nx.number_of_nodes(self._graphs[j]))) + if data[3] != {}: + results['ek_dict_mem'].append(estimate_ek_memory(data[3], + nx.number_of_nodes(self._graphs[i]), + nx.number_of_nodes(self._graphs[j]))) + results['i'] = i + results['j'] = j + + time1 = time.time() + if time1 - time0 > 600: + save_results(self._file_name, results) + time0 = time1 + + compute_stats(self._file_name, results, splist) + # @todo: may not remove the path file if the program stops exactly here. + try: + os.remove(fn_paths) + except OSError as e: + if e.errno != errno.ENOENT: + raise + + return gram_matrix + + + def _ssp_do_naive_space(self, g1, g2, spl1, spl2): + if self._fcsp: # @todo: it may be put outside the _sp_do(). + return self._sp_do_naive_fcsp(g1, g2, spl1, spl2) + else: + return self._sp_do_naive_naive(g1, g2, spl1, spl2) + + + def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2): + + # First, compute shortest path matrices, method borrowed from FCSP. + vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2) + # Then, compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2) + + return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict + + + def _sp_do_naive_naive(self, g1, g2, spl1, spl2): + + nb_v_comparison = 0 + nb_e_comparison = 0 + + # Define the function to compute kernels between vertices in each condition. + if len(self._node_labels) > 0: + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['mix'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + # node symb labeled + else: + def compute_vk(n1, n2): + kn = self._node_kernels['symb'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + return kn(n1_labels, n2_labels) + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['nsymb'] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_attrs, n2_attrs) +# # node unlabeled +# else: +# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 +# return kernel + + # Define the function to compute kernels between edges in each condition. + if len(self._edge_labels) > 0: + # edge symb and non-synb labeled + if len(self._edge_attrs) > 0: + def compute_ek(e1, e2): + ke = self._edge_kernels['mix'] + e1_labels = [g1.edges[e1][el] for el in self._edge_labels] + e2_labels = [g2.edges[e2][el] for el in self._edge_labels] + e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] + e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] + return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) + # edge symb labeled + else: + def compute_ek(e1, e2): + ke = self._edge_kernels['symb'] + e1_labels = [g1.edges[e1][el] for el in self._edge_labels] + e2_labels = [g2.edges[e2][el] for el in self._edge_labels] + return ke(e1_labels, e2_labels) + else: + # edge non-synb labeled + if len(self._edge_attrs) > 0: + def compute_ek(e1, e2): + ke = self._edge_kernels['nsymb'] + e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] + e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] + return ke(e1_attrs, e2_attrs) + + + # compute graph kernels + if len(self._node_labels) > 0 or len(self._node_attrs) > 0: + if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): +# nb_v_comparison = len(p1) +# nb_e_comparison = len(p1) - 1 + kpath = compute_vk(p1[0], p2[0]) + nb_v_comparison += 1 + if kpath: + for idx in range(1, len(p1)): + kpath *= compute_vk(p1[idx], p2[idx]) * \ + compute_ek((p1[idx-1], p1[idx]), + (p2[idx-1], p2[idx])) + nb_v_comparison += 1 + nb_e_comparison += 1 + if not kpath: + break +# kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = compute_vk(p1[0], p2[0]) + nb_v_comparison += 1 + if kpath: + for idx in range(1, len(p1)): + kpath *= compute_vk(p1[idx], p2[idx]) + nb_v_comparison += 1 + if not kpath: + break +# kernel += kpath # add up kernels of all paths + else: + if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + if len(p1) == 0: + pass + else: + kpath = 1 + for idx in range(0, len(p1) - 1): + kpath *= compute_ek((p1[idx], p1[idx+1]), + (p2[idx], p2[idx+1])) + nb_e_comparison += 1 + if not kpath: + break + else: + pass +# for p1, p2 in product(spl1, spl2): +# if len(p1) == len(p2): +# kernel += 1 +# try: +# kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average +# except ZeroDivisionError: +# print(spl1, spl2) +# print(g1.nodes(data=True)) +# print(g1.edges(data=True)) +# raise Exception + + return nb_v_comparison, nb_e_comparison + + + def _get_all_node_kernels(self, g1, g2): + nb_comparison = 0 + + vk_dict = {} # shortest path matrices dict + if len(self._node_labels) > 0: + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + kn = self._node_kernels['mix'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in self._node_labels] + n2_labels = [n2[1][nl] for nl in self._node_labels] + n1_attrs = [n1[1][na] for na in self._node_attrs] + n2_attrs = [n2[1][na] for na in self._node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + nb_comparison += 1 + # node symb labeled + else: + kn = self._node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in self._node_labels] + n2_labels = [n2[1][nl] for nl in self._node_labels] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + nb_comparison += 1 + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + kn = self._node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_attrs = [n1[1][na] for na in self._node_attrs] + n2_attrs = [n2[1][na] for na in self._node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) + nb_comparison += 1 + # node unlabeled + else: + pass # @todo: add edge weights. + # for e1 in g1.edges(data=True): + # for e2 in g2.edges(data=True): + # if e1[2]['cost'] == e2[2]['cost']: + # kernel += 1 + # return kernel + + return vk_dict, nb_comparison + + + def _get_all_edge_kernels(self, g1, g2): + nb_comparison = 0 + + # compute kernels between all pairs of edges, which is an idea of + # extension of FCSP. It suits sparse graphs, which is the most case we + # went though. For dense graphs, this would be slow. + ek_dict = {} # dict of edge kernels + if len(self._edge_labels) > 0: + # edge symb and non-synb labeled + if len(self._edge_attrs) > 0: + ke = self._edge_kernels['mix'] + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + e1_labels = [e1[2][el] for el in self._edge_labels] + e2_labels = [e2[2][el] for el in self._edge_labels] + e1_attrs = [e1[2][ea] for ea in self._edge_attrs] + e2_attrs = [e2[2][ea] for ea in self._edge_attrs] + ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + nb_comparison += 1 + # edge symb labeled + else: + ke = self._edge_kernels['symb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + e1_labels = [e1[2][el] for el in self._edge_labels] + e2_labels = [e2[2][el] for el in self._edge_labels] + ek_temp = ke(e1_labels, e2_labels) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + nb_comparison += 1 + else: + # edge non-synb labeled + if len(self._edge_attrs) > 0: + ke = self._edge_kernels['nsymb'] + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + e1_attrs = [e1[2][ea] for ea in self._edge_attrs] + e2_attrs = [e2[2][ea] for ea in self._edge_attrs] + ek_temp = ke(e1_attrs, e2_attrs) + ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp + ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp + ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp + nb_comparison += 1 + # edge unlabeled + else: + pass + + return ek_dict, nb_comparison \ No newline at end of file