@@ -10,6 +10,7 @@ This script compares the results with and without FCSP. | |||||
from gklearn.dataset import Dataset | from gklearn.dataset import Dataset | ||||
from gklearn.utils import get_graph_kernel_by_name | from gklearn.utils import get_graph_kernel_by_name | ||||
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | ||||
from gklearn.experiments import DATASET_ROOT | |||||
import functools | import functools | ||||
import os | import os | ||||
import pickle | import pickle | ||||
@@ -17,50 +18,77 @@ import sys | |||||
import logging | import logging | ||||
def run_all(fcsp): | |||||
save_dir = 'outputs/' + ('fscp' if fcsp == True else 'naive') + '/' | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
# def run_all(fcsp): | |||||
# from sklearn.model_selection import ParameterGrid | |||||
# Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||||
# 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||||
# 'Letter-high', 'Letter-med', 'Letter-low', | |||||
# 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||||
# 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||||
# 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||||
# 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||||
# 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||||
# 'Mutagenicity', 'REDDIT-BINARY'] | |||||
# Kernel_List = ['ShortestPath', 'StructuralSP'] | |||||
# task_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||||
# for task in list(task_grid): | |||||
from sklearn.model_selection import ParameterGrid | |||||
# save_file_suffix = '.' + task['kernel'] + '.' + task['dataset'] | |||||
# file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||||
# if not os.path.isfile(file_name): | |||||
# print() | |||||
# print((task['kernel'], task['dataset'])) | |||||
Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||||
'PAH_unlabeled', 'PAH', 'MUTAG', 'Letter-high', 'Letter-med', 'Letter-low', | |||||
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||||
'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||||
'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||||
'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||||
'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||||
'Mutagenicity', 'REDDIT-BINARY'] | |||||
# try: | |||||
# gram_matrix, run_time = compute(task['kernel'], task['dataset'], fcsp) | |||||
Kernel_List = ['ShortestPath', 'StructuralSP'] | |||||
# except Exception as exp: | |||||
# print('An exception occured when running this experiment:') | |||||
# LOG_FILENAME = save_dir + 'error.txt' | |||||
# logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
# logging.exception('\n--------------' + save_file_suffix + '------------------') | |||||
# print(repr(exp)) | |||||
# else: | |||||
# save_file_suffix = '.' + task['kernel'] + task['dataset'] | |||||
work_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||||
# with open(file_name, 'wb') as f: | |||||
# pickle.dump(run_time, f) | |||||
for work in list(work_grid): | |||||
save_file_suffix = '.' + work['kernel'] + '.' + work['dataset'] | |||||
file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||||
if not os.path.isfile(file_name): | |||||
print() | |||||
print((work['kernel'], work['dataset'])) | |||||
try: | |||||
gram_matrix, run_time = run_work(work['kernel'], work['dataset'], fcsp) | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = save_dir + 'error.txt' | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception(save_file_suffix) | |||||
print(repr(exp)) | |||||
def run_task(kernel_name, ds_name, fcsp): | |||||
save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||||
file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||||
save_file_suffix = '.' + work['kernel'] + work['dataset'] | |||||
if not os.path.isfile(file_name): | |||||
print() | |||||
print((kernel_name, ds_name, str(fcsp))) | |||||
try: | |||||
gram_matrix, run_time = compute(kernel_name, ds_name, fcsp) | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = os.path.join(save_dir, 'error' + save_file_suffix + '.txt') | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception('\n--------------' + save_file_suffix + '------------------') | |||||
print(repr(exp)) | |||||
else: | |||||
with open(file_name, 'wb') as f: | with open(file_name, 'wb') as f: | ||||
pickle.dump(run_time, f) | pickle.dump(run_time, f) | ||||
def run_work(kernel_name, ds_name, fcsp): | |||||
dataset = Dataset(ds_name, verbose=True) | |||||
def compute(kernel_name, ds_name, fcsp): | |||||
dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||||
if kernel_name == 'ShortestPath': | |||||
dataset.trim_dataset(edge_required=True) | |||||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | ||||
node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | ||||
@@ -87,8 +115,15 @@ def run_work(kernel_name, ds_name, fcsp): | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
if len(sys.argv) > 1: | if len(sys.argv) > 1: | ||||
fcsp = True if sys.argv[1] == 'True' else False | |||||
kernel_name = sys.argv[1] | |||||
ds_name = sys.argv[2] | |||||
fcsp = True if sys.argv[3] == 'True' else False | |||||
else: | else: | ||||
kernel_name = 'ShortestPath' | |||||
ds_name = 'Acyclic' | |||||
fcsp = True | fcsp = True | ||||
run_all(fcsp) | |||||
save_dir = 'outputs/' | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
run_task(kernel_name, ds_name, fcsp) |
@@ -0,0 +1,98 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Wed Dec 2 17:41:54 2020 | |||||
@author: ljia | |||||
This script compares the results with and without FCSP. | |||||
""" | |||||
from gklearn.dataset import Dataset | |||||
from shortest_path import SPSpace | |||||
from structural_sp import SSPSpace | |||||
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||||
from gklearn.experiments import DATASET_ROOT | |||||
import functools | |||||
import os | |||||
import pickle | |||||
import sys | |||||
import logging | |||||
def run_task(kernel_name, ds_name, fcsp): | |||||
save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||||
file_name = os.path.join(save_dir, 'space' + save_file_suffix + '.pkl') | |||||
# Return if the task is already completed. | |||||
if os.path.isfile(file_name): | |||||
with open(file_name, 'rb') as f: | |||||
data = pickle.load(f) | |||||
if data['completed']: | |||||
return | |||||
print() | |||||
print((kernel_name, ds_name, str(fcsp))) | |||||
try: | |||||
gram_matrix, run_time = compute(kernel_name, ds_name, fcsp, file_name) | |||||
except Exception as exp: | |||||
print('An exception occured when running this experiment:') | |||||
LOG_FILENAME = os.path.join(save_dir, 'error.space' + save_file_suffix + '.txt') | |||||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||||
logging.exception('\n--------------' + save_file_suffix + '------------------') | |||||
print(repr(exp)) | |||||
# else: | |||||
# with open(file_name, 'wb') as f: | |||||
# pickle.dump(run_time, f) | |||||
def compute(kernel_name, ds_name, fcsp, file_name): | |||||
dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||||
if kernel_name == 'ShortestPath': | |||||
dataset.trim_dataset(edge_required=True) | |||||
# dataset.cut_graphs(range(0, 10)) | |||||
kernel_class = SPSpace | |||||
else: | |||||
# dataset.cut_graphs(range(0, 10)) | |||||
kernel_class = SSPSpace | |||||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||||
node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
edge_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||||
graph_kernel = kernel_class(name=kernel_name, | |||||
node_labels=dataset.node_labels, | |||||
edge_labels=dataset.edge_labels, | |||||
node_attrs=dataset.node_attrs, | |||||
edge_attrs=dataset.edge_attrs, | |||||
ds_infos=dataset.get_dataset_infos(keys=['directed']), | |||||
fcsp=fcsp, | |||||
compute_method='naive', | |||||
node_kernels=node_kernels, | |||||
edge_kernels=edge_kernels, | |||||
file_name=file_name | |||||
) | |||||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, | |||||
parallel=None, | |||||
normalize=False, | |||||
verbose=2 | |||||
) | |||||
return gram_matrix, run_time | |||||
if __name__ == '__main__': | |||||
if len(sys.argv) > 1: | |||||
kernel_name = sys.argv[1] | |||||
ds_name = sys.argv[2] | |||||
fcsp = True if sys.argv[3] == 'True' else False | |||||
else: | |||||
kernel_name = 'StructuralSP' | |||||
ds_name = 'Fingerprint' | |||||
fcsp = True | |||||
save_dir = 'outputs/' | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
run_task(kernel_name, ds_name, fcsp) |
@@ -10,27 +10,60 @@ import os | |||||
import re | import re | ||||
def get_job_script(param): | |||||
OUT_TIME_LIST = [('ShortestPath', 'ENZYMES', 'False'), | |||||
('StructuralSP', 'ENZYMES', 'True'), | |||||
('StructuralSP', 'ENZYMES', 'False'), | |||||
('StructuralSP', 'AIDS', 'False'), | |||||
('ShortestPath', 'NCI1', 'False'), | |||||
('StructuralSP', 'NCI1', 'True'), | |||||
('StructuralSP', 'NCI1', 'False'), | |||||
('ShortestPath', 'NCI109', 'False'), | |||||
('StructuralSP', 'NCI109', 'True'), | |||||
('StructuralSP', 'NCI109', 'False'), | |||||
('ShortestPath', 'DD', 'True'), | |||||
('ShortestPath', 'DD', 'False'), | |||||
('StructuralSP', 'BZR', 'False'), | |||||
('ShortestPath', 'COX2', 'False'), | |||||
('StructuralSP', 'COX2', 'False'), | |||||
('ShortestPath', 'DHFR', 'False'), | |||||
] | |||||
OUT_MEM_LIST = [('StructuralSP', 'PROTEINS', 'True'), | |||||
('StructuralSP', 'PROTEINS', 'False'), | |||||
('StructuralSP', 'PROTEINS_full', 'True'), | |||||
('StructuralSP', 'PROTEINS_full', 'False'), | |||||
('ShortestPath', 'REDDIT-BINARY', 'True'), | |||||
] | |||||
MISS_LABEL_LIST = [('StructuralSP', 'GREC', 'True'), | |||||
('StructuralSP', 'GREC', 'False'), | |||||
('StructuralSP', 'Web', 'True'), | |||||
('StructuralSP', 'Web', 'False'), | |||||
] | |||||
def get_job_script(kernel, dataset, fcsp): | |||||
script = r""" | script = r""" | ||||
#!/bin/bash | #!/bin/bash | ||||
#SBATCH --exclusive | #SBATCH --exclusive | ||||
#SBATCH --job-name="fcsp.""" + param + r"""" | |||||
#SBATCH --partition=long | |||||
#SBATCH --job-name="fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||||
#SBATCH --partition=tlong | |||||
#SBATCH --mail-type=ALL | #SBATCH --mail-type=ALL | ||||
#SBATCH --mail-user=jajupmochi@gmail.com | #SBATCH --mail-user=jajupmochi@gmail.com | ||||
#SBATCH --output="outputs/output_fcsp.""" + param + r""".txt" | |||||
#SBATCH --error="errors/error_fcsp.""" + param + r""".txt" | |||||
#SBATCH --output="outputs/output_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||||
#SBATCH --error="errors/error_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||||
# | # | ||||
#SBATCH --ntasks=1 | #SBATCH --ntasks=1 | ||||
#SBATCH --nodes=1 | #SBATCH --nodes=1 | ||||
#SBATCH --cpus-per-task=1 | #SBATCH --cpus-per-task=1 | ||||
#SBATCH --time=100:00:00 | |||||
#SBATCH --mem-per-cpu=4000 | |||||
#SBATCH --time=300:00:00 | |||||
##SBATCH --mem-per-cpu=4000 | |||||
#SBATCH --mem=40000 | |||||
srun hostname | srun hostname | ||||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | ||||
srun python3 compare_fcsp.py """ + param | |||||
srun python3 compare_fcsp.py """ + kernel + r" " + dataset + r" " + fcsp | |||||
script = script.strip() | script = script.strip() | ||||
script = re.sub('\n\t+', '\n', script) | script = re.sub('\n\t+', '\n', script) | ||||
script = re.sub('\n +', '\n', script) | script = re.sub('\n +', '\n', script) | ||||
@@ -38,15 +71,75 @@ srun python3 compare_fcsp.py """ + param | |||||
return script | return script | ||||
def check_task_status(save_dir, *params): | |||||
str_task_id = '.' + '.'.join(params) | |||||
# Check if the task is in out of memeory or out of space lists or missing labels. | |||||
if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||||
return True | |||||
# Check if the task is running or in queue of slurm. | |||||
command = 'squeue --user ljia02 --name "fcsp' + str_task_id + '" --format "%.2t" --noheader' | |||||
stream = os.popen(command) | |||||
output = stream.readlines() | |||||
if len(output) > 0: | |||||
return True | |||||
# Check if the results are already computed. | |||||
file_name = os.path.join(save_dir, 'run_time' + str_task_id + '.pkl') | |||||
if os.path.isfile(file_name): | |||||
return True | |||||
return False | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
save_dir = 'outputs/' | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs('outputs/', exist_ok=True) | os.makedirs('outputs/', exist_ok=True) | ||||
os.makedirs('errors/', exist_ok=True) | os.makedirs('errors/', exist_ok=True) | ||||
param_list = ['True', 'False'] | |||||
for param in param_list[:]: | |||||
job_script = get_job_script(param) | |||||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
# print(command) | |||||
os.system(command) | |||||
from sklearn.model_selection import ParameterGrid | |||||
Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||||
'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||||
'Letter-high', 'Letter-med', 'Letter-low', | |||||
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||||
# new: not so large. | |||||
'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||||
'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||||
'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||||
'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||||
# new: large. | |||||
'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||||
'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||||
'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||||
'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||||
'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||||
'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||||
'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||||
'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||||
'COLLAB', 'COIL-DEL', | |||||
'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||||
'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||||
'REDDIT-MULTI-12K'] | |||||
Kernel_List = ['ShortestPath', 'StructuralSP'] | |||||
fcsp_list = ['True', 'False'] | |||||
task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||||
'dataset': Dataset_List[:], | |||||
'fcsp': fcsp_list[:]}) | |||||
from tqdm import tqdm | |||||
for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||||
if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||||
job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
# print(command) | |||||
os.system(command) | |||||
# os.popen(command) | # os.popen(command) | ||||
# output = stream.readlines() | # output = stream.readlines() |
@@ -0,0 +1,225 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Mon Dec 14 11:49:43 2020 | |||||
@author: ljia | |||||
""" | |||||
import os | |||||
import re | |||||
import pickle | |||||
OUT_TIME_LIST = [] | |||||
OUT_MEM_LIST = [('ShortestPath', 'REDDIT-BINARY', 'True'), | |||||
('ShortestPath', 'REDDIT-BINARY', 'False'), | |||||
('ShortestPath', 'DD', 'True'), | |||||
('ShortestPath', 'DD', 'False'), | |||||
('ShortestPath', 'MCF-7', 'True'), | |||||
('ShortestPath', 'MCF-7', 'False'), | |||||
('StructuralSP', 'MCF-7', 'True'), | |||||
('StructuralSP', 'MCF-7', 'False'), | |||||
('ShortestPath', 'MCF-7H', 'True'), | |||||
('ShortestPath', 'MCF-7H', 'False'), | |||||
('StructuralSP', 'MCF-7H', 'True'), | |||||
('StructuralSP', 'MCF-7H', 'False'), | |||||
('ShortestPath', 'MOLT-4', 'True'), | |||||
('ShortestPath', 'MOLT-4', 'False'), | |||||
('StructuralSP', 'MOLT-4', 'True'), | |||||
('StructuralSP', 'MOLT-4', 'False'), | |||||
('ShortestPath', 'MOLT-4H', 'True'), | |||||
('ShortestPath', 'MOLT-4H', 'False'), | |||||
('StructuralSP', 'MOLT-4H', 'True'), | |||||
('StructuralSP', 'MOLT-4H', 'False'), | |||||
('ShortestPath', 'P388', 'True'), | |||||
('ShortestPath', 'P388H', 'True'), | |||||
('ShortestPath', 'NCI-H23', 'True'), | |||||
('ShortestPath', 'NCI-H23', 'False'), | |||||
('StructuralSP', 'NCI-H23', 'True'), | |||||
('StructuralSP', 'NCI-H23', 'False'), | |||||
('ShortestPath', 'NCI-H23H', 'True'), | |||||
('ShortestPath', 'NCI-H23H', 'False'), | |||||
('StructuralSP', 'NCI-H23H', 'True'), | |||||
('StructuralSP', 'NCI-H23H', 'False'), | |||||
('ShortestPath', 'OVCAR-8', 'True'), | |||||
('ShortestPath', 'OVCAR-8', 'False'), | |||||
('StructuralSP', 'OVCAR-8', 'True'), | |||||
('StructuralSP', 'OVCAR-8', 'False'), | |||||
('ShortestPath', 'OVCAR-8H', 'False'), | |||||
('StructuralSP', 'OVCAR-8H', 'False'), | |||||
('ShortestPath', 'SN12C', 'True'), | |||||
('ShortestPath', 'SN12C', 'False'), | |||||
('StructuralSP', 'SN12C', 'True'), | |||||
('StructuralSP', 'SN12C', 'False'), | |||||
('ShortestPath', 'SN12CH', 'True'), | |||||
('ShortestPath', 'SN12CH', 'False'), | |||||
('ShortestPath', 'SF-295', 'True'), | |||||
('ShortestPath', 'SF-295', 'False'), | |||||
('StructuralSP', 'SF-295', 'True'), | |||||
('StructuralSP', 'SF-295', 'False'), | |||||
('ShortestPath', 'SF-295H', 'False'), | |||||
('StructuralSP', 'SF-295H', 'False'), | |||||
('ShortestPath', 'SW-620', 'True'), | |||||
('ShortestPath', 'SW-620', 'False'), | |||||
('StructuralSP', 'SW-620', 'True'), | |||||
('StructuralSP', 'SW-620', 'False'), | |||||
('ShortestPath', 'SW-620H', 'False'), | |||||
('StructuralSP', 'SW-620H', 'False'), | |||||
('ShortestPath', 'TRIANGLES', 'False'), | |||||
('StructuralSP', 'TRIANGLES', 'False'), | |||||
('ShortestPath', 'Yeast', 'True'), | |||||
('ShortestPath', 'Yeast', 'False'), | |||||
('StructuralSP', 'Yeast', 'True'), | |||||
('StructuralSP', 'Yeast', 'False'), | |||||
('ShortestPath', 'YeastH', 'True'), | |||||
('ShortestPath', 'FRANKENSTEIN', 'True'), | |||||
('ShortestPath', 'FRANKENSTEIN', 'False'), | |||||
('StructuralSP', 'FRANKENSTEIN', 'True'), | |||||
('StructuralSP', 'FRANKENSTEIN', 'False'), | |||||
('StructuralSP', 'SN12CH', 'True'), | |||||
('StructuralSP', 'SN12CH', 'False'), | |||||
('ShortestPath', 'UACC257', 'True'), | |||||
('ShortestPath', 'UACC257', 'False'), | |||||
('StructuralSP', 'UACC257', 'True'), | |||||
('StructuralSP', 'UACC257', 'False'), | |||||
('ShortestPath', 'UACC257H', 'True'), | |||||
('ShortestPath', 'UACC257H', 'False'), | |||||
('StructuralSP', 'UACC257H', 'True'), | |||||
('StructuralSP', 'UACC257H', 'False'), | |||||
('ShortestPath', 'PC-3', 'True'), | |||||
('ShortestPath', 'PC-3', 'False'), | |||||
('StructuralSP', 'PC-3', 'True'), | |||||
('StructuralSP', 'PC-3', 'False'), | |||||
('ShortestPath', 'PC-3H', 'True'), | |||||
('ShortestPath', 'PC-3H', 'False'), | |||||
('StructuralSP', 'PC-3H', 'True'), | |||||
('StructuralSP', 'PC-3H', 'False'), | |||||
('ShortestPath', 'DBLP_v1', 'False'), | |||||
('StructuralSP', 'DBLP_v1', 'True'), | |||||
('ShortestPath', 'REDDIT-BINARY', 'False'), | |||||
('ShortestPath', 'REDDIT-MULTI-12K', 'False'), | |||||
('StructuralSP', 'REDDIT-MULTI-12K', 'False'), | |||||
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'True'), | |||||
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'False'), | |||||
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'True'), | |||||
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'False'), | |||||
] | |||||
MISS_LABEL_LIST = [('StructuralSP', 'GREC', 'True'), | |||||
('StructuralSP', 'GREC', 'False'), | |||||
('StructuralSP', 'Web', 'True'), | |||||
('StructuralSP', 'Web', 'False'), | |||||
] | |||||
def get_job_script(kernel, dataset, fcsp): | |||||
# if (kernel, dataset, fcsp) in OUT_MEM_LIST: | |||||
# mem = '2560000' | |||||
# else: | |||||
mem = '4000' | |||||
script = r""" | |||||
#!/bin/bash | |||||
#SBATCH --exclusive | |||||
#SBATCH --job-name="fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||||
#SBATCH --partition=""" + (r"court" if kernel == 'ShortestPath' else r"court") + r""" | |||||
#SBATCH --mail-type=ALL | |||||
#SBATCH --mail-user=jajupmochi@gmail.com | |||||
#SBATCH --output="outputs/output_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||||
#SBATCH --error="errors/error_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||||
# | |||||
#SBATCH --ntasks=1 | |||||
#SBATCH --nodes=1 | |||||
#SBATCH --cpus-per-task=1 | |||||
#SBATCH --time=""" + (r"48" if kernel == 'ShortestPath' else r"48") + r""":00:00 | |||||
##SBATCH --mem-per-cpu=""" + mem + r""" | |||||
#SBATCH --mem=4000 | |||||
srun hostname | |||||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | |||||
srun python3 compare_fcsp_space.py """ + kernel + r" " + dataset + r" " + fcsp | |||||
script = script.strip() | |||||
script = re.sub('\n\t+', '\n', script) | |||||
script = re.sub('\n +', '\n', script) | |||||
return script | |||||
def check_task_status(save_dir, *params): | |||||
str_task_id = '.' + '.'.join(params) | |||||
# Check if the task is in out of memeory or out of space lists or missing labels. | |||||
if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||||
return True | |||||
# Check if the task is running or in queue of slurm. | |||||
command = 'squeue --user ljia02 --name "fcsp.space' + str_task_id + '" --format "%.2t" --noheader' | |||||
stream = os.popen(command) | |||||
output = stream.readlines() | |||||
if len(output) > 0: | |||||
return True | |||||
# Check if the task is already computed. | |||||
file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') | |||||
if os.path.isfile(file_name): | |||||
with open(file_name, 'rb') as f: | |||||
data = pickle.load(f) | |||||
if data['completed']: | |||||
return True | |||||
return False | |||||
if __name__ == '__main__': | |||||
save_dir = 'outputs/' | |||||
os.makedirs(save_dir, exist_ok=True) | |||||
os.makedirs('outputs/', exist_ok=True) | |||||
os.makedirs('errors/', exist_ok=True) | |||||
from sklearn.model_selection import ParameterGrid | |||||
Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||||
'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||||
'Letter-high', 'Letter-med', 'Letter-low', | |||||
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||||
# new: not so large. | |||||
'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||||
'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||||
'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||||
'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||||
# new: large. | |||||
'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||||
'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||||
'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||||
'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||||
'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||||
'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||||
'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||||
'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||||
'COLLAB', 'COIL-DEL', | |||||
'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||||
'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||||
'REDDIT-MULTI-12K'] | |||||
Kernel_List = ['ShortestPath', 'StructuralSP'] | |||||
fcsp_list = ['True', 'False'] | |||||
task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||||
'dataset': Dataset_List[:], | |||||
'fcsp': fcsp_list[:]}) | |||||
from tqdm import tqdm | |||||
for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||||
if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||||
job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||||
# print(command) | |||||
os.system(command) | |||||
# os.popen(command) | |||||
# output = stream.readlines() |
@@ -0,0 +1,253 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Tue Apr 7 15:24:58 2020 | |||||
@author: ljia | |||||
@references: | |||||
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||||
Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||||
""" | |||||
import sys | |||||
from itertools import product | |||||
# from functools import partial | |||||
from gklearn.utils import get_iters | |||||
import numpy as np | |||||
from gklearn.utils.utils import getSPGraph | |||||
from gklearn.kernels import ShortestPath | |||||
import os | |||||
import pickle | |||||
from pympler import asizeof | |||||
import time | |||||
import networkx as nx | |||||
def load_results(file_name, fcsp): | |||||
if os.path.isfile(file_name): | |||||
with open(file_name, 'rb') as f: | |||||
return pickle.load(f) | |||||
else: | |||||
results = {'nb_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||||
if fcsp: | |||||
results['vk_dict_mem'] = [] | |||||
return results | |||||
def save_results(file_name, results): | |||||
with open(file_name, 'wb') as f: | |||||
pickle.dump(results, f) | |||||
def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||||
# asizeof.asized(obj, detail=1).format() | |||||
# return asizeof.asizeof(obj) | |||||
key, val = next(iter(obj.items())) | |||||
# key = dict.iterkeys().next() | |||||
# key_mem = asizeof.asizeof(key) | |||||
dict_flat = sys.getsizeof(obj) | |||||
key_mem = 64 | |||||
if isinstance(val, float): | |||||
val_mem = 24 | |||||
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||||
else: # value is True or False | |||||
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||||
# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||||
return mem | |||||
def compute_stats(file_name, results): | |||||
del results['i'] | |||||
del results['j'] | |||||
results['nb_comparison'] = np.mean(results['nb_comparison']) | |||||
results['completed'] = True | |||||
if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||||
results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||||
save_results(file_name, results) | |||||
class SPSpace(ShortestPath): | |||||
def __init__(self, **kwargs): | |||||
super().__init__(**kwargs) | |||||
self._file_name = kwargs.get('file_name') | |||||
# @profile | |||||
def _compute_gm_series(self): | |||||
self._all_graphs_have_edges(self._graphs) | |||||
# get shortest path graph of each graph. | |||||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | |||||
results = load_results(self._file_name, self._fcsp) | |||||
# compute Gram matrix. | |||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||||
from itertools import combinations_with_replacement | |||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||||
iterator = get_iters(itr, desc='Computing kernels', | |||||
length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) | |||||
time0 = time.time() | |||||
for i, j in iterator: | |||||
if i > results['i'] or (i == results['i'] and j > results['j']): | |||||
data = self._sp_do_space(self._graphs[i], self._graphs[j]) | |||||
if self._fcsp: | |||||
results['nb_comparison'].append(data[0]) | |||||
if data[1] != {}: | |||||
results['vk_dict_mem'].append(estimate_vk_memory(data[1], | |||||
nx.number_of_nodes(self._graphs[i]), | |||||
nx.number_of_nodes(self._graphs[j]))) | |||||
else: | |||||
results['nb_comparison'].append(data) | |||||
results['i'] = i | |||||
results['j'] = j | |||||
time1 = time.time() | |||||
if time1 - time0 > 600: | |||||
save_results(self._file_name, results) | |||||
time0 = time1 | |||||
compute_stats(self._file_name, results) | |||||
return gram_matrix | |||||
def _sp_do_space(self, g1, g2): | |||||
if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||||
return self._sp_do_fcsp(g1, g2) | |||||
else: | |||||
return self._sp_do_naive(g1, g2) | |||||
def _sp_do_fcsp(self, g1, g2): | |||||
nb_comparison = 0 | |||||
# compute shortest path matrices first, method borrowed from FCSP. | |||||
vk_dict = {} # shortest path matrices dict | |||||
if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do(). | |||||
# node symb and non-synb labeled | |||||
if len(self._node_attrs) > 0: | |||||
kn = self._node_kernels['mix'] | |||||
for n1, n2 in product( | |||||
g1.nodes(data=True), g2.nodes(data=True)): | |||||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
nb_comparison += 1 | |||||
# node symb labeled | |||||
else: | |||||
kn = self._node_kernels['symb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||||
nb_comparison += 1 | |||||
else: | |||||
# node non-synb labeled | |||||
if len(self._node_attrs) > 0: | |||||
kn = self._node_kernels['nsymb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||||
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||||
nb_comparison += 1 | |||||
# node unlabeled | |||||
else: | |||||
for e1, e2 in product( | |||||
g1.edges(data=True), g2.edges(data=True)): | |||||
pass | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kernel += 1 | |||||
# nb_comparison += 1 | |||||
return nb_comparison, vk_dict | |||||
# # compute graph kernels | |||||
# if self._ds_infos['directed']: | |||||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])] | |||||
# kn1 = nk11 * nk22 | |||||
# kernel += kn1 | |||||
# else: | |||||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||||
# nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||||
# e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])] | |||||
# kn1 = nk11 * nk22 | |||||
# kn2 = nk12 * nk21 | |||||
# kernel += kn1 + kn2 | |||||
def _sp_do_naive(self, g1, g2): | |||||
nb_comparison = 0 | |||||
# Define the function to compute kernels between vertices in each condition. | |||||
if len(self._node_labels) > 0: | |||||
# node symb and non-synb labeled | |||||
if len(self._node_attrs) > 0: | |||||
def compute_vk(n1, n2): | |||||
kn = self._node_kernels['mix'] | |||||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||||
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
# node symb labeled | |||||
else: | |||||
def compute_vk(n1, n2): | |||||
kn = self._node_kernels['symb'] | |||||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||||
return kn(n1_labels, n2_labels) | |||||
else: | |||||
# node non-synb labeled | |||||
if len(self._node_attrs) > 0: | |||||
def compute_vk(n1, n2): | |||||
kn = self._node_kernels['nsymb'] | |||||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||||
return kn(n1_attrs, n2_attrs) | |||||
# node unlabeled | |||||
else: | |||||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kernel += 1 | |||||
return 0 | |||||
# compute graph kernels | |||||
if self._ds_infos['directed']: | |||||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
# nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1]) | |||||
# kn1 = nk11 * nk22 | |||||
# kernel += kn1 | |||||
nb_comparison += 2 | |||||
else: | |||||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
if e1[2]['cost'] == e2[2]['cost']: | |||||
# each edge walk is counted twice, starting from both its extreme nodes. | |||||
# nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk( | |||||
# e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1]) | |||||
# kn1 = nk11 * nk22 | |||||
# kn2 = nk12 * nk21 | |||||
# kernel += kn1 + kn2 | |||||
nb_comparison += 4 | |||||
return nb_comparison |
@@ -0,0 +1,439 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Mon Mar 30 11:59:57 2020 | |||||
@author: ljia | |||||
@references: | |||||
[1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For | |||||
Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). | |||||
""" | |||||
import sys | |||||
from itertools import product | |||||
from gklearn.utils import get_iters | |||||
import numpy as np | |||||
import time | |||||
import os, errno | |||||
import pickle | |||||
from pympler import asizeof | |||||
import networkx as nx | |||||
from gklearn.utils.utils import get_shortest_paths | |||||
from gklearn.kernels import StructuralSP | |||||
def load_splist(file_name): | |||||
if os.path.isfile(file_name): | |||||
with open(file_name, 'rb') as f: | |||||
return pickle.load(f) | |||||
else: | |||||
results_path = {'splist': [], 'i': -1, 'completed': False} | |||||
return results_path | |||||
def load_results(file_name, fcsp): | |||||
if os.path.isfile(file_name): | |||||
with open(file_name, 'rb') as f: | |||||
return pickle.load(f) | |||||
else: | |||||
results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||||
if fcsp: | |||||
results['vk_dict_mem'] = [] | |||||
results['ek_dict_mem'] = [] | |||||
return results | |||||
def save_results(file_name, results): | |||||
with open(file_name, 'wb') as f: | |||||
pickle.dump(results, f) | |||||
def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||||
# asizeof.asized(obj, detail=1).format() | |||||
# return asizeof.asizeof(obj) | |||||
key, val = next(iter(obj.items())) | |||||
# key = dict.iterkeys().next() | |||||
# key_mem = asizeof.asizeof(key) | |||||
dict_flat = sys.getsizeof(obj) | |||||
key_mem = 64 | |||||
if isinstance(val, float): | |||||
val_mem = 24 | |||||
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||||
else: # value is True or False | |||||
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||||
# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||||
return mem | |||||
def estimate_ek_memory(obj, nb_nodes1, nb_nodes2): | |||||
# asizeof.asized(obj, detail=1).format() | |||||
# return asizeof.asizeof(obj) | |||||
key, val = next(iter(obj.items())) | |||||
# key = dict.iterkeys().next() | |||||
# key_mem = asizeof.asizeof(key) | |||||
dict_flat = sys.getsizeof(obj) | |||||
key_mem = 192 | |||||
if isinstance(val, float): | |||||
val_mem = 24 | |||||
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||||
else: # value is True or False | |||||
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||||
# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||||
return mem | |||||
def compute_stats(file_name, results, splist): | |||||
del results['i'] | |||||
del results['j'] | |||||
results['nb_v_comparison'] = np.mean(results['nb_v_comparison']) | |||||
# if len(results['nb_e_comparison']) > 0: | |||||
results['nb_e_comparison'] = np.mean(results['nb_e_comparison']) | |||||
results['completed'] = True | |||||
if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||||
results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||||
if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0: | |||||
results['ek_dict_mem'] = np.mean(results['ek_dict_mem']) | |||||
results['nb_sp_ave'] = np.mean([len(ps) for ps in splist]) | |||||
results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist]) | |||||
results['sp_mem_all'] = asizeof.asizeof(splist) | |||||
save_results(file_name, results) | |||||
class SSPSpace(StructuralSP): | |||||
def __init__(self, **kwargs): | |||||
super().__init__(**kwargs) | |||||
self._file_name = kwargs.get('file_name') | |||||
# @profile | |||||
def _compute_gm_series(self): | |||||
# get shortest paths of each graph in the graphs. | |||||
fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl' | |||||
results_path = load_splist(fn_paths) | |||||
if not results_path['completed']: | |||||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||||
if self._compute_method == 'trie': | |||||
for g in iterator: | |||||
splist.append(self._get_sps_as_trie(g)) | |||||
else: | |||||
time0 = time.time() | |||||
for i, g in enumerate(iterator): | |||||
if i > results_path['i']: | |||||
results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) | |||||
results_path['i'] = i | |||||
time1 = time.time() | |||||
if time1 - time0 > 600: | |||||
save_results(fn_paths, results_path) | |||||
time0 = time1 | |||||
del results_path['i'] | |||||
results_path['completed'] = True | |||||
save_results(fn_paths, results_path) | |||||
######### | |||||
splist = results_path['splist'] | |||||
results = load_results(self._file_name, self._fcsp) | |||||
# compute Gram matrix. | |||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||||
from itertools import combinations_with_replacement | |||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||||
length=len_itr, verbose=(self._verbose >= 2)) | |||||
if self._compute_method == 'trie': | |||||
for i, j in iterator: | |||||
kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||||
gram_matrix[i][j] = kernel | |||||
gram_matrix[j][i] = kernel | |||||
else: | |||||
time0 = time.time() | |||||
for i, j in iterator: | |||||
if i > results['i'] or (i == results['i'] and j > results['j']): | |||||
data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||||
results['nb_v_comparison'].append(data[0]) | |||||
results['nb_e_comparison'].append(data[1]) | |||||
if self._fcsp: | |||||
if data[2] != {}: | |||||
results['vk_dict_mem'].append(estimate_vk_memory(data[2], | |||||
nx.number_of_nodes(self._graphs[i]), | |||||
nx.number_of_nodes(self._graphs[j]))) | |||||
if data[3] != {}: | |||||
results['ek_dict_mem'].append(estimate_ek_memory(data[3], | |||||
nx.number_of_nodes(self._graphs[i]), | |||||
nx.number_of_nodes(self._graphs[j]))) | |||||
results['i'] = i | |||||
results['j'] = j | |||||
time1 = time.time() | |||||
if time1 - time0 > 600: | |||||
save_results(self._file_name, results) | |||||
time0 = time1 | |||||
compute_stats(self._file_name, results, splist) | |||||
# @todo: may not remove the path file if the program stops exactly here. | |||||
try: | |||||
os.remove(fn_paths) | |||||
except OSError as e: | |||||
if e.errno != errno.ENOENT: | |||||
raise | |||||
return gram_matrix | |||||
def _ssp_do_naive_space(self, g1, g2, spl1, spl2): | |||||
if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||||
return self._sp_do_naive_fcsp(g1, g2, spl1, spl2) | |||||
else: | |||||
return self._sp_do_naive_naive(g1, g2, spl1, spl2) | |||||
def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2): | |||||
# First, compute shortest path matrices, method borrowed from FCSP. | |||||
vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2) | |||||
# Then, compute kernels between all pairs of edges, which is an idea of | |||||
# extension of FCSP. It suits sparse graphs, which is the most case we | |||||
# went though. For dense graphs, this would be slow. | |||||
ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2) | |||||
return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict | |||||
def _sp_do_naive_naive(self, g1, g2, spl1, spl2): | |||||
nb_v_comparison = 0 | |||||
nb_e_comparison = 0 | |||||
# Define the function to compute kernels between vertices in each condition. | |||||
if len(self._node_labels) > 0: | |||||
# node symb and non-synb labeled | |||||
if len(self._node_attrs) > 0: | |||||
def compute_vk(n1, n2): | |||||
kn = self._node_kernels['mix'] | |||||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||||
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
# node symb labeled | |||||
else: | |||||
def compute_vk(n1, n2): | |||||
kn = self._node_kernels['symb'] | |||||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||||
return kn(n1_labels, n2_labels) | |||||
else: | |||||
# node non-synb labeled | |||||
if len(self._node_attrs) > 0: | |||||
def compute_vk(n1, n2): | |||||
kn = self._node_kernels['nsymb'] | |||||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||||
return kn(n1_attrs, n2_attrs) | |||||
# # node unlabeled | |||||
# else: | |||||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kernel += 1 | |||||
# return kernel | |||||
# Define the function to compute kernels between edges in each condition. | |||||
if len(self._edge_labels) > 0: | |||||
# edge symb and non-synb labeled | |||||
if len(self._edge_attrs) > 0: | |||||
def compute_ek(e1, e2): | |||||
ke = self._edge_kernels['mix'] | |||||
e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||||
e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||||
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||||
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||||
return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||||
# edge symb labeled | |||||
else: | |||||
def compute_ek(e1, e2): | |||||
ke = self._edge_kernels['symb'] | |||||
e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||||
e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||||
return ke(e1_labels, e2_labels) | |||||
else: | |||||
# edge non-synb labeled | |||||
if len(self._edge_attrs) > 0: | |||||
def compute_ek(e1, e2): | |||||
ke = self._edge_kernels['nsymb'] | |||||
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||||
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||||
return ke(e1_attrs, e2_attrs) | |||||
# compute graph kernels | |||||
if len(self._node_labels) > 0 or len(self._node_attrs) > 0: | |||||
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||||
for p1, p2 in product(spl1, spl2): | |||||
if len(p1) == len(p2): | |||||
# nb_v_comparison = len(p1) | |||||
# nb_e_comparison = len(p1) - 1 | |||||
kpath = compute_vk(p1[0], p2[0]) | |||||
nb_v_comparison += 1 | |||||
if kpath: | |||||
for idx in range(1, len(p1)): | |||||
kpath *= compute_vk(p1[idx], p2[idx]) * \ | |||||
compute_ek((p1[idx-1], p1[idx]), | |||||
(p2[idx-1], p2[idx])) | |||||
nb_v_comparison += 1 | |||||
nb_e_comparison += 1 | |||||
if not kpath: | |||||
break | |||||
# kernel += kpath # add up kernels of all paths | |||||
else: | |||||
for p1, p2 in product(spl1, spl2): | |||||
if len(p1) == len(p2): | |||||
kpath = compute_vk(p1[0], p2[0]) | |||||
nb_v_comparison += 1 | |||||
if kpath: | |||||
for idx in range(1, len(p1)): | |||||
kpath *= compute_vk(p1[idx], p2[idx]) | |||||
nb_v_comparison += 1 | |||||
if not kpath: | |||||
break | |||||
# kernel += kpath # add up kernels of all paths | |||||
else: | |||||
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||||
for p1, p2 in product(spl1, spl2): | |||||
if len(p1) == len(p2): | |||||
if len(p1) == 0: | |||||
pass | |||||
else: | |||||
kpath = 1 | |||||
for idx in range(0, len(p1) - 1): | |||||
kpath *= compute_ek((p1[idx], p1[idx+1]), | |||||
(p2[idx], p2[idx+1])) | |||||
nb_e_comparison += 1 | |||||
if not kpath: | |||||
break | |||||
else: | |||||
pass | |||||
# for p1, p2 in product(spl1, spl2): | |||||
# if len(p1) == len(p2): | |||||
# kernel += 1 | |||||
# try: | |||||
# kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average | |||||
# except ZeroDivisionError: | |||||
# print(spl1, spl2) | |||||
# print(g1.nodes(data=True)) | |||||
# print(g1.edges(data=True)) | |||||
# raise Exception | |||||
return nb_v_comparison, nb_e_comparison | |||||
def _get_all_node_kernels(self, g1, g2): | |||||
nb_comparison = 0 | |||||
vk_dict = {} # shortest path matrices dict | |||||
if len(self._node_labels) > 0: | |||||
# node symb and non-synb labeled | |||||
if len(self._node_attrs) > 0: | |||||
kn = self._node_kernels['mix'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||||
nb_comparison += 1 | |||||
# node symb labeled | |||||
else: | |||||
kn = self._node_kernels['symb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||||
nb_comparison += 1 | |||||
else: | |||||
# node non-synb labeled | |||||
if len(self._node_attrs) > 0: | |||||
kn = self._node_kernels['nsymb'] | |||||
for n1 in g1.nodes(data=True): | |||||
for n2 in g2.nodes(data=True): | |||||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||||
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||||
nb_comparison += 1 | |||||
# node unlabeled | |||||
else: | |||||
pass # @todo: add edge weights. | |||||
# for e1 in g1.edges(data=True): | |||||
# for e2 in g2.edges(data=True): | |||||
# if e1[2]['cost'] == e2[2]['cost']: | |||||
# kernel += 1 | |||||
# return kernel | |||||
return vk_dict, nb_comparison | |||||
def _get_all_edge_kernels(self, g1, g2): | |||||
nb_comparison = 0 | |||||
# compute kernels between all pairs of edges, which is an idea of | |||||
# extension of FCSP. It suits sparse graphs, which is the most case we | |||||
# went though. For dense graphs, this would be slow. | |||||
ek_dict = {} # dict of edge kernels | |||||
if len(self._edge_labels) > 0: | |||||
# edge symb and non-synb labeled | |||||
if len(self._edge_attrs) > 0: | |||||
ke = self._edge_kernels['mix'] | |||||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||||
e1_labels = [e1[2][el] for el in self._edge_labels] | |||||
e2_labels = [e2[2][el] for el in self._edge_labels] | |||||
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||||
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||||
ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||||
nb_comparison += 1 | |||||
# edge symb labeled | |||||
else: | |||||
ke = self._edge_kernels['symb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
e1_labels = [e1[2][el] for el in self._edge_labels] | |||||
e2_labels = [e2[2][el] for el in self._edge_labels] | |||||
ek_temp = ke(e1_labels, e2_labels) | |||||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||||
nb_comparison += 1 | |||||
else: | |||||
# edge non-synb labeled | |||||
if len(self._edge_attrs) > 0: | |||||
ke = self._edge_kernels['nsymb'] | |||||
for e1 in g1.edges(data=True): | |||||
for e2 in g2.edges(data=True): | |||||
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||||
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||||
ek_temp = ke(e1_attrs, e2_attrs) | |||||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||||
nb_comparison += 1 | |||||
# edge unlabeled | |||||
else: | |||||
pass | |||||
return ek_dict, nb_comparison |