@@ -10,6 +10,7 @@ This script compares the results with and without FCSP. | |||
from gklearn.dataset import Dataset | |||
from gklearn.utils import get_graph_kernel_by_name | |||
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||
from gklearn.experiments import DATASET_ROOT | |||
import functools | |||
import os | |||
import pickle | |||
@@ -17,50 +18,77 @@ import sys | |||
import logging | |||
def run_all(fcsp): | |||
save_dir = 'outputs/' + ('fscp' if fcsp == True else 'naive') + '/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
# def run_all(fcsp): | |||
# from sklearn.model_selection import ParameterGrid | |||
# Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
# 'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||
# 'Letter-high', 'Letter-med', 'Letter-low', | |||
# 'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
# 'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||
# 'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||
# 'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||
# 'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||
# 'Mutagenicity', 'REDDIT-BINARY'] | |||
# Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
# task_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||
# for task in list(task_grid): | |||
from sklearn.model_selection import ParameterGrid | |||
# save_file_suffix = '.' + task['kernel'] + '.' + task['dataset'] | |||
# file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||
# if not os.path.isfile(file_name): | |||
# print() | |||
# print((task['kernel'], task['dataset'])) | |||
Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
'PAH_unlabeled', 'PAH', 'MUTAG', 'Letter-high', 'Letter-med', 'Letter-low', | |||
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
'BZR', 'COX2', 'DHFR', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', | |||
'Cuneiform', 'KKI', 'OHSU', 'Peking_1', 'SYNTHETICnew', | |||
'Synthie', 'SYNTHETIC', 'Fingerprint', 'IMDB-BINARY', | |||
'IMDB-MULTI', 'COIL-DEL', 'PROTEINS', 'PROTEINS_full', | |||
'Mutagenicity', 'REDDIT-BINARY'] | |||
# try: | |||
# gram_matrix, run_time = compute(task['kernel'], task['dataset'], fcsp) | |||
Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
# except Exception as exp: | |||
# print('An exception occured when running this experiment:') | |||
# LOG_FILENAME = save_dir + 'error.txt' | |||
# logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
# logging.exception('\n--------------' + save_file_suffix + '------------------') | |||
# print(repr(exp)) | |||
# else: | |||
# save_file_suffix = '.' + task['kernel'] + task['dataset'] | |||
work_grid = ParameterGrid({'kernel': Kernel_List[:], 'dataset': Dataset_List[:]}) | |||
# with open(file_name, 'wb') as f: | |||
# pickle.dump(run_time, f) | |||
for work in list(work_grid): | |||
save_file_suffix = '.' + work['kernel'] + '.' + work['dataset'] | |||
file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||
if not os.path.isfile(file_name): | |||
print() | |||
print((work['kernel'], work['dataset'])) | |||
try: | |||
gram_matrix, run_time = run_work(work['kernel'], work['dataset'], fcsp) | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = save_dir + 'error.txt' | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception(save_file_suffix) | |||
print(repr(exp)) | |||
def run_task(kernel_name, ds_name, fcsp): | |||
save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||
file_name = os.path.join(save_dir, 'run_time' + save_file_suffix + '.pkl') | |||
save_file_suffix = '.' + work['kernel'] + work['dataset'] | |||
if not os.path.isfile(file_name): | |||
print() | |||
print((kernel_name, ds_name, str(fcsp))) | |||
try: | |||
gram_matrix, run_time = compute(kernel_name, ds_name, fcsp) | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = os.path.join(save_dir, 'error' + save_file_suffix + '.txt') | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('\n--------------' + save_file_suffix + '------------------') | |||
print(repr(exp)) | |||
else: | |||
with open(file_name, 'wb') as f: | |||
pickle.dump(run_time, f) | |||
def run_work(kernel_name, ds_name, fcsp): | |||
dataset = Dataset(ds_name, verbose=True) | |||
def compute(kernel_name, ds_name, fcsp): | |||
dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||
if kernel_name == 'ShortestPath': | |||
dataset.trim_dataset(edge_required=True) | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
@@ -87,8 +115,15 @@ def run_work(kernel_name, ds_name, fcsp): | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
fcsp = True if sys.argv[1] == 'True' else False | |||
kernel_name = sys.argv[1] | |||
ds_name = sys.argv[2] | |||
fcsp = True if sys.argv[3] == 'True' else False | |||
else: | |||
kernel_name = 'ShortestPath' | |||
ds_name = 'Acyclic' | |||
fcsp = True | |||
run_all(fcsp) | |||
save_dir = 'outputs/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_task(kernel_name, ds_name, fcsp) |
@@ -0,0 +1,98 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Wed Dec 2 17:41:54 2020 | |||
@author: ljia | |||
This script compares the results with and without FCSP. | |||
""" | |||
from gklearn.dataset import Dataset | |||
from shortest_path import SPSpace | |||
from structural_sp import SSPSpace | |||
from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct | |||
from gklearn.experiments import DATASET_ROOT | |||
import functools | |||
import os | |||
import pickle | |||
import sys | |||
import logging | |||
def run_task(kernel_name, ds_name, fcsp): | |||
save_file_suffix = '.' + kernel_name + '.' + ds_name + '.' + str(fcsp) | |||
file_name = os.path.join(save_dir, 'space' + save_file_suffix + '.pkl') | |||
# Return if the task is already completed. | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
data = pickle.load(f) | |||
if data['completed']: | |||
return | |||
print() | |||
print((kernel_name, ds_name, str(fcsp))) | |||
try: | |||
gram_matrix, run_time = compute(kernel_name, ds_name, fcsp, file_name) | |||
except Exception as exp: | |||
print('An exception occured when running this experiment:') | |||
LOG_FILENAME = os.path.join(save_dir, 'error.space' + save_file_suffix + '.txt') | |||
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) | |||
logging.exception('\n--------------' + save_file_suffix + '------------------') | |||
print(repr(exp)) | |||
# else: | |||
# with open(file_name, 'wb') as f: | |||
# pickle.dump(run_time, f) | |||
def compute(kernel_name, ds_name, fcsp, file_name): | |||
dataset = Dataset(ds_name, root=DATASET_ROOT, verbose=True) | |||
if kernel_name == 'ShortestPath': | |||
dataset.trim_dataset(edge_required=True) | |||
# dataset.cut_graphs(range(0, 10)) | |||
kernel_class = SPSpace | |||
else: | |||
# dataset.cut_graphs(range(0, 10)) | |||
kernel_class = SSPSpace | |||
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) | |||
node_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
edge_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} | |||
graph_kernel = kernel_class(name=kernel_name, | |||
node_labels=dataset.node_labels, | |||
edge_labels=dataset.edge_labels, | |||
node_attrs=dataset.node_attrs, | |||
edge_attrs=dataset.edge_attrs, | |||
ds_infos=dataset.get_dataset_infos(keys=['directed']), | |||
fcsp=fcsp, | |||
compute_method='naive', | |||
node_kernels=node_kernels, | |||
edge_kernels=edge_kernels, | |||
file_name=file_name | |||
) | |||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, | |||
parallel=None, | |||
normalize=False, | |||
verbose=2 | |||
) | |||
return gram_matrix, run_time | |||
if __name__ == '__main__': | |||
if len(sys.argv) > 1: | |||
kernel_name = sys.argv[1] | |||
ds_name = sys.argv[2] | |||
fcsp = True if sys.argv[3] == 'True' else False | |||
else: | |||
kernel_name = 'StructuralSP' | |||
ds_name = 'Fingerprint' | |||
fcsp = True | |||
save_dir = 'outputs/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
run_task(kernel_name, ds_name, fcsp) |
@@ -10,27 +10,60 @@ import os | |||
import re | |||
def get_job_script(param): | |||
OUT_TIME_LIST = [('ShortestPath', 'ENZYMES', 'False'), | |||
('StructuralSP', 'ENZYMES', 'True'), | |||
('StructuralSP', 'ENZYMES', 'False'), | |||
('StructuralSP', 'AIDS', 'False'), | |||
('ShortestPath', 'NCI1', 'False'), | |||
('StructuralSP', 'NCI1', 'True'), | |||
('StructuralSP', 'NCI1', 'False'), | |||
('ShortestPath', 'NCI109', 'False'), | |||
('StructuralSP', 'NCI109', 'True'), | |||
('StructuralSP', 'NCI109', 'False'), | |||
('ShortestPath', 'DD', 'True'), | |||
('ShortestPath', 'DD', 'False'), | |||
('StructuralSP', 'BZR', 'False'), | |||
('ShortestPath', 'COX2', 'False'), | |||
('StructuralSP', 'COX2', 'False'), | |||
('ShortestPath', 'DHFR', 'False'), | |||
] | |||
OUT_MEM_LIST = [('StructuralSP', 'PROTEINS', 'True'), | |||
('StructuralSP', 'PROTEINS', 'False'), | |||
('StructuralSP', 'PROTEINS_full', 'True'), | |||
('StructuralSP', 'PROTEINS_full', 'False'), | |||
('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
] | |||
MISS_LABEL_LIST = [('StructuralSP', 'GREC', 'True'), | |||
('StructuralSP', 'GREC', 'False'), | |||
('StructuralSP', 'Web', 'True'), | |||
('StructuralSP', 'Web', 'False'), | |||
] | |||
def get_job_script(kernel, dataset, fcsp): | |||
script = r""" | |||
#!/bin/bash | |||
#SBATCH --exclusive | |||
#SBATCH --job-name="fcsp.""" + param + r"""" | |||
#SBATCH --partition=long | |||
#SBATCH --job-name="fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||
#SBATCH --partition=tlong | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_fcsp.""" + param + r""".txt" | |||
#SBATCH --error="errors/error_fcsp.""" + param + r""".txt" | |||
#SBATCH --output="outputs/output_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
#SBATCH --error="errors/error_fcsp.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=100:00:00 | |||
#SBATCH --mem-per-cpu=4000 | |||
#SBATCH --time=300:00:00 | |||
##SBATCH --mem-per-cpu=4000 | |||
#SBATCH --mem=40000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | |||
srun python3 compare_fcsp.py """ + param | |||
srun python3 compare_fcsp.py """ + kernel + r" " + dataset + r" " + fcsp | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
@@ -38,15 +71,75 @@ srun python3 compare_fcsp.py """ + param | |||
return script | |||
def check_task_status(save_dir, *params): | |||
str_task_id = '.' + '.'.join(params) | |||
# Check if the task is in out of memeory or out of space lists or missing labels. | |||
if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||
return True | |||
# Check if the task is running or in queue of slurm. | |||
command = 'squeue --user ljia02 --name "fcsp' + str_task_id + '" --format "%.2t" --noheader' | |||
stream = os.popen(command) | |||
output = stream.readlines() | |||
if len(output) > 0: | |||
return True | |||
# Check if the results are already computed. | |||
file_name = os.path.join(save_dir, 'run_time' + str_task_id + '.pkl') | |||
if os.path.isfile(file_name): | |||
return True | |||
return False | |||
if __name__ == '__main__': | |||
save_dir = 'outputs/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs('outputs/', exist_ok=True) | |||
os.makedirs('errors/', exist_ok=True) | |||
param_list = ['True', 'False'] | |||
for param in param_list[:]: | |||
job_script = get_job_script(param) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
from sklearn.model_selection import ParameterGrid | |||
Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||
'Letter-high', 'Letter-med', 'Letter-low', | |||
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
# new: not so large. | |||
'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||
'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||
'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||
'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||
# new: large. | |||
'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||
'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||
'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||
'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||
'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||
'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||
'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||
'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||
'COLLAB', 'COIL-DEL', | |||
'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||
'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||
'REDDIT-MULTI-12K'] | |||
Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
fcsp_list = ['True', 'False'] | |||
task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||
'dataset': Dataset_List[:], | |||
'fcsp': fcsp_list[:]}) | |||
from tqdm import tqdm | |||
for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||
if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||
job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
# os.popen(command) | |||
# output = stream.readlines() |
@@ -0,0 +1,225 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Mon Dec 14 11:49:43 2020 | |||
@author: ljia | |||
""" | |||
import os | |||
import re | |||
import pickle | |||
OUT_TIME_LIST = [] | |||
OUT_MEM_LIST = [('ShortestPath', 'REDDIT-BINARY', 'True'), | |||
('ShortestPath', 'REDDIT-BINARY', 'False'), | |||
('ShortestPath', 'DD', 'True'), | |||
('ShortestPath', 'DD', 'False'), | |||
('ShortestPath', 'MCF-7', 'True'), | |||
('ShortestPath', 'MCF-7', 'False'), | |||
('StructuralSP', 'MCF-7', 'True'), | |||
('StructuralSP', 'MCF-7', 'False'), | |||
('ShortestPath', 'MCF-7H', 'True'), | |||
('ShortestPath', 'MCF-7H', 'False'), | |||
('StructuralSP', 'MCF-7H', 'True'), | |||
('StructuralSP', 'MCF-7H', 'False'), | |||
('ShortestPath', 'MOLT-4', 'True'), | |||
('ShortestPath', 'MOLT-4', 'False'), | |||
('StructuralSP', 'MOLT-4', 'True'), | |||
('StructuralSP', 'MOLT-4', 'False'), | |||
('ShortestPath', 'MOLT-4H', 'True'), | |||
('ShortestPath', 'MOLT-4H', 'False'), | |||
('StructuralSP', 'MOLT-4H', 'True'), | |||
('StructuralSP', 'MOLT-4H', 'False'), | |||
('ShortestPath', 'P388', 'True'), | |||
('ShortestPath', 'P388H', 'True'), | |||
('ShortestPath', 'NCI-H23', 'True'), | |||
('ShortestPath', 'NCI-H23', 'False'), | |||
('StructuralSP', 'NCI-H23', 'True'), | |||
('StructuralSP', 'NCI-H23', 'False'), | |||
('ShortestPath', 'NCI-H23H', 'True'), | |||
('ShortestPath', 'NCI-H23H', 'False'), | |||
('StructuralSP', 'NCI-H23H', 'True'), | |||
('StructuralSP', 'NCI-H23H', 'False'), | |||
('ShortestPath', 'OVCAR-8', 'True'), | |||
('ShortestPath', 'OVCAR-8', 'False'), | |||
('StructuralSP', 'OVCAR-8', 'True'), | |||
('StructuralSP', 'OVCAR-8', 'False'), | |||
('ShortestPath', 'OVCAR-8H', 'False'), | |||
('StructuralSP', 'OVCAR-8H', 'False'), | |||
('ShortestPath', 'SN12C', 'True'), | |||
('ShortestPath', 'SN12C', 'False'), | |||
('StructuralSP', 'SN12C', 'True'), | |||
('StructuralSP', 'SN12C', 'False'), | |||
('ShortestPath', 'SN12CH', 'True'), | |||
('ShortestPath', 'SN12CH', 'False'), | |||
('ShortestPath', 'SF-295', 'True'), | |||
('ShortestPath', 'SF-295', 'False'), | |||
('StructuralSP', 'SF-295', 'True'), | |||
('StructuralSP', 'SF-295', 'False'), | |||
('ShortestPath', 'SF-295H', 'False'), | |||
('StructuralSP', 'SF-295H', 'False'), | |||
('ShortestPath', 'SW-620', 'True'), | |||
('ShortestPath', 'SW-620', 'False'), | |||
('StructuralSP', 'SW-620', 'True'), | |||
('StructuralSP', 'SW-620', 'False'), | |||
('ShortestPath', 'SW-620H', 'False'), | |||
('StructuralSP', 'SW-620H', 'False'), | |||
('ShortestPath', 'TRIANGLES', 'False'), | |||
('StructuralSP', 'TRIANGLES', 'False'), | |||
('ShortestPath', 'Yeast', 'True'), | |||
('ShortestPath', 'Yeast', 'False'), | |||
('StructuralSP', 'Yeast', 'True'), | |||
('StructuralSP', 'Yeast', 'False'), | |||
('ShortestPath', 'YeastH', 'True'), | |||
('ShortestPath', 'FRANKENSTEIN', 'True'), | |||
('ShortestPath', 'FRANKENSTEIN', 'False'), | |||
('StructuralSP', 'FRANKENSTEIN', 'True'), | |||
('StructuralSP', 'FRANKENSTEIN', 'False'), | |||
('StructuralSP', 'SN12CH', 'True'), | |||
('StructuralSP', 'SN12CH', 'False'), | |||
('ShortestPath', 'UACC257', 'True'), | |||
('ShortestPath', 'UACC257', 'False'), | |||
('StructuralSP', 'UACC257', 'True'), | |||
('StructuralSP', 'UACC257', 'False'), | |||
('ShortestPath', 'UACC257H', 'True'), | |||
('ShortestPath', 'UACC257H', 'False'), | |||
('StructuralSP', 'UACC257H', 'True'), | |||
('StructuralSP', 'UACC257H', 'False'), | |||
('ShortestPath', 'PC-3', 'True'), | |||
('ShortestPath', 'PC-3', 'False'), | |||
('StructuralSP', 'PC-3', 'True'), | |||
('StructuralSP', 'PC-3', 'False'), | |||
('ShortestPath', 'PC-3H', 'True'), | |||
('ShortestPath', 'PC-3H', 'False'), | |||
('StructuralSP', 'PC-3H', 'True'), | |||
('StructuralSP', 'PC-3H', 'False'), | |||
('ShortestPath', 'DBLP_v1', 'False'), | |||
('StructuralSP', 'DBLP_v1', 'True'), | |||
('ShortestPath', 'REDDIT-BINARY', 'False'), | |||
('ShortestPath', 'REDDIT-MULTI-12K', 'False'), | |||
('StructuralSP', 'REDDIT-MULTI-12K', 'False'), | |||
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'True'), | |||
('ShortestPath', 'TWITTER-Real-Graph-Partial', 'False'), | |||
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'True'), | |||
('StructuralSP', 'TWITTER-Real-Graph-Partial', 'False'), | |||
] | |||
MISS_LABEL_LIST = [('StructuralSP', 'GREC', 'True'), | |||
('StructuralSP', 'GREC', 'False'), | |||
('StructuralSP', 'Web', 'True'), | |||
('StructuralSP', 'Web', 'False'), | |||
] | |||
def get_job_script(kernel, dataset, fcsp): | |||
# if (kernel, dataset, fcsp) in OUT_MEM_LIST: | |||
# mem = '2560000' | |||
# else: | |||
mem = '4000' | |||
script = r""" | |||
#!/bin/bash | |||
#SBATCH --exclusive | |||
#SBATCH --job-name="fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r"""" | |||
#SBATCH --partition=""" + (r"court" if kernel == 'ShortestPath' else r"court") + r""" | |||
#SBATCH --mail-type=ALL | |||
#SBATCH --mail-user=jajupmochi@gmail.com | |||
#SBATCH --output="outputs/output_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
#SBATCH --error="errors/error_fcsp.space.""" + kernel + r"." + dataset + r"." + fcsp + r""".txt" | |||
# | |||
#SBATCH --ntasks=1 | |||
#SBATCH --nodes=1 | |||
#SBATCH --cpus-per-task=1 | |||
#SBATCH --time=""" + (r"48" if kernel == 'ShortestPath' else r"48") + r""":00:00 | |||
##SBATCH --mem-per-cpu=""" + mem + r""" | |||
#SBATCH --mem=4000 | |||
srun hostname | |||
srun cd /home/2019015/ljia02/graphkit-learn/gklearn/experiments/thesis/graph_kernels/fcsp | |||
srun python3 compare_fcsp_space.py """ + kernel + r" " + dataset + r" " + fcsp | |||
script = script.strip() | |||
script = re.sub('\n\t+', '\n', script) | |||
script = re.sub('\n +', '\n', script) | |||
return script | |||
def check_task_status(save_dir, *params): | |||
str_task_id = '.' + '.'.join(params) | |||
# Check if the task is in out of memeory or out of space lists or missing labels. | |||
if params in OUT_MEM_LIST or params in OUT_TIME_LIST or params in MISS_LABEL_LIST: | |||
return True | |||
# Check if the task is running or in queue of slurm. | |||
command = 'squeue --user ljia02 --name "fcsp.space' + str_task_id + '" --format "%.2t" --noheader' | |||
stream = os.popen(command) | |||
output = stream.readlines() | |||
if len(output) > 0: | |||
return True | |||
# Check if the task is already computed. | |||
file_name = os.path.join(save_dir, 'space' + str_task_id + '.pkl') | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
data = pickle.load(f) | |||
if data['completed']: | |||
return True | |||
return False | |||
if __name__ == '__main__': | |||
save_dir = 'outputs/' | |||
os.makedirs(save_dir, exist_ok=True) | |||
os.makedirs('outputs/', exist_ok=True) | |||
os.makedirs('errors/', exist_ok=True) | |||
from sklearn.model_selection import ParameterGrid | |||
Dataset_List = ['Alkane_unlabeled', 'Alkane', 'Acyclic', 'MAO_lite', 'MAO', | |||
'PAH_unlabeled', 'PAH', 'MUTAG', 'Monoterpens', | |||
'Letter-high', 'Letter-med', 'Letter-low', | |||
'ENZYMES', 'AIDS', 'NCI1', 'NCI109', 'DD', | |||
# new: not so large. | |||
'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR', 'Chiral', 'Vitamin_D', | |||
'ACE', 'Steroid', 'KKI', 'Fingerprint', 'IMDB-BINARY', | |||
'IMDB-MULTI', 'Peking_1', 'Cuneiform', 'OHSU', 'BZR', 'COX2', | |||
'DHFR', 'SYNTHETICnew', 'Synthie', 'SYNTHETIC', | |||
# new: large. | |||
'TWITTER-Real-Graph-Partial', 'GREC', 'Web', 'MCF-7', | |||
'MCF-7H', 'MOLT-4', 'MOLT-4H', 'NCI-H23', 'NCI-H23H', | |||
'OVCAR-8', 'OVCAR-8H', 'P388', 'P388H', 'PC-3', 'PC-3H', | |||
'SF-295', 'SF-295H', 'SN12C', 'SN12CH', 'SW-620', 'SW-620H', | |||
'TRIANGLES', 'UACC257', 'UACC257H', 'Yeast', 'YeastH', | |||
'COLORS-3', 'DBLP_v1', 'REDDIT-MULTI-12K', | |||
'REDDIT-MULTI-12K', 'REDDIT-MULTI-12K', | |||
'REDDIT-MULTI-12K', 'MSRC_9', 'MSRC_21', 'MSRC_21C', | |||
'COLLAB', 'COIL-DEL', | |||
'COIL-RAG', 'PROTEINS', 'PROTEINS_full', 'Mutagenicity', | |||
'REDDIT-BINARY', 'FRANKENSTEIN', 'REDDIT-MULTI-5K', | |||
'REDDIT-MULTI-12K'] | |||
Kernel_List = ['ShortestPath', 'StructuralSP'] | |||
fcsp_list = ['True', 'False'] | |||
task_grid = ParameterGrid({'kernel': Kernel_List[:], | |||
'dataset': Dataset_List[:], | |||
'fcsp': fcsp_list[:]}) | |||
from tqdm import tqdm | |||
for task in tqdm(list(task_grid), desc='submitting tasks/jobs'): | |||
if False == check_task_status(save_dir, task['kernel'], task['dataset'], task['fcsp']): | |||
job_script = get_job_script(task['kernel'], task['dataset'], task['fcsp']) | |||
command = 'sbatch <<EOF\n' + job_script + '\nEOF' | |||
# print(command) | |||
os.system(command) | |||
# os.popen(command) | |||
# output = stream.readlines() |
@@ -0,0 +1,253 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Tue Apr 7 15:24:58 2020 | |||
@author: ljia | |||
@references: | |||
[1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData | |||
Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. | |||
""" | |||
import sys | |||
from itertools import product | |||
# from functools import partial | |||
from gklearn.utils import get_iters | |||
import numpy as np | |||
from gklearn.utils.utils import getSPGraph | |||
from gklearn.kernels import ShortestPath | |||
import os | |||
import pickle | |||
from pympler import asizeof | |||
import time | |||
import networkx as nx | |||
def load_results(file_name, fcsp): | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
return pickle.load(f) | |||
else: | |||
results = {'nb_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||
if fcsp: | |||
results['vk_dict_mem'] = [] | |||
return results | |||
def save_results(file_name, results): | |||
with open(file_name, 'wb') as f: | |||
pickle.dump(results, f) | |||
def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||
# asizeof.asized(obj, detail=1).format() | |||
# return asizeof.asizeof(obj) | |||
key, val = next(iter(obj.items())) | |||
# key = dict.iterkeys().next() | |||
# key_mem = asizeof.asizeof(key) | |||
dict_flat = sys.getsizeof(obj) | |||
key_mem = 64 | |||
if isinstance(val, float): | |||
val_mem = 24 | |||
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||
else: # value is True or False | |||
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||
# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||
return mem | |||
def compute_stats(file_name, results): | |||
del results['i'] | |||
del results['j'] | |||
results['nb_comparison'] = np.mean(results['nb_comparison']) | |||
results['completed'] = True | |||
if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||
results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||
save_results(file_name, results) | |||
class SPSpace(ShortestPath): | |||
def __init__(self, **kwargs): | |||
super().__init__(**kwargs) | |||
self._file_name = kwargs.get('file_name') | |||
# @profile | |||
def _compute_gm_series(self): | |||
self._all_graphs_have_edges(self._graphs) | |||
# get shortest path graph of each graph. | |||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] | |||
results = load_results(self._file_name, self._fcsp) | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', | |||
length=len_itr, file=sys.stdout,verbose=(self._verbose >= 2)) | |||
time0 = time.time() | |||
for i, j in iterator: | |||
if i > results['i'] or (i == results['i'] and j > results['j']): | |||
data = self._sp_do_space(self._graphs[i], self._graphs[j]) | |||
if self._fcsp: | |||
results['nb_comparison'].append(data[0]) | |||
if data[1] != {}: | |||
results['vk_dict_mem'].append(estimate_vk_memory(data[1], | |||
nx.number_of_nodes(self._graphs[i]), | |||
nx.number_of_nodes(self._graphs[j]))) | |||
else: | |||
results['nb_comparison'].append(data) | |||
results['i'] = i | |||
results['j'] = j | |||
time1 = time.time() | |||
if time1 - time0 > 600: | |||
save_results(self._file_name, results) | |||
time0 = time1 | |||
compute_stats(self._file_name, results) | |||
return gram_matrix | |||
def _sp_do_space(self, g1, g2): | |||
if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||
return self._sp_do_fcsp(g1, g2) | |||
else: | |||
return self._sp_do_naive(g1, g2) | |||
def _sp_do_fcsp(self, g1, g2): | |||
nb_comparison = 0 | |||
# compute shortest path matrices first, method borrowed from FCSP. | |||
vk_dict = {} # shortest path matrices dict | |||
if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do(). | |||
# node symb and non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
kn = self._node_kernels['mix'] | |||
for n1, n2 in product( | |||
g1.nodes(data=True), g2.nodes(data=True)): | |||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
nb_comparison += 1 | |||
# node symb labeled | |||
else: | |||
kn = self._node_kernels['symb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||
nb_comparison += 1 | |||
else: | |||
# node non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
kn = self._node_kernels['nsymb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||
nb_comparison += 1 | |||
# node unlabeled | |||
else: | |||
for e1, e2 in product( | |||
g1.edges(data=True), g2.edges(data=True)): | |||
pass | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kernel += 1 | |||
# nb_comparison += 1 | |||
return nb_comparison, vk_dict | |||
# # compute graph kernels | |||
# if self._ds_infos['directed']: | |||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])] | |||
# kn1 = nk11 * nk22 | |||
# kernel += kn1 | |||
# else: | |||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# # each edge walk is counted twice, starting from both its extreme nodes. | |||
# nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( | |||
# e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])] | |||
# kn1 = nk11 * nk22 | |||
# kn2 = nk12 * nk21 | |||
# kernel += kn1 + kn2 | |||
def _sp_do_naive(self, g1, g2): | |||
nb_comparison = 0 | |||
# Define the function to compute kernels between vertices in each condition. | |||
if len(self._node_labels) > 0: | |||
# node symb and non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['mix'] | |||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
# node symb labeled | |||
else: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['symb'] | |||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
return kn(n1_labels, n2_labels) | |||
else: | |||
# node non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['nsymb'] | |||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
return kn(n1_attrs, n2_attrs) | |||
# node unlabeled | |||
else: | |||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kernel += 1 | |||
return 0 | |||
# compute graph kernels | |||
if self._ds_infos['directed']: | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
# nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1]) | |||
# kn1 = nk11 * nk22 | |||
# kernel += kn1 | |||
nb_comparison += 2 | |||
else: | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
if e1[2]['cost'] == e2[2]['cost']: | |||
# each edge walk is counted twice, starting from both its extreme nodes. | |||
# nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk( | |||
# e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1]) | |||
# kn1 = nk11 * nk22 | |||
# kn2 = nk12 * nk21 | |||
# kernel += kn1 + kn2 | |||
nb_comparison += 4 | |||
return nb_comparison |
@@ -0,0 +1,439 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Mon Mar 30 11:59:57 2020 | |||
@author: ljia | |||
@references: | |||
[1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For | |||
Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). | |||
""" | |||
import sys | |||
from itertools import product | |||
from gklearn.utils import get_iters | |||
import numpy as np | |||
import time | |||
import os, errno | |||
import pickle | |||
from pympler import asizeof | |||
import networkx as nx | |||
from gklearn.utils.utils import get_shortest_paths | |||
from gklearn.kernels import StructuralSP | |||
def load_splist(file_name): | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
return pickle.load(f) | |||
else: | |||
results_path = {'splist': [], 'i': -1, 'completed': False} | |||
return results_path | |||
def load_results(file_name, fcsp): | |||
if os.path.isfile(file_name): | |||
with open(file_name, 'rb') as f: | |||
return pickle.load(f) | |||
else: | |||
results = {'nb_v_comparison': [], 'nb_e_comparison': [], 'i': -1, 'j': -1, 'completed': False} | |||
if fcsp: | |||
results['vk_dict_mem'] = [] | |||
results['ek_dict_mem'] = [] | |||
return results | |||
def save_results(file_name, results): | |||
with open(file_name, 'wb') as f: | |||
pickle.dump(results, f) | |||
def estimate_vk_memory(obj, nb_nodes1, nb_nodes2): | |||
# asizeof.asized(obj, detail=1).format() | |||
# return asizeof.asizeof(obj) | |||
key, val = next(iter(obj.items())) | |||
# key = dict.iterkeys().next() | |||
# key_mem = asizeof.asizeof(key) | |||
dict_flat = sys.getsizeof(obj) | |||
key_mem = 64 | |||
if isinstance(val, float): | |||
val_mem = 24 | |||
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||
else: # value is True or False | |||
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||
# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||
return mem | |||
def estimate_ek_memory(obj, nb_nodes1, nb_nodes2): | |||
# asizeof.asized(obj, detail=1).format() | |||
# return asizeof.asizeof(obj) | |||
key, val = next(iter(obj.items())) | |||
# key = dict.iterkeys().next() | |||
# key_mem = asizeof.asizeof(key) | |||
dict_flat = sys.getsizeof(obj) | |||
key_mem = 192 | |||
if isinstance(val, float): | |||
val_mem = 24 | |||
mem = (key_mem + val_mem) * len(obj) + dict_flat + 28 * (nb_nodes1 + nb_nodes2) | |||
else: # value is True or False | |||
mem = (key_mem) * len(obj) + dict_flat + 52 + 28 * (nb_nodes1 + nb_nodes2) | |||
# print(mem, asizeof.asizeof(obj), '\n', asizeof.asized(obj, detail=3).format(), '\n') | |||
return mem | |||
def compute_stats(file_name, results, splist): | |||
del results['i'] | |||
del results['j'] | |||
results['nb_v_comparison'] = np.mean(results['nb_v_comparison']) | |||
# if len(results['nb_e_comparison']) > 0: | |||
results['nb_e_comparison'] = np.mean(results['nb_e_comparison']) | |||
results['completed'] = True | |||
if 'vk_dict_mem' in results and len(results['vk_dict_mem']) > 0: | |||
results['vk_dict_mem'] = np.mean(results['vk_dict_mem']) | |||
if 'ek_dict_mem' in results and len(results['ek_dict_mem']) > 0: | |||
results['ek_dict_mem'] = np.mean(results['ek_dict_mem']) | |||
results['nb_sp_ave'] = np.mean([len(ps) for ps in splist]) | |||
results['sp_len_ave'] = np.mean([np.mean([len(p) for p in ps]) for ps in splist]) | |||
results['sp_mem_all'] = asizeof.asizeof(splist) | |||
save_results(file_name, results) | |||
class SSPSpace(StructuralSP): | |||
def __init__(self, **kwargs): | |||
super().__init__(**kwargs) | |||
self._file_name = kwargs.get('file_name') | |||
# @profile | |||
def _compute_gm_series(self): | |||
# get shortest paths of each graph in the graphs. | |||
fn_paths = os.path.splitext(self._file_name)[0] + '.paths.pkl' | |||
results_path = load_splist(fn_paths) | |||
if not results_path['completed']: | |||
iterator = get_iters(self._graphs, desc='getting sp graphs', file=sys.stdout, verbose=(self._verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for g in iterator: | |||
splist.append(self._get_sps_as_trie(g)) | |||
else: | |||
time0 = time.time() | |||
for i, g in enumerate(iterator): | |||
if i > results_path['i']: | |||
results_path['splist'].append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) | |||
results_path['i'] = i | |||
time1 = time.time() | |||
if time1 - time0 > 600: | |||
save_results(fn_paths, results_path) | |||
time0 = time1 | |||
del results_path['i'] | |||
results_path['completed'] = True | |||
save_results(fn_paths, results_path) | |||
######### | |||
splist = results_path['splist'] | |||
results = load_results(self._file_name, self._fcsp) | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
len_itr = int(len(self._graphs) * (len(self._graphs) + 1) / 2) | |||
iterator = get_iters(itr, desc='Computing kernels', file=sys.stdout, | |||
length=len_itr, verbose=(self._verbose >= 2)) | |||
if self._compute_method == 'trie': | |||
for i, j in iterator: | |||
kernel = self._ssp_do_trie(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||
gram_matrix[i][j] = kernel | |||
gram_matrix[j][i] = kernel | |||
else: | |||
time0 = time.time() | |||
for i, j in iterator: | |||
if i > results['i'] or (i == results['i'] and j > results['j']): | |||
data = self._ssp_do_naive_space(self._graphs[i], self._graphs[j], splist[i], splist[j]) | |||
results['nb_v_comparison'].append(data[0]) | |||
results['nb_e_comparison'].append(data[1]) | |||
if self._fcsp: | |||
if data[2] != {}: | |||
results['vk_dict_mem'].append(estimate_vk_memory(data[2], | |||
nx.number_of_nodes(self._graphs[i]), | |||
nx.number_of_nodes(self._graphs[j]))) | |||
if data[3] != {}: | |||
results['ek_dict_mem'].append(estimate_ek_memory(data[3], | |||
nx.number_of_nodes(self._graphs[i]), | |||
nx.number_of_nodes(self._graphs[j]))) | |||
results['i'] = i | |||
results['j'] = j | |||
time1 = time.time() | |||
if time1 - time0 > 600: | |||
save_results(self._file_name, results) | |||
time0 = time1 | |||
compute_stats(self._file_name, results, splist) | |||
# @todo: may not remove the path file if the program stops exactly here. | |||
try: | |||
os.remove(fn_paths) | |||
except OSError as e: | |||
if e.errno != errno.ENOENT: | |||
raise | |||
return gram_matrix | |||
def _ssp_do_naive_space(self, g1, g2, spl1, spl2): | |||
if self._fcsp: # @todo: it may be put outside the _sp_do(). | |||
return self._sp_do_naive_fcsp(g1, g2, spl1, spl2) | |||
else: | |||
return self._sp_do_naive_naive(g1, g2, spl1, spl2) | |||
def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2): | |||
# First, compute shortest path matrices, method borrowed from FCSP. | |||
vk_dict, nb_v_comparison = self._get_all_node_kernels(g1, g2) | |||
# Then, compute kernels between all pairs of edges, which is an idea of | |||
# extension of FCSP. It suits sparse graphs, which is the most case we | |||
# went though. For dense graphs, this would be slow. | |||
ek_dict, nb_e_comparison = self._get_all_edge_kernels(g1, g2) | |||
return nb_v_comparison, nb_e_comparison, vk_dict, ek_dict | |||
def _sp_do_naive_naive(self, g1, g2, spl1, spl2): | |||
nb_v_comparison = 0 | |||
nb_e_comparison = 0 | |||
# Define the function to compute kernels between vertices in each condition. | |||
if len(self._node_labels) > 0: | |||
# node symb and non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['mix'] | |||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
# node symb labeled | |||
else: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['symb'] | |||
n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] | |||
n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] | |||
return kn(n1_labels, n2_labels) | |||
else: | |||
# node non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
def compute_vk(n1, n2): | |||
kn = self._node_kernels['nsymb'] | |||
n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] | |||
n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] | |||
return kn(n1_attrs, n2_attrs) | |||
# # node unlabeled | |||
# else: | |||
# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kernel += 1 | |||
# return kernel | |||
# Define the function to compute kernels between edges in each condition. | |||
if len(self._edge_labels) > 0: | |||
# edge symb and non-synb labeled | |||
if len(self._edge_attrs) > 0: | |||
def compute_ek(e1, e2): | |||
ke = self._edge_kernels['mix'] | |||
e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||
e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||
return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||
# edge symb labeled | |||
else: | |||
def compute_ek(e1, e2): | |||
ke = self._edge_kernels['symb'] | |||
e1_labels = [g1.edges[e1][el] for el in self._edge_labels] | |||
e2_labels = [g2.edges[e2][el] for el in self._edge_labels] | |||
return ke(e1_labels, e2_labels) | |||
else: | |||
# edge non-synb labeled | |||
if len(self._edge_attrs) > 0: | |||
def compute_ek(e1, e2): | |||
ke = self._edge_kernels['nsymb'] | |||
e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] | |||
e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] | |||
return ke(e1_attrs, e2_attrs) | |||
# compute graph kernels | |||
if len(self._node_labels) > 0 or len(self._node_attrs) > 0: | |||
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||
for p1, p2 in product(spl1, spl2): | |||
if len(p1) == len(p2): | |||
# nb_v_comparison = len(p1) | |||
# nb_e_comparison = len(p1) - 1 | |||
kpath = compute_vk(p1[0], p2[0]) | |||
nb_v_comparison += 1 | |||
if kpath: | |||
for idx in range(1, len(p1)): | |||
kpath *= compute_vk(p1[idx], p2[idx]) * \ | |||
compute_ek((p1[idx-1], p1[idx]), | |||
(p2[idx-1], p2[idx])) | |||
nb_v_comparison += 1 | |||
nb_e_comparison += 1 | |||
if not kpath: | |||
break | |||
# kernel += kpath # add up kernels of all paths | |||
else: | |||
for p1, p2 in product(spl1, spl2): | |||
if len(p1) == len(p2): | |||
kpath = compute_vk(p1[0], p2[0]) | |||
nb_v_comparison += 1 | |||
if kpath: | |||
for idx in range(1, len(p1)): | |||
kpath *= compute_vk(p1[idx], p2[idx]) | |||
nb_v_comparison += 1 | |||
if not kpath: | |||
break | |||
# kernel += kpath # add up kernels of all paths | |||
else: | |||
if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: | |||
for p1, p2 in product(spl1, spl2): | |||
if len(p1) == len(p2): | |||
if len(p1) == 0: | |||
pass | |||
else: | |||
kpath = 1 | |||
for idx in range(0, len(p1) - 1): | |||
kpath *= compute_ek((p1[idx], p1[idx+1]), | |||
(p2[idx], p2[idx+1])) | |||
nb_e_comparison += 1 | |||
if not kpath: | |||
break | |||
else: | |||
pass | |||
# for p1, p2 in product(spl1, spl2): | |||
# if len(p1) == len(p2): | |||
# kernel += 1 | |||
# try: | |||
# kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average | |||
# except ZeroDivisionError: | |||
# print(spl1, spl2) | |||
# print(g1.nodes(data=True)) | |||
# print(g1.edges(data=True)) | |||
# raise Exception | |||
return nb_v_comparison, nb_e_comparison | |||
def _get_all_node_kernels(self, g1, g2): | |||
nb_comparison = 0 | |||
vk_dict = {} # shortest path matrices dict | |||
if len(self._node_labels) > 0: | |||
# node symb and non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
kn = self._node_kernels['mix'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) | |||
nb_comparison += 1 | |||
# node symb labeled | |||
else: | |||
kn = self._node_kernels['symb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_labels = [n1[1][nl] for nl in self._node_labels] | |||
n2_labels = [n2[1][nl] for nl in self._node_labels] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) | |||
nb_comparison += 1 | |||
else: | |||
# node non-synb labeled | |||
if len(self._node_attrs) > 0: | |||
kn = self._node_kernels['nsymb'] | |||
for n1 in g1.nodes(data=True): | |||
for n2 in g2.nodes(data=True): | |||
n1_attrs = [n1[1][na] for na in self._node_attrs] | |||
n2_attrs = [n2[1][na] for na in self._node_attrs] | |||
vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) | |||
nb_comparison += 1 | |||
# node unlabeled | |||
else: | |||
pass # @todo: add edge weights. | |||
# for e1 in g1.edges(data=True): | |||
# for e2 in g2.edges(data=True): | |||
# if e1[2]['cost'] == e2[2]['cost']: | |||
# kernel += 1 | |||
# return kernel | |||
return vk_dict, nb_comparison | |||
def _get_all_edge_kernels(self, g1, g2): | |||
nb_comparison = 0 | |||
# compute kernels between all pairs of edges, which is an idea of | |||
# extension of FCSP. It suits sparse graphs, which is the most case we | |||
# went though. For dense graphs, this would be slow. | |||
ek_dict = {} # dict of edge kernels | |||
if len(self._edge_labels) > 0: | |||
# edge symb and non-synb labeled | |||
if len(self._edge_attrs) > 0: | |||
ke = self._edge_kernels['mix'] | |||
for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): | |||
e1_labels = [e1[2][el] for el in self._edge_labels] | |||
e2_labels = [e2[2][el] for el in self._edge_labels] | |||
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||
ek_temp = ke(e1_labels, e2_labels, e1_attrs, e2_attrs) | |||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||
nb_comparison += 1 | |||
# edge symb labeled | |||
else: | |||
ke = self._edge_kernels['symb'] | |||
for e1 in g1.edges(data=True): | |||
for e2 in g2.edges(data=True): | |||
e1_labels = [e1[2][el] for el in self._edge_labels] | |||
e2_labels = [e2[2][el] for el in self._edge_labels] | |||
ek_temp = ke(e1_labels, e2_labels) | |||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||
nb_comparison += 1 | |||
else: | |||
# edge non-synb labeled | |||
if len(self._edge_attrs) > 0: | |||
ke = self._edge_kernels['nsymb'] | |||
for e1 in g1.edges(data=True): | |||
for e2 in g2.edges(data=True): | |||
e1_attrs = [e1[2][ea] for ea in self._edge_attrs] | |||
e2_attrs = [e2[2][ea] for ea in self._edge_attrs] | |||
ek_temp = ke(e1_attrs, e2_attrs) | |||
ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp | |||
ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp | |||
ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp | |||
nb_comparison += 1 | |||
# edge unlabeled | |||
else: | |||
pass | |||
return ek_dict, nb_comparison |