From f67d65bf51c53102918a590b500b4242dd1898a0 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sun, 15 Nov 2020 17:11:45 +0100 Subject: [PATCH 1/6] Update Dataset class for predefined datasets. --- gklearn/dataset/dataset.py | 121 ++++++++++++++------------------------- gklearn/dataset/file_managers.py | 1 + gklearn/dataset/metadata.py | 12 ++-- 3 files changed, 49 insertions(+), 85 deletions(-) diff --git a/gklearn/dataset/dataset.py b/gklearn/dataset/dataset.py index 0343c0b..cf90051 100644 --- a/gklearn/dataset/dataset.py +++ b/gklearn/dataset/dataset.py @@ -7,23 +7,44 @@ Created on Thu Mar 26 18:48:27 2020 """ import numpy as np import networkx as nx -from gklearn.utils.graph_files import load_dataset import os +from gklearn.dataset import DATASET_META, DataFetcher, DataLoader class Dataset(object): - def __init__(self, filename=None, filename_targets=None, **kwargs): - if filename is None: + def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs): + if inputs is None: self._graphs = None self._targets = None self._node_labels = None self._edge_labels = None self._node_attrs = None self._edge_attrs = None + + # If inputs is a list of graphs. + elif isinstance(inputs, list): + node_labels = kwargs.get('node_labels', None) + node_attrs = kwargs.get('node_attrs', None) + edge_labels = kwargs.get('edge_labels', None) + edge_attrs = kwargs.get('edge_attrs', None) + self.load_graphs(inputs, targets=targets) + self.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs) + if clean_labels: + self.clean_labels() + + elif isinstance(inputs, str): + # If inputs is predefined dataset name. + if inputs in DATASET_META: + self.load_predefined_dataset(inputs, root=root, clean_labels=clean_labels, reload=reload, verbose=verbose) + + # If inputs is a file name. + else: + self.load_dataset(inputs, filename_targets=filename_targets, clean_labels=clean_labels, **kwargs) + else: - self.load_dataset(filename, filename_targets=filename_targets, **kwargs) + raise TypeError('The "inputs" argument cannot be recoganized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.') self._substructures = None self._node_label_dim = None @@ -51,13 +72,14 @@ class Dataset(object): self._class_number = None - def load_dataset(self, filename, filename_targets=None, **kwargs): - self._graphs, self._targets, label_names = load_dataset(filename, filename_targets=filename_targets, **kwargs) + def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs): + self._graphs, self._targets, label_names = DataLoader(filename, filename_targets=filename_targets, **kwargs).data self._node_labels = label_names['node_labels'] self._node_attrs = label_names['node_attrs'] self._edge_labels = label_names['edge_labels'] self._edge_attrs = label_names['edge_attrs'] - self.clean_labels() + if clean_labels: + self.clean_labels() def load_graphs(self, graphs, targets=None): @@ -67,84 +89,25 @@ class Dataset(object): # self.set_labels_attrs() # @todo - def load_predefined_dataset(self, ds_name): - current_path = os.path.dirname(os.path.realpath(__file__)) + '/' - if ds_name == 'Acyclic': - ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'AIDS': - ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'Alkane': - ds_file = current_path + '../../datasets/Alkane/dataset.ds' - fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file, filename_targets=fn_targets) - elif ds_name == 'COIL-DEL': - ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'COIL-RAG': - ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'COLORS-3': - ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'Cuneiform': - ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'DD': - ds_file = current_path + '../../datasets/DD/DD_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'ENZYMES': - ds_file = current_path + '../../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'Fingerprint': - ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'FRANKENSTEIN': - ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'Letter-high': # node non-symb - ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'Letter-low': # node non-symb - ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'Letter-med': # node non-symb - ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'MAO': - ds_file = current_path + '../../datasets/MAO/dataset.ds' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'Monoterpenoides': - ds_file = current_path + '../../datasets/Monoterpenoides/dataset_10+.ds' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'MUTAG': - ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'NCI1': - ds_file = current_path + '../../datasets/NCI1/NCI1_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'NCI109': - ds_file = current_path + '../../datasets/NCI109/NCI109_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'PAH': - ds_file = current_path + '../../datasets/PAH/dataset.ds' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'SYNTHETIC': - pass - elif ds_name == 'SYNTHETICnew': - ds_file = current_path + '../../datasets/SYNTHETICnew/SYNTHETICnew_A.txt' - self._graphs, self._targets, label_names = load_dataset(ds_file) - elif ds_name == 'Synthie': - pass + def load_predefined_dataset(self, ds_name, root='datasets', clean_labels=True, reload=False, verbose=False): + path = DataFetcher(name=ds_name, root=root, reload=reload, verbose=verbose).path + + if DATASET_META[ds_name]['database'] == 'tudataset': + ds_file = os.path.join(path, ds_name + '_A.txt') + fn_targets = None else: - raise Exception('The dataset name "', ds_name, '" is not pre-defined.') + load_files = DATASET_META[ds_name]['load_files'] + ds_file = os.path.join(path, load_files[0]) + fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None + + self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets).data self._node_labels = label_names['node_labels'] self._node_attrs = label_names['node_attrs'] self._edge_labels = label_names['edge_labels'] self._edge_attrs = label_names['edge_attrs'] - self.clean_labels() + if clean_labels: + self.clean_labels() def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): diff --git a/gklearn/dataset/file_managers.py b/gklearn/dataset/file_managers.py index f2e539e..76ea9b0 100644 --- a/gklearn/dataset/file_managers.py +++ b/gklearn/dataset/file_managers.py @@ -74,6 +74,7 @@ class DataLoader(): label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} with open(filename) as fn: content = fn.read().splitlines() + content = [line for line in content if not line.endswith('.ds')] extension = splitext(content[0].split(' ')[0])[1][1:] if extension == 'ct': load_file_fun = self.load_ct diff --git a/gklearn/dataset/metadata.py b/gklearn/dataset/metadata.py index 4fa48d9..9725517 100644 --- a/gklearn/dataset/metadata.py +++ b/gklearn/dataset/metadata.py @@ -32,7 +32,7 @@ GREYC_META = { 'domain': 'small molecules', 'train_valid_test': [], 'stereoisomerism': True, - 'load_files': [], + 'load_files': ['data.ds'], }, 'Acyclic': { 'database': 'greyc', @@ -165,7 +165,7 @@ GREYC_META = { 'domain': 'small molecules', 'train_valid_test': ['trainset_0.ds', None, 'testset_0.ds'], 'stereoisomerism': False, - 'load_files': [], + 'load_files': ['dataset.ds'], }, 'PTC': { 'database': 'greyc', @@ -654,7 +654,7 @@ TUDataset_META = { 'node_attr_dim': 0, 'geometry': None, 'edge_attr_dim': 0, - 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23.zip-H23', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23.zip', 'domain': 'small molecules', }, 'NCI-H23H': { @@ -670,7 +670,7 @@ TUDataset_META = { 'node_attr_dim': 0, 'geometry': None, 'edge_attr_dim': 0, - 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23H.zip-H23H', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/NCI-H23H.zip', 'domain': 'small molecules', }, 'OVCAR-8': { @@ -686,7 +686,7 @@ TUDataset_META = { 'node_attr_dim': 0, 'geometry': None, 'edge_attr_dim': 0, - 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8.zip-8', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8.zip', 'domain': 'small molecules', }, 'OVCAR-8H': { @@ -702,7 +702,7 @@ TUDataset_META = { 'node_attr_dim': 0, 'geometry': None, 'edge_attr_dim': 0, - 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8H.zip-8H', + 'url': 'https://www.chrsmrrs.com/graphkerneldatasets/OVCAR-8H.zip', 'domain': 'small molecules', }, 'P388': { From 180c614b44bd606a276c3838a56ac318c9476d39 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Wed, 18 Nov 2020 17:02:51 +0100 Subject: [PATCH 2/6] Fix bugs in gklearn.dataset. --- gklearn/dataset/data_fetcher.py | 6 ++- gklearn/dataset/dataset.py | 88 +++++++++++++++++++++++++++------------- gklearn/dataset/file_managers.py | 26 +++++++++--- gklearn/dataset/metadata.py | 2 +- 4 files changed, 84 insertions(+), 38 deletions(-) diff --git a/gklearn/dataset/data_fetcher.py b/gklearn/dataset/data_fetcher.py index 8f3f167..4349753 100644 --- a/gklearn/dataset/data_fetcher.py +++ b/gklearn/dataset/data_fetcher.py @@ -74,6 +74,8 @@ class DataFetcher(): message = 'Invalid Dataset name "' + self._name + '".' message += '\nAvailable datasets are as follows: \n\n' message += '\n'.join(ds for ds in sorted(DATASET_META)) + message += '\n\nFollowing special suffices can be added to the name:' + message += '\n\n' + '\n'.join(['_unlabeled']) raise ValueError(message) else: self.write_archive_file(self._name) @@ -127,9 +129,9 @@ class DataFetcher(): def write_archive_file(self, ds_name): path = osp.join(self._root, ds_name) - url = DATASET_META[ds_name]['url'] # filename_dir = osp.join(path,filename) if not osp.exists(path) or self._reload: + url = DATASET_META[ds_name]['url'] response = self.download_file(url) if response is None: return False @@ -152,7 +154,7 @@ class DataFetcher(): with tarfile.open(filename_archive, 'r:gz') as tar: if self._reload and self._verbose: print(filename + ' Downloaded.') - subpath = os.path.join(path, tar.getnames()[0]) + subpath = os.path.join(path, tar.getnames()[0].split('/')[0]) if not osp.exists(subpath) or self._reload: tar.extractall(path = path) return subpath diff --git a/gklearn/dataset/dataset.py b/gklearn/dataset/dataset.py index cf90051..6911b12 100644 --- a/gklearn/dataset/dataset.py +++ b/gklearn/dataset/dataset.py @@ -14,7 +14,33 @@ from gklearn.dataset import DATASET_META, DataFetcher, DataLoader class Dataset(object): - def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs): + def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs): + self._substructures = None + self._node_label_dim = None + self._edge_label_dim = None + self._directed = None + self._dataset_size = None + self._total_node_num = None + self._ave_node_num = None + self._min_node_num = None + self._max_node_num = None + self._total_edge_num = None + self._ave_edge_num = None + self._min_edge_num = None + self._max_edge_num = None + self._ave_node_degree = None + self._min_node_degree = None + self._max_node_degree = None + self._ave_fill_factor = None + self._min_fill_factor = None + self._max_fill_factor = None + self._node_label_nums = None + self._edge_label_nums = None + self._node_attr_dim = None + self._edge_attr_dim = None + self._class_number = None + self._ds_name = None + if inputs is None: self._graphs = None self._targets = None @@ -38,38 +64,26 @@ class Dataset(object): # If inputs is predefined dataset name. if inputs in DATASET_META: self.load_predefined_dataset(inputs, root=root, clean_labels=clean_labels, reload=reload, verbose=verbose) + self._ds_name = inputs + + elif inputs.endswith('_unlabeled'): + self.load_predefined_dataset(inputs[:len(inputs) - 10], root=root, clean_labels=clean_labels, reload=reload, verbose=verbose) + self._ds_name = inputs + + # Deal with special suffices. + self.check_special_suffices() + + # If inputs is a file name. + elif os.path.isfile(inputs): + self.load_dataset(inputs, filename_targets=filename_targets, clean_labels=clean_labels, **kwargs) # If inputs is a file name. else: - self.load_dataset(inputs, filename_targets=filename_targets, clean_labels=clean_labels, **kwargs) + raise ValueError('The "inputs" argument "' + inputs + '" is not a valid dataset name or file name.') else: - raise TypeError('The "inputs" argument cannot be recoganized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.') - - self._substructures = None - self._node_label_dim = None - self._edge_label_dim = None - self._directed = None - self._dataset_size = None - self._total_node_num = None - self._ave_node_num = None - self._min_node_num = None - self._max_node_num = None - self._total_edge_num = None - self._ave_edge_num = None - self._min_edge_num = None - self._max_edge_num = None - self._ave_node_degree = None - self._min_node_degree = None - self._max_node_degree = None - self._ave_fill_factor = None - self._min_fill_factor = None - self._max_fill_factor = None - self._node_label_nums = None - self._edge_label_nums = None - self._node_attr_dim = None - self._edge_attr_dim = None - self._class_number = None + raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.') + def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs): @@ -97,7 +111,10 @@ class Dataset(object): fn_targets = None else: load_files = DATASET_META[ds_name]['load_files'] - ds_file = os.path.join(path, load_files[0]) + if isinstance(load_files[0], str): + ds_file = os.path.join(path, load_files[0]) + else: # load_files[0] is a list of files. + ds_file = [os.path.join(path, fn) for fn in load_files[0]] fn_targets = os.path.join(path, load_files[1]) if len(load_files) == 2 else None self._graphs, self._targets, label_names = DataLoader(ds_file, filename_targets=fn_targets).data @@ -108,6 +125,11 @@ class Dataset(object): self._edge_attrs = label_names['edge_attrs'] if clean_labels: self.clean_labels() + + # Deal with specific datasets. + if ds_name == 'Alkane': + self.trim_dataset(edge_required=True) + self.remove_labels(node_labels=['atom_symbol']) def set_labels(self, node_labels=[], node_attrs=[], edge_labels=[], edge_attrs=[]): @@ -536,6 +558,14 @@ class Dataset(object): return dataset + def check_special_suffices(self): + if self._ds_name.endswith('_unlabeled'): + self.remove_labels(node_labels=self._node_labels, + edge_labels=self._edge_labels, + node_attrs=self._node_attrs, + edge_attrs=self._edge_attrs) + + def get_all_node_labels(self): node_labels = [] for g in self._graphs: diff --git a/gklearn/dataset/file_managers.py b/gklearn/dataset/file_managers.py index 76ea9b0..d7d333b 100644 --- a/gklearn/dataset/file_managers.py +++ b/gklearn/dataset/file_managers.py @@ -38,7 +38,11 @@ class DataLoader(): for details. Note here filename is the name of either .txt file in the dataset directory. """ - extension = splitext(filename)[1][1:] + if isinstance(filename, str): + extension = splitext(filename)[1][1:] + else: # filename is a list of files. + extension = splitext(filename[0])[1][1:] + if extension == "ds": self._graphs, self._targets, self._label_names = self.load_from_ds(filename, filename_targets) elif extension == "cxl": @@ -67,14 +71,24 @@ class DataLoader(): Note these graph formats are checked automatically by the extensions of graph files. - """ - dirname_dataset = dirname(filename) + """ + if isinstance(filename, str): + dirname_dataset = dirname(filename) + with open(filename) as f: + content = f.read().splitlines() + else: # filename is a list of files. + dirname_dataset = dirname(filename[0]) + content = [] + for fn in filename: + with open(fn) as f: + content += f.read().splitlines() + # to remove duplicate file names. + data = [] y = [] label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} - with open(filename) as fn: - content = fn.read().splitlines() - content = [line for line in content if not line.endswith('.ds')] + content = [line for line in content if not line.endswith('.ds')] # Alkane + content = [line for line in content if not line.startswith('#')] # Acyclic extension = splitext(content[0].split(' ')[0])[1][1:] if extension == 'ct': load_file_fun = self.load_ct diff --git a/gklearn/dataset/metadata.py b/gklearn/dataset/metadata.py index 9725517..13844a4 100644 --- a/gklearn/dataset/metadata.py +++ b/gklearn/dataset/metadata.py @@ -165,7 +165,7 @@ GREYC_META = { 'domain': 'small molecules', 'train_valid_test': ['trainset_0.ds', None, 'testset_0.ds'], 'stereoisomerism': False, - 'load_files': ['dataset.ds'], + 'load_files': [['trainset_0.ds', 'testset_0.ds']], }, 'PTC': { 'database': 'greyc', From 10a276008599bec0c44f74f12dac9f04d0d0463f Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sun, 13 Dec 2020 12:32:25 +0100 Subject: [PATCH 3/6] Enable sp kernel and ssp kernel to NOT use fcsp. --- gklearn/kernels/graph_kernel.py | 107 +++++++++-------- gklearn/kernels/metadata.py | 2 +- gklearn/kernels/shortest_path.py | 150 ++++++++++++++++------- gklearn/kernels/structural_sp.py | 232 ++++++++++++++++++++++++++++-------- gklearn/tests/test_graph_kernels.py | 133 +++++++++++++-------- 5 files changed, 426 insertions(+), 198 deletions(-) diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index d263828..e9a4032 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -9,10 +9,11 @@ import numpy as np import networkx as nx import multiprocessing import time +from gklearn.utils import normalize_gram_matrix class GraphKernel(object): - + def __init__(self): self._graphs = None self._parallel = '' @@ -22,14 +23,14 @@ class GraphKernel(object): self._run_time = 0 self._gram_matrix = None self._gram_matrix_unnorm = None - + def compute(self, *graphs, **kwargs): self._parallel = kwargs.get('parallel', 'imap_unordered') self._n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) self._normalize = kwargs.get('normalize', True) self._verbose = kwargs.get('verbose', 2) - + if len(graphs) == 1: if not isinstance(graphs[0], list): raise Exception('Cannot detect graphs.') @@ -40,9 +41,9 @@ class GraphKernel(object): self._gram_matrix = self._compute_gram_matrix() self._gram_matrix_unnorm = np.copy(self._gram_matrix) if self._normalize: - self._gram_matrix = self.normalize_gm(self._gram_matrix) + self._gram_matrix = normalize_gram_matrix(self._gram_matrix) return self._gram_matrix, self._run_time - + elif len(graphs) == 2: if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): kernel = self._compute_single_kernel(graphs[0].copy(), graphs[1].copy()) @@ -59,14 +60,14 @@ class GraphKernel(object): return kernel_list, self._run_time else: raise Exception('Cannot detect graphs.') - + elif len(graphs) == 0 and self._graphs is None: raise Exception('Please add graphs before computing.') - + else: raise Exception('Cannot detect graphs.') - - + + def normalize_gm(self, gram_matrix): import warnings warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning) @@ -77,8 +78,8 @@ class GraphKernel(object): gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) gram_matrix[j][i] = gram_matrix[i][j] return gram_matrix - - + + def compute_distance_matrix(self): if self._gram_matrix is None: raise Exception('Please compute the Gram matrix before computing distance matrix.') @@ -97,98 +98,98 @@ class GraphKernel(object): dis_min = np.min(np.min(dis_mat[dis_mat != 0])) dis_mean = np.mean(np.mean(dis_mat)) return dis_mat, dis_max, dis_min, dis_mean - - + + def _compute_gram_matrix(self): start_time = time.time() - + if self._parallel == 'imap_unordered': gram_matrix = self._compute_gm_imap_unordered() elif self._parallel is None: gram_matrix = self._compute_gm_series() else: raise Exception('Parallel mode is not set correctly.') - + self._run_time = time.time() - start_time if self._verbose: print('Gram matrix of size %d built in %s seconds.' % (len(self._graphs), self._run_time)) - + return gram_matrix - - + + def _compute_gm_series(self): pass def _compute_gm_imap_unordered(self): pass - - + + def _compute_kernel_list(self, g1, g_list): start_time = time.time() - + if self._parallel == 'imap_unordered': kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) elif self._parallel is None: kernel_list = self._compute_kernel_list_series(g1, g_list) else: raise Exception('Parallel mode is not set correctly.') - + self._run_time = time.time() - start_time if self._verbose: print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' % (len(g_list), self._run_time)) - + return kernel_list - + def _compute_kernel_list_series(self, g1, g_list): pass - + def _compute_kernel_list_imap_unordered(self, g1, g_list): pass - - + + def _compute_single_kernel(self, g1, g2): start_time = time.time() - + kernel = self._compute_single_kernel_series(g1, g2) - + self._run_time = time.time() - start_time if self._verbose: print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) - + return kernel - - + + def _compute_single_kernel_series(self, g1, g2): pass - - + + def is_graph(self, graph): if isinstance(graph, nx.Graph): return True if isinstance(graph, nx.DiGraph): - return True + return True if isinstance(graph, nx.MultiGraph): - return True + return True if isinstance(graph, nx.MultiDiGraph): - return True + return True return False - - + + @property def graphs(self): return self._graphs - - + + @property def parallel(self): return self._parallel - - + + @property def n_jobs(self): return self._n_jobs @@ -197,30 +198,30 @@ class GraphKernel(object): @property def verbose(self): return self._verbose - - + + @property def normalize(self): return self._normalize - - + + @property def run_time(self): return self._run_time - - + + @property def gram_matrix(self): return self._gram_matrix - + @gram_matrix.setter def gram_matrix(self, value): self._gram_matrix = value - - + + @property def gram_matrix_unnorm(self): - return self._gram_matrix_unnorm + return self._gram_matrix_unnorm @gram_matrix_unnorm.setter def gram_matrix_unnorm(self, value): diff --git a/gklearn/kernels/metadata.py b/gklearn/kernels/metadata.py index d00d5d7..188fc56 100644 --- a/gklearn/kernels/metadata.py +++ b/gklearn/kernels/metadata.py @@ -12,7 +12,7 @@ GRAPH_KERNELS = { 'common walk': '', 'marginalized': '', 'sylvester equation': '', - 'fixed_point': '', + 'fixed point': '', 'conjugate gradient': '', 'spectral decomposition': '', ### based on paths. diff --git a/gklearn/kernels/shortest_path.py b/gklearn/kernels/shortest_path.py index 794095e..3a29423 100644 --- a/gklearn/kernels/shortest_path.py +++ b/gklearn/kernels/shortest_path.py @@ -5,9 +5,9 @@ Created on Tue Apr 7 15:24:58 2020 @author: ljia -@references: - - [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData +@references: + + [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. """ @@ -23,13 +23,14 @@ from gklearn.kernels import GraphKernel class ShortestPath(GraphKernel): - + def __init__(self, **kwargs): GraphKernel.__init__(self) self._node_labels = kwargs.get('node_labels', []) self._node_attrs = kwargs.get('node_attrs', []) self._edge_weight = kwargs.get('edge_weight', None) self._node_kernels = kwargs.get('node_kernels', None) + self._fcsp = kwargs.get('fcsp', True) self._ds_infos = kwargs.get('ds_infos', {}) @@ -40,10 +41,10 @@ class ShortestPath(GraphKernel): else: iterator = self._graphs self._graphs = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: @@ -54,10 +55,10 @@ class ShortestPath(GraphKernel): kernel = self._sp_do(self._graphs[i], self._graphs[j]) gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): # get shortest path graph of each graph. pool = Pool(self._n_jobs) @@ -76,20 +77,20 @@ class ShortestPath(GraphKernel): self._graphs[i] = g pool.close() pool.join() - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + def init_worker(gs_toshare): global G_gs G_gs = gs_toshare do_fun = self._wrapper_sp_do - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) - + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): # get shortest path graphs of g1 and each graph in g_list. g1 = getSPGraph(g1, edge_weight=self._edge_weight) @@ -98,7 +99,7 @@ class ShortestPath(GraphKernel): else: iterator = g_list g_list = [getSPGraph(g, edge_weight=self._edge_weight) for g in iterator] - + # compute kernel list. kernel_list = [None] * len(g_list) if self._verbose >= 2: @@ -108,10 +109,10 @@ class ShortestPath(GraphKernel): for i in iterator: kernel = self._sp_do(g1, g_list[i]) kernel_list[i] = kernel - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): # get shortest path graphs of g1 and each graph in g_list. g1 = getSPGraph(g1, edge_weight=self._edge_weight) @@ -131,49 +132,57 @@ class ShortestPath(GraphKernel): g_list[i] = g pool.close() pool.join() - + # compute Gram matrix. kernel_list = [None] * len(g_list) def init_worker(g1_toshare, gl_toshare): global G_g1, G_gl - G_g1 = g1_toshare - G_gl = gl_toshare + G_g1 = g1_toshare + G_gl = gl_toshare do_fun = self._wrapper_kernel_list_do - def func_assign(result, var_to_assign): + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + return kernel_list - - + + def _wrapper_kernel_list_do(self, itr): return itr, self._sp_do(G_g1, G_gl[itr]) - - + + def _compute_single_kernel_series(self, g1, g2): g1 = getSPGraph(g1, edge_weight=self._edge_weight) g2 = getSPGraph(g2, edge_weight=self._edge_weight) kernel = self._sp_do(g1, g2) - return kernel - - + return kernel + + def _wrapper_get_sp_graphs(self, itr_item): g = itr_item[0] i = itr_item[1] return i, getSPGraph(g, edge_weight=self._edge_weight) - - + + def _sp_do(self, g1, g2): - + + if self._fcsp: # @todo: it may be put outside the _sp_do(). + return self._sp_do_fcsp(g1, g2) + else: + return self._sp_do_naive(g1, g2) + + + def _sp_do_fcsp(self, g1, g2): + kernel = 0 - + # compute shortest path matrices first, method borrowed from FCSP. vk_dict = {} # shortest path matrices dict - if len(self._node_labels) > 0: + if len(self._node_labels) > 0: # @todo: it may be put outside the _sp_do(). # node symb and non-synb labeled if len(self._node_attrs) > 0: kn = self._node_kernels['mix'] @@ -208,7 +217,7 @@ class ShortestPath(GraphKernel): if e1[2]['cost'] == e2[2]['cost']: kernel += 1 return kernel - + # compute graph kernels if self._ds_infos['directed']: for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): @@ -225,7 +234,7 @@ class ShortestPath(GraphKernel): kn1 = nk11 * nk22 kn2 = nk12 * nk21 kernel += kn1 + kn2 - + # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation # # compute vertex kernels # try: @@ -238,7 +247,7 @@ class ShortestPath(GraphKernel): # vk_mat[i1][i2] = kn( # n1[1][node_label], n2[1][node_label], # [n1[1]['attributes']], [n2[1]['attributes']]) - + # range1 = range(0, len(edge_w_g[i])) # range2 = range(0, len(edge_w_g[j])) # for i1 in range1: @@ -254,10 +263,67 @@ class ShortestPath(GraphKernel): # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] # kernel += kn1 + kn2 - + return kernel - - + + + def _sp_do_naive(self, g1, g2): + + kernel = 0 + + # Define the function to compute kernels between vertices in each condition. + if len(self._node_labels) > 0: + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['mix'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + # node symb labeled + else: + def compute_vk(n1, n2): + kn = self._node_kernels['symb'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + return kn(n1_labels, n2_labels) + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['nsymb'] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_attrs, n2_attrs) + # node unlabeled + else: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + kernel += 1 + return kernel + + # compute graph kernels + if self._ds_infos['directed']: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + nk11, nk22 = compute_vk(e1[0], e2[0]), compute_vk(e1[1], e2[1]) + kn1 = nk11 * nk22 + kernel += kn1 + else: + for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): + if e1[2]['cost'] == e2[2]['cost']: + # each edge walk is counted twice, starting from both its extreme nodes. + nk11, nk12, nk21, nk22 = compute_vk(e1[0], e2[0]), compute_vk( + e1[0], e2[1]), compute_vk(e1[1], e2[0]), compute_vk(e1[1], e2[1]) + kn1 = nk11 * nk22 + kn2 = nk12 * nk21 + kernel += kn1 + kn2 + + return kernel + + def _wrapper_sp_do(self, itr): i = itr[0] j = itr[1] diff --git a/gklearn/kernels/structural_sp.py b/gklearn/kernels/structural_sp.py index 19322a7..1464807 100644 --- a/gklearn/kernels/structural_sp.py +++ b/gklearn/kernels/structural_sp.py @@ -5,9 +5,9 @@ Created on Mon Mar 30 11:59:57 2020 @author: ljia -@references: +@references: - [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For + [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). """ import sys @@ -23,7 +23,7 @@ from gklearn.kernels import GraphKernel class StructuralSP(GraphKernel): - + def __init__(self, **kwargs): GraphKernel.__init__(self) self._node_labels = kwargs.get('node_labels', []) @@ -34,6 +34,7 @@ class StructuralSP(GraphKernel): self._node_kernels = kwargs.get('node_kernels', None) self._edge_kernels = kwargs.get('edge_kernels', None) self._compute_method = kwargs.get('compute_method', 'naive') + self._fcsp = kwargs.get('fcsp', True) self._ds_infos = kwargs.get('ds_infos', {}) @@ -50,10 +51,10 @@ class StructuralSP(GraphKernel): else: for g in iterator: splist.append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - + from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: @@ -72,10 +73,10 @@ class StructuralSP(GraphKernel): # print("error here ") gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel - + return gram_matrix - - + + def _compute_gm_imap_unordered(self): # get shortest paths of each graph in the graphs. splist = [None] * len(self._graphs) @@ -87,9 +88,9 @@ class StructuralSP(GraphKernel): chunksize = 100 # get shortest path graphs of self._graphs if self._compute_method == 'trie': - get_sps_fun = self._wrapper_get_sps_trie + get_sps_fun = self._wrapper_get_sps_trie else: - get_sps_fun = self._wrapper_get_sps_naive + get_sps_fun = self._wrapper_get_sps_naive if self.verbose >= 2: iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize), desc='getting shortest paths', file=sys.stdout) @@ -99,24 +100,24 @@ class StructuralSP(GraphKernel): splist[i] = sp pool.close() pool.join() - + # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) def init_worker(spl_toshare, gs_toshare): global G_spl, G_gs G_spl = spl_toshare - G_gs = gs_toshare - if self._compute_method == 'trie': + G_gs = gs_toshare + if self._compute_method == 'trie': do_fun = self._wrapper_ssp_do_trie - else: - do_fun = self._wrapper_ssp_do_naive - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + else: + do_fun = self._wrapper_ssp_do_naive + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose) - + return gram_matrix - - + + def _compute_kernel_list_series(self, g1, g_list): # get shortest paths of g1 and each graph in g_list. sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) @@ -131,7 +132,7 @@ class StructuralSP(GraphKernel): else: for g in iterator: splist.append(get_shortest_paths(g, self._edge_weight, self._ds_infos['directed'])) - + # compute kernel list. kernel_list = [None] * len(g_list) if self._verbose >= 2: @@ -146,10 +147,10 @@ class StructuralSP(GraphKernel): for i in iterator: kernel = self._ssp_do_naive(g1, g_list[i], sp1, splist[i]) kernel_list[i] = kernel - + return kernel_list - - + + def _compute_kernel_list_imap_unordered(self, g1, g_list): # get shortest paths of g1 and each graph in g_list. sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) @@ -162,9 +163,9 @@ class StructuralSP(GraphKernel): chunksize = 100 # get shortest path graphs of g_list if self._compute_method == 'trie': - get_sps_fun = self._wrapper_get_sps_trie + get_sps_fun = self._wrapper_get_sps_trie else: - get_sps_fun = self._wrapper_get_sps_naive + get_sps_fun = self._wrapper_get_sps_naive if self.verbose >= 2: iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize), desc='getting shortest paths', file=sys.stdout) @@ -174,7 +175,7 @@ class StructuralSP(GraphKernel): splist[i] = sp pool.close() pool.join() - + # compute Gram matrix. kernel_list = [None] * len(g_list) @@ -182,27 +183,27 @@ class StructuralSP(GraphKernel): global G_sp1, G_spl, G_g1, G_gl G_sp1 = sp1_toshare G_spl = spl_toshare - G_g1 = g1_toshare - G_gl = gl_toshare - if self._compute_method == 'trie': + G_g1 = g1_toshare + G_gl = gl_toshare + if self._compute_method == 'trie': do_fun = self._wrapper_ssp_do_trie - else: + else: do_fun = self._wrapper_kernel_list_do - def func_assign(result, var_to_assign): + def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - + return kernel_list - - + + def _wrapper_kernel_list_do(self, itr): return itr, self._ssp_do_naive(G_g1, G_gl[itr], G_sp1, G_spl[itr]) - - + + def _compute_single_kernel_series(self, g1, g2): sp1 = get_shortest_paths(g1, self._edge_weight, self._ds_infos['directed']) sp2 = get_shortest_paths(g2, self._edge_weight, self._ds_infos['directed']) @@ -210,26 +211,33 @@ class StructuralSP(GraphKernel): kernel = self._ssp_do_trie(g1, g2, sp1, sp2) else: kernel = self._ssp_do_naive(g1, g2, sp1, sp2) - return kernel - - + return kernel + + def _wrapper_get_sps_naive(self, itr_item): g = itr_item[0] i = itr_item[1] return i, get_shortest_paths(g, self._edge_weight, self._ds_infos['directed']) - - + + def _ssp_do_naive(self, g1, g2, spl1, spl2): - + if self._fcsp: # @todo: it may be put outside the _sp_do(). + return self._sp_do_naive_fcsp(g1, g2, spl1, spl2) + else: + return self._sp_do_naive_naive(g1, g2, spl1, spl2) + + + def _sp_do_naive_fcsp(self, g1, g2, spl1, spl2): + kernel = 0 - + # First, compute shortest path matrices, method borrowed from FCSP. vk_dict = self._get_all_node_kernels(g1, g2) # Then, compute kernels between all pairs of edges, which is an idea of # extension of FCSP. It suits sparse graphs, which is the most case we # went though. For dense graphs, this would be slow. ek_dict = self._get_all_edge_kernels(g1, g2) - + # compute graph kernels if vk_dict: if ek_dict: @@ -279,7 +287,7 @@ class StructuralSP(GraphKernel): print(g1.nodes(data=True)) print(g1.edges(data=True)) raise Exception - + # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation # # compute vertex kernel matrix # try: @@ -292,7 +300,7 @@ class StructuralSP(GraphKernel): # vk_mat[i1][i2] = kn( # n1[1][node_label], n2[1][node_label], # [n1[1]['attributes']], [n2[1]['attributes']]) - + # range1 = range(0, len(edge_w_g[i])) # range2 = range(0, len(edge_w_g[j])) # for i1 in range1: @@ -309,18 +317,136 @@ class StructuralSP(GraphKernel): # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] # Kmatrix += kn1 + kn2 return kernel - - + + + def _sp_do_naive_naive(self, g1, g2, spl1, spl2): + + kernel = 0 + + # Define the function to compute kernels between vertices in each condition. + if len(self._node_labels) > 0: + # node symb and non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['mix'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + # node symb labeled + else: + def compute_vk(n1, n2): + kn = self._node_kernels['symb'] + n1_labels = [g1.nodes[n1][nl] for nl in self._node_labels] + n2_labels = [g2.nodes[n2][nl] for nl in self._node_labels] + return kn(n1_labels, n2_labels) + else: + # node non-synb labeled + if len(self._node_attrs) > 0: + def compute_vk(n1, n2): + kn = self._node_kernels['nsymb'] + n1_attrs = [g1.nodes[n1][na] for na in self._node_attrs] + n2_attrs = [g2.nodes[n2][na] for na in self._node_attrs] + return kn(n1_attrs, n2_attrs) +# # node unlabeled +# else: +# for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 +# return kernel + + # Define the function to compute kernels between edges in each condition. + if len(self._edge_labels) > 0: + # edge symb and non-synb labeled + if len(self._edge_attrs) > 0: + def compute_ek(e1, e2): + ke = self._edge_kernels['mix'] + e1_labels = [g1.edges[e1][el] for el in self._edge_labels] + e2_labels = [g2.edges[e2][el] for el in self._edge_labels] + e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] + e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] + return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) + # edge symb labeled + else: + def compute_ek(e1, e2): + ke = self._edge_kernels['symb'] + e1_labels = [g1.edges[e1][el] for el in self._edge_labels] + e2_labels = [g2.edges[e2][el] for el in self._edge_labels] + return ke(e1_labels, e2_labels) + else: + # edge non-synb labeled + if len(self._edge_attrs) > 0: + def compute_ek(e1, e2): + ke = self._edge_kernels['nsymb'] + e1_attrs = [g1.edges[e1][ea] for ea in self._edge_attrs] + e2_attrs = [g2.edges[e2][ea] for ea in self._edge_attrs] + return ke(e1_attrs, e2_attrs) + + + # compute graph kernels + if len(self._node_labels) > 0 or len(self._node_attrs) > 0: + if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = compute_vk(p1[0], p2[0]) + if kpath: + for idx in range(1, len(p1)): + kpath *= compute_vk(p1[idx], p2[idx]) * \ + compute_ek((p1[idx-1], p1[idx]), + (p2[idx-1], p2[idx])) + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kpath = compute_vk(p1[0], p2[0]) + if kpath: + for idx in range(1, len(p1)): + kpath *= compute_vk(p1[idx], p2[idx]) + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + if len(self._edge_labels) > 0 or len(self._edge_attrs) > 0: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + if len(p1) == 0: + kernel += 1 + else: + kpath = 1 + for idx in range(0, len(p1) - 1): + kpath *= compute_ek((p1[idx], p1[idx+1]), + (p2[idx], p2[idx+1])) + if not kpath: + break + kernel += kpath # add up kernels of all paths + else: + for p1, p2 in product(spl1, spl2): + if len(p1) == len(p2): + kernel += 1 + try: + kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average + except ZeroDivisionError: + print(spl1, spl2) + print(g1.nodes(data=True)) + print(g1.edges(data=True)) + raise Exception + + return kernel + + def _wrapper_ssp_do_naive(self, itr): i = itr[0] j = itr[1] return i, j, self._ssp_do_naive(G_gs[i], G_gs[j], G_spl[i], G_spl[j]) - - + + def _get_all_node_kernels(self, g1, g2): return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs) - - + + def _get_all_edge_kernels(self, g1, g2): # compute kernels between all pairs of edges, which is an idea of # extension of FCSP. It suits sparse graphs, which is the most case we @@ -368,5 +494,5 @@ class StructuralSP(GraphKernel): # edge unlabeled else: pass - + return ek_dict \ No newline at end of file diff --git a/gklearn/tests/test_graph_kernels.py b/gklearn/tests/test_graph_kernels.py index 061b17c..a97635a 100644 --- a/gklearn/tests/test_graph_kernels.py +++ b/gklearn/tests/test_graph_kernels.py @@ -3,13 +3,14 @@ import pytest import multiprocessing +import numpy as np def chooseDataset(ds_name): """Choose dataset according to name. """ from gklearn.utils import Dataset - + dataset = Dataset() # no node labels (and no edge labels). @@ -46,9 +47,9 @@ def chooseDataset(ds_name): elif ds_name == 'Cuneiform': dataset.load_predefined_dataset(ds_name) dataset.trim_dataset(edge_required=True) - + dataset.cut_graphs(range(0, 3)) - + return dataset @@ -57,7 +58,7 @@ def test_list_graph_kernels(): """ from gklearn.kernels import GRAPH_KERNELS, list_of_graph_kernels assert list_of_graph_kernels() == [i for i in GRAPH_KERNELS] - + @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) @@ -68,10 +69,10 @@ def test_CommonWalk(ds_name, parallel, weight, compute_method): """ from gklearn.kernels import CommonWalk import networkx as nx - + dataset = chooseDataset(ds_name) dataset.load_graphs([g for g in dataset.graphs if nx.number_of_nodes(g) > 1]) - + try: graph_kernel = CommonWalk(node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, @@ -87,8 +88,8 @@ def test_CommonWalk(ds_name, parallel, weight, compute_method): except Exception as exception: assert False, exception - - + + @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) @pytest.mark.parametrize('remove_totters', [False]) #[True, False]) @pytest.mark.parametrize('parallel', ['imap_unordered', None]) @@ -96,9 +97,9 @@ def test_Marginalized(ds_name, parallel, remove_totters): """Test marginalized kernel. """ from gklearn.kernels import Marginalized - + dataset = chooseDataset(ds_name) - + try: graph_kernel = Marginalized(node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, @@ -115,15 +116,15 @@ def test_Marginalized(ds_name, parallel, remove_totters): except Exception as exception: assert False, exception - - + + @pytest.mark.parametrize('ds_name', ['Acyclic']) @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_SylvesterEquation(ds_name, parallel): """Test sylvester equation kernel. """ from gklearn.kernels import SylvesterEquation - + dataset = chooseDataset(ds_name) try: @@ -139,11 +140,11 @@ def test_SylvesterEquation(ds_name, parallel): parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) - + except Exception as exception: assert False, exception - - + + @pytest.mark.parametrize('ds_name', ['Acyclic', 'AIDS']) @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_ConjugateGradient(ds_name, parallel): @@ -152,9 +153,9 @@ def test_ConjugateGradient(ds_name, parallel): from gklearn.kernels import ConjugateGradient from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct import functools - + dataset = chooseDataset(ds_name) - + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} @@ -177,11 +178,11 @@ def test_ConjugateGradient(ds_name, parallel): parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) - + except Exception as exception: assert False, exception - - + + @pytest.mark.parametrize('ds_name', ['Acyclic', 'AIDS']) @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_FixedPoint(ds_name, parallel): @@ -190,9 +191,9 @@ def test_FixedPoint(ds_name, parallel): from gklearn.kernels import FixedPoint from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct import functools - + dataset = chooseDataset(ds_name) - + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} @@ -215,11 +216,11 @@ def test_FixedPoint(ds_name, parallel): parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) - + except Exception as exception: assert False, exception - - + + @pytest.mark.parametrize('ds_name', ['Acyclic']) @pytest.mark.parametrize('sub_kernel', ['exp', 'geo']) @pytest.mark.parametrize('parallel', ['imap_unordered', None]) @@ -227,7 +228,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel, parallel): """Test spectral decomposition kernel. """ from gklearn.kernels import SpectralDecomposition - + dataset = chooseDataset(ds_name) try: @@ -244,11 +245,11 @@ def test_SpectralDecomposition(ds_name, sub_kernel, parallel): parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) - + except Exception as exception: assert False, exception - - + + # @pytest.mark.parametrize( # 'compute_method,ds_name,sub_kernel', # [ @@ -268,7 +269,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel, parallel): # from gklearn.kernels import RandomWalk # from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct # import functools -# +# # dataset = chooseDataset(ds_name) # mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) @@ -297,7 +298,7 @@ def test_SpectralDecomposition(ds_name, sub_kernel, parallel): # except Exception as exception: # assert False, exception - + @pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_ShortestPath(ds_name, parallel): @@ -306,23 +307,38 @@ def test_ShortestPath(ds_name, parallel): from gklearn.kernels import ShortestPath from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct import functools - + dataset = chooseDataset(ds_name) - + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} try: graph_kernel = ShortestPath(node_labels=dataset.node_labels, node_attrs=dataset.node_attrs, ds_infos=dataset.get_dataset_infos(keys=['directed']), + fcsp=True, node_kernels=sub_kernels) - gram_matrix, run_time = graph_kernel.compute(dataset.graphs, + gram_matrix1, run_time = graph_kernel.compute(dataset.graphs, + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + + graph_kernel = ShortestPath(node_labels=dataset.node_labels, + node_attrs=dataset.node_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + fcsp=False, + node_kernels=sub_kernels) + gram_matrix2, run_time = graph_kernel.compute(dataset.graphs, parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + assert np.array_equal(gram_matrix1, gram_matrix2) + except Exception as exception: assert False, exception @@ -336,26 +352,44 @@ def test_StructuralSP(ds_name, parallel): from gklearn.kernels import StructuralSP from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct import functools - + dataset = chooseDataset(ds_name) - + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} try: graph_kernel = StructuralSP(node_labels=dataset.node_labels, - edge_labels=dataset.edge_labels, + edge_labels=dataset.edge_labels, node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs, ds_infos=dataset.get_dataset_infos(keys=['directed']), + fcsp=True, node_kernels=sub_kernels, edge_kernels=sub_kernels) - gram_matrix, run_time = graph_kernel.compute(dataset.graphs, + gram_matrix1, run_time = graph_kernel.compute(dataset.graphs, + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + + graph_kernel = StructuralSP(node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, + edge_attrs=dataset.edge_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + fcsp=False, + node_kernels=sub_kernels, + edge_kernels=sub_kernels) + gram_matrix2, run_time = graph_kernel.compute(dataset.graphs, parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + assert np.array_equal(gram_matrix1, gram_matrix2) + except Exception as exception: assert False, exception @@ -369,9 +403,9 @@ def test_PathUpToH(ds_name, parallel, k_func, compute_method): """Test path kernel up to length $h$. """ from gklearn.kernels import PathUpToH - + dataset = chooseDataset(ds_name) - + try: graph_kernel = PathUpToH(node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, @@ -385,8 +419,8 @@ def test_PathUpToH(ds_name, parallel, k_func, compute_method): parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) except Exception as exception: assert False, exception - - + + @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) @pytest.mark.parametrize('parallel', ['imap_unordered', None]) def test_Treelet(ds_name, parallel): @@ -395,10 +429,10 @@ def test_Treelet(ds_name, parallel): from gklearn.kernels import Treelet from gklearn.utils.kernels import polynomialkernel import functools - + dataset = chooseDataset(ds_name) - pkernel = functools.partial(polynomialkernel, d=2, c=1e5) + pkernel = functools.partial(polynomialkernel, d=2, c=1e5) try: graph_kernel = Treelet(node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, @@ -412,8 +446,8 @@ def test_Treelet(ds_name, parallel): parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) except Exception as exception: assert False, exception - - + + @pytest.mark.parametrize('ds_name', ['Acyclic']) #@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge']) # @pytest.mark.parametrize('base_kernel', ['subtree']) @@ -422,7 +456,7 @@ def test_WLSubtree(ds_name, parallel): """Test Weisfeiler-Lehman subtree kernel. """ from gklearn.kernels import WLSubtree - + dataset = chooseDataset(ds_name) try: @@ -438,12 +472,13 @@ def test_WLSubtree(ds_name, parallel): parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) except Exception as exception: assert False, exception - + if __name__ == "__main__": test_list_graph_kernels() # test_spkernel('Alkane', 'imap_unordered') # test_StructuralSP('Fingerprint_edge', 'imap_unordered') + test_StructuralSP('Acyclic', 'imap_unordered') # test_WLSubtree('Acyclic', 'imap_unordered') # test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered') From e2af9432627ee64645e193d1f78d7c8b6b01548a Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 14 Dec 2020 10:28:53 +0100 Subject: [PATCH 4/6] [Very important!!!] fix bugs in ssp kernel functions, before this update symbolic/dicrete edge labels were ignored! --- gklearn/kernels/structural_sp.py | 4 +- gklearn/kernels/structuralspKernel.py | 194 +++++++++++++++++----------------- 2 files changed, 100 insertions(+), 98 deletions(-) diff --git a/gklearn/kernels/structural_sp.py b/gklearn/kernels/structural_sp.py index 1464807..ba98a6c 100644 --- a/gklearn/kernels/structural_sp.py +++ b/gklearn/kernels/structural_sp.py @@ -252,6 +252,7 @@ class StructuralSP(GraphKernel): if not kpath: break kernel += kpath # add up kernels of all paths +# print(kernel, ',', p1, ',', p2) else: for p1, p2 in product(spl1, spl2): if len(p1) == len(p2): @@ -398,6 +399,7 @@ class StructuralSP(GraphKernel): if not kpath: break kernel += kpath # add up kernels of all paths +# print(kernel, ',', p1, ',', p2) else: for p1, p2 in product(spl1, spl2): if len(p1) == len(p2): @@ -495,4 +497,4 @@ class StructuralSP(GraphKernel): else: pass - return ek_dict \ No newline at end of file + return ek_dict \ No newline at end of file diff --git a/gklearn/kernels/structuralspKernel.py b/gklearn/kernels/structuralspKernel.py index cfafc8c..a1d2539 100644 --- a/gklearn/kernels/structuralspKernel.py +++ b/gklearn/kernels/structuralspKernel.py @@ -5,9 +5,9 @@ Created on Thu Sep 27 10:56:23 2018 @author: linlin -@references: +@references: - [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For + [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). """ @@ -43,7 +43,7 @@ def structuralspkernel(*args, ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. - + G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. @@ -51,25 +51,25 @@ def structuralspkernel(*args, Node attribute used as label. The default node label is atom. edge_weight : string - Edge attribute name corresponding to the edge weight. Applied for the + Edge attribute name corresponding to the edge weight. Applied for the computation of the shortest paths. edge_label : string Edge attribute used as label. The default edge label is bond_type. node_kernels : dict - A dictionary of kernel functions for nodes, including 3 items: 'symb' - for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' - for both labels. The first 2 functions take two node labels as + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when nodes are unlabeled. edge_kernels : dict - A dictionary of kernel functions for edges, including 3 items: 'symb' - for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' - for both labels. The first 2 functions take two edge labels as + A dictionary of kernel functions for edges, including 3 items: 'symb' + for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' + for both labels. The first 2 functions take two edge labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two edges. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number @@ -89,7 +89,7 @@ def structuralspkernel(*args, Return ------ Kmatrix : Numpy matrix - Kernel matrix, each element of which is the mean average structural + Kernel matrix, each element of which is the mean average structural shortest path kernel between 2 praphs. """ # pre-process @@ -135,9 +135,9 @@ def structuralspkernel(*args, chunksize = 100 # get shortest path graphs of Gn if compute_method == 'trie': - getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) + getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) else: - getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) + getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting shortest paths', file=sys.stdout) @@ -161,17 +161,17 @@ def structuralspkernel(*args, else: for g in iterator: splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) - + # ss = 0 # ss += sys.getsizeof(splist) # for spss in splist: # ss += sys.getsizeof(spss) # for spp in spss: # ss += sys.getsizeof(spp) - - + + # time.sleep(20) - + # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) @@ -194,21 +194,21 @@ def structuralspkernel(*args, Kmatrix = np.zeros((len(Gn), len(Gn))) - # ---- use pool.imap_unordered to parallel and track progress. ---- + # ---- use pool.imap_unordered to parallel and track progress. ---- if parallel == 'imap_unordered': def init_worker(spl_toshare, gs_toshare): global G_spl, G_gs G_spl = spl_toshare - G_gs = gs_toshare - if compute_method == 'trie': - do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, - glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) - else: - do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, - node_kernels, edge_kernels) - parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + G_gs = gs_toshare + if compute_method == 'trie': + do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + else: + do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, + node_kernels, edge_kernels) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # ---- direct running, normally use single CPU core. ---- elif parallel is None: @@ -232,10 +232,10 @@ def structuralspkernel(*args, # print("error here ") Kmatrix[i][j] = kernel Kmatrix[j][i] = kernel - + # # ---- use pool.map to parallel. ---- # pool = Pool(n_jobs) -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), @@ -249,7 +249,7 @@ def structuralspkernel(*args, # pool.join() # # ---- use pool.imap_unordered to parallel and track progress. ---- -# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, +# do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), @@ -282,7 +282,7 @@ def structuralspkernel(*args, def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, node_kernels, edge_kernels): - + kernel = 0 # First, compute shortest path matrices, method borrowed from FCSP. @@ -373,25 +373,25 @@ def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, return kernel -def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, +def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels, edge_kernels, itr): i = itr[0] j = itr[1] - return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], - ds_attrs, node_label, edge_label, + return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j], + ds_attrs, node_label, edge_label, node_kernels, edge_kernels) - - + + def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, node_kernels, edge_kernels): - + # # traverse all paths in graph1. Deep-first search is applied. # def traverseBothTrie(root, trie2, kernel, pcurrent=[]): # for key, node in root['children'].items(): # pcurrent.append(key) # if node['isEndOfWord']: # # print(node['count']) -# traverseTrie2(trie2.root, pcurrent, kernel, +# traverseTrie2(trie2.root, pcurrent, kernel, # pcurrent=[]) # if node['children'] != {}: # traverseBothTrie(node, trie2, kernel, pcurrent) @@ -399,14 +399,14 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # del pcurrent[-1] # if pcurrent != []: # del pcurrent[-1] -# -# -# # traverse all paths in graph2 and find out those that are not in -# # graph1. Deep-first search is applied. +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. # def traverseTrie2(root, p1, kernel, pcurrent=[]): # for key, node in root['children'].items(): # pcurrent.append(key) -# if node['isEndOfWord']: +# if node['isEndOfWord']: # # print(node['count']) # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) # if node['children'] != {}: @@ -415,8 +415,8 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # del pcurrent[-1] # if pcurrent != []: # del pcurrent[-1] -# -# +# +# # kernel = [0] # # # First, compute shortest path matrices, method borrowed from FCSP. @@ -437,7 +437,7 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # pcurrent.append(key) # if node['isEndOfWord']: # # print(node['count']) -# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict, +# traverseTrie2(trie2.root, pcurrent, kernel, vk_dict, ek_dict, # pcurrent=[]) # if node['children'] != {}: # traverseBothTrie(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -445,14 +445,14 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # del pcurrent[-1] # if pcurrent != []: # del pcurrent[-1] -# -# -# # traverse all paths in graph2 and find out those that are not in -# # graph1. Deep-first search is applied. +# +# +# # traverse all paths in graph2 and find out those that are not in +# # graph1. Deep-first search is applied. # def traverseTrie2(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): # for key, node in root['children'].items(): # pcurrent.append(key) -# if node['isEndOfWord']: +# if node['isEndOfWord']: # # print(node['count']) # kernel[0] += computePathKernel(p1, pcurrent, vk_dict, ek_dict) # if node['children'] != {}: @@ -461,8 +461,8 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # del pcurrent[-1] # if pcurrent != []: # del pcurrent[-1] - - + + kernel = [0] # First, compute shortest path matrices, method borrowed from FCSP. @@ -483,20 +483,20 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, if ek_dict: traverseBothTriee(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) else: - traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) + traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) kernel = kernel[0] / (trie1[1] * trie2[1]) # Compute mean average return kernel -def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels, +def wrapper_ssp_do_trie(ds_attrs, node_label, edge_label, node_kernels, edge_kernels, itr): i = itr[0] j = itr[1] - return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs, + return i, j, ssp_do_trie(G_gs[i], G_gs[j], G_spl[i], G_spl[j], ds_attrs, node_label, edge_label, node_kernels, edge_kernels) - + def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs): # compute shortest path matrices, method borrowed from FCSP. @@ -528,7 +528,7 @@ def getAllNodeKernels(g1, g2, node_kernels, node_label, ds_attrs): # node unlabeled else: pass - + return vk_dict @@ -573,17 +573,17 @@ def getAllEdgeKernels(g1, g2, edge_kernels, edge_label, ds_attrs): # edge unlabeled else: pass - - return ek_dict - - + + return ek_dict + + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) if node['isEndOfWord']: # print(node['count']) - traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + traverseTrie2m(trie2.root, pcurrent, kernel, vk_dict, ek_dict, pcurrent=[]) if node['children'] != {}: traverseBothTriem(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -591,14 +591,14 @@ def traverseBothTriem(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - -# traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: # print(node['count']) if len(p1) == len(pcurrent): kpath = vk_dict[(p1[0], pcurrent[0])] @@ -616,7 +616,7 @@ def traverseTrie2m(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): @@ -624,7 +624,7 @@ def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): pcurrent.append(key) if node['isEndOfWord']: # print(node['count']) - traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + traverseTrie2v(trie2.root, pcurrent, kernel, vk_dict, ek_dict, pcurrent=[]) if node['children'] != {}: traverseBothTriev(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -632,14 +632,14 @@ def traverseBothTriev(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - -# traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: # print(node['count']) if len(p1) == len(pcurrent): kpath = vk_dict[(p1[0], pcurrent[0])] @@ -655,15 +655,15 @@ def traverseTrie2v(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - + + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) if node['isEndOfWord']: # print(node['count']) - traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + traverseTrie2e(trie2.root, pcurrent, kernel, vk_dict, ek_dict, pcurrent=[]) if node['children'] != {}: traverseBothTriee(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -671,14 +671,14 @@ def traverseBothTriee(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - -# traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: # print(node['count']) if len(p1) == len(pcurrent): if len(p1) == 0: @@ -697,15 +697,15 @@ def traverseTrie2e(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - + + # traverse all paths in graph1. Deep-first search is applied. def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) if node['isEndOfWord']: # print(node['count']) - traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict, + traverseTrie2u(trie2.root, pcurrent, kernel, vk_dict, ek_dict, pcurrent=[]) if node['children'] != {}: traverseBothTrieu(node, trie2, kernel, vk_dict, ek_dict, pcurrent) @@ -713,14 +713,14 @@ def traverseBothTrieu(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - -# traverse all paths in graph2 and find out those that are not in -# graph1. Deep-first search is applied. + + +# traverse all paths in graph2 and find out those that are not in +# graph1. Deep-first search is applied. def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): for key, node in root['children'].items(): pcurrent.append(key) - if node['isEndOfWord']: + if node['isEndOfWord']: # print(node['count']) if len(p1) == len(pcurrent): kernel[0] += 1 @@ -730,8 +730,8 @@ def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): del pcurrent[-1] if pcurrent != []: del pcurrent[-1] - - + + #def computePathKernel(p1, p2, vk_dict, ek_dict): # kernel = 0 # if vk_dict: @@ -771,7 +771,7 @@ def traverseTrie2u(root, p1, kernel, vk_dict, ek_dict, pcurrent=[]): # else: # if len(p1) == len(p2): # kernel += 1 -# +# # return kernel @@ -804,7 +804,7 @@ def get_shortest_paths(G, weight, directed): # each edge walk is counted twice, starting from both its extreme nodes. if not directed: sp += [sptemp[::-1] for sptemp in spltemp] - + # add single nodes as length 0 paths. sp += [[n] for n in G.nodes()] return sp @@ -849,7 +849,7 @@ def get_sps_as_trie(G, weight, directed): # each edge walk is counted twice, starting from both its extreme nodes. if not directed: sptrie.insertWord(sp[::-1]) - + # add single nodes as length 0 paths. for n in G.nodes(): sptrie.insertWord([n]) From 6e1372e8fab83e5f27c5198f06ad8f0f1d1d36bf Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 14 Dec 2020 10:43:33 +0100 Subject: [PATCH 5/6] Update: Check if all graphs have edge(s) in ShorestPath. --- gklearn/kernels/shortest_path.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/gklearn/kernels/shortest_path.py b/gklearn/kernels/shortest_path.py index 3a29423..9e553a3 100644 --- a/gklearn/kernels/shortest_path.py +++ b/gklearn/kernels/shortest_path.py @@ -17,6 +17,7 @@ from itertools import product from multiprocessing import Pool from tqdm import tqdm import numpy as np +import networkx as nx from gklearn.utils.parallel import parallel_gm, parallel_me from gklearn.utils.utils import getSPGraph from gklearn.kernels import GraphKernel @@ -35,6 +36,7 @@ class ShortestPath(GraphKernel): def _compute_gm_series(self): + self._all_graphs_have_edges(self._graphs) # get shortest path graph of each graph. if self._verbose >= 2: iterator = tqdm(self._graphs, desc='getting sp graphs', file=sys.stdout) @@ -60,6 +62,7 @@ class ShortestPath(GraphKernel): def _compute_gm_imap_unordered(self): + self._all_graphs_have_edges(self._graphs) # get shortest path graph of each graph. pool = Pool(self._n_jobs) get_sp_graphs_fun = self._wrapper_get_sp_graphs @@ -92,6 +95,7 @@ class ShortestPath(GraphKernel): def _compute_kernel_list_series(self, g1, g_list): + self._all_graphs_have_edges([g1] + g_list) # get shortest path graphs of g1 and each graph in g_list. g1 = getSPGraph(g1, edge_weight=self._edge_weight) if self._verbose >= 2: @@ -114,6 +118,7 @@ class ShortestPath(GraphKernel): def _compute_kernel_list_imap_unordered(self, g1, g_list): + self._all_graphs_have_edges([g1] + g_list) # get shortest path graphs of g1 and each graph in g_list. g1 = getSPGraph(g1, edge_weight=self._edge_weight) pool = Pool(self._n_jobs) @@ -156,6 +161,7 @@ class ShortestPath(GraphKernel): def _compute_single_kernel_series(self, g1, g2): + self._all_graphs_have_edges([g1] + [g2]) g1 = getSPGraph(g1, edge_weight=self._edge_weight) g2 = getSPGraph(g2, edge_weight=self._edge_weight) kernel = self._sp_do(g1, g2) @@ -327,4 +333,10 @@ class ShortestPath(GraphKernel): def _wrapper_sp_do(self, itr): i = itr[0] j = itr[1] - return i, j, self._sp_do(G_gs[i], G_gs[j]) \ No newline at end of file + return i, j, self._sp_do(G_gs[i], G_gs[j]) + + + def _all_graphs_have_edges(self, graphs): + for G in graphs: + if nx.number_of_edges(G) == 0: + raise ValueError('Not all graphs have edges!!!') \ No newline at end of file From 35886083566c508ee074ac7ca150fe6d59c2e5a1 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Mon, 14 Dec 2020 10:45:34 +0100 Subject: [PATCH 6/6] Test: if sp / ssp kernel yield same results if the fcsp method is turned off. --- gklearn/tests/test_graph_kernels.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/gklearn/tests/test_graph_kernels.py b/gklearn/tests/test_graph_kernels.py index a97635a..c2ff6ec 100644 --- a/gklearn/tests/test_graph_kernels.py +++ b/gklearn/tests/test_graph_kernels.py @@ -19,6 +19,7 @@ def chooseDataset(ds_name): dataset.trim_dataset(edge_required=False) irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} dataset.remove_labels(**irrelevant_labels) + dataset.cut_graphs(range(1, 10)) # node symbolic labels. elif ds_name == 'Acyclic': dataset.load_predefined_dataset(ds_name) @@ -337,11 +338,11 @@ def test_ShortestPath(ds_name, parallel): kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) - assert np.array_equal(gram_matrix1, gram_matrix2) - except Exception as exception: assert False, exception + assert np.array_equal(gram_matrix1, gram_matrix2) + #@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) @pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) @@ -367,11 +368,11 @@ def test_StructuralSP(ds_name, parallel): node_kernels=sub_kernels, edge_kernels=sub_kernels) gram_matrix1, run_time = graph_kernel.compute(dataset.graphs, - parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True, normalize=False) kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], - parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], - parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) graph_kernel = StructuralSP(node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, @@ -382,17 +383,17 @@ def test_StructuralSP(ds_name, parallel): node_kernels=sub_kernels, edge_kernels=sub_kernels) gram_matrix2, run_time = graph_kernel.compute(dataset.graphs, - parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True, normalize=False) kernel_list, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1:], - parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) kernel, run_time = graph_kernel.compute(dataset.graphs[0], dataset.graphs[1], - parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) - - assert np.array_equal(gram_matrix1, gram_matrix2) + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) except Exception as exception: assert False, exception + assert np.array_equal(gram_matrix1, gram_matrix2) + @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) @pytest.mark.parametrize('parallel', ['imap_unordered', None]) @@ -477,8 +478,10 @@ def test_WLSubtree(ds_name, parallel): if __name__ == "__main__": test_list_graph_kernels() # test_spkernel('Alkane', 'imap_unordered') +# test_ShortestPath('Alkane', 'imap_unordered') # test_StructuralSP('Fingerprint_edge', 'imap_unordered') - test_StructuralSP('Acyclic', 'imap_unordered') +# test_StructuralSP('Alkane', None) +# test_StructuralSP('Cuneiform', None) # test_WLSubtree('Acyclic', 'imap_unordered') # test_RandomWalk('Acyclic', 'sylvester', None, 'imap_unordered') # test_RandomWalk('Acyclic', 'conjugate', None, 'imap_unordered')