diff --git a/gklearn/kernels/__init__.py b/gklearn/kernels/__init__.py index 73147ba..60fea24 100644 --- a/gklearn/kernels/__init__.py +++ b/gklearn/kernels/__init__.py @@ -10,3 +10,4 @@ __date__ = "November 2018" from gklearn.kernels.graph_kernel import GraphKernel from gklearn.kernels.structural_sp import StructuralSP from gklearn.kernels.shortest_path import ShortestPath +from gklearn.kernels.path_up_to_h import PathUpToH diff --git a/gklearn/kernels/path_up_to_h.py b/gklearn/kernels/path_up_to_h.py index 4b35463..26639cf 100644 --- a/gklearn/kernels/path_up_to_h.py +++ b/gklearn/kernels/path_up_to_h.py @@ -7,22 +7,23 @@ Created on Fri Apr 10 18:33:13 2020 @references: - [1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre - Baldi. Graph kernels for chemical informatics. Neural networks, - 18(8):1093–1110, 2005. + [1] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre + Baldi. Graph kernels for chemical informatics. Neural networks, + 18(8):1093–1110, 2005. """ import sys -from itertools import product -# from functools import partial from multiprocessing import Pool from tqdm import tqdm import numpy as np +import networkx as nx +from collections import Counter +from functools import partial from gklearn.utils.parallel import parallel_gm, parallel_me -from gklearn.utils.utils import getSPGraph from gklearn.kernels import GraphKernel +from gklearn.utils import Trie -class PathUpToH(GraphKernel): +class PathUpToH(GraphKernel): # @todo: add function for k_func == None def __init__(self, **kwargs): GraphKernel.__init__(self) @@ -35,231 +36,557 @@ class PathUpToH(GraphKernel): def _compute_gm_series(self): - # get shortest path graph of each graph. - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='getting sp graphs', file=sys.stdout) - else: - iterator = self._graphs - self._graphs = [getSPGraph(g, edge_weight=self.__edge_weight) for g in iterator] - - # compute Gram matrix. - gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + self.__add_dummy_labels(self._graphs) from itertools import combinations_with_replacement - itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + iterator_ps = tqdm(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout) + iterator_kernel = tqdm(itr_kernel, desc='calculating kernels', file=sys.stdout) else: - iterator = itr - for i, j in iterator: - kernel = self.__sp_do_(self._graphs[i], self._graphs[j]) - gram_matrix[i][j] = kernel - gram_matrix[j][i] = kernel + iterator_ps = range(0, len(self._graphs)) + iterator_kernel = itr_kernel + + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + if self.__compute_method == 'trie': + all_paths = [self.__find_all_path_as_trie(self._graphs[i]) for i in iterator_ps] + for i, j in iterator_kernel: + kernel = self.__kernel_do_trie(all_paths[i], all_paths[j]) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + else: + all_paths = [self.__find_all_paths_until_length(self._graphs[i]) for i in iterator_ps] + for i, j in iterator_kernel: + kernel = self.__kernel_do_naive(all_paths[i], all_paths[j]) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel return gram_matrix def _compute_gm_imap_unordered(self): - # get shortest path graph of each graph. + self.__add_dummy_labels(self._graphs) + + # get all paths of all graphs before calculating kernels to save time, + # but this may cost a lot of memory for large datasets. pool = Pool(self._n_jobs) - get_sp_graphs_fun = self._wrapper_get_sp_graphs itr = zip(self._graphs, range(0, len(self._graphs))) if len(self._graphs) < 100 * self._n_jobs: chunksize = int(len(self._graphs) / self._n_jobs) + 1 else: chunksize = 100 + all_paths = [[] for _ in range(len(self._graphs))] + if self.__compute_method == 'trie' and self.__k_func is not None: + get_ps_fun = self._wrapper_find_all_path_as_trie + elif self.__compute_method != 'trie' and self.__k_func is not None: + get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True) + else: + get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), - desc='getting sp graphs', file=sys.stdout) + iterator = tqdm(pool.imap_unordered(get_ps_fun, itr, chunksize), + desc='getting paths', file=sys.stdout) else: - iterator = pool.imap_unordered(get_sp_graphs_fun, itr, chunksize) - for i, g in iterator: - self._graphs[i] = g + iterator = pool.imap_unordered(get_ps_fun, itr, chunksize) + for i, ps in iterator: + all_paths[i] = ps pool.close() pool.join() # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - - def init_worker(gs_toshare): - global G_gs - G_gs = gs_toshare - do_fun = self._wrapper_sp_do + + if self.__compute_method == 'trie' and self.__k_func is not None: + def init_worker(trie_toshare): + global G_trie + G_trie = trie_toshare + do_fun = self._wrapper_kernel_do_trie + elif self.__compute_method != 'trie' and self.__k_func is not None: + def init_worker(plist_toshare): + global G_plist + G_plist = plist_toshare + do_fun = self._wrapper_kernel_do_naive + else: + def init_worker(plist_toshare): + global G_plist + G_plist = plist_toshare + do_fun = self.__wrapper_kernel_do_kernelless # @todo: what is this? parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose) return gram_matrix def _compute_kernel_list_series(self, g1, g_list): - # get shortest path graphs of g1 and each graph in g_list. - g1 = getSPGraph(g1, edge_weight=self.__edge_weight) + self.__add_dummy_labels(g_list + [g1]) + if self._verbose >= 2: - iterator = tqdm(g_list, desc='getting sp graphs', file=sys.stdout) + iterator_ps = tqdm(g_list, desc='getting paths', file=sys.stdout) + iterator_kernel = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) else: - iterator = g_list - g_list = [getSPGraph(g, edge_weight=self.__edge_weight) for g in iterator] - - # compute kernel list. + iterator_ps = g_list + iterator_kernel = range(len(g_list)) + kernel_list = [None] * len(g_list) - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + + if self.__compute_method == 'trie': + paths_g1 = self.__find_all_path_as_trie(g1) + paths_g_list = [self.__find_all_path_as_trie(self._graphs[i]) for i in iterator_ps] + for i in iterator_kernel: + kernel = self.__kernel_do_trie(paths_g1, paths_g_list[i]) + kernel_list[i] = kernel else: - iterator = range(len(g_list)) - for i in iterator: - kernel = self.__sp_do(g1, g_list[i]) - kernel_list[i] = kernel + paths_g1 = self.__find_all_paths_until_length(g1) + paths_g_list = [self.__find_all_paths_until_length(self._graphs[i]) for i in iterator_ps] + for i in iterator_kernel: + kernel = self.__kernel_do_naive(paths_g1, paths_g_list[i]) + kernel_list[i] = kernel return kernel_list def _compute_kernel_list_imap_unordered(self, g1, g_list): - # get shortest path graphs of g1 and each graph in g_list. - g1 = getSPGraph(g1, edge_weight=self.__edge_weight) + self.__add_dummy_labels(g_list + [g1]) + + # get all paths of all graphs before calculating kernels to save time, + # but this may cost a lot of memory for large datasets. pool = Pool(self._n_jobs) - get_sp_graphs_fun = self._wrapper_get_sp_graphs itr = zip(g_list, range(0, len(g_list))) if len(g_list) < 100 * self._n_jobs: chunksize = int(len(g_list) / self._n_jobs) + 1 else: chunksize = 100 + paths_g_list = [[] for _ in range(len(g_list))] + if self.__compute_method == 'trie' and self.__k_func is not None: + paths_g1 = self.__find_all_path_as_trie(g1) + get_ps_fun = self._wrapper_find_all_path_as_trie + elif self.__compute_method != 'trie' and self.__k_func is not None: + paths_g1 = self.__find_all_paths_until_length(g1) + get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True) + else: + paths_g1 = self.__find_all_paths_until_length(g1) + get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) if self._verbose >= 2: - iterator = tqdm(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), - desc='getting sp graphs', file=sys.stdout) + iterator = tqdm(pool.imap_unordered(get_ps_fun, itr, chunksize), + desc='getting paths', file=sys.stdout) else: - iterator = pool.imap_unordered(get_sp_graphs_fun, itr, chunksize) - for i, g in iterator: - g_list[i] = g + iterator = pool.imap_unordered(get_ps_fun, itr, chunksize) + for i, ps in iterator: + paths_g_list[i] = ps pool.close() pool.join() # compute Gram matrix. kernel_list = [None] * len(g_list) - - def init_worker(g1_toshare, gl_toshare): - global G_g1, G_gl - G_g1 = g1_toshare - G_gl = gl_toshare + + def init_worker(p1_toshare, plist_toshare): + global G_p1, G_plist + G_p1 = p1_toshare + G_plist = plist_toshare do_fun = self._wrapper_kernel_list_do def func_assign(result, var_to_assign): var_to_assign[result[0]] = result[1] itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) - + init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + return kernel_list def _wrapper_kernel_list_do(self, itr): - return itr, self.__sp_do(G_g1, G_gl[itr]) + if self.__compute_method == 'trie' and self.__k_func is not None: + return itr, self.__kernel_do_trie(G_p1, G_plist[itr]) + elif self.__compute_method != 'trie' and self.__k_func is not None: + return itr, self.__kernel_do_naive(G_p1, G_plist[itr]) + else: + return itr, self.__kernel_do_kernelless(G_p1, G_plist[itr]) def _compute_single_kernel_series(self, g1, g2): - g1 = getSPGraph(g1, edge_weight=self.__edge_weight) - g2 = getSPGraph(g2, edge_weight=self.__edge_weight) - kernel = self.__sp_do(g1, g2) + self.__add_dummy_labels([g1] + [g2]) + if self.__compute_method == 'trie': + paths_g1 = self.__find_all_path_as_trie(g1) + paths_g2 = self.__find_all_path_as_trie(g2) + kernel = self.__kernel_do_trie(paths_g1, paths_g2) + else: + paths_g1 = self.__find_all_paths_until_length(g1) + paths_g2 = self.__find_all_paths_until_length(g2) + kernel = self.__kernel_do_naive(paths_g1, paths_g2) return kernel + + + def __kernel_do_trie(self, trie1, trie2): + """Calculate path graph kernels up to depth d between 2 graphs using trie. + + Parameters + ---------- + trie1, trie2 : list + Tries that contains all paths in 2 graphs. + k_func : function + A kernel function applied using different notions of fingerprint + similarity. + + Return + ------ + kernel : float + Path kernel up to h between 2 graphs. + """ + if self.__k_func == 'tanimoto': + # traverse all paths in graph1 and search them in graph2. Deep-first + # search is applied. + def traverseTrie1t(root, trie2, setlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + setlist[1] += 1 + count2 = trie2.searchWord(pcurrent) + if count2 != 0: + setlist[0] += 1 + if node['children'] != {}: + traverseTrie1t(node, trie2, setlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + + # traverse all paths in graph2 and find out those that are not in + # graph1. Deep-first search is applied. + def traverseTrie2t(root, trie1, setlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + # print(node['count']) + count1 = trie1.searchWord(pcurrent) + if count1 == 0: + setlist[1] += 1 + if node['children'] != {}: + traverseTrie2t(node, trie1, setlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + setlist = [0, 0] # intersection and union of path sets of g1, g2. + # print(trie1.root) + # print(trie2.root) + traverseTrie1t(trie1.root, trie2, setlist) + # print(setlist) + traverseTrie2t(trie2.root, trie1, setlist) + # print(setlist) + kernel = setlist[0] / setlist[1] + + elif self.__k_func == 'MinMax': # MinMax kernel + # traverse all paths in graph1 and search them in graph2. Deep-first + # search is applied. + def traverseTrie1m(root, trie2, sumlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: +# print(node['count']) + count1 = node['count'] + count2 = trie2.searchWord(pcurrent) + sumlist[0] += min(count1, count2) + sumlist[1] += max(count1, count2) + if node['children'] != {}: + traverseTrie1m(node, trie2, sumlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + # traverse all paths in graph2 and find out those that are not in + # graph1. Deep-first search is applied. + def traverseTrie2m(root, trie1, sumlist, pcurrent=[]): + for key, node in root['children'].items(): + pcurrent.append(key) + if node['isEndOfWord']: + # print(node['count']) + count1 = trie1.searchWord(pcurrent) + if count1 == 0: + sumlist[1] += node['count'] + if node['children'] != {}: + traverseTrie2m(node, trie1, sumlist, pcurrent) + else: + del pcurrent[-1] + if pcurrent != []: + del pcurrent[-1] + + sumlist = [0, 0] # sum of mins and sum of maxs +# print(trie1.root) +# print(trie2.root) + traverseTrie1m(trie1.root, trie2, sumlist) +# print(sumlist) + traverseTrie2m(trie2.root, trie1, sumlist) +# print(sumlist) + kernel = sumlist[0] / sumlist[1] + else: + raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax".') + + return kernel + + + def _wrapper_kernel_do_trie(self, itr): + i = itr[0] + j = itr[1] + return i, j, self.__kernel_do_trie(G_trie[i], G_trie[j]) + + + def __kernel_do_naive(self, paths1, paths2): + """Calculate path graph kernels up to depth d between 2 graphs naively. + + Parameters + ---------- + paths_list : list of list + List of list of paths in all graphs, where for unlabeled graphs, each + path is represented by a list of nodes; while for labeled graphs, each + path is represented by a string consists of labels of nodes and/or + edges on that path. + k_func : function + A kernel function applied using different notions of fingerprint + similarity. + + Return + ------ + kernel : float + Path kernel up to h between 2 graphs. + """ + all_paths = list(set(paths1 + paths2)) + + if self.__k_func == 'tanimoto': + length_union = len(set(paths1 + paths2)) + kernel = (len(set(paths1)) + len(set(paths2)) - + length_union) / length_union + # vector1 = [(1 if path in paths1 else 0) for path in all_paths] + # vector2 = [(1 if path in paths2 else 0) for path in all_paths] + # kernel_uv = np.dot(vector1, vector2) + # kernel = kernel_uv / (len(set(paths1)) + len(set(paths2)) - kernel_uv) + + elif self.__k_func == 'MinMax': # MinMax kernel + path_count1 = Counter(paths1) + path_count2 = Counter(paths2) + vector1 = [(path_count1[key] if (key in path_count1.keys()) else 0) + for key in all_paths] + vector2 = [(path_count2[key] if (key in path_count2.keys()) else 0) + for key in all_paths] + kernel = np.sum(np.minimum(vector1, vector2)) / \ + np.sum(np.maximum(vector1, vector2)) + else: + raise Exception('The given "k_func" cannot be recognized. Possible choices include: "tanimoto", "MinMax".') + + return kernel + + + def _wrapper_kernel_do_naive(self, itr): + i = itr[0] + j = itr[1] + return i, j, self.__kernel_do_naive(G_plist[i], G_plist[j]) + + + def __find_all_path_as_trie(self, G): + # all_path = find_all_paths_until_length(G, length, ds_attrs, + # node_label=node_label, + # edge_label=edge_label) + # ptrie = Trie() + # for path in all_path: + # ptrie.insertWord(path) + + # ptrie = Trie() + # path_l = [[n] for n in G.nodes] # paths of length l + # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) + # for p in path_l_str: + # ptrie.insertWord(p) + # for l in range(1, length + 1): + # path_lplus1 = [] + # for path in path_l: + # for neighbor in G[path[-1]]: + # if neighbor not in path: + # tmp = path + [neighbor] + ## if tmp[::-1] not in path_lplus1: + # path_lplus1.append(tmp) + # path_l = path_lplus1[:] + # # consider labels + # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) + # for p in path_l_str: + # ptrie.insertWord(p) + # + # print(time.time() - time1) + # print(ptrie.root) + # print() + + + # traverse all paths up to length h in a graph and construct a trie with + # them. Deep-first search is applied. Notice the reverse of each path is + # also stored to the trie. + def traverseGraph(root, ptrie, G, pcurrent=[]): + if len(pcurrent) < self.__depth + 1: + for neighbor in G[root]: + if neighbor not in pcurrent: + pcurrent.append(neighbor) + plstr = self.__paths2labelseqs([pcurrent], G) + ptrie.insertWord(plstr[0]) + traverseGraph(neighbor, ptrie, G, pcurrent) + del pcurrent[-1] + + + ptrie = Trie() + path_l = [[n] for n in G.nodes] # paths of length l + path_l_str = self.__paths2labelseqs(path_l, G) + for p in path_l_str: + ptrie.insertWord(p) + for n in G.nodes: + traverseGraph(n, ptrie, G, pcurrent=[n]) + + + # def traverseGraph(root, all_paths, length, G, ds_attrs, node_label, edge_label, + # pcurrent=[]): + # if len(pcurrent) < length + 1: + # for neighbor in G[root]: + # if neighbor not in pcurrent: + # pcurrent.append(neighbor) + # plstr = paths2labelseqs([pcurrent], G, ds_attrs, + # node_label, edge_label) + # all_paths.append(pcurrent[:]) + # traverseGraph(neighbor, all_paths, length, G, ds_attrs, + # node_label, edge_label, pcurrent) + # del pcurrent[-1] + # + # + # path_l = [[n] for n in G.nodes] # paths of length l + # all_paths = path_l[:] + # path_l_str = paths2labelseqs(path_l, G, ds_attrs, node_label, edge_label) + ## for p in path_l_str: + ## ptrie.insertWord(p) + # for n in G.nodes: + # traverseGraph(n, all_paths, length, G, ds_attrs, node_label, edge_label, + # pcurrent=[n]) + # print(ptrie.root) + return ptrie + - def _wrapper_get_sp_graphs(self, itr_item): + def _wrapper_find_all_path_as_trie(self, itr_item): g = itr_item[0] i = itr_item[1] - return i, getSPGraph(g, edge_weight=self.__edge_weight) + return i, self.__find_all_path_as_trie(g) - def __sp_do(self, g1, g2): - - kernel = 0 + # @todo: (can be removed maybe) this method find paths repetively, it could be faster. + def __find_all_paths_until_length(self, G, tolabelseqs=True): + """Find all paths no longer than a certain maximum length in a graph. A + recursive depth first search is applied. + + Parameters + ---------- + G : NetworkX graphs + The graph in which paths are searched. + length : integer + The maximum length of paths. + ds_attrs: dict + Dataset attributes. + node_label : string + Node attribute used as label. The default node label is atom. + edge_label : string + Edge attribute used as label. The default edge label is bond_type. + + Return + ------ + path : list + List of paths retrieved, where for unlabeled graphs, each path is + represented by a list of nodes; while for labeled graphs, each path is + represented by a list of strings consists of labels of nodes and/or + edges on that path. + """ + # path_l = [tuple([n]) for n in G.nodes] # paths of length l + # all_paths = path_l[:] + # for l in range(1, self.__depth + 1): + # path_l_new = [] + # for path in path_l: + # for neighbor in G[path[-1]]: + # if len(path) < 2 or neighbor != path[-2]: + # tmp = path + (neighbor, ) + # if tuple(tmp[::-1]) not in path_l_new: + # path_l_new.append(tuple(tmp)) + + # all_paths += path_l_new + # path_l = path_l_new[:] + + path_l = [[n] for n in G.nodes] # paths of length l + all_paths = [p.copy() for p in path_l] + for l in range(1, self.__depth + 1): + path_lplus1 = [] + for path in path_l: + for neighbor in G[path[-1]]: + if neighbor not in path: + tmp = path + [neighbor] + # if tmp[::-1] not in path_lplus1: + path_lplus1.append(tmp) + + all_paths += path_lplus1 + path_l = [p.copy() for p in path_lplus1] + + # for i in range(0, self.__depth + 1): + # new_paths = find_all_paths(G, i) + # if new_paths == []: + # break + # all_paths.extend(new_paths) + + # consider labels + # print(paths2labelseqs(all_paths, G, ds_attrs, node_label, edge_label)) + # print() + return (self.__paths2labelseqs(all_paths, G) if tolabelseqs else all_paths) + + + def _wrapper_find_all_paths_until_length(self, tolabelseqs, itr_item): + g = itr_item[0] + i = itr_item[1] + return i, self.__find_all_paths_until_length(g, tolabelseqs=tolabelseqs) + - # compute shortest path matrices first, method borrowed from FCSP. - vk_dict = {} # shortest path matrices dict + def __paths2labelseqs(self, plist, G): if len(self.__node_labels) > 0: - # node symb and non-synb labeled - if len(self.__node_attrs) > 0: - kn = self.__node_kernels['mix'] - for n1, n2 in product( - g1.nodes(data=True), g2.nodes(data=True)): - n1_labels = [n1[1][nl] for nl in self.__node_labels] - n2_labels = [n2[1][nl] for nl in self.__node_labels] - n1_attrs = [n1[1][na] for na in self.__node_attrs] - n2_attrs = [n2[1][na] for na in self.__node_attrs] - vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) - # node symb labeled + if len(self.__edge_labels) > 0: + path_strs = [] + for path in plist: + pths_tmp = [] + for idx, node in enumerate(path[:-1]): + pths_tmp.append(tuple(G.nodes[node][nl] for nl in self.__node_labels)) + pths_tmp.append(tuple(G[node][path[idx + 1]][el] for el in self.__edge_labels)) + pths_tmp.append(tuple(G.nodes[path[-1]][nl] for nl in self.__node_labels)) + path_strs.append(tuple(pths_tmp)) else: - kn = self.__node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_labels = [n1[1][nl] for nl in self.__node_labels] - n2_labels = [n2[1][nl] for nl in self.__node_labels] - vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + path_strs = [] + for path in plist: + pths_tmp = [] + for node in path: + pths_tmp.append(tuple(G.nodes[node][nl] for nl in self.__node_labels)) + path_strs.append(tuple(pths_tmp)) + return path_strs else: - # node non-synb labeled - if len(self.__node_attrs) > 0: - kn = self.__node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_attrs = [n1[1][na] for na in self.__node_attrs] - n2_attrs = [n2[1][na] for na in self.__node_attrs] - vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) - # node unlabeled + if len(self.__edge_labels) > 0: + path_strs = [] + for path in plist: + if len(path) == 1: + path_strs.append(tuple()) + else: + pths_tmp = [] + for idx, node in enumerate(path[:-1]): + pths_tmp.append(tuple(G[node][path[idx + 1]][el] for el in self.__edge_labels)) + path_strs.append(tuple(pths_tmp)) + return path_strs else: - for e1, e2 in product( - g1.edges(data=True), g2.edges(data=True)): - if e1[2]['cost'] == e2[2]['cost']: - kernel += 1 - return kernel - - # compute graph kernels - if self.__ds_infos['directed']: - for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): - if e1[2]['cost'] == e2[2]['cost']: - nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1], e2[1])] - kn1 = nk11 * nk22 - kernel += kn1 - else: - for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)): - if e1[2]['cost'] == e2[2]['cost']: - # each edge walk is counted twice, starting from both its extreme nodes. - nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[( - e1[0], e2[1])], vk_dict[(e1[1], e2[0])], vk_dict[(e1[1], e2[1])] - kn1 = nk11 * nk22 - kn2 = nk12 * nk21 - kernel += kn1 + kn2 - - # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation - # # compute vertex kernels - # try: - # vk_mat = np.zeros((nx.number_of_nodes(g1), - # nx.number_of_nodes(g2))) - # g1nl = enumerate(g1.nodes(data=True)) - # g2nl = enumerate(g2.nodes(data=True)) - # for i1, n1 in g1nl: - # for i2, n2 in g2nl: - # vk_mat[i1][i2] = kn( - # n1[1][node_label], n2[1][node_label], - # [n1[1]['attributes']], [n2[1]['attributes']]) - - # range1 = range(0, len(edge_w_g[i])) - # range2 = range(0, len(edge_w_g[j])) - # for i1 in range1: - # x1 = edge_x_g[i][i1] - # y1 = edge_y_g[i][i1] - # w1 = edge_w_g[i][i1] - # for i2 in range2: - # x2 = edge_x_g[j][i2] - # y2 = edge_y_g[j][i2] - # w2 = edge_w_g[j][i2] - # ke = (w1 == w2) - # if ke > 0: - # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2] - # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2] - # kernel += kn1 + kn2 - - return kernel + return [tuple(['0' for node in path]) for path in plist] + # return [tuple([len(path)]) for path in all_paths] - def _wrapper_sp_do(self, itr): - i = itr[0] - j = itr[1] - return i, j, self.__sp_do(G_gs[i], G_gs[j]) + def __add_dummy_labels(self, Gn): + if self.__k_func is not None: + if len(self.__node_labels) == 0: + for G in Gn: + nx.set_node_attributes(G, '0', 'dummy') + self.__node_labels.append('dummy') + if len(self.__edge_labels) == 0: + for G in Gn: + nx.set_edge_attributes(G, '0', 'dummy') + self.__edge_labels.append('dummy') \ No newline at end of file diff --git a/gklearn/kernels/shortest_path.py b/gklearn/kernels/shortest_path.py index c11c2e5..1923b00 100644 --- a/gklearn/kernels/shortest_path.py +++ b/gklearn/kernels/shortest_path.py @@ -51,7 +51,7 @@ class ShortestPath(GraphKernel): else: iterator = itr for i, j in iterator: - kernel = self.__sp_do_(self._graphs[i], self._graphs[j]) + kernel = self.__sp_do(self._graphs[i], self._graphs[j]) gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel diff --git a/gklearn/preimage/experiments/xp_median_preimage.py b/gklearn/preimage/experiments/xp_median_preimage.py index 7ae20ba..4fd12d2 100644 --- a/gklearn/preimage/experiments/xp_median_preimage.py +++ b/gklearn/preimage/experiments/xp_median_preimage.py @@ -13,7 +13,7 @@ from gklearn.utils import compute_gram_matrices_by_class def xp_median_preimage_9_1(): - """xp 9_1: MAO, sspkernel, using CONSTANT. + """xp 9_1: MAO, StructuralSP, using CONSTANT, symbolic only. """ # set parameters. ds_name = 'MAO' # @@ -29,7 +29,7 @@ def xp_median_preimage_9_1(): 'verbose': 2} mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - kernel_options = {'name': 'structuralspkernel', + kernel_options = {'name': 'StructuralSP', 'edge_weight': None, 'node_kernels': sub_kernels, 'edge_kernels': sub_kernels, @@ -69,7 +69,68 @@ def xp_median_preimage_9_1(): print() # generate preimages. - for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']: + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_9_2(): + """xp 9_2: MAO, PathUpToH, using CONSTANT, symbolic only. + """ + # set parameters. + ds_name = 'MAO' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [4, 4, 2, 1, 1, 1], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, # + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + kernel_options = {'name': 'PathUpToH', + 'depth': 2, # + 'k_func': 'MinMax', # + 'compute_method': 'trie', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'CONSTANT', # + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: print('\n-------------------------------------') print('fit method:', fit_method, '\n') mpg_options['fit_method'] = fit_method @@ -77,7 +138,7 @@ def xp_median_preimage_9_1(): def xp_median_preimage_8_1(): - """xp 8_1: Monoterpenoides, sspkernel, using CONSTANT. + """xp 8_1: Monoterpenoides, StructuralSP, using CONSTANT. """ # set parameters. ds_name = 'Monoterpenoides' # @@ -93,7 +154,7 @@ def xp_median_preimage_8_1(): 'verbose': 2} mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - kernel_options = {'name': 'structuralspkernel', + kernel_options = {'name': 'StructuralSP', 'edge_weight': None, 'node_kernels': sub_kernels, 'edge_kernels': sub_kernels, @@ -133,7 +194,68 @@ def xp_median_preimage_8_1(): print() # generate preimages. - for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']: + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_8_2(): + """xp 8_2: Monoterpenoides, PathUpToH, using CONSTANT. + """ + # set parameters. + ds_name = 'Monoterpenoides' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [4, 4, 2, 1, 1, 1], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, # + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + kernel_options = {'name': 'PathUpToH', + 'depth': 2, # + 'k_func': 'MinMax', # + 'compute_method': 'trie', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'CONSTANT', # + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: print('\n-------------------------------------') print('fit method:', fit_method, '\n') mpg_options['fit_method'] = fit_method @@ -141,7 +263,7 @@ def xp_median_preimage_8_1(): def xp_median_preimage_7_1(): - """xp 7_1: MUTAG, sspkernel, using CONSTANT. + """xp 7_1: MUTAG, StructuralSP, using CONSTANT. """ # set parameters. ds_name = 'MUTAG' # @@ -157,7 +279,7 @@ def xp_median_preimage_7_1(): 'verbose': 2} mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - kernel_options = {'name': 'structuralspkernel', + kernel_options = {'name': 'StructuralSP', 'edge_weight': None, 'node_kernels': sub_kernels, 'edge_kernels': sub_kernels, @@ -197,7 +319,68 @@ def xp_median_preimage_7_1(): print() # generate preimages. - for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']: + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_7_2(): + """xp 7_2: MUTAG, PathUpToH, using CONSTANT. + """ + # set parameters. + ds_name = 'MUTAG' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [4, 4, 2, 1, 1, 1], # + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, # + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + kernel_options = {'name': 'PathUpToH', + 'depth': 2, # + 'k_func': 'MinMax', # + 'compute_method': 'trie', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'CONSTANT', # + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = False # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: print('\n-------------------------------------') print('fit method:', fit_method, '\n') mpg_options['fit_method'] = fit_method @@ -205,7 +388,7 @@ def xp_median_preimage_7_1(): def xp_median_preimage_6_1(): - """xp 6_1: COIL-RAG, sspkernel, using NON_SYMBOLIC. + """xp 6_1: COIL-RAG, StructuralSP, using NON_SYMBOLIC. """ # set parameters. ds_name = 'COIL-RAG' # @@ -221,7 +404,7 @@ def xp_median_preimage_6_1(): 'verbose': 2} mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - kernel_options = {'name': 'structuralspkernel', + kernel_options = {'name': 'StructuralSP', 'edge_weight': None, 'node_kernels': sub_kernels, 'edge_kernels': sub_kernels, @@ -261,20 +444,20 @@ def xp_median_preimage_6_1(): print() # generate preimages. - for fit_method in ['k-graphs', 'random', 'random', 'random']: + for fit_method in ['k-graphs'] + ['random'] * 10: print('\n-------------------------------------') print('fit method:', fit_method, '\n') mpg_options['fit_method'] = fit_method generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) - - -def xp_median_preimage_5_1(): - """xp 5_1: FRANKENSTEIN, sspkernel, using NON_SYMBOLIC. + + +def xp_median_preimage_6_2(): + """xp 6_2: COIL-RAG, ShortestPath, using NON_SYMBOLIC. """ # set parameters. - ds_name = 'FRANKENSTEIN' # + ds_name = 'COIL-RAG' # mpg_options = {'fit_method': 'k-graphs', - 'init_ecc': [3, 3, 1, 3, 3, 0], # + 'init_ecc': [3, 3, 1, 3, 3, 1], # 'ds_name': ds_name, 'parallel': True, # False 'time_limit_in_sec': 0, @@ -285,11 +468,9 @@ def xp_median_preimage_5_1(): 'verbose': 2} mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - kernel_options = {'name': 'structuralspkernel', + kernel_options = {'name': 'ShortestPath', 'edge_weight': None, 'node_kernels': sub_kernels, - 'edge_kernels': sub_kernels, - 'compute_method': 'naive', 'parallel': 'imap_unordered', # 'parallel': None, 'n_jobs': multiprocessing.cpu_count(), @@ -298,7 +479,7 @@ def xp_median_preimage_5_1(): ged_options = {'method': 'IPFP', 'initialization_method': 'RANDOM', # 'NODE' 'initial_solutions': 10, # 1 - 'edit_cost': 'NON_SYMBOLIC', + 'edit_cost': 'NON_SYMBOLIC', # 'attr_distance': 'euclidean', 'ratio_runs_from_initial_solutions': 1, 'threads': multiprocessing.cpu_count(), @@ -311,7 +492,7 @@ def xp_median_preimage_5_1(): save_results = True dir_save='../results/xp_median_preimage/' irrelevant_labels = None # - edge_required = False # + edge_required = True # # print settings. print('parameters:') @@ -325,18 +506,18 @@ def xp_median_preimage_5_1(): print() # generate preimages. - for fit_method in ['k-graphs', 'random', 'random', 'random']: + for fit_method in ['k-graphs'] + ['random'] * 10: print('\n-------------------------------------') print('fit method:', fit_method, '\n') mpg_options['fit_method'] = fit_method generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) - -def xp_median_preimage_4_1(): - """xp 4_1: COLORS-3, sspkernel, using NON_SYMBOLIC. + +def xp_median_preimage_5_1(): + """xp 5_1: FRANKENSTEIN, StructuralSP, using NON_SYMBOLIC. """ # set parameters. - ds_name = 'COLORS-3' # + ds_name = 'FRANKENSTEIN' # mpg_options = {'fit_method': 'k-graphs', 'init_ecc': [3, 3, 1, 3, 3, 0], # 'ds_name': ds_name, @@ -349,7 +530,7 @@ def xp_median_preimage_4_1(): 'verbose': 2} mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - kernel_options = {'name': 'structuralspkernel', + kernel_options = {'name': 'StructuralSP', 'edge_weight': None, 'node_kernels': sub_kernels, 'edge_kernels': sub_kernels, @@ -389,20 +570,20 @@ def xp_median_preimage_4_1(): print() # generate preimages. - for fit_method in ['k-graphs', 'random', 'random', 'random']: + for fit_method in ['k-graphs'] + ['random'] * 10: print('\n-------------------------------------') print('fit method:', fit_method, '\n') mpg_options['fit_method'] = fit_method generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + - -def xp_median_preimage_3_1(): - """xp 3_1: Fingerprint, sspkernel, using LETTER2, only node attrs. +def xp_median_preimage_4_1(): + """xp 4_1: COLORS-3, StructuralSP, using NON_SYMBOLIC. """ # set parameters. - ds_name = 'Fingerprint' # + ds_name = 'COLORS-3' # mpg_options = {'fit_method': 'k-graphs', - 'init_ecc': [0.525, 0.525, 0.001, 0.125, 0.125], # + 'init_ecc': [3, 3, 1, 3, 3, 0], # 'ds_name': ds_name, 'parallel': True, # False 'time_limit_in_sec': 0, @@ -413,7 +594,7 @@ def xp_median_preimage_3_1(): 'verbose': 2} mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - kernel_options = {'name': 'structuralspkernel', + kernel_options = {'name': 'StructuralSP', 'edge_weight': None, 'node_kernels': sub_kernels, 'edge_kernels': sub_kernels, @@ -426,7 +607,7 @@ def xp_median_preimage_3_1(): ged_options = {'method': 'IPFP', 'initialization_method': 'RANDOM', # 'NODE' 'initial_solutions': 10, # 1 - 'edit_cost': 'LETTER2', + 'edit_cost': 'NON_SYMBOLIC', 'attr_distance': 'euclidean', 'ratio_runs_from_initial_solutions': 1, 'threads': multiprocessing.cpu_count(), @@ -438,7 +619,7 @@ def xp_median_preimage_3_1(): 'refine': False} save_results = True dir_save='../results/xp_median_preimage/' - irrelevant_labels = {'edge_attrs': ['orient', 'angle']} # + irrelevant_labels = None # edge_required = False # # print settings. @@ -453,20 +634,20 @@ def xp_median_preimage_3_1(): print() # generate preimages. - for fit_method in ['k-graphs', 'random', 'random', 'random']: + for fit_method in ['k-graphs'] + ['random'] * 10: print('\n-------------------------------------') print('fit method:', fit_method, '\n') mpg_options['fit_method'] = fit_method generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) - -def xp_median_preimage_2_1(): - """xp 2_1: COIL-DEL, sspkernel, using LETTER2, only node attrs. + +def xp_median_preimage_3_2(): + """xp 3_2: Fingerprint, ShortestPath, using LETTER2, only node attrs. """ # set parameters. - ds_name = 'COIL-DEL' # + ds_name = 'Fingerprint' # mpg_options = {'fit_method': 'k-graphs', - 'init_ecc': [3, 3, 1, 3, 3], + 'init_ecc': [0.525, 0.525, 0.001, 0.125, 0.125], # 'ds_name': ds_name, 'parallel': True, # False 'time_limit_in_sec': 0, @@ -477,11 +658,9 @@ def xp_median_preimage_2_1(): 'verbose': 2} mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - kernel_options = {'name': 'structuralspkernel', + kernel_options = {'name': 'ShortestPath', 'edge_weight': None, 'node_kernels': sub_kernels, - 'edge_kernels': sub_kernels, - 'compute_method': 'naive', 'parallel': 'imap_unordered', # 'parallel': None, 'n_jobs': multiprocessing.cpu_count(), @@ -502,7 +681,8 @@ def xp_median_preimage_2_1(): 'refine': False} save_results = True dir_save='../results/xp_median_preimage/' - irrelevant_labels = {'edge_labels': ['valence']} + irrelevant_labels = {'edge_attrs': ['orient', 'angle']} # + edge_required = True # # print settings. print('parameters:') @@ -515,25 +695,21 @@ def xp_median_preimage_2_1(): print('irrelevant_labels:', irrelevant_labels) print() -# # compute gram matrices for each class a priori. -# print('Compute gram matrices for each class a priori.') -# compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save=dir_save, irrelevant_labels=irrelevant_labels) - # generate preimages. - for fit_method in ['k-graphs', 'random', 'random', 'random']: + for fit_method in ['k-graphs'] + ['random'] * 10: print('\n-------------------------------------') print('fit method:', fit_method, '\n') mpg_options['fit_method'] = fit_method - generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels) + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) -def xp_median_preimage_1_1(): - """xp 1_1: Letter-high, sspkernel. +def xp_median_preimage_3_1(): + """xp 3_1: Fingerprint, StructuralSP, using LETTER2, only node attrs. """ # set parameters. - ds_name = 'Letter-high' + ds_name = 'Fingerprint' # mpg_options = {'fit_method': 'k-graphs', - 'init_ecc': [3, 3, 1, 3, 3], + 'init_ecc': [0.525, 0.525, 0.001, 0.125, 0.125], # 'ds_name': ds_name, 'parallel': True, # False 'time_limit_in_sec': 0, @@ -544,19 +720,19 @@ def xp_median_preimage_1_1(): 'verbose': 2} mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - kernel_options = {'name': 'structuralspkernel', + kernel_options = {'name': 'StructuralSP', 'edge_weight': None, 'node_kernels': sub_kernels, 'edge_kernels': sub_kernels, 'compute_method': 'naive', - 'parallel': 'imap_unordered', -# 'parallel': None, + 'parallel': 'imap_unordered', + # 'parallel': None, 'n_jobs': multiprocessing.cpu_count(), 'normalize': True, 'verbose': 2} ged_options = {'method': 'IPFP', 'initialization_method': 'RANDOM', # 'NODE' - 'initial_solutions': 1, # 1 + 'initial_solutions': 10, # 1 'edit_cost': 'LETTER2', 'attr_distance': 'euclidean', 'ratio_runs_from_initial_solutions': 1, @@ -568,6 +744,9 @@ def xp_median_preimage_1_1(): 'verbose': 2, 'refine': False} save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = {'edge_attrs': ['orient', 'angle']} # + edge_required = False # # print settings. print('parameters:') @@ -577,40 +756,502 @@ def xp_median_preimage_1_1(): print('ged_options:', ged_options) print('mge_options:', mge_options) print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() # generate preimages. - for fit_method in ['k-graphs', 'expert', 'random', 'random', 'random']: + for fit_method in ['k-graphs'] + ['random'] * 10: print('\n-------------------------------------') print('fit method:', fit_method, '\n') mpg_options['fit_method'] = fit_method - generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save='../results/xp_median_preimage/') - - -if __name__ == "__main__": - - #### xp 1_1: Letter-high, sspkernel. - # xp_median_preimage_1_1() - - #### xp 2_1: COIL-DEL, sspkernel, using LETTER2, only node attrs. -# xp_median_preimage_2_1() - - #### xp 3_1: Fingerprint, sspkernel, using LETTER2, only node attrs. - # xp_median_preimage_3_1() - - #### xp 4_1: COLORS-3, sspkernel, using NON_SYMBOLIC. -# xp_median_preimage_4_1() - - #### xp 5_1: FRANKENSTEIN, sspkernel, using NON_SYMBOLIC. -# xp_median_preimage_5_1() - - #### xp 6_1: COIL-RAG, sspkernel, using NON_SYMBOLIC. - # xp_median_preimage_6_1() + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + - #### xp 7_1: MUTAG, sspkernel, using CONSTANT. - # xp_median_preimage_7_1() - - #### xp 8_1: Monoterpenoides, sspkernel, using CONSTANT. -# xp_median_preimage_8_1() +def xp_median_preimage_2_1(): + """xp 2_1: COIL-DEL, StructuralSP, using LETTER2, only node attrs. + """ + # set parameters. + ds_name = 'COIL-DEL' # + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [3, 3, 1, 3, 3], + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'StructuralSP', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', + # 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = {'edge_labels': ['valence']} + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + +# # compute gram matrices for each class a priori. +# print('Compute gram matrices for each class a priori.') +# compute_gram_matrices_by_class(ds_name, kernel_options, save_results=True, dir_save=dir_save, irrelevant_labels=irrelevant_labels) + + # generate preimages. + for fit_method in ['k-graphs'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels) + + +def xp_median_preimage_1_1(): + """xp 1_1: Letter-high, StructuralSP. + """ + # set parameters. + ds_name = 'Letter-high' + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [0.675, 0.675, 0.75, 0.425, 0.425], + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'StructuralSP', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', +# 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + + # generate preimages. + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save='../results/xp_median_preimage/') + + +def xp_median_preimage_1_2(): + """xp 1_2: Letter-high, ShortestPath. + """ + # set parameters. + ds_name = 'Letter-high' + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [0.675, 0.675, 0.75, 0.425, 0.425], + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'ShortestPath', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'parallel': 'imap_unordered', +# 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = True # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_10_1(): + """xp 10_1: Letter-med, StructuralSP. + """ + # set parameters. + ds_name = 'Letter-med' + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475], + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'StructuralSP', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', +# 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + + # generate preimages. + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save='../results/xp_median_preimage/') + + +def xp_median_preimage_10_2(): + """xp 10_2: Letter-med, ShortestPath. + """ + # set parameters. + ds_name = 'Letter-med' + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [0.525, 0.525, 0.75, 0.475, 0.475], + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'ShortestPath', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'parallel': 'imap_unordered', +# 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = True # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +def xp_median_preimage_11_1(): + """xp 11_1: Letter-low, StructuralSP. + """ + # set parameters. + ds_name = 'Letter-low' + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075], + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'StructuralSP', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'edge_kernels': sub_kernels, + 'compute_method': 'naive', + 'parallel': 'imap_unordered', +# 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + + # generate preimages. + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save='../results/xp_median_preimage/') + + +def xp_median_preimage_11_2(): + """xp 11_2: Letter-low, ShortestPath. + """ + # set parameters. + ds_name = 'Letter-low' + mpg_options = {'fit_method': 'k-graphs', + 'init_ecc': [0.075, 0.075, 0.25, 0.075, 0.075], + 'ds_name': ds_name, + 'parallel': True, # False + 'time_limit_in_sec': 0, + 'max_itrs': 100, + 'max_itrs_without_update': 3, + 'epsilon_residual': 0.01, + 'epsilon_ec': 0.1, + 'verbose': 2} + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + kernel_options = {'name': 'ShortestPath', + 'edge_weight': None, + 'node_kernels': sub_kernels, + 'parallel': 'imap_unordered', +# 'parallel': None, + 'n_jobs': multiprocessing.cpu_count(), + 'normalize': True, + 'verbose': 2} + ged_options = {'method': 'IPFP', + 'initialization_method': 'RANDOM', # 'NODE' + 'initial_solutions': 10, # 1 + 'edit_cost': 'LETTER2', + 'attr_distance': 'euclidean', + 'ratio_runs_from_initial_solutions': 1, + 'threads': multiprocessing.cpu_count(), + 'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'} + mge_options = {'init_type': 'MEDOID', + 'random_inits': 10, + 'time_limit': 600, + 'verbose': 2, + 'refine': False} + save_results = True + dir_save='../results/xp_median_preimage/' + irrelevant_labels = None # + edge_required = True # + + # print settings. + print('parameters:') + print('dataset name:', ds_name) + print('mpg_options:', mpg_options) + print('kernel_options:', kernel_options) + print('ged_options:', ged_options) + print('mge_options:', mge_options) + print('save_results:', save_results) + print('irrelevant_labels:', irrelevant_labels) + print() + + # generate preimages. + for fit_method in ['k-graphs', 'expert'] + ['random'] * 10: + print('\n-------------------------------------') + print('fit method:', fit_method, '\n') + mpg_options['fit_method'] = fit_method + generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required) + + +if __name__ == "__main__": + + #### xp 1_1: Letter-high, StructuralSP. + # xp_median_preimage_1_1() + + #### xp 1_2: Letter-high, ShortestPath. +# xp_median_preimage_1_2() + + #### xp 10_1: Letter-med, StructuralSP. + # xp_median_preimage_10_1() + + #### xp 10_2: Letter-med, ShortestPath. + # xp_median_preimage_10_2() + + #### xp 11_1: Letter-low, StructuralSP. + # xp_median_preimage_11_1() + + #### xp 11_2: Letter-low, ShortestPath. +# xp_median_preimage_11_2() + + #### xp 2_1: COIL-DEL, StructuralSP, using LETTER2, only node attrs. +# xp_median_preimage_2_1() + + #### xp 3_1: Fingerprint, StructuralSP, using LETTER2, only node attrs. + # xp_median_preimage_3_1() + + #### xp 3_2: Fingerprint, ShortestPath, using LETTER2, only node attrs. +# xp_median_preimage_3_2() + + #### xp 4_1: COLORS-3, StructuralSP, using NON_SYMBOLIC. +# xp_median_preimage_4_1() + + #### xp 5_1: FRANKENSTEIN, StructuralSP, using NON_SYMBOLIC. +# xp_median_preimage_5_1() + + #### xp 6_1: COIL-RAG, StructuralSP, using NON_SYMBOLIC. + # xp_median_preimage_6_1() + + #### xp 6_2: COIL-RAG, ShortestPath, using NON_SYMBOLIC. +# xp_median_preimage_6_2() + + #### xp 7_1: MUTAG, StructuralSP, using CONSTANT. + # xp_median_preimage_7_1() + + #### xp 7_2: MUTAG, PathUpToH, using CONSTANT. + xp_median_preimage_7_2() + + #### xp 8_1: Monoterpenoides, StructuralSP, using CONSTANT. +# xp_median_preimage_8_1() + + #### xp 8_2: Monoterpenoides, PathUpToH, using CONSTANT. +# xp_median_preimage_8_2() + + #### xp 9_1: MAO, StructuralSP, using CONSTANT, symbolic only. +# xp_median_preimage_9_1() - #### xp 9_1: MAO, sspkernel, using CONSTANT. - xp_median_preimage_9_1() \ No newline at end of file + #### xp 9_2: MAO, PathUpToH, using CONSTANT, symbolic only. +# xp_median_preimage_9_2() \ No newline at end of file diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index 6878701..04efa9d 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -725,7 +725,13 @@ class MedianPreimageGenerator(PreimageGenerator): def __set_graph_kernel_by_name(self): - if self.kernel_options['name'] == 'structuralspkernel': + if self._kernel_options['name'] == 'ShortestPath': + from gklearn.kernels import ShortestPath + self._graph_kernel = ShortestPath(node_labels=self._dataset.node_labels, + node_attrs=self._dataset.node_attrs, + ds_infos=self._dataset.get_dataset_infos(keys=['directed']), + **self._kernel_options) + elif self._kernel_options['name'] == 'StructuralSP': from gklearn.kernels import StructuralSP self._graph_kernel = StructuralSP(node_labels=self._dataset.node_labels, edge_labels=self._dataset.edge_labels, @@ -733,6 +739,14 @@ class MedianPreimageGenerator(PreimageGenerator): edge_attrs=self._dataset.edge_attrs, ds_infos=self._dataset.get_dataset_infos(keys=['directed']), **self._kernel_options) + elif self._kernel_options['name'] == 'PathUpToH': + from gklearn.kernels import PathUpToH + self._graph_kernel = PathUpToH(node_labels=self._dataset.node_labels, + edge_labels=self._dataset.edge_labels, + ds_infos=self._dataset.get_dataset_infos(keys=['directed']), + **self._kernel_options) + else: + raise Exception('The graph kernel given is not defined. Possible choices include: "StructuralSP", "ShortestPath", "PathUpToH".') # def __clean_graph(self, G, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]): diff --git a/gklearn/tests/test_graph_kernels.py b/gklearn/tests/test_graph_kernels.py new file mode 100644 index 0000000..fbad15f --- /dev/null +++ b/gklearn/tests/test_graph_kernels.py @@ -0,0 +1,270 @@ +"""Tests of graph kernels. +""" + +import pytest +import multiprocessing + + +def chooseDataset(ds_name): + """Choose dataset according to name. + """ + from gklearn.utils import Dataset + + dataset = Dataset() + + # no node labels (and no edge labels). + if ds_name == 'Alkane': + dataset.load_predefined_dataset(ds_name) + dataset.trim_dataset(edge_required=False) + irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} + dataset.remove_labels(**irrelevant_labels) + # node symbolic labels. + elif ds_name == 'Acyclic': + dataset.load_predefined_dataset(ds_name) + dataset.trim_dataset(edge_required=False) + irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} + dataset.remove_labels(**irrelevant_labels) + # node non-symbolic labels. + elif ds_name == 'Letter-med': + dataset.load_predefined_dataset(ds_name) + dataset.trim_dataset(edge_required=False) + # node symbolic and non-symbolic labels (and edge symbolic labels). + elif ds_name == 'AIDS': + dataset.load_predefined_dataset(ds_name) + dataset.trim_dataset(edge_required=False) + # edge non-symbolic labels (no node labels). + elif ds_name == 'Fingerprint_edge': + dataset.load_predefined_dataset('Fingerprint') + dataset.trim_dataset(edge_required=True) + irrelevant_labels = {'edge_attrs': ['orient', 'angle']} + dataset.remove_labels(**irrelevant_labels) + # edge non-symbolic labels (and node non-symbolic labels). + elif ds_name == 'Fingerprint': + dataset.load_predefined_dataset(ds_name) + dataset.trim_dataset(edge_required=True) + # edge symbolic and non-symbolic labels (and node symbolic and non-symbolic labels). + elif ds_name == 'Cuneiform': + dataset.load_predefined_dataset(ds_name) + dataset.trim_dataset(edge_required=True) + + dataset.cut_graphs(range(0, 3)) + + return dataset + + +# @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +# @pytest.mark.parametrize('weight,compute_method', [(0.01, 'geo'), (1, 'exp')]) +# #@pytest.mark.parametrize('parallel', ['imap_unordered', None]) +# def test_commonwalkkernel(ds_name, weight, compute_method): +# """Test common walk kernel. +# """ +# from gklearn.kernels.commonWalkKernel import commonwalkkernel + +# Gn, y = chooseDataset(ds_name) + +# try: +# Kmatrix, run_time, idx = commonwalkkernel(Gn, +# node_label='atom', +# edge_label='bond_type', +# weight=weight, +# compute_method=compute_method, +# # parallel=parallel, +# n_jobs=multiprocessing.cpu_count(), +# verbose=True) +# except Exception as exception: +# assert False, exception + + +# @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +# @pytest.mark.parametrize('remove_totters', [True, False]) +# #@pytest.mark.parametrize('parallel', ['imap_unordered', None]) +# def test_marginalizedkernel(ds_name, remove_totters): +# """Test marginalized kernel. +# """ +# from gklearn.kernels.marginalizedKernel import marginalizedkernel + +# Gn, y = chooseDataset(ds_name) + +# try: +# Kmatrix, run_time = marginalizedkernel(Gn, +# node_label='atom', +# edge_label='bond_type', +# p_quit=0.5, +# n_iteration=2, +# remove_totters=remove_totters, +# # parallel=parallel, +# n_jobs=multiprocessing.cpu_count(), +# verbose=True) +# except Exception as exception: +# assert False, exception + + +# @pytest.mark.parametrize( +# 'compute_method,ds_name,sub_kernel', +# [ +# # ('sylvester', 'Alkane', None), +# # ('conjugate', 'Alkane', None), +# # ('conjugate', 'AIDS', None), +# # ('fp', 'Alkane', None), +# # ('fp', 'AIDS', None), +# ('spectral', 'Alkane', 'exp'), +# ('spectral', 'Alkane', 'geo'), +# ] +# ) +# #@pytest.mark.parametrize('parallel', ['imap_unordered', None]) +# def test_randomwalkkernel(ds_name, compute_method, sub_kernel): +# """Test random walk kernel kernel. +# """ +# from gklearn.kernels.randomWalkKernel import randomwalkkernel +# from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct +# import functools + +# Gn, y = chooseDataset(ds_name) + +# mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) +# sub_kernels = [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}] +# try: +# Kmatrix, run_time, idx = randomwalkkernel(Gn, +# compute_method=compute_method, +# weight=1e-3, +# p=None, +# q=None, +# edge_weight=None, +# node_kernels=sub_kernels, +# edge_kernels=sub_kernels, +# node_label='atom', +# edge_label='bond_type', +# sub_kernel=sub_kernel, +# # parallel=parallel, +# n_jobs=multiprocessing.cpu_count(), +# verbose=True) +# except Exception as exception: +# assert False, exception + + +@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) +@pytest.mark.parametrize('parallel', ['imap_unordered', None]) +def test_ShortestPath(ds_name, parallel): + """Test shortest path kernel. + """ + from gklearn.kernels import ShortestPath + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + + dataset = chooseDataset(ds_name) + + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + try: + graph_kernel = ShortestPath(node_labels=dataset.node_labels, + node_attrs=dataset.node_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + node_kernels=sub_kernels) + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + except Exception as exception: + assert False, exception + + +#@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint']) +@pytest.mark.parametrize('ds_name', ['Alkane', 'Acyclic', 'Letter-med', 'AIDS', 'Fingerprint', 'Fingerprint_edge', 'Cuneiform']) +@pytest.mark.parametrize('parallel', ['imap_unordered', None]) +def test_StructuralSP(ds_name, parallel): + """Test structural shortest path kernel. + """ + from gklearn.kernels import StructuralSP + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + + dataset = chooseDataset(ds_name) + + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + try: + graph_kernel = StructuralSP(node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + node_attrs=dataset.node_attrs, + edge_attrs=dataset.edge_attrs, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + node_kernels=sub_kernels, + edge_kernels=sub_kernels) + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + except Exception as exception: + assert False, exception + + +@pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +@pytest.mark.parametrize('parallel', ['imap_unordered', None]) +#@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto', None]) +@pytest.mark.parametrize('k_func', ['MinMax', 'tanimoto']) +@pytest.mark.parametrize('compute_method', ['trie', 'naive']) +def test_PathUpToH(ds_name, parallel, k_func, compute_method): + """Test path kernel up to length $h$. + """ + from gklearn.kernels import PathUpToH + + dataset = chooseDataset(ds_name) + + try: + graph_kernel = PathUpToH(node_labels=dataset.node_labels, + edge_labels=dataset.edge_labels, + ds_infos=dataset.get_dataset_infos(keys=['directed']), + depth=2, k_func=k_func, compute_method=compute_method) + gram_matrix, run_time = graph_kernel.compute(dataset.graphs, + parallel=parallel, n_jobs=multiprocessing.cpu_count(), verbose=True) + except Exception as exception: + assert False, exception + + +# @pytest.mark.parametrize('ds_name', ['Alkane', 'AIDS']) +# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) +# def test_treeletkernel(ds_name, parallel): +# """Test treelet kernel. +# """ +# from gklearn.kernels.treeletKernel import treeletkernel +# from gklearn.utils.kernels import polynomialkernel +# import functools + +# Gn, y = chooseDataset(ds_name) + +# pkernel = functools.partial(polynomialkernel, d=2, c=1e5) +# try: +# Kmatrix, run_time = treeletkernel(Gn, +# sub_kernel=pkernel, +# node_label='atom', +# edge_label='bond_type', +# parallel=parallel, +# n_jobs=multiprocessing.cpu_count(), +# verbose=True) +# except Exception as exception: +# assert False, exception + + +# @pytest.mark.parametrize('ds_name', ['Acyclic']) +# #@pytest.mark.parametrize('base_kernel', ['subtree', 'sp', 'edge']) +# @pytest.mark.parametrize('base_kernel', ['subtree']) +# @pytest.mark.parametrize('parallel', ['imap_unordered', None]) +# def test_weisfeilerlehmankernel(ds_name, parallel, base_kernel): +# """Test Weisfeiler-Lehman kernel. +# """ +# from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel + +# Gn, y = chooseDataset(ds_name) + +# try: +# Kmatrix, run_time = weisfeilerlehmankernel(Gn, +# node_label='atom', +# edge_label='bond_type', +# height=2, +# base_kernel=base_kernel, +# parallel=parallel, +# n_jobs=multiprocessing.cpu_count(), +# verbose=True) +# except Exception as exception: +# assert False, exception + + +if __name__ == "__main__": +# test_spkernel('Alkane', 'imap_unordered') + test_StructuralSP('Fingerprint_edge', 'imap_unordered') \ No newline at end of file diff --git a/gklearn/utils/__init__.py b/gklearn/utils/__init__.py index 78832d3..79d0a39 100644 --- a/gklearn/utils/__init__.py +++ b/gklearn/utils/__init__.py @@ -20,3 +20,4 @@ from gklearn.utils.graph_files import load_dataset, save_dataset from gklearn.utils.timer import Timer from gklearn.utils.utils import get_graph_kernel_by_name from gklearn.utils.utils import compute_gram_matrices_by_class +from gklearn.utils.trie import Trie diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index e218596..abd7edd 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -70,6 +70,13 @@ class Dataset(object): if ds_name == 'Acyclic': ds_file = current_path + '../../datasets/Acyclic/dataset_bps.ds' self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'AIDS': + ds_file = current_path + '../../datasets/AIDS/AIDS_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Alkane': + ds_file = current_path + '../../datasets/Alkane/dataset.ds' + fn_targets = current_path + '../../datasets/Alkane/dataset_boiling_point_names.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file, filename_targets=fn_targets) elif ds_name == 'COIL-DEL': ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) @@ -79,6 +86,9 @@ class Dataset(object): elif ds_name == 'COLORS-3': ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) + elif ds_name == 'Cuneiform': + ds_file = current_path + '../../datasets/Cuneiform/Cuneiform_A.txt' + self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'Fingerprint': ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) @@ -89,10 +99,10 @@ class Dataset(object): ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'Letter-low': # node non-symb - ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt' + ds_file = current_path + '../../datasets/Letter-low/Letter-low_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'Letter-med': # node non-symb - ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt' + ds_file = current_path + '../../datasets/Letter-med/Letter-med_A.txt' self.__graphs, self.__targets, label_names = load_dataset(ds_file) elif ds_name == 'MAO': ds_file = current_path + '../../datasets/MAO/dataset.ds' diff --git a/gklearn/utils/graph_files.py b/gklearn/utils/graph_files.py index 7594110..ce07a59 100644 --- a/gklearn/utils/graph_files.py +++ b/gklearn/utils/graph_files.py @@ -569,10 +569,10 @@ def load_tud(filename): elif 'fga' in locals(): content_targets = open(fga).read().splitlines() # targets (regression) targets = [int(i) for i in content_targets] - if class_label_map is not None: - targets = [class_label_map[t] for t in targets] else: raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') + if class_label_map is not None: + targets = [class_label_map[t] for t in targets] # create graphs and add nodes data = [nx.Graph(name=str(i)) for i in range(0, len(content_targets))]