From 543032cb0264ca4613fef0ad51fdf844355ee98c Mon Sep 17 00:00:00 2001 From: linlin Date: Sun, 4 Oct 2020 17:40:57 +0200 Subject: [PATCH 01/13] Update Crowdin configuration file --- crowdin.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 crowdin.yml diff --git a/crowdin.yml b/crowdin.yml new file mode 100644 index 0000000..4cc48fc --- /dev/null +++ b/crowdin.yml @@ -0,0 +1 @@ +files: [] \ No newline at end of file From 2bfe305595f4722e0022b29d05fc13b7100836fc Mon Sep 17 00:00:00 2001 From: linlin Date: Sun, 4 Oct 2020 18:14:18 +0200 Subject: [PATCH 02/13] Update Crowdin configuration file --- crowdin.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crowdin.yml b/crowdin.yml index 4cc48fc..47b46bc 100644 --- a/crowdin.yml +++ b/crowdin.yml @@ -1 +1,3 @@ -files: [] \ No newline at end of file +files: + - source: README.md + translation: lang/%two_letters_code%/%original_file_name% From 524d75d041168088524c3984b5a06c80d85b303e Mon Sep 17 00:00:00 2001 From: linlin Date: Sun, 4 Oct 2020 18:36:35 +0200 Subject: [PATCH 03/13] Update Crowdin configuration file --- crowdin.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crowdin.yml b/crowdin.yml index 47b46bc..f18c249 100644 --- a/crowdin.yml +++ b/crowdin.yml @@ -1,3 +1,5 @@ files: - - source: README.md - translation: lang/%two_letters_code%/%original_file_name% + - source: /**/ + ignore: + - /datasets/ + translation: /lang/%two_letters_code%/%original_path%/%original_file_name% From 95d327f3d811cc707383a047ea774703bf7fe291 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Tue, 6 Oct 2020 17:14:07 +0200 Subject: [PATCH 04/13] Add entropy experiments. --- .../papers/PRL_2020/accuracy_diff_entropy.py | 186 ++ .../papers/PRL_2020/runtimes_28cores.py | 12 +- .../papers/PRL_2020/runtimes_diff_chunksizes.py | 18 +- .../papers/PRL_2020/synthesized_graphs_N.py | 4 +- .../papers/PRL_2020/synthesized_graphs_degrees.py | 4 +- .../papers/PRL_2020/synthesized_graphs_num_el.py | 4 +- .../papers/PRL_2020/synthesized_graphs_num_nl.py | 4 +- .../PRL_2020/synthesized_graphs_num_nodes.py | 4 +- gklearn/experiments/papers/PRL_2020/utils.py | 121 ++ gklearn/utils/dataset.py | 59 +- gklearn/utils/model_selection_precomputed.py | 1834 ++++++++++---------- gklearn/utils/stats.py | 27 + setup.py | 2 +- 13 files changed, 1332 insertions(+), 947 deletions(-) create mode 100644 gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py create mode 100644 gklearn/utils/stats.py diff --git a/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py b/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py new file mode 100644 index 0000000..c25c116 --- /dev/null +++ b/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Oct 5 16:08:33 2020 + +@author: ljia + +This script compute classification accuracy of each geaph kernel on datasets +with different entropy of degree distribution. +""" +from utils import Graph_Kernel_List, cross_validate +import numpy as np +import logging + +num_nodes = 40 +half_num_graphs = 100 + + +def generate_graphs(): +# from gklearn.utils.graph_synthesizer import GraphSynthesizer +# gsyzer = GraphSynthesizer() +# graphs = gsyzer.unified_graphs(num_graphs=1000, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) +# return graphs + import networkx as nx + + degrees11 = [5] * num_nodes +# degrees12 = [2] * num_nodes + degrees12 = [5] * num_nodes + degrees21 = list(range(1, 11)) * 6 +# degrees22 = [5 * i for i in list(range(1, 11)) * 6] + degrees22 = list(range(1, 11)) * 6 + + # method 1 + graphs11 = [nx.configuration_model(degrees11, create_using=nx.Graph) for i in range(half_num_graphs)] + graphs12 = [nx.configuration_model(degrees12, create_using=nx.Graph) for i in range(half_num_graphs)] + + # method 2: can easily generate isomorphic graphs. +# graphs11 = [nx.random_regular_graph(2, num_nodes, seed=None) for i in range(half_num_graphs)] +# graphs12 = [nx.random_regular_graph(10, num_nodes, seed=None) for i in range(half_num_graphs)] + + # Add node labels. + for g in graphs11: + for n in g.nodes(): + g.nodes[n]['atom'] = 0 + for g in graphs12: + for n in g.nodes(): + g.nodes[n]['atom'] = 1 + + graphs1 = graphs11 + graphs12 + + # method 1: the entorpy of the two classes is not the same. + graphs21 = [nx.configuration_model(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] + graphs22 = [nx.configuration_model(degrees22, create_using=nx.Graph) for i in range(half_num_graphs)] + +# # method 2: tooo slow, and may fail. +# graphs21 = [nx.random_degree_sequence_graph(degrees21, seed=None, tries=100) for i in range(half_num_graphs)] +# graphs22 = [nx.random_degree_sequence_graph(degrees22, seed=None, tries=100) for i in range(half_num_graphs)] + +# # method 3: no randomness. +# graphs21 = [nx.havel_hakimi_graph(degrees21, create_using=None) for i in range(half_num_graphs)] +# graphs22 = [nx.havel_hakimi_graph(degrees22, create_using=None) for i in range(half_num_graphs)] + +# # method 4: +# graphs21 = [nx.configuration_model(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] +# graphs22 = [nx.degree_sequence_tree(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] + +# # method 5: the entorpy of the two classes is not the same. +# graphs21 = [nx.expected_degree_graph(degrees21, seed=None, selfloops=False) for i in range(half_num_graphs)] +# graphs22 = [nx.expected_degree_graph(degrees22, seed=None, selfloops=False) for i in range(half_num_graphs)] + +# # method 6: seems there is no randomness0 +# graphs21 = [nx.random_powerlaw_tree(num_nodes, gamma=3, seed=None, tries=10000) for i in range(half_num_graphs)] +# graphs22 = [nx.random_powerlaw_tree(num_nodes, gamma=3, seed=None, tries=10000) for i in range(half_num_graphs)] + + # Add node labels. + for g in graphs21: + for n in g.nodes(): + g.nodes[n]['atom'] = 0 + for g in graphs22: + for n in g.nodes(): + g.nodes[n]['atom'] = 1 + + graphs2 = graphs21 + graphs22 + +# # check for isomorphism. +# iso_mat1 = np.zeros((len(graphs1), len(graphs1))) +# num1 = 0 +# num2 = 0 +# for i in range(len(graphs1)): +# for j in range(i + 1, len(graphs1)): +# if nx.is_isomorphic(graphs1[i], graphs1[j]): +# iso_mat1[i, j] = 1 +# iso_mat1[j, i] = 1 +# num1 += 1 +# print('iso:', num1, ':', i, ',', j) +# else: +# num2 += 1 +# print('not iso:', num2, ':', i, ',', j) +# +# iso_mat2 = np.zeros((len(graphs2), len(graphs2))) +# num1 = 0 +# num2 = 0 +# for i in range(len(graphs2)): +# for j in range(i + 1, len(graphs2)): +# if nx.is_isomorphic(graphs2[i], graphs2[j]): +# iso_mat2[i, j] = 1 +# iso_mat2[j, i] = 1 +# num1 += 1 +# print('iso:', num1, ':', i, ',', j) +# else: +# num2 += 1 +# print('not iso:', num2, ':', i, ',', j) + + return graphs1, graphs2 + + +def get_infos(graph): + from gklearn.utils import Dataset + ds = Dataset() + ds.load_graphs(graph) + infos = ds.get_dataset_infos(keys=['all_degree_entropy', 'ave_node_degree']) + infos['ave_degree_entropy'] = np.mean(infos['all_degree_entropy']) + print(infos['ave_degree_entropy'], ',', infos['ave_node_degree']) + return infos + + +def xp_accuracy_diff_entropy(): + + # Generate graphs. + graphs1, graphs2 = generate_graphs() + + + # Compute entropy of degree distribution of the generated graphs. + info11 = get_infos(graphs1[0:half_num_graphs]) + info12 = get_infos(graphs1[half_num_graphs:]) + info21 = get_infos(graphs2[0:half_num_graphs]) + info22 = get_infos(graphs2[half_num_graphs:]) + + # Run and save. + import pickle + import os + save_dir = 'outputs/accuracy_diff_entropy/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + accuracies = {} + confidences = {} + + for kernel_name in Graph_Kernel_List: + print() + print('Kernel:', kernel_name) + + accuracies[kernel_name] = [] + confidences[kernel_name] = [] + for set_i, graphs in enumerate([graphs1, graphs2]): + print() + print('Graph set', set_i) + + tmp_graphs = [g.copy() for g in graphs] + targets = [0] * half_num_graphs + [1] * half_num_graphs + + accuracy = 'error' + confidence = 'error' + try: + accuracy, confidence = cross_validate(tmp_graphs, targets, kernel_name, ds_name=str(set_i), output_dir=save_dir) #, n_jobs=1) + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception('\n' + kernel_name + ', ' + str(set_i) + ':') + print(repr(exp)) + accuracies[kernel_name].append(accuracy) + confidences[kernel_name].append(confidence) + + pickle.dump(accuracy, open(save_dir + 'accuracy.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb')) + pickle.dump(confidence, open(save_dir + 'confidence.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb')) + + # Save all. + pickle.dump(accuracies, open(save_dir + 'accuracies.pkl', 'wb')) + pickle.dump(confidences, open(save_dir + 'confidences.pkl', 'wb')) + + return + + +if __name__ == '__main__': + xp_accuracy_diff_entropy() \ No newline at end of file diff --git a/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py b/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py index 4c827ce..0e25f46 100644 --- a/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py +++ b/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py @@ -21,14 +21,14 @@ def xp_runtimes_of_all_28cores(): run_times = {} - for kernel_name in Graph_Kernel_List: + for ds_name in Dataset_List: print() - print('Kernel:', kernel_name) + print('Dataset:', ds_name) - run_times[kernel_name] = [] - for ds_name in Dataset_List: + run_times[ds_name] = [] + for kernel_name in Graph_Kernel_List: print() - print('Dataset:', ds_name) + print('Kernel:', kernel_name) # get graphs. graphs, _ = load_predefined_dataset(ds_name) @@ -43,7 +43,7 @@ def xp_runtimes_of_all_28cores(): logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) - run_times[kernel_name].append(run_time) + run_times[ds_name].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.pkl', 'wb')) diff --git a/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py b/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py index 343694c..6d118d8 100644 --- a/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py +++ b/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py @@ -20,17 +20,17 @@ def xp_runtimes_diff_chunksizes(): os.makedirs(save_dir) run_times = {} - - for kernel_name in Graph_Kernel_List: + + for ds_name in Dataset_List: print() - print('Kernel:', kernel_name) - - run_times[kernel_name] = [] - for ds_name in Dataset_List: + print('Dataset:', ds_name) + + run_times[ds_name] = [] + for kernel_name in Graph_Kernel_List: print() - print('Dataset:', ds_name) + print('Kernel:', kernel_name) - run_times[kernel_name].append([]) + run_times[ds_name].append([]) for chunksize in [1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000]: print() print('Chunksize:', chunksize) @@ -48,7 +48,7 @@ def xp_runtimes_diff_chunksizes(): logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) - run_times[kernel_name][-1].append(run_time) + run_times[ds_name][-1].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.' + str(chunksize) + '.pkl', 'wb')) diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py index 36bf1bc..a7056f3 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py @@ -16,7 +16,7 @@ def generate_graphs(): return graphs -def xp_synthesied_graphs_dataset_size(): +def xp_synthesized_graphs_dataset_size(): # Generate graphs. graphs = generate_graphs() @@ -61,4 +61,4 @@ def xp_synthesied_graphs_dataset_size(): if __name__ == '__main__': - xp_synthesied_graphs_dataset_size() \ No newline at end of file + xp_synthesized_graphs_dataset_size() \ No newline at end of file diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py index 0562d81..2f5594d 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py @@ -16,7 +16,7 @@ def generate_graphs(degree): return graphs -def xp_synthesied_graphs_degrees(): +def xp_synthesized_graphs_degrees(): # Run and save. import pickle @@ -60,4 +60,4 @@ def xp_synthesied_graphs_degrees(): if __name__ == '__main__': - xp_synthesied_graphs_degrees() + xp_synthesized_graphs_degrees() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py index 9a8e721..51e07ba 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py @@ -16,7 +16,7 @@ def generate_graphs(num_el_alp): return graphs -def xp_synthesied_graphs_num_edge_label_alphabet(): +def xp_synthesized_graphs_num_edge_label_alphabet(): # Run and save. import pickle @@ -60,4 +60,4 @@ def xp_synthesied_graphs_num_edge_label_alphabet(): if __name__ == '__main__': - xp_synthesied_graphs_num_edge_label_alphabet() + xp_synthesized_graphs_num_edge_label_alphabet() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py index 2ab63ee..61609ba 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py @@ -16,7 +16,7 @@ def generate_graphs(num_nl_alp): return graphs -def xp_synthesied_graphs_num_node_label_alphabet(): +def xp_synthesized_graphs_num_node_label_alphabet(): # Run and save. import pickle @@ -61,4 +61,4 @@ def xp_synthesied_graphs_num_node_label_alphabet(): if __name__ == '__main__': - xp_synthesied_graphs_num_node_label_alphabet() + xp_synthesized_graphs_num_node_label_alphabet() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py index d0d6ebb..ec6557c 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py @@ -16,7 +16,7 @@ def generate_graphs(num_nodes): return graphs -def xp_synthesied_graphs_num_nodes(): +def xp_synthesized_graphs_num_nodes(): # Run and save. import pickle @@ -61,4 +61,4 @@ def xp_synthesied_graphs_num_nodes(): if __name__ == '__main__': - xp_synthesied_graphs_num_nodes() + xp_synthesized_graphs_num_nodes() diff --git a/gklearn/experiments/papers/PRL_2020/utils.py b/gklearn/experiments/papers/PRL_2020/utils.py index 07c82f7..99e2d20 100644 --- a/gklearn/experiments/papers/PRL_2020/utils.py +++ b/gklearn/experiments/papers/PRL_2020/utils.py @@ -6,6 +6,8 @@ Created on Tue Sep 22 11:33:28 2020 @author: ljia """ import multiprocessing +import numpy as np +from gklearn.utils import model_selection_for_precomputed_kernel Graph_Kernel_List = ['PathUpToH', 'WLSubtree', 'SylvesterEquation', 'Marginalized', 'ShortestPath', 'Treelet', 'ConjugateGradient', 'FixedPoint', 'SpectralDecomposition', 'StructuralSP', 'CommonWalk'] @@ -109,4 +111,123 @@ def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count() params['verbose'] = True results = estimator(graphs, **params) + return results[0], results[1] + + +def cross_validate(graphs, targets, kernel_name, output_dir='outputs/', ds_name='synthesized', n_jobs=multiprocessing.cpu_count()): + + param_grid = None + + if kernel_name == 'CommonWalk': + from gklearn.kernels.commonWalkKernel import commonwalkkernel + estimator = commonwalkkernel + param_grid_precomputed = [{'compute_method': ['geo'], + 'weight': np.linspace(0.01, 0.15, 15)}] + + elif kernel_name == 'Marginalized': + from gklearn.kernels.marginalizedKernel import marginalizedkernel + estimator = marginalizedkernel + param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), + 'n_iteration': np.linspace(1, 19, 7), + 'remove_totters': [False]} + + elif kernel_name == 'SylvesterEquation': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + param_grid_precomputed = {'compute_method': ['sylvester'], +# 'weight': np.linspace(0.01, 0.10, 10)} + 'weight': np.logspace(-1, -10, num=10, base=10)} + + elif kernel_name == 'ConjugateGradient': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + param_grid_precomputed = {'compute_method': ['conjugate'], + 'node_kernels': [sub_kernel], 'edge_kernels': [sub_kernel], + 'weight': np.logspace(-1, -10, num=10, base=10)} + + elif kernel_name == 'FixedPoint': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + param_grid_precomputed = {'compute_method': ['fp'], + 'node_kernels': [sub_kernel], 'edge_kernels': [sub_kernel], + 'weight': np.logspace(-3, -10, num=8, base=10)} + + elif kernel_name == 'SpectralDecomposition': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + param_grid_precomputed = {'compute_method': ['spectral'], + 'weight': np.logspace(-1, -10, num=10, base=10), + 'sub_kernel': ['geo', 'exp']} + + elif kernel_name == 'ShortestPath': + from gklearn.kernels.spKernel import spkernel + estimator = spkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + param_grid_precomputed = {'node_kernels': [sub_kernel]} + + elif kernel_name == 'StructuralSP': + from gklearn.kernels.structuralspKernel import structuralspkernel + estimator = structuralspkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + param_grid_precomputed = {'node_kernels': [sub_kernel], 'edge_kernels': [sub_kernel], + 'compute_method': ['naive']} + + elif kernel_name == 'PathUpToH': + from gklearn.kernels.untilHPathKernel import untilhpathkernel + estimator = untilhpathkernel + param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], + 'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], # + 'compute_method': ['trie']} # ['MinMax']} + + elif kernel_name == 'Treelet': + from gklearn.kernels.treeletKernel import treeletkernel + estimator = treeletkernel + from gklearn.utils.kernels import polynomialkernel + import functools + gkernels = [functools.partial(gaussiankernel, gamma=1 / ga) + # for ga in np.linspace(1, 10, 10)] + for ga in np.logspace(0, 10, num=11, base=10)] + pkernels = [functools.partial(polynomialkernel, d=d, c=c) for d in range(1, 11) + for c in np.logspace(0, 10, num=11, base=10)] + param_grid_precomputed = {'sub_kernel': pkernels + gkernels} + + elif kernel_name == 'WLSubtree': + from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel + estimator = weisfeilerlehmankernel + param_grid_precomputed = {'base_kernel': ['subtree'], + 'height': np.linspace(0, 10, 11)} + param_grid = {'C': np.logspace(-10, 4, num=29, base=10)} + + if param_grid is None: + param_grid = {'C': np.logspace(-10, 10, num=41, base=10)} + + results = model_selection_for_precomputed_kernel( + graphs, + estimator, + param_grid_precomputed, + param_grid, + 'classification', + NUM_TRIALS=28, + datafile_y=targets, + extra_params=None, + ds_name=ds_name, + output_dir=output_dir, + n_jobs=n_jobs, + read_gm_from_file=False, + verbose=True) + return results[0], results[1] \ No newline at end of file diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 7201a0d..3d68212 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -13,6 +13,7 @@ import os class Dataset(object): + def __init__(self, filename=None, filename_targets=None, **kwargs): if filename is None: self.__graphs = None @@ -180,13 +181,13 @@ class Dataset(object): # return 0 - def get_dataset_infos(self, keys=None): + def get_dataset_infos(self, keys=None, params=None): """Computes and returns the structure and property information of the graph dataset. Parameters ---------- - keys : list - List of strings which indicate which informations will be returned. The + keys : list, optional + A list of strings which indicate which informations will be returned. The possible choices includes: 'substructures': sub-structures graphs contains, including 'linear', 'non @@ -241,7 +242,15 @@ class Dataset(object): 'class_number': number of classes. Only available for classification problems. + 'all_degree_entropy': the entropy of degree distribution of each graph. + + 'ave_degree_entropy': the average entropy of degree distribution of all graphs. + All informations above will be returned if `keys` is not given. + + params: dict of dict, optional + A dictinary which contains extra parameters for each possible + element in ``keys``. Return ------ @@ -276,6 +285,8 @@ class Dataset(object): 'node_attr_dim', 'edge_attr_dim', 'class_number', + 'all_degree_entropy', + 'ave_degree_entropy' ] # dataset size @@ -420,6 +431,22 @@ class Dataset(object): self.__edge_attr_dim = self.__get_edge_attr_dim() infos['edge_attr_dim'] = self.__edge_attr_dim + # entropy of degree distribution. + + if 'all_degree_entropy' in keys: + if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): + base = params['all_degree_entropy']['base'] + else: + base = None + infos['all_degree_entropy'] = self.__compute_all_degree_entropy(base=base) + + if 'ave_degree_entropy' in keys: + if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): + base = params['ave_degree_entropy']['base'] + else: + base = None + infos['ave_degree_entropy'] = np.mean(self.__compute_all_degree_entropy(base=base)) + return infos @@ -653,8 +680,7 @@ class Dataset(object): def __get_all_fill_factors(self): - """ - Get fill factor, the number of non-zero entries in the adjacency matrix. + """Get fill factor, the number of non-zero entries in the adjacency matrix. Returns ------- @@ -721,7 +747,30 @@ class Dataset(object): def __get_edge_attr_dim(self): return len(self.__edge_attrs) + + def __compute_all_degree_entropy(self, base=None): + """Compute the entropy of degree distribution of each graph. + + Parameters + ---------- + base : float, optional + The logarithmic base to use. The default is ``e`` (natural logarithm). + + Returns + ------- + degree_entropy : float + The calculated entropy. + """ + from gklearn.utils.stats import entropy + + degree_entropy = [] + for g in self.__graphs: + degrees = list(dict(g.degree()).values()) + en = entropy(degrees, base=base) + degree_entropy.append(en) + return degree_entropy + @property def graphs(self): diff --git a/gklearn/utils/model_selection_precomputed.py b/gklearn/utils/model_selection_precomputed.py index 1252f12..517d30a 100644 --- a/gklearn/utils/model_selection_precomputed.py +++ b/gklearn/utils/model_selection_precomputed.py @@ -22,936 +22,938 @@ from tqdm import tqdm #@profile def model_selection_for_precomputed_kernel(datafile, - estimator, - param_grid_precomputed, - param_grid, - model_type, - NUM_TRIALS=30, - datafile_y=None, - extra_params=None, - ds_name='ds-unknown', + estimator, + param_grid_precomputed, + param_grid, + model_type, + NUM_TRIALS=30, + datafile_y=None, + extra_params=None, + ds_name='ds-unknown', output_dir='outputs/', - n_jobs=1, - read_gm_from_file=False, - verbose=True): - """Perform model selection, fitting and testing for precomputed kernels - using nested CV. Print out neccessary data during the process then finally - the results. - - Parameters - ---------- - datafile : string - Path of dataset file. - estimator : function - kernel function used to estimate. This function needs to return a gram matrix. - param_grid_precomputed : dictionary - Dictionary with names (string) of parameters used to calculate gram - matrices as keys and lists of parameter settings to try as values. This - enables searching over any sequence of parameter settings. Params with - length 1 will be omitted. - param_grid : dictionary - Dictionary with names (string) of parameters used as penelties as keys - and lists of parameter settings to try as values. This enables - searching over any sequence of parameter settings. Params with length 1 - will be omitted. - model_type : string - Type of the problem, can be 'regression' or 'classification'. - NUM_TRIALS : integer - Number of random trials of the outer CV loop. The default is 30. - datafile_y : string - Path of file storing y data. This parameter is optional depending on - the given dataset file. - extra_params : dict - Extra parameters for loading dataset. See function gklearn.utils. - graphfiles.loadDataset for detail. - ds_name : string - Name of the dataset. - n_jobs : int - Number of jobs for parallelization. - read_gm_from_file : boolean - Whether gram matrices are loaded from a file. - - Examples - -------- - >>> import numpy as np - >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel - >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel - >>> - >>> datafile = '../datasets/MUTAG/MUTAG_A.txt' - >>> estimator = untilhpathkernel - >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: - [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} - >>> # ’C’ for classification problems and ’alpha’ for regression problems. - >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: - np.logspace(-10, 10, num=41, base=10)}] - >>> - >>> model_selection_for_precomputed_kernel(datafile, estimator, - param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) - """ - tqdm.monitor_interval = 0 - - output_dir += estimator.__name__ - if not os.path.exists(output_dir): - os.makedirs(output_dir) - # a string to save all the results. - str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' - str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' - - # setup the model type - model_type = model_type.lower() - if model_type != 'regression' and model_type != 'classification': - raise Exception( - 'The model type is incorrect! Please choose from regression or classification.' - ) - if verbose: - print() - print('--- This is a %s problem ---' % model_type) - str_fw += 'This is a %s problem.\n' % model_type - - # calculate gram matrices rather than read them from file. - if read_gm_from_file == False: - # Load the dataset - if verbose: - print() - print('\n1. Loading dataset from file...') - if isinstance(datafile, str): - dataset, y_all = loadDataset( - datafile, filename_y=datafile_y, extra_params=extra_params) - else: # load data directly from variable. - dataset = datafile - y_all = datafile_y - - # import matplotlib.pyplot as plt - # import networkx as nx - # nx.draw_networkx(dataset[30]) - # plt.show() - - # Grid of parameters with a discrete number of values for each. - param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) - param_list = list(ParameterGrid(param_grid)) - - gram_matrices = [ - ] # a list to store gram matrices for all param_grid_precomputed - gram_matrix_time = [ - ] # a list to store time to calculate gram matrices - param_list_pre_revised = [ - ] # list to store param grids precomputed ignoring the useless ones - - # calculate all gram matrices - if verbose: - print() - print('2. Calculating gram matrices. This could take a while...') - str_fw += '\nII. Gram matrices.\n\n' - tts = time.time() # start training time - nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) - for idx, params_out in enumerate(param_list_precomputed): - y = y_all[:] - params_out['n_jobs'] = n_jobs - params_out['verbose'] = verbose -# print(dataset) -# import networkx as nx -# nx.draw_networkx(dataset[1]) -# plt.show() - rtn_data = estimator(dataset[:], **params_out) - Kmatrix = rtn_data[0] - current_run_time = rtn_data[1] - # for some kernels, some graphs in datasets may not meet the - # kernels' requirements for graph structure. These graphs are trimmed. - if len(rtn_data) == 3: - idx_trim = rtn_data[2] # the index of trimmed graph list - y = [y[idxt] for idxt in idx_trim] # trim y accordingly -# Kmatrix = np.random.rand(2250, 2250) -# current_run_time = 0.1 - - # remove graphs whose kernels with themselves are zeros - # @todo: y not changed accordingly? - Kmatrix_diag = Kmatrix.diagonal().copy() - nb_g_ignore = 0 - for idxk, diag in enumerate(Kmatrix_diag): - if diag == 0: - Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) - Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) - nb_g_ignore += 1 - # normalization - # @todo: works only for undirected graph? - Kmatrix_diag = Kmatrix.diagonal().copy() - for i in range(len(Kmatrix)): - for j in range(i, len(Kmatrix)): - Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) - Kmatrix[j][i] = Kmatrix[i][j] - if verbose: - print() - if params_out == {}: - if verbose: - print('the gram matrix is: ') - str_fw += 'the gram matrix is:\n\n' - else: - if verbose: - print('the gram matrix with parameters', params_out, 'is: \n\n') - str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out - if len(Kmatrix) < 2: - nb_gm_ignore += 1 - if verbose: - print('ignored, as at most only one of all its diagonal value is non-zero.') - str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' - else: - if np.isnan(Kmatrix).any( - ): # if the matrix contains elements that are not numbers - nb_gm_ignore += 1 - if verbose: - print('ignored, as it contains elements that are not numbers.') - str_fw += 'ignored, as it contains elements that are not numbers.\n\n' - else: -# print(Kmatrix) - str_fw += np.array2string( - Kmatrix, - separator=',') + '\n\n' -# separator=',', -# threshold=np.inf, -# floatmode='unique') + '\n\n' - - fig_file_name = output_dir + '/GM[ds]' + ds_name - if params_out != {}: - fig_file_name += '[params]' + str(idx) - plt.imshow(Kmatrix) - plt.colorbar() - plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) -# plt.show() - plt.clf() - gram_matrices.append(Kmatrix) - gram_matrix_time.append(current_run_time) - param_list_pre_revised.append(params_out) - if nb_g_ignore > 0: - if verbose: - print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) - str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore - if verbose: - print() - print( - '{} gram matrices are calculated, {} of which are ignored.'.format( - len(param_list_precomputed), nb_gm_ignore)) - str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) - str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' - str_fw += ''.join([ - '{}: {}\n'.format(idx, params_out) - for idx, params_out in enumerate(param_list_precomputed) - ]) - - if verbose: - print() - if len(gram_matrices) == 0: - if verbose: - print('all gram matrices are ignored, no results obtained.') - str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' - else: - # save gram matrices to file. -# np.savez(output_dir + '/' + ds_name + '.gm', -# gms=gram_matrices, params=param_list_pre_revised, y=y, -# gmtime=gram_matrix_time) - if verbose: - print( - '3. Fitting and predicting using nested cross validation. This could really take a while...' - ) - - # ---- use pool.imap_unordered to parallel and track progress. ---- -# train_pref = [] -# val_pref = [] -# test_pref = [] -# def func_assign(result, var_to_assign): -# for idx, itm in enumerate(var_to_assign): -# itm.append(result[idx]) -# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) -# -# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, -# [train_pref, val_pref, test_pref], glbv=gram_matrices, -# method='imap_unordered', n_jobs=n_jobs, chunksize=1, -# itr_desc='cross validation') - - def init_worker(gms_toshare): - global G_gms - G_gms = gms_toshare - -# gram_matrices = np.array(gram_matrices) -# gms_shape = gram_matrices.shape -# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) -# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) - trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) - train_pref = [] - val_pref = [] - test_pref = [] -# if NUM_TRIALS < 1000 * n_jobs: -# chunksize = int(NUM_TRIALS / n_jobs) + 1 -# else: -# chunksize = 1000 - chunksize = 1 - if verbose: - iterator = tqdm(pool.imap_unordered(trial_do_partial, - range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout) - else: - iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize) - for o1, o2, o3 in iterator: - train_pref.append(o1) - val_pref.append(o2) - test_pref.append(o3) - pool.close() - pool.join() - -# # ---- use pool.map to parallel. ---- -# pool = Pool(n_jobs) -# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type) -# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) -# train_pref = [item[0] for item in result_perf] -# val_pref = [item[1] for item in result_perf] -# test_pref = [item[2] for item in result_perf] - -# # ---- direct running, normally use a single CPU core. ---- -# train_pref = [] -# val_pref = [] -# test_pref = [] -# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): -# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) -# train_pref.append(o1) -# val_pref.append(o2) -# test_pref.append(o3) -# print() - - if verbose: - print() - print('4. Getting final performance...') - str_fw += '\nIII. Performance.\n\n' - # averages and confidences of performances on outer trials for each combination of parameters - average_train_scores = np.mean(train_pref, axis=0) -# print('val_pref: ', val_pref[0][0]) - average_val_scores = np.mean(val_pref, axis=0) -# print('test_pref: ', test_pref[0][0]) - average_perf_scores = np.mean(test_pref, axis=0) - # sample std is used here - std_train_scores = np.std(train_pref, axis=0, ddof=1) - std_val_scores = np.std(val_pref, axis=0, ddof=1) - std_perf_scores = np.std(test_pref, axis=0, ddof=1) - - if model_type == 'regression': - best_val_perf = np.amin(average_val_scores) - else: - best_val_perf = np.amax(average_val_scores) -# print('average_val_scores: ', average_val_scores) -# print('best_val_perf: ', best_val_perf) -# print() - best_params_index = np.where(average_val_scores == best_val_perf) - # find smallest val std with best val perf. - best_val_stds = [ - std_val_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - min_val_std = np.amin(best_val_stds) - best_params_index = np.where(std_val_scores == min_val_std) - best_params_out = [ - param_list_pre_revised[i] for i in best_params_index[0] - ] - best_params_in = [param_list[i] for i in best_params_index[1]] - if verbose: - print('best_params_out: ', best_params_out) - print('best_params_in: ', best_params_in) - print() - print('best_val_perf: ', best_val_perf) - print('best_val_std: ', min_val_std) - str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out - str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in - str_fw += 'best_val_perf: %s\n' % best_val_perf - str_fw += 'best_val_std: %s\n' % min_val_std - -# print(best_params_index) -# print(best_params_index[0]) -# print(average_perf_scores) - final_performance = [ - average_perf_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - final_confidence = [ - std_perf_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - if verbose: - print('final_performance: ', final_performance) - print('final_confidence: ', final_confidence) - str_fw += 'final_performance: %s\n' % final_performance - str_fw += 'final_confidence: %s\n' % final_confidence - train_performance = [ - average_train_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - train_std = [ - std_train_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - if verbose: - print('train_performance: %s' % train_performance) - print('train_std: ', train_std) - str_fw += 'train_performance: %s\n' % train_performance - str_fw += 'train_std: %s\n\n' % train_std - - if verbose: - print() - tt_total = time.time() - tts # training time for all hyper-parameters - average_gram_matrix_time = np.mean(gram_matrix_time) - std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0 - best_gram_matrix_time = [ - gram_matrix_time[i] for i in best_params_index[0] - ] - ave_bgmt = np.mean(best_gram_matrix_time) - std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 - if verbose: - print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' - .format(average_gram_matrix_time, std_gram_matrix_time)) - print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( - ave_bgmt, std_bgmt)) - print('total training time with all hyper-param choices: {:.2f}s'.format( - tt_total)) - str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) - str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) - str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) - - # # save results to file - # np.savetxt(results_name_pre + 'average_train_scores.dt', - # average_train_scores) - # np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) - # np.savetxt(results_name_pre + 'average_perf_scores.dt', - # average_perf_scores) - # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) - # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) - # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) - - # np.save(results_name_pre + 'best_params_index', best_params_index) - # np.save(results_name_pre + 'best_params_pre.dt', best_params_out) - # np.save(results_name_pre + 'best_params_in.dt', best_params_in) - # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) - # np.save(results_name_pre + 'best_val_std.dt', best_val_std) - # np.save(results_name_pre + 'final_performance.dt', final_performance) - # np.save(results_name_pre + 'final_confidence.dt', final_confidence) - # np.save(results_name_pre + 'train_performance.dt', train_performance) - # np.save(results_name_pre + 'train_std.dt', train_std) - - # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) - # np.save(results_name_pre + 'average_gram_matrix_time.dt', - # average_gram_matrix_time) - # np.save(results_name_pre + 'std_gram_matrix_time.dt', - # std_gram_matrix_time) - # np.save(results_name_pre + 'best_gram_matrix_time.dt', - # best_gram_matrix_time) - - # read gram matrices from file. - else: - # Grid of parameters with a discrete number of values for each. -# param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) - param_list = list(ParameterGrid(param_grid)) - - # read gram matrices from file. - if verbose: - print() - print('2. Reading gram matrices from file...') - str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n' - gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz') - gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed - gram_matrix_time = gmfile['gmtime'] # time used to compute the gram matrices - param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones - y = gmfile['y'].tolist() - - tts = time.time() # start training time -# nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) - if verbose: - print( - '3. Fitting and predicting using nested cross validation. This could really take a while...' - ) + n_jobs=1, + read_gm_from_file=False, + verbose=True): + """Perform model selection, fitting and testing for precomputed kernels + using nested CV. Print out neccessary data during the process then finally + the results. + + Parameters + ---------- + datafile : string + Path of dataset file. + estimator : function + kernel function used to estimate. This function needs to return a gram matrix. + param_grid_precomputed : dictionary + Dictionary with names (string) of parameters used to calculate gram + matrices as keys and lists of parameter settings to try as values. This + enables searching over any sequence of parameter settings. Params with + length 1 will be omitted. + param_grid : dictionary + Dictionary with names (string) of parameters used as penelties as keys + and lists of parameter settings to try as values. This enables + searching over any sequence of parameter settings. Params with length 1 + will be omitted. + model_type : string + Type of the problem, can be 'regression' or 'classification'. + NUM_TRIALS : integer + Number of random trials of the outer CV loop. The default is 30. + datafile_y : string + Path of file storing y data. This parameter is optional depending on + the given dataset file. + extra_params : dict + Extra parameters for loading dataset. See function gklearn.utils. + graphfiles.loadDataset for detail. + ds_name : string + Name of the dataset. + n_jobs : int + Number of jobs for parallelization. + read_gm_from_file : boolean + Whether gram matrices are loaded from a file. + + Examples + -------- + >>> import numpy as np + >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel + >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel + >>> + >>> datafile = '../datasets/MUTAG/MUTAG_A.txt' + >>> estimator = untilhpathkernel + >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: + [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} + >>> # ’C’ for classification problems and ’alpha’ for regression problems. + >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: + np.logspace(-10, 10, num=41, base=10)}] + >>> + >>> model_selection_for_precomputed_kernel(datafile, estimator, + param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) + """ + tqdm.monitor_interval = 0 + + output_dir += estimator.__name__ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # a string to save all the results. + str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' + str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' + + # setup the model type + model_type = model_type.lower() + if model_type != 'regression' and model_type != 'classification': + raise Exception( + 'The model type is incorrect! Please choose from regression or classification.' + ) + if verbose: + print() + print('--- This is a %s problem ---' % model_type) + str_fw += 'This is a %s problem.\n' % model_type + + # calculate gram matrices rather than read them from file. + if read_gm_from_file == False: + # Load the dataset + if verbose: + print() + print('\n1. Loading dataset from file...') + if isinstance(datafile, str): + dataset, y_all = loadDataset( + datafile, filename_y=datafile_y, extra_params=extra_params) + else: # load data directly from variable. + dataset = datafile + y_all = datafile_y + + # import matplotlib.pyplot as plt + # import networkx as nx + # nx.draw_networkx(dataset[30]) + # plt.show() + + # Grid of parameters with a discrete number of values for each. + param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) + param_list = list(ParameterGrid(param_grid)) + + gram_matrices = [ + ] # a list to store gram matrices for all param_grid_precomputed + gram_matrix_time = [ + ] # a list to store time to calculate gram matrices + param_list_pre_revised = [ + ] # list to store param grids precomputed ignoring the useless ones + + # calculate all gram matrices + if verbose: + print() + print('2. Calculating gram matrices. This could take a while...') + str_fw += '\nII. Gram matrices.\n\n' + tts = time.time() # start training time + nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) + for idx, params_out in enumerate(param_list_precomputed): + y = y_all[:] + params_out['n_jobs'] = n_jobs + params_out['verbose'] = verbose +# print(dataset) +# import networkx as nx +# nx.draw_networkx(dataset[1]) +# plt.show() + rtn_data = estimator(dataset[:], **params_out) + Kmatrix = rtn_data[0] + current_run_time = rtn_data[1] + # for some kernels, some graphs in datasets may not meet the + # kernels' requirements for graph structure. These graphs are trimmed. + if len(rtn_data) == 3: + idx_trim = rtn_data[2] # the index of trimmed graph list + y = [y[idxt] for idxt in idx_trim] # trim y accordingly +# Kmatrix = np.random.rand(2250, 2250) +# current_run_time = 0.1 + + # remove graphs whose kernels with themselves are zeros + # @todo: y not changed accordingly? + Kmatrix_diag = Kmatrix.diagonal().copy() + nb_g_ignore = 0 + for idxk, diag in enumerate(Kmatrix_diag): + if diag == 0: + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) + nb_g_ignore += 1 + # normalization + # @todo: works only for undirected graph? + Kmatrix_diag = Kmatrix.diagonal().copy() + for i in range(len(Kmatrix)): + for j in range(i, len(Kmatrix)): + Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) + Kmatrix[j][i] = Kmatrix[i][j] + if verbose: + print() + if params_out == {}: + if verbose: + print('the gram matrix is: ') + str_fw += 'the gram matrix is:\n\n' + else: + if verbose: + print('the gram matrix with parameters', params_out, 'is: \n\n') + str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out + if len(Kmatrix) < 2: + nb_gm_ignore += 1 + if verbose: + print('ignored, as at most only one of all its diagonal value is non-zero.') + str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' + else: + if np.isnan(Kmatrix).any( + ): # if the matrix contains elements that are not numbers + nb_gm_ignore += 1 + if verbose: + print('ignored, as it contains elements that are not numbers.') + str_fw += 'ignored, as it contains elements that are not numbers.\n\n' + else: +# print(Kmatrix) + str_fw += np.array2string( + Kmatrix, + separator=',') + '\n\n' +# separator=',', +# threshold=np.inf, +# floatmode='unique') + '\n\n' + + fig_file_name = output_dir + '/GM[ds]' + ds_name + if params_out != {}: + fig_file_name += '[params]' + str(idx) + plt.imshow(Kmatrix) + plt.colorbar() + plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) +# plt.show() + plt.clf() + gram_matrices.append(Kmatrix) + gram_matrix_time.append(current_run_time) + param_list_pre_revised.append(params_out) + if nb_g_ignore > 0: + if verbose: + print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) + str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore + if verbose: + print() + print( + '{} gram matrices are calculated, {} of which are ignored.'.format( + len(param_list_precomputed), nb_gm_ignore)) + str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) + str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' + str_fw += ''.join([ + '{}: {}\n'.format(idx, params_out) + for idx, params_out in enumerate(param_list_precomputed) + ]) + + if verbose: + print() + if len(gram_matrices) == 0: + if verbose: + print('all gram matrices are ignored, no results obtained.') + str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' + else: + # save gram matrices to file. +# np.savez(output_dir + '/' + ds_name + '.gm', +# gms=gram_matrices, params=param_list_pre_revised, y=y, +# gmtime=gram_matrix_time) + if verbose: + print( + '3. Fitting and predicting using nested cross validation. This could really take a while...' + ) + + # ---- use pool.imap_unordered to parallel and track progress. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# def func_assign(result, var_to_assign): +# for idx, itm in enumerate(var_to_assign): +# itm.append(result[idx]) +# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) +# +# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, +# [train_pref, val_pref, test_pref], glbv=gram_matrices, +# method='imap_unordered', n_jobs=n_jobs, chunksize=1, +# itr_desc='cross validation') + + def init_worker(gms_toshare): + global G_gms + G_gms = gms_toshare + +# gram_matrices = np.array(gram_matrices) +# gms_shape = gram_matrices.shape +# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) +# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) + trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) + train_pref = [] + val_pref = [] + test_pref = [] +# if NUM_TRIALS < 1000 * n_jobs: +# chunksize = int(NUM_TRIALS / n_jobs) + 1 +# else: +# chunksize = 1000 + chunksize = 1 + if verbose: + iterator = tqdm(pool.imap_unordered(trial_do_partial, + range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout) + else: + iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize) + for o1, o2, o3 in iterator: + train_pref.append(o1) + val_pref.append(o2) + test_pref.append(o3) + pool.close() + pool.join() + +# # ---- use pool.map to parallel. ---- +# pool = Pool(n_jobs) +# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type) +# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) +# train_pref = [item[0] for item in result_perf] +# val_pref = [item[1] for item in result_perf] +# test_pref = [item[2] for item in result_perf] + +# # ---- direct running, normally use a single CPU core. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): +# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) +# train_pref.append(o1) +# val_pref.append(o2) +# test_pref.append(o3) +# print() + + if verbose: + print() + print('4. Getting final performance...') + str_fw += '\nIII. Performance.\n\n' + # averages and confidences of performances on outer trials for each combination of parameters + average_train_scores = np.mean(train_pref, axis=0) +# print('val_pref: ', val_pref[0][0]) + average_val_scores = np.mean(val_pref, axis=0) +# print('test_pref: ', test_pref[0][0]) + average_perf_scores = np.mean(test_pref, axis=0) + # sample std is used here + std_train_scores = np.std(train_pref, axis=0, ddof=1) + std_val_scores = np.std(val_pref, axis=0, ddof=1) + std_perf_scores = np.std(test_pref, axis=0, ddof=1) + + if model_type == 'regression': + best_val_perf = np.amin(average_val_scores) + else: + best_val_perf = np.amax(average_val_scores) +# print('average_val_scores: ', average_val_scores) +# print('best_val_perf: ', best_val_perf) +# print() + best_params_index = np.where(average_val_scores == best_val_perf) + # find smallest val std with best val perf. + best_val_stds = [ + std_val_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + min_val_std = np.amin(best_val_stds) + best_params_index = np.where(std_val_scores == min_val_std) + best_params_out = [ + param_list_pre_revised[i] for i in best_params_index[0] + ] + best_params_in = [param_list[i] for i in best_params_index[1]] + if verbose: + print('best_params_out: ', best_params_out) + print('best_params_in: ', best_params_in) + print() + print('best_val_perf: ', best_val_perf) + print('best_val_std: ', min_val_std) + str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out + str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in + str_fw += 'best_val_perf: %s\n' % best_val_perf + str_fw += 'best_val_std: %s\n' % min_val_std + +# print(best_params_index) +# print(best_params_index[0]) +# print(average_perf_scores) + final_performance = [ + average_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + final_confidence = [ + std_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + if verbose: + print('final_performance: ', final_performance) + print('final_confidence: ', final_confidence) + str_fw += 'final_performance: %s\n' % final_performance + str_fw += 'final_confidence: %s\n' % final_confidence + train_performance = [ + average_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + train_std = [ + std_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + if verbose: + print('train_performance: %s' % train_performance) + print('train_std: ', train_std) + str_fw += 'train_performance: %s\n' % train_performance + str_fw += 'train_std: %s\n\n' % train_std + + if verbose: + print() + tt_total = time.time() - tts # training time for all hyper-parameters + average_gram_matrix_time = np.mean(gram_matrix_time) + std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0 + best_gram_matrix_time = [ + gram_matrix_time[i] for i in best_params_index[0] + ] + ave_bgmt = np.mean(best_gram_matrix_time) + std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 + if verbose: + print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' + .format(average_gram_matrix_time, std_gram_matrix_time)) + print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( + ave_bgmt, std_bgmt)) + print('total training time with all hyper-param choices: {:.2f}s'.format( + tt_total)) + str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) + str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) + str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) + + # # save results to file + # np.savetxt(results_name_pre + 'average_train_scores.dt', + # average_train_scores) + # np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) + # np.savetxt(results_name_pre + 'average_perf_scores.dt', + # average_perf_scores) + # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) + # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) + # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) + + # np.save(results_name_pre + 'best_params_index', best_params_index) + # np.save(results_name_pre + 'best_params_pre.dt', best_params_out) + # np.save(results_name_pre + 'best_params_in.dt', best_params_in) + # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) + # np.save(results_name_pre + 'best_val_std.dt', best_val_std) + # np.save(results_name_pre + 'final_performance.dt', final_performance) + # np.save(results_name_pre + 'final_confidence.dt', final_confidence) + # np.save(results_name_pre + 'train_performance.dt', train_performance) + # np.save(results_name_pre + 'train_std.dt', train_std) + + # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) + # np.save(results_name_pre + 'average_gram_matrix_time.dt', + # average_gram_matrix_time) + # np.save(results_name_pre + 'std_gram_matrix_time.dt', + # std_gram_matrix_time) + # np.save(results_name_pre + 'best_gram_matrix_time.dt', + # best_gram_matrix_time) + + # read gram matrices from file. + else: + # Grid of parameters with a discrete number of values for each. +# param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) + param_list = list(ParameterGrid(param_grid)) + + # read gram matrices from file. + if verbose: + print() + print('2. Reading gram matrices from file...') + str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n' + gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz') + gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed + gram_matrix_time = gmfile['gmtime'] # time used to compute the gram matrices + param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones + y = gmfile['y'].tolist() + + tts = time.time() # start training time +# nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) + if verbose: + print( + '3. Fitting and predicting using nested cross validation. This could really take a while...' + ) - # ---- use pool.imap_unordered to parallel and track progress. ---- - def init_worker(gms_toshare): - global G_gms - G_gms = gms_toshare - - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) - trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) - train_pref = [] - val_pref = [] - test_pref = [] - chunksize = 1 - if verbose: - iterator = tqdm(pool.imap_unordered(trial_do_partial, - range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout) - else: - iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize) - for o1, o2, o3 in iterator: - train_pref.append(o1) - val_pref.append(o2) - test_pref.append(o3) - pool.close() - pool.join() - - # # ---- use pool.map to parallel. ---- - # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) - # train_pref = [item[0] for item in result_perf] - # val_pref = [item[1] for item in result_perf] - # test_pref = [item[2] for item in result_perf] - - # # ---- use joblib.Parallel to parallel and track progress. ---- - # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) - # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) - # train_pref = [item[0] for item in result_perf] - # val_pref = [item[1] for item in result_perf] - # test_pref = [item[2] for item in result_perf] - -# # ---- direct running, normally use a single CPU core. ---- -# train_pref = [] -# val_pref = [] -# test_pref = [] -# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): -# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) -# train_pref.append(o1) -# val_pref.append(o2) -# test_pref.append(o3) - - if verbose: - print() - print('4. Getting final performance...') - str_fw += '\nIII. Performance.\n\n' - # averages and confidences of performances on outer trials for each combination of parameters - average_train_scores = np.mean(train_pref, axis=0) - average_val_scores = np.mean(val_pref, axis=0) - average_perf_scores = np.mean(test_pref, axis=0) - # sample std is used here - std_train_scores = np.std(train_pref, axis=0, ddof=1) - std_val_scores = np.std(val_pref, axis=0, ddof=1) - std_perf_scores = np.std(test_pref, axis=0, ddof=1) - - if model_type == 'regression': - best_val_perf = np.amin(average_val_scores) - else: - best_val_perf = np.amax(average_val_scores) - best_params_index = np.where(average_val_scores == best_val_perf) - # find smallest val std with best val perf. - best_val_stds = [ - std_val_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - min_val_std = np.amin(best_val_stds) - best_params_index = np.where(std_val_scores == min_val_std) - best_params_out = [ - param_list_pre_revised[i] for i in best_params_index[0] - ] - best_params_in = [param_list[i] for i in best_params_index[1]] - if verbose: - print('best_params_out: ', best_params_out) - print('best_params_in: ', best_params_in) - print() - print('best_val_perf: ', best_val_perf) - print('best_val_std: ', min_val_std) - str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out - str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in - str_fw += 'best_val_perf: %s\n' % best_val_perf - str_fw += 'best_val_std: %s\n' % min_val_std - - final_performance = [ - average_perf_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - final_confidence = [ - std_perf_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - if verbose: - print('final_performance: ', final_performance) - print('final_confidence: ', final_confidence) - str_fw += 'final_performance: %s\n' % final_performance - str_fw += 'final_confidence: %s\n' % final_confidence - train_performance = [ - average_train_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - train_std = [ - std_train_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - if verbose: - print('train_performance: %s' % train_performance) - print('train_std: ', train_std) - str_fw += 'train_performance: %s\n' % train_performance - str_fw += 'train_std: %s\n\n' % train_std - - if verbose: - print() - average_gram_matrix_time = np.mean(gram_matrix_time) - std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0 - best_gram_matrix_time = [ - gram_matrix_time[i] for i in best_params_index[0] - ] - ave_bgmt = np.mean(best_gram_matrix_time) - std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 - if verbose: - print( - 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' - .format(average_gram_matrix_time, std_gram_matrix_time)) - print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( - ave_bgmt, std_bgmt)) - tt_poster = time.time() - tts # training time with hyper-param choices who did not participate in calculation of gram matrices - if verbose: - print( - 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format( - tt_poster)) - print('total training time with all hyper-param choices: {:.2f}s'.format( - tt_poster + np.sum(gram_matrix_time))) -# str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) -# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) - str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) - - # open file to save all results for this dataset. - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # print out results as table. - str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, - std_val_scores, average_perf_scores, std_perf_scores, - average_train_scores, std_train_scores, gram_matrix_time, - model_type, verbose) - - # open file to save all results for this dataset. - if not os.path.exists(output_dir + '/' + ds_name + '.output.txt'): - with open(output_dir + '/' + ds_name + '.output.txt', 'w') as f: - f.write(str_fw) - else: - with open(output_dir + '/' + ds_name + '.output.txt', 'r+') as f: - content = f.read() - f.seek(0, 0) - f.write(str_fw + '\n\n\n' + content) + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(gms_toshare): + global G_gms + G_gms = gms_toshare + + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) + trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) + train_pref = [] + val_pref = [] + test_pref = [] + chunksize = 1 + if verbose: + iterator = tqdm(pool.imap_unordered(trial_do_partial, + range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout) + else: + iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize) + for o1, o2, o3 in iterator: + train_pref.append(o1) + val_pref.append(o2) + test_pref.append(o3) + pool.close() + pool.join() + + # # ---- use pool.map to parallel. ---- + # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) + # train_pref = [item[0] for item in result_perf] + # val_pref = [item[1] for item in result_perf] + # test_pref = [item[2] for item in result_perf] + + # # ---- use joblib.Parallel to parallel and track progress. ---- + # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) + # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) + # train_pref = [item[0] for item in result_perf] + # val_pref = [item[1] for item in result_perf] + # test_pref = [item[2] for item in result_perf] + +# # ---- direct running, normally use a single CPU core. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): +# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) +# train_pref.append(o1) +# val_pref.append(o2) +# test_pref.append(o3) + + if verbose: + print() + print('4. Getting final performance...') + str_fw += '\nIII. Performance.\n\n' + # averages and confidences of performances on outer trials for each combination of parameters + average_train_scores = np.mean(train_pref, axis=0) + average_val_scores = np.mean(val_pref, axis=0) + average_perf_scores = np.mean(test_pref, axis=0) + # sample std is used here + std_train_scores = np.std(train_pref, axis=0, ddof=1) + std_val_scores = np.std(val_pref, axis=0, ddof=1) + std_perf_scores = np.std(test_pref, axis=0, ddof=1) + + if model_type == 'regression': + best_val_perf = np.amin(average_val_scores) + else: + best_val_perf = np.amax(average_val_scores) + best_params_index = np.where(average_val_scores == best_val_perf) + # find smallest val std with best val perf. + best_val_stds = [ + std_val_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + min_val_std = np.amin(best_val_stds) + best_params_index = np.where(std_val_scores == min_val_std) + best_params_out = [ + param_list_pre_revised[i] for i in best_params_index[0] + ] + best_params_in = [param_list[i] for i in best_params_index[1]] + if verbose: + print('best_params_out: ', best_params_out) + print('best_params_in: ', best_params_in) + print() + print('best_val_perf: ', best_val_perf) + print('best_val_std: ', min_val_std) + str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out + str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in + str_fw += 'best_val_perf: %s\n' % best_val_perf + str_fw += 'best_val_std: %s\n' % min_val_std + + final_performance = [ + average_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + final_confidence = [ + std_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + if verbose: + print('final_performance: ', final_performance) + print('final_confidence: ', final_confidence) + str_fw += 'final_performance: %s\n' % final_performance + str_fw += 'final_confidence: %s\n' % final_confidence + train_performance = [ + average_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + train_std = [ + std_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + if verbose: + print('train_performance: %s' % train_performance) + print('train_std: ', train_std) + str_fw += 'train_performance: %s\n' % train_performance + str_fw += 'train_std: %s\n\n' % train_std + + if verbose: + print() + average_gram_matrix_time = np.mean(gram_matrix_time) + std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0 + best_gram_matrix_time = [ + gram_matrix_time[i] for i in best_params_index[0] + ] + ave_bgmt = np.mean(best_gram_matrix_time) + std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 + if verbose: + print( + 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' + .format(average_gram_matrix_time, std_gram_matrix_time)) + print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( + ave_bgmt, std_bgmt)) + tt_poster = time.time() - tts # training time with hyper-param choices who did not participate in calculation of gram matrices + if verbose: + print( + 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format( + tt_poster)) + print('total training time with all hyper-param choices: {:.2f}s'.format( + tt_poster + np.sum(gram_matrix_time))) +# str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) +# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) + str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) + + # open file to save all results for this dataset. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # print out results as table. + str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, + std_val_scores, average_perf_scores, std_perf_scores, + average_train_scores, std_train_scores, gram_matrix_time, + model_type, verbose) + + # open file to save all results for this dataset. + if not os.path.exists(output_dir + '/' + ds_name + '.output.txt'): + with open(output_dir + '/' + ds_name + '.output.txt', 'w') as f: + f.write(str_fw) + else: + with open(output_dir + '/' + ds_name + '.output.txt', 'r+') as f: + content = f.read() + f.seek(0, 0) + f.write(str_fw + '\n\n\n' + content) + + return final_performance, final_confidence def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level -# # get gram matrices from global variables. -# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') - - # Arrays to store scores - train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) - val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) - test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) - - # randomness added to seeds of split function below. "high" is "size" times - # 10 so that at least 10 different random output will be yielded. Remove - # these lines if identical outputs is required. - rdm_out = np.random.RandomState(seed=None) - rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, - size=len(param_list_pre_revised)) -# print(trial, rdm_seed_out_l) -# print() - # loop for each outer param tuple - for index_out, params_out in enumerate(param_list_pre_revised): - # get gram matrices from global variables. -# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] -# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') - gm_now = gram_matrices[index_out].copy() - - # split gram matrix and y to app and test sets. - indices = range(len(y)) - # The argument "random_state" in function "train_test_split" can not be - # set to None, because it will use RandomState instance used by - # np.random, which is possible for multiple subprocesses to inherit the - # same seed if they forked at the same time, leading to identical - # random variates for different subprocesses. Instead, we use "trial" - # and "index_out" parameters to generate different seeds for different - # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add - # randomness into seeds, so that it yields a different output every - # time the program is run. To yield identical outputs every time, - # remove the second line below. Same method is used to the "KFold" - # function in the inner loop. - rdm_seed_out = (trial + 1) * (index_out + 1) - rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) -# print(trial, rdm_seed_out) - X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( - gm_now, y, indices, test_size=0.1, - random_state=rdm_seed_out, shuffle=True) -# print(trial, idx_app, idx_test) -# print() - X_app = X_app[:, idx_app] - X_test = X_test[:, idx_app] - y_app = np.array(y_app) - y_test = np.array(y_test) - - rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, - size=len(param_list)) - # loop for each inner param tuple - for index_in, params_in in enumerate(param_list): -# if trial == 0: -# print(index_out, index_in) -# print('params_in: ', params_in) -# st = time.time() - rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) -# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) - rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) -# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) - inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) - current_train_perf = [] - current_valid_perf = [] - current_test_perf = [] - - # For regression use the Kernel Ridge method -# try: - if model_type == 'regression': - kr = KernelRidge(kernel='precomputed', **params_in) - # loop for each split on validation set level - # validation set level - for train_index, valid_index in inner_cv.split(X_app): -# print("train_index, valid_index: ", trial, index_in, train_index, valid_index) -# if trial == 0: -# print('train_index: ', train_index) -# print('valid_index: ', valid_index) -# print('idx_test: ', idx_test) -# print('y_app[train_index]: ', y_app[train_index]) -# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) -# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) - kr.fit(X_app[train_index, :][:, train_index], - y_app[train_index]) - - # predict on the train, validation and test set - y_pred_train = kr.predict( - X_app[train_index, :][:, train_index]) - y_pred_valid = kr.predict( - X_app[valid_index, :][:, train_index]) -# if trial == 0: -# print('y_pred_valid: ', y_pred_valid) -# print() - y_pred_test = kr.predict( - X_test[:, train_index]) - - # root mean squared errors - current_train_perf.append( - np.sqrt( - mean_squared_error( - y_app[train_index], y_pred_train))) - current_valid_perf.append( - np.sqrt( - mean_squared_error( - y_app[valid_index], y_pred_valid))) -# if trial == 0: -# print(mean_squared_error( -# y_app[valid_index], y_pred_valid)) - current_test_perf.append( - np.sqrt( - mean_squared_error( - y_test, y_pred_test))) - # For clcassification use SVM - else: - svc = SVC(kernel='precomputed', cache_size=200, - verbose=False, **params_in) - # loop for each split on validation set level - # validation set level - for train_index, valid_index in inner_cv.split(X_app): -# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) -# if trial == 0: -# print('train_index: ', train_index) -# print('valid_index: ', valid_index) -# print('idx_test: ', idx_test) -# print('y_app[train_index]: ', y_app[train_index]) -# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) -# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) - svc.fit(X_app[train_index, :][:, train_index], - y_app[train_index]) - - # predict on the train, validation and test set - y_pred_train = svc.predict( - X_app[train_index, :][:, train_index]) - y_pred_valid = svc.predict( - X_app[valid_index, :][:, train_index]) - y_pred_test = svc.predict( - X_test[:, train_index]) - - # root mean squared errors - current_train_perf.append( - accuracy_score(y_app[train_index], - y_pred_train)) - current_valid_perf.append( - accuracy_score(y_app[valid_index], - y_pred_valid)) - current_test_perf.append( - accuracy_score(y_test, y_pred_test)) -# except ValueError: -# print(sys.exc_info()[0]) -# print(params_out, params_in) - - # average performance on inner splits - train_pref[index_out][index_in] = np.mean( - current_train_perf) - val_pref[index_out][index_in] = np.mean( - current_valid_perf) - test_pref[index_out][index_in] = np.mean( - current_test_perf) -# print(time.time() - st) -# if trial == 0: -# print('val_pref: ', val_pref) -# print('test_pref: ', test_pref) - - return train_pref, val_pref, test_pref +# # get gram matrices from global variables. +# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') + + # Arrays to store scores + train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + + # randomness added to seeds of split function below. "high" is "size" times + # 10 so that at least 10 different random output will be yielded. Remove + # these lines if identical outputs is required. + rdm_out = np.random.RandomState(seed=None) + rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, + size=len(param_list_pre_revised)) +# print(trial, rdm_seed_out_l) +# print() + # loop for each outer param tuple + for index_out, params_out in enumerate(param_list_pre_revised): + # get gram matrices from global variables. +# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] +# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') + gm_now = gram_matrices[index_out].copy() + + # split gram matrix and y to app and test sets. + indices = range(len(y)) + # The argument "random_state" in function "train_test_split" can not be + # set to None, because it will use RandomState instance used by + # np.random, which is possible for multiple subprocesses to inherit the + # same seed if they forked at the same time, leading to identical + # random variates for different subprocesses. Instead, we use "trial" + # and "index_out" parameters to generate different seeds for different + # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add + # randomness into seeds, so that it yields a different output every + # time the program is run. To yield identical outputs every time, + # remove the second line below. Same method is used to the "KFold" + # function in the inner loop. + rdm_seed_out = (trial + 1) * (index_out + 1) + rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) +# print(trial, rdm_seed_out) + X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( + gm_now, y, indices, test_size=0.1, + random_state=rdm_seed_out, shuffle=True) +# print(trial, idx_app, idx_test) +# print() + X_app = X_app[:, idx_app] + X_test = X_test[:, idx_app] + y_app = np.array(y_app) + y_test = np.array(y_test) + + rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, + size=len(param_list)) + # loop for each inner param tuple + for index_in, params_in in enumerate(param_list): +# if trial == 0: +# print(index_out, index_in) +# print('params_in: ', params_in) +# st = time.time() + rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) +# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) + rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) +# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) + inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) + current_train_perf = [] + current_valid_perf = [] + current_test_perf = [] + + # For regression use the Kernel Ridge method +# try: + if model_type == 'regression': + kr = KernelRidge(kernel='precomputed', **params_in) + # loop for each split on validation set level + # validation set level + for train_index, valid_index in inner_cv.split(X_app): +# print("train_index, valid_index: ", trial, index_in, train_index, valid_index) +# if trial == 0: +# print('train_index: ', train_index) +# print('valid_index: ', valid_index) +# print('idx_test: ', idx_test) +# print('y_app[train_index]: ', y_app[train_index]) +# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) +# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) + kr.fit(X_app[train_index, :][:, train_index], + y_app[train_index]) + + # predict on the train, validation and test set + y_pred_train = kr.predict( + X_app[train_index, :][:, train_index]) + y_pred_valid = kr.predict( + X_app[valid_index, :][:, train_index]) +# if trial == 0: +# print('y_pred_valid: ', y_pred_valid) +# print() + y_pred_test = kr.predict( + X_test[:, train_index]) + + # root mean squared errors + current_train_perf.append( + np.sqrt( + mean_squared_error( + y_app[train_index], y_pred_train))) + current_valid_perf.append( + np.sqrt( + mean_squared_error( + y_app[valid_index], y_pred_valid))) +# if trial == 0: +# print(mean_squared_error( +# y_app[valid_index], y_pred_valid)) + current_test_perf.append( + np.sqrt( + mean_squared_error( + y_test, y_pred_test))) + # For clcassification use SVM + else: + svc = SVC(kernel='precomputed', cache_size=200, + verbose=False, **params_in) + # loop for each split on validation set level + # validation set level + for train_index, valid_index in inner_cv.split(X_app): +# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) +# if trial == 0: +# print('train_index: ', train_index) +# print('valid_index: ', valid_index) +# print('idx_test: ', idx_test) +# print('y_app[train_index]: ', y_app[train_index]) +# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) +# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) + svc.fit(X_app[train_index, :][:, train_index], + y_app[train_index]) + + # predict on the train, validation and test set + y_pred_train = svc.predict( + X_app[train_index, :][:, train_index]) + y_pred_valid = svc.predict( + X_app[valid_index, :][:, train_index]) + y_pred_test = svc.predict( + X_test[:, train_index]) + + # root mean squared errors + current_train_perf.append( + accuracy_score(y_app[train_index], + y_pred_train)) + current_valid_perf.append( + accuracy_score(y_app[valid_index], + y_pred_valid)) + current_test_perf.append( + accuracy_score(y_test, y_pred_test)) +# except ValueError: +# print(sys.exc_info()[0]) +# print(params_out, params_in) + + # average performance on inner splits + train_pref[index_out][index_in] = np.mean( + current_train_perf) + val_pref[index_out][index_in] = np.mean( + current_valid_perf) + test_pref[index_out][index_in] = np.mean( + current_test_perf) +# print(time.time() - st) +# if trial == 0: +# print('val_pref: ', val_pref) +# print('test_pref: ', test_pref) + + return train_pref, val_pref, test_pref def parallel_trial_do(param_list_pre_revised, param_list, y, model_type, trial): - train_pref, val_pref, test_pref = trial_do(param_list_pre_revised, - param_list, G_gms, y, - model_type, trial) - return train_pref, val_pref, test_pref + train_pref, val_pref, test_pref = trial_do(param_list_pre_revised, + param_list, G_gms, y, + model_type, trial) + return train_pref, val_pref, test_pref def compute_gram_matrices(dataset, y, estimator, param_list_precomputed, - output_dir, ds_name, - n_jobs=1, str_fw='', verbose=True): - gram_matrices = [ - ] # a list to store gram matrices for all param_grid_precomputed - gram_matrix_time = [ - ] # a list to store time to calculate gram matrices - param_list_pre_revised = [ - ] # list to store param grids precomputed ignoring the useless ones - - nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) - for idx, params_out in enumerate(param_list_precomputed): - params_out['n_jobs'] = n_jobs -# print(dataset) -# import networkx as nx -# nx.draw_networkx(dataset[1]) -# plt.show() - rtn_data = estimator(dataset[:], **params_out) - Kmatrix = rtn_data[0] - current_run_time = rtn_data[1] - # for some kernels, some graphs in datasets may not meet the - # kernels' requirements for graph structure. These graphs are trimmed. - if len(rtn_data) == 3: - idx_trim = rtn_data[2] # the index of trimmed graph list - y = [y[idxt] for idxt in idx_trim] # trim y accordingly - - Kmatrix_diag = Kmatrix.diagonal().copy() - # remove graphs whose kernels with themselves are zeros - nb_g_ignore = 0 - for idxk, diag in enumerate(Kmatrix_diag): - if diag == 0: - Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) - Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) - nb_g_ignore += 1 - # normalization - for i in range(len(Kmatrix)): - for j in range(i, len(Kmatrix)): - Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) - Kmatrix[j][i] = Kmatrix[i][j] - - if verbose: - print() - if params_out == {}: - if verbose: - print('the gram matrix is: ') - str_fw += 'the gram matrix is:\n\n' - else: - if verbose: - print('the gram matrix with parameters', params_out, 'is: ') - str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out - if len(Kmatrix) < 2: - nb_gm_ignore += 1 - if verbose: - print('ignored, as at most only one of all its diagonal value is non-zero.') - str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' - else: - if np.isnan(Kmatrix).any( - ): # if the matrix contains elements that are not numbers - nb_gm_ignore += 1 - if verbose: - print('ignored, as it contains elements that are not numbers.') - str_fw += 'ignored, as it contains elements that are not numbers.\n\n' - else: -# print(Kmatrix) - str_fw += np.array2string( - Kmatrix, - separator=',') + '\n\n' -# separator=',', -# threshold=np.inf, -# floatmode='unique') + '\n\n' - - fig_file_name = output_dir + '/GM[ds]' + ds_name - if params_out != {}: - fig_file_name += '[params]' + str(idx) - plt.imshow(Kmatrix) - plt.colorbar() - plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) -# plt.show() - plt.clf() - gram_matrices.append(Kmatrix) - gram_matrix_time.append(current_run_time) - param_list_pre_revised.append(params_out) - if nb_g_ignore > 0: - if verbose: - print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) - str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore - if verbose: - print() - print( - '{} gram matrices are calculated, {} of which are ignored.'.format( - len(param_list_precomputed), nb_gm_ignore)) - str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) - str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' - str_fw += ''.join([ - '{}: {}\n'.format(idx, params_out) - for idx, params_out in enumerate(param_list_precomputed) - ]) - - return gram_matrices, gram_matrix_time, param_list_pre_revised, y, str_fw + output_dir, ds_name, + n_jobs=1, str_fw='', verbose=True): + gram_matrices = [ + ] # a list to store gram matrices for all param_grid_precomputed + gram_matrix_time = [ + ] # a list to store time to calculate gram matrices + param_list_pre_revised = [ + ] # list to store param grids precomputed ignoring the useless ones + + nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) + for idx, params_out in enumerate(param_list_precomputed): + params_out['n_jobs'] = n_jobs +# print(dataset) +# import networkx as nx +# nx.draw_networkx(dataset[1]) +# plt.show() + rtn_data = estimator(dataset[:], **params_out) + Kmatrix = rtn_data[0] + current_run_time = rtn_data[1] + # for some kernels, some graphs in datasets may not meet the + # kernels' requirements for graph structure. These graphs are trimmed. + if len(rtn_data) == 3: + idx_trim = rtn_data[2] # the index of trimmed graph list + y = [y[idxt] for idxt in idx_trim] # trim y accordingly + + Kmatrix_diag = Kmatrix.diagonal().copy() + # remove graphs whose kernels with themselves are zeros + nb_g_ignore = 0 + for idxk, diag in enumerate(Kmatrix_diag): + if diag == 0: + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) + nb_g_ignore += 1 + # normalization + for i in range(len(Kmatrix)): + for j in range(i, len(Kmatrix)): + Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) + Kmatrix[j][i] = Kmatrix[i][j] + + if verbose: + print() + if params_out == {}: + if verbose: + print('the gram matrix is: ') + str_fw += 'the gram matrix is:\n\n' + else: + if verbose: + print('the gram matrix with parameters', params_out, 'is: ') + str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out + if len(Kmatrix) < 2: + nb_gm_ignore += 1 + if verbose: + print('ignored, as at most only one of all its diagonal value is non-zero.') + str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' + else: + if np.isnan(Kmatrix).any( + ): # if the matrix contains elements that are not numbers + nb_gm_ignore += 1 + if verbose: + print('ignored, as it contains elements that are not numbers.') + str_fw += 'ignored, as it contains elements that are not numbers.\n\n' + else: +# print(Kmatrix) + str_fw += np.array2string( + Kmatrix, + separator=',') + '\n\n' +# separator=',', +# threshold=np.inf, +# floatmode='unique') + '\n\n' + + fig_file_name = output_dir + '/GM[ds]' + ds_name + if params_out != {}: + fig_file_name += '[params]' + str(idx) + plt.imshow(Kmatrix) + plt.colorbar() + plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) +# plt.show() + plt.clf() + gram_matrices.append(Kmatrix) + gram_matrix_time.append(current_run_time) + param_list_pre_revised.append(params_out) + if nb_g_ignore > 0: + if verbose: + print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) + str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore + if verbose: + print() + print( + '{} gram matrices are calculated, {} of which are ignored.'.format( + len(param_list_precomputed), nb_gm_ignore)) + str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) + str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' + str_fw += ''.join([ + '{}: {}\n'.format(idx, params_out) + for idx, params_out in enumerate(param_list_precomputed) + ]) + + return gram_matrices, gram_matrix_time, param_list_pre_revised, y, str_fw def read_gram_matrices_from_file(output_dir, ds_name): - gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz') - gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed - param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones - y = gmfile['y'].tolist() - return gram_matrices, param_list_pre_revised, y + gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz') + gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed + param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones + y = gmfile['y'].tolist() + return gram_matrices, param_list_pre_revised, y def printResultsInTable(param_list, param_list_pre_revised, average_val_scores, - std_val_scores, average_perf_scores, std_perf_scores, - average_train_scores, std_train_scores, gram_matrix_time, - model_type, verbose): - from collections import OrderedDict - from tabulate import tabulate - table_dict = {} - if model_type == 'regression': - for param_in in param_list: - param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) - else: - for param_in in param_list: - param_in['C'] = '{:.2e}'.format(param_in['C']) - table_dict['params'] = [{**param_out, **param_in} - for param_in in param_list for param_out in param_list_pre_revised] - table_dict['gram_matrix_time'] = [ - '{:.2f}'.format(gram_matrix_time[index_out]) - for param_in in param_list - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['valid_perf'] = [ - '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], - std_val_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['test_perf'] = [ - '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], - std_perf_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['train_perf'] = [ - '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], - std_train_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - - keyorder = [ - 'params', 'train_perf', 'valid_perf', 'test_perf', - 'gram_matrix_time' - ] - if verbose: - print() - tb_print = tabulate(OrderedDict(sorted(table_dict.items(), - key=lambda i: keyorder.index(i[0]))), headers='keys') -# print(tb_print) - return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print \ No newline at end of file + std_val_scores, average_perf_scores, std_perf_scores, + average_train_scores, std_train_scores, gram_matrix_time, + model_type, verbose): + from collections import OrderedDict + from tabulate import tabulate + table_dict = {} + if model_type == 'regression': + for param_in in param_list: + param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) + else: + for param_in in param_list: + param_in['C'] = '{:.2e}'.format(param_in['C']) + table_dict['params'] = [{**param_out, **param_in} + for param_in in param_list for param_out in param_list_pre_revised] + table_dict['gram_matrix_time'] = [ + '{:.2f}'.format(gram_matrix_time[index_out]) + for param_in in param_list + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['valid_perf'] = [ + '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], + std_val_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['test_perf'] = [ + '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], + std_perf_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['train_perf'] = [ + '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], + std_train_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + + keyorder = [ + 'params', 'train_perf', 'valid_perf', 'test_perf', + 'gram_matrix_time' + ] + if verbose: + print() + tb_print = tabulate(OrderedDict(sorted(table_dict.items(), + key=lambda i: keyorder.index(i[0]))), headers='keys') +# print(tb_print) + return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print \ No newline at end of file diff --git a/gklearn/utils/stats.py b/gklearn/utils/stats.py new file mode 100644 index 0000000..d51cf48 --- /dev/null +++ b/gklearn/utils/stats.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Oct 5 15:12:41 2020 + +@author: ljia +""" +from collections import Counter +from scipy import stats + + +def entropy(labels, base=None): + """Calculate the entropy of a distribution for given list of labels. + + Parameters + ---------- + labels : list + Given list of labels. + base : float, optional + The logarithmic base to use. The default is ``e`` (natural logarithm). + + Returns + ------- + float + The calculated entropy. + """ + return stats.entropy(list(Counter(labels).values()), base=base) \ No newline at end of file diff --git a/setup.py b/setup.py index ea10603..cd45970 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ with open('requirements_pypi.txt') as fp: setuptools.setup( name="graphkit-learn", - version="0.2.0", + version="0.2.1b1", author="Linlin Jia", author_email="linlin.jia@insa-rouen.fr", description="A Python library for graph kernels, graph edit distances, and graph pre-images", From 64e76fe1a0cd5a0a443a3421d36217113a005831 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 9 Oct 2020 10:29:02 +0200 Subject: [PATCH 05/13] Add math module. --- gklearn/utils/math.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 gklearn/utils/math.py diff --git a/gklearn/utils/math.py b/gklearn/utils/math.py new file mode 100644 index 0000000..395946b --- /dev/null +++ b/gklearn/utils/math.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 7 14:43:36 2020 + +@author: ljia +""" + +def rounder(x, decimals): + """Round, where 5 is rounded up. + + Parameters + ---------- + x : float + The number to be rounded. + decimals : int + Decimals to which ``x'' is rounded. + + Returns + ------- + string + The rounded number. + """ + x_strs = str(x).split('.') + if len(x_strs) == 2: + before = x_strs[0] + after = x_strs[1] + if len(after) > decimals: + if int(after[decimals]) >= 5: + after0s = '' + for c in after: + if c == '0': + after0s += '0' + elif c != '0': + break + if len(after0s) == decimals: + after0s = after0s[:-1] + after = after0s + str(int(after[0:decimals]) + 1)[-decimals:] + else: + after = after[0:decimals] + elif len(after) < decimals: + after += '0' * (decimals - len(after)) + return before + '.' + after + + elif len(x_strs) == 1: + return x_strs[0] + + +if __name__ == '__main__': + x = 1.0075333616 + y = rounder(x, 2) + print(y) \ No newline at end of file From 1fd81996493574e87e17d179586d8676542c17eb Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 9 Oct 2020 10:30:37 +0200 Subject: [PATCH 06/13] Update experiments. --- gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py | 4 ++-- gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py | 2 +- gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py | 2 +- gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py | 2 +- gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py | 2 +- gklearn/experiments/papers/PRL_2020/utils.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py index a7056f3..891ae4c 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py @@ -43,7 +43,7 @@ def xp_synthesized_graphs_dataset_size(): run_time = 'error' try: - gram_matrix, run_time = compute_graph_kernel(sub_graphs, kernel_name, n_jobs=1) + gram_matrix, run_time = compute_graph_kernel(sub_graphs, kernel_name) except Exception as exp: print('An exception occured when running this experiment:') LOG_FILENAME = save_dir + 'error.txt' @@ -61,4 +61,4 @@ def xp_synthesized_graphs_dataset_size(): if __name__ == '__main__': - xp_synthesized_graphs_dataset_size() \ No newline at end of file + xp_synthesized_graphs_dataset_size() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py index 2f5594d..f005172 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py @@ -42,7 +42,7 @@ def xp_synthesized_graphs_degrees(): # Compute Gram matrix. run_time = 'error' try: - gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) + gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name) except Exception as exp: print('An exception occured when running this experiment:') LOG_FILENAME = save_dir + 'error.txt' diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py index 51e07ba..8e35c74 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py @@ -42,7 +42,7 @@ def xp_synthesized_graphs_num_edge_label_alphabet(): # Compute Gram matrix. run_time = 'error' try: - gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) + gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name) except Exception as exp: print('An exception occured when running this experiment:') LOG_FILENAME = save_dir + 'error.txt' diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py index 61609ba..51e1382 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py @@ -42,7 +42,7 @@ def xp_synthesized_graphs_num_node_label_alphabet(): # Compute Gram matrix. run_time = 'error' try: - gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) + gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name) except Exception as exp: run_times[kernel_name].append('error') print('An exception occured when running this experiment:') diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py index ec6557c..f63c404 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py @@ -42,7 +42,7 @@ def xp_synthesized_graphs_num_nodes(): # Compute Gram matrix. run_time = 'error' try: - gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name, n_jobs=1) + gram_matrix, run_time = compute_graph_kernel(graphs, kernel_name) except Exception as exp: run_times[kernel_name].append('error') print('An exception occured when running this experiment:') diff --git a/gklearn/experiments/papers/PRL_2020/utils.py b/gklearn/experiments/papers/PRL_2020/utils.py index 99e2d20..84fa624 100644 --- a/gklearn/experiments/papers/PRL_2020/utils.py +++ b/gklearn/experiments/papers/PRL_2020/utils.py @@ -62,7 +62,7 @@ def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count() import functools mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} - params = {'compute_method': 'fp', 'weight': 1e-3, 'node_kernels': sub_kernel, 'edge_kernels': sub_kernel} + params = {'compute_method': 'fp', 'weight': 1e-4, 'node_kernels': sub_kernel, 'edge_kernels': sub_kernel} elif kernel_name == 'SpectralDecomposition': from gklearn.kernels.randomWalkKernel import randomwalkkernel From 921ea4a67a41198685c3e810c0445b79e7baab72 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 15 Oct 2020 15:37:36 +0200 Subject: [PATCH 07/13] Update docs: experiments. --- docs/source/experiments.rst | 5 +- docs/source/figures/all_ave_gm_times.svg | 45 +- docs/source/figures/all_test_accuracy.svg | 1454 +++++++++++++++-------------- 3 files changed, 764 insertions(+), 740 deletions(-) diff --git a/docs/source/experiments.rst b/docs/source/experiments.rst index de137c0..7d8d477 100644 --- a/docs/source/experiments.rst +++ b/docs/source/experiments.rst @@ -7,15 +7,14 @@ A two-layer nested cross-validation (CV) is applied to select and evaluate model The machine used to execute the experiments is a cluster with 28 CPU cores of Intel(R) Xeon(R) E5-2680 v4 @ 2.40GHz, 252GB memory, and 64-bit operating system CentOS Linux release 7.3.1611. All results were run with Python 3.5.2. -The figure below exhibits accuracies achieved by graph kernels implemented in `graphkit-learn` library. Each row corresponds to a dataset and each column to a graph kernel. Accuracies are in percentage for classification and in terms of errors of boiling points for regression (Alkane and -Acyclic datasets). Red color indicates a worse result and green a better one. Gray cells with the “inf” marker indicate that the computation of the graph kernel on the dataset is neglected due to much higher consumption of computational resources than other kernels. +The figure below exhibits accuracies achieved by graph kernels implemented in `graphkit-learn` library, in terms of regression error (the upper table) and classification rate (the lower table). Red color indicates the worse results and dark green the best ones. Gray cells with the “inf” marker indicate that the computation of the graph kernel on the dataset is omitted due to much higher consumption of computational resources than other kernels. .. image:: figures/all_test_accuracy.svg :width: 600 :alt: accuracies The figure below displays computational time consumed to compute Gram matrices of each graph -kernels (in :math:`log10` of seconds) on each dataset. Colors have the same meaning as in the figure above. +kernels (in :math:`log10` of seconds) on each dataset. Color legends have the same meaning as in the figure above. .. image:: figures/all_ave_gm_times.svg :width: 600 diff --git a/docs/source/figures/all_ave_gm_times.svg b/docs/source/figures/all_ave_gm_times.svg index d102164..7b20aa0 100644 --- a/docs/source/figures/all_ave_gm_times.svg +++ b/docs/source/figures/all_ave_gm_times.svg @@ -1367,7 +1367,7 @@ Q 28.265625 36.71875 33.203125 36.71875 z " id="DejaVuSans-Bold-57"/> - + @@ -1903,7 +1903,7 @@ z - + @@ -2024,10 +2024,10 @@ L 9.1875 0 z " id="DejaVuSans-Bold-72"/> - + - - + + @@ -2126,7 +2126,7 @@ z - + @@ -2231,7 +2231,7 @@ z - + @@ -2334,7 +2334,7 @@ z - + @@ -2546,7 +2546,7 @@ z - + @@ -2663,7 +2663,7 @@ L 9.1875 0 z " id="DejaVuSans-Bold-69"/> - + @@ -2896,7 +2896,7 @@ L 9.1875 0 z " id="DejaVuSans-Bold-68"/> - + @@ -2997,7 +2997,7 @@ z - + @@ -3097,7 +3097,7 @@ z " style="fill:#ffffff;stroke:#ffffff;stroke-linejoin:miter;stroke-width:0.1;"/> - + - + - @@ -3231,7 +3230,7 @@ z - + @@ -3432,7 +3431,7 @@ z - + @@ -3669,7 +3668,7 @@ z - + @@ -3759,29 +3758,29 @@ z - - - - - + diff --git a/docs/source/figures/all_test_accuracy.svg b/docs/source/figures/all_test_accuracy.svg index 96b052c..a1f13af 100644 --- a/docs/source/figures/all_test_accuracy.svg +++ b/docs/source/figures/all_test_accuracy.svg @@ -2,7 +2,7 @@ - + - - - + @@ -220,8 +220,8 @@ z - - + @@ -379,8 +379,8 @@ z - - + @@ -571,8 +571,8 @@ z - - + @@ -625,8 +625,8 @@ z - - + @@ -728,8 +728,8 @@ z - - + @@ -763,8 +763,8 @@ z - - + @@ -822,8 +822,8 @@ z - - + @@ -878,8 +878,8 @@ z - - + @@ -936,8 +936,8 @@ z - - + @@ -971,8 +971,8 @@ z - - + @@ -1056,10 +1056,10 @@ z - @@ -1081,7 +1081,7 @@ L 38.71875 55.8125 z " id="DejaVuSans-Bold-65"/> - + @@ -1091,10 +1091,10 @@ z - @@ -1166,7 +1166,7 @@ Q 54.25 36.140625 44.390625 27.484375 z " id="DejaVuSans-Bold-50"/> - + @@ -1175,10 +1175,10 @@ z - @@ -1244,7 +1244,7 @@ L 6.6875 59.078125 z " id="DejaVuSans-Bold-55"/> - + @@ -1253,10 +1253,10 @@ z - @@ -1333,7 +1333,7 @@ Q 28.265625 36.71875 33.203125 36.71875 z " id="DejaVuSans-Bold-57"/> - + @@ -1341,16 +1341,16 @@ z - - + @@ -1359,16 +1359,16 @@ z - - + @@ -1377,16 +1377,16 @@ z - - + @@ -1395,16 +1395,16 @@ z - - + @@ -1412,10 +1412,10 @@ z - @@ -1453,7 +1453,7 @@ Q 54.78125 72.5625 59.421875 71 z " id="DejaVuSans-Bold-54"/> - + @@ -1461,10 +1461,10 @@ z - @@ -1493,23 +1493,23 @@ Q 64.796875 54.546875 64.796875 36.375 z " id="DejaVuSans-Bold-48"/> - + - - + @@ -1517,16 +1517,16 @@ z - - + @@ -1535,16 +1535,16 @@ z - - + @@ -1555,16 +1555,16 @@ z - - + @@ -1573,16 +1573,16 @@ z - - + @@ -1591,16 +1591,16 @@ z - - + @@ -1608,16 +1608,16 @@ z - - + @@ -1626,16 +1626,16 @@ z - - + @@ -1643,16 +1643,16 @@ z - - + @@ -1661,16 +1661,16 @@ z - - + @@ -1678,16 +1678,16 @@ z - - + @@ -1695,16 +1695,16 @@ z - - + @@ -1712,16 +1712,16 @@ z - - + @@ -1729,27 +1729,51 @@ z - - + + + + + + + + + + + + + + + + + - @@ -1793,23 +1817,23 @@ Q 24.953125 74.21875 42.484375 74.21875 z " id="DejaVuSans-Bold-79"/> - + - - + @@ -1817,16 +1841,16 @@ z - - + @@ -1835,16 +1859,16 @@ z - - + @@ -1853,16 +1877,16 @@ z - - + @@ -1871,16 +1895,16 @@ z - - + @@ -1889,16 +1913,16 @@ z - - + @@ -1907,16 +1931,16 @@ z - - + @@ -1925,16 +1949,16 @@ z - - + @@ -1943,16 +1967,16 @@ z - - + @@ -1961,16 +1985,16 @@ z - - + @@ -1979,16 +2003,16 @@ z - - + @@ -1997,10 +2021,10 @@ z - @@ -2022,23 +2046,23 @@ L 9.1875 0 z " id="DejaVuSans-Bold-72"/> - + - - + + - - + @@ -2046,16 +2070,16 @@ z - - + @@ -2064,16 +2088,16 @@ z - - + @@ -2081,16 +2105,16 @@ z - - + @@ -2099,16 +2123,16 @@ z - - + @@ -2117,16 +2141,16 @@ z - - + @@ -2135,16 +2159,16 @@ z - - + @@ -2152,16 +2176,16 @@ z - - + @@ -2169,16 +2193,16 @@ z - - + @@ -2187,16 +2211,16 @@ z - - + @@ -2204,16 +2228,16 @@ z - - + @@ -2222,16 +2246,16 @@ z - - + @@ -2240,16 +2264,16 @@ z - - + @@ -2258,16 +2282,16 @@ z - - + @@ -2276,16 +2300,16 @@ z - - + @@ -2294,16 +2318,16 @@ z - - + @@ -2312,16 +2336,16 @@ z - - + @@ -2330,16 +2354,16 @@ z - - + @@ -2348,16 +2372,16 @@ z - - + @@ -2366,16 +2390,16 @@ z - - + @@ -2384,16 +2408,16 @@ z - - + @@ -2402,16 +2426,16 @@ z - - + @@ -2420,16 +2444,16 @@ z - - + @@ -2438,16 +2462,16 @@ z - - + @@ -2461,16 +2485,16 @@ z - - + @@ -2479,32 +2503,32 @@ z - - + - - + @@ -2513,16 +2537,16 @@ z - - + @@ -2531,16 +2555,16 @@ z - - + @@ -2548,16 +2572,16 @@ z - - + @@ -2566,16 +2590,16 @@ z - - + @@ -2584,16 +2608,16 @@ z - - + @@ -2602,16 +2626,16 @@ z - - + @@ -2620,32 +2644,32 @@ z - - + - - + @@ -2654,10 +2678,10 @@ z - @@ -2679,7 +2703,7 @@ L 9.1875 0 z " id="DejaVuSans-Bold-69"/> - + @@ -2690,16 +2714,16 @@ z - - + @@ -2708,16 +2732,16 @@ z - - + @@ -2726,16 +2750,16 @@ z - - + @@ -2744,16 +2768,16 @@ z - - + @@ -2762,16 +2786,16 @@ z - - + @@ -2780,16 +2804,16 @@ z - - + @@ -2798,16 +2822,16 @@ z - - + @@ -2816,32 +2840,32 @@ z - - + - - + @@ -2850,16 +2874,16 @@ z - - + @@ -2868,16 +2892,16 @@ z - - + @@ -2886,10 +2910,10 @@ z - @@ -2924,7 +2948,7 @@ L 9.1875 0 z " id="DejaVuSans-Bold-68"/> - + @@ -2932,16 +2956,16 @@ z - - + @@ -2950,32 +2974,32 @@ z - - + - - + @@ -2984,16 +3008,16 @@ z - - + @@ -3002,16 +3026,16 @@ z - - + @@ -3020,16 +3044,16 @@ z - - + @@ -3038,16 +3062,16 @@ z - - + @@ -3056,16 +3080,16 @@ z - - + @@ -3074,16 +3098,16 @@ z - - + @@ -3092,16 +3116,16 @@ z - - + @@ -3110,16 +3134,16 @@ z - - + @@ -3128,15 +3152,15 @@ z - - + - + - - - + - - + - - + @@ -3230,16 +3253,16 @@ z - - + @@ -3248,64 +3271,64 @@ z - - + - - + - - + - - + @@ -3314,16 +3337,16 @@ z - - + @@ -3332,16 +3355,16 @@ z - - + @@ -3350,16 +3373,16 @@ z - - + @@ -3368,16 +3391,16 @@ z - - + @@ -3387,48 +3410,48 @@ z - - + - - + - - + @@ -3437,16 +3460,16 @@ z - - + @@ -3454,16 +3477,16 @@ z - - + @@ -3472,48 +3495,48 @@ z - - + - - + - - + @@ -3522,16 +3545,16 @@ z - - + @@ -3540,16 +3563,16 @@ z - - + @@ -3558,16 +3581,16 @@ z - - + @@ -3576,10 +3599,10 @@ z - @@ -3626,151 +3649,151 @@ Q 45.953125 13.1875 48.390625 15.1875 z " id="DejaVuSans-Bold-38"/> - + - - + - - + - - + - - + - - + - - + - - + - - + - - + @@ -3778,32 +3801,32 @@ z - - + - - + @@ -3811,31 +3834,34 @@ z - - + - - + - - + - - + - - + + + + + From f3a3441b449e0f3c35b55046630051819ad5c607 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 15 Oct 2020 15:39:40 +0200 Subject: [PATCH 08/13] Update exps: remove self loops. --- gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py b/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py index c25c116..0ababc3 100644 --- a/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py +++ b/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py @@ -34,6 +34,11 @@ def generate_graphs(): graphs11 = [nx.configuration_model(degrees11, create_using=nx.Graph) for i in range(half_num_graphs)] graphs12 = [nx.configuration_model(degrees12, create_using=nx.Graph) for i in range(half_num_graphs)] + for g in graphs11: + g.remove_edges_from(nx.selfloop_edges(g)) + for g in graphs12: + g.remove_edges_from(nx.selfloop_edges(g)) + # method 2: can easily generate isomorphic graphs. # graphs11 = [nx.random_regular_graph(2, num_nodes, seed=None) for i in range(half_num_graphs)] # graphs12 = [nx.random_regular_graph(10, num_nodes, seed=None) for i in range(half_num_graphs)] @@ -51,6 +56,11 @@ def generate_graphs(): # method 1: the entorpy of the two classes is not the same. graphs21 = [nx.configuration_model(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] graphs22 = [nx.configuration_model(degrees22, create_using=nx.Graph) for i in range(half_num_graphs)] + + for g in graphs21: + g.remove_edges_from(nx.selfloop_edges(g)) + for g in graphs22: + g.remove_edges_from(nx.selfloop_edges(g)) # # method 2: tooo slow, and may fail. # graphs21 = [nx.random_degree_sequence_graph(degrees21, seed=None, tries=100) for i in range(half_num_graphs)] From e6b92b752b0ab5a38d4fa7a3d91bc51e402e1aca Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 15 Oct 2020 15:42:19 +0200 Subject: [PATCH 09/13] Add the function compute_vertex_kernels. --- gklearn/utils/utils.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/gklearn/utils/utils.py b/gklearn/utils/utils.py index c32169d..66c92a8 100644 --- a/gklearn/utils/utils.py +++ b/gklearn/utils/utils.py @@ -565,6 +565,86 @@ def compute_distance_matrix(gram_matrix): return dis_mat, dis_max, dis_min, dis_mean +# @todo: use it in ShortestPath. +def compute_vertex_kernels(g1, g2, node_kernels, node_labels=[], node_attrs=[]): + """Compute kernels between each pair of vertices in two graphs. + + Parameters + ---------- + g1, g2 : NetworkX graph + The kernels bewteen pairs of vertices in these two graphs are computed. + node_kernels : dict + A dictionary of kernel functions for nodes, including 3 items: 'symb' + for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' + for both labels. The first 2 functions take two node labels as + parameters, and the 'mix' function takes 4 parameters, a symbolic and a + non-symbolic label for each the two nodes. Each label is in form of 2-D + dimension array (n_samples, n_features). Each function returns a number + as the kernel value. Ignored when nodes are unlabeled. This argument + is designated to conjugate gradient method and fixed-point iterations. + node_labels : list, optional + The list of the name strings of the node labels. The default is []. + node_attrs : list, optional + The list of the name strings of the node attributes. The default is []. + + Returns + ------- + vk_dict : dict + Vertex kernels keyed by vertices. + + Notes + ----- + This function is used by ``gklearn.kernels.FixedPoint'' and + ``gklearn.kernels.StructuralSP''. The method is borrowed from FCSP [1]. + + References + ---------- + .. [1] Lifan Xu, Wei Wang, M Alvarez, John Cavazos, and Dongping Zhang. + Parallelization of shortest path graph kernels on multi-core cpus and gpus. + Proceedings of the Programmability Issues for Heterogeneous Multicores + (MultiProg), Vienna, Austria, 2014. + """ + vk_dict = {} # shortest path matrices dict + if len(node_labels) > 0: + # node symb and non-synb labeled + if len(node_attrs) > 0: + kn = node_kernels['mix'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in node_labels] + n2_labels = [n2[1][nl] for nl in node_labels] + n1_attrs = [n1[1][na] for na in node_attrs] + n2_attrs = [n2[1][na] for na in node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) + # node symb labeled + else: + kn = node_kernels['symb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_labels = [n1[1][nl] for nl in node_labels] + n2_labels = [n2[1][nl] for nl in node_labels] + vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) + else: + # node non-synb labeled + if len(node_attrs) > 0: + kn = node_kernels['nsymb'] + for n1 in g1.nodes(data=True): + for n2 in g2.nodes(data=True): + n1_attrs = [n1[1][na] for na in node_attrs] + n2_attrs = [n2[1][na] for na in node_attrs] + vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) + # node unlabeled + else: + pass # @todo: add edge weights. +# for e1 in g1.edges(data=True): +# for e2 in g2.edges(data=True): +# if e1[2]['cost'] == e2[2]['cost']: +# kernel += 1 +# return kernel + + return vk_dict + + def dummy_node(): """ /*! From 07aa31bbbf302a025aff312a7277893bfdcc23a6 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 15 Oct 2020 16:10:47 +0200 Subject: [PATCH 10/13] Add the ConjugateGradient class. --- gklearn/kernels/__init__.py | 7 +- gklearn/kernels/conjugate_gradient.py | 322 ++++++++++++++++++++++++++++++++++ gklearn/kernels/random_walk.py | 82 +++------ gklearn/kernels/random_walk_meta.py | 86 +++++++++ 4 files changed, 435 insertions(+), 62 deletions(-) create mode 100644 gklearn/kernels/conjugate_gradient.py create mode 100644 gklearn/kernels/random_walk_meta.py diff --git a/gklearn/kernels/__init__.py b/gklearn/kernels/__init__.py index 7b15d70..5740c77 100644 --- a/gklearn/kernels/__init__.py +++ b/gklearn/kernels/__init__.py @@ -1,5 +1,5 @@ # -*-coding:utf-8 -*- -"""gklearn - kernels module +"""gklearn - graph kernels module """ # info @@ -10,9 +10,12 @@ __date__ = "November 2018" from gklearn.kernels.graph_kernel import GraphKernel from gklearn.kernels.common_walk import CommonWalk from gklearn.kernels.marginalized import Marginalized -from gklearn.kernels.random_walk import RandomWalk +from gklearn.kernels.random_walk_meta import RandomWalkMeta from gklearn.kernels.sylvester_equation import SylvesterEquation +from gklearn.kernels.conjugate_gradient import ConjugateGradient +from gklearn.kernels.fixed_point import FixedPoint from gklearn.kernels.spectral_decomposition import SpectralDecomposition +from gklearn.kernels.random_walk import RandomWalk from gklearn.kernels.shortest_path import ShortestPath from gklearn.kernels.structural_sp import StructuralSP from gklearn.kernels.path_up_to_h import PathUpToH diff --git a/gklearn/kernels/conjugate_gradient.py b/gklearn/kernels/conjugate_gradient.py new file mode 100644 index 0000000..73cac4c --- /dev/null +++ b/gklearn/kernels/conjugate_gradient.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Aug 20 16:09:51 2020 + +@author: ljia + +@references: + + [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. +""" + +import sys +from tqdm import tqdm +import numpy as np +import networkx as nx +from scipy.sparse import identity +from scipy.sparse.linalg import cg +from gklearn.utils.parallel import parallel_gm, parallel_me +from gklearn.kernels import RandomWalkMeta +from gklearn.utils.utils import compute_vertex_kernels + + +class ConjugateGradient(RandomWalkMeta): + + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._node_kernels = kwargs.get('node_kernels', None) + self._edge_kernels = kwargs.get('edge_kernels', None) + self._node_labels = kwargs.get('node_labels', []) + self._edge_labels = kwargs.get('edge_labels', []) + self._node_attrs = kwargs.get('node_attrs', []) + self._edge_attrs = kwargs.get('edge_attrs', []) + + + def _compute_gm_series(self): + self._check_edge_weight(self._graphs, self._verbose) + self._check_graphs(self._graphs) + + lmda = self._weight + + # Compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + # Reindex nodes using consecutive integers for the convenience of kernel computation. + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout) + else: + iterator = self._graphs + self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] + + if self._p is None and self._q is None: # p and q are uniform distributions as default. + + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) + if self._verbose >= 2: + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) + else: + iterator = itr + + for i, j in iterator: + kernel = self.__kernel_do(self._graphs[i], self._graphs[j], lmda) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + + else: # @todo + pass + + return gram_matrix + + + def _compute_gm_imap_unordered(self): + self._check_edge_weight(self._graphs, self._verbose) + self._check_graphs(self._graphs) + + # Compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + + # @todo: parallel this. + # Reindex nodes using consecutive integers for the convenience of kernel computation. + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout) + else: + iterator = self._graphs + self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] + + if self._p is None and self._q is None: # p and q are uniform distributions as default. + + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + + do_fun = self._wrapper_kernel_do + + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + + else: # @todo + pass + + return gram_matrix + + + def _compute_kernel_list_series(self, g1, g_list): + self._check_edge_weight(g_list + [g1], self._verbose) + self._check_graphs(g_list + [g1]) + + lmda = self._weight + + # compute kernel list. + kernel_list = [None] * len(g_list) + + # Reindex nodes using consecutive integers for the convenience of kernel computation. + g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') + if self._verbose >= 2: + iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout) + else: + iterator = g_list + g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] + + if self._p is None and self._q is None: # p and q are uniform distributions as default. + + if self._verbose >= 2: + iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) + else: + iterator = range(len(g_list)) + + for i in iterator: + kernel = self.__kernel_do(g1, g_list[i], lmda) + kernel_list[i] = kernel + + else: # @todo + pass + + return kernel_list + + + def _compute_kernel_list_imap_unordered(self, g1, g_list): + self._check_edge_weight(g_list + [g1], self._verbose) + self._check_graphs(g_list + [g1]) + + # compute kernel list. + kernel_list = [None] * len(g_list) + + # Reindex nodes using consecutive integers for the convenience of kernel computation. + g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') + # @todo: parallel this. + if self._verbose >= 2: + iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout) + else: + iterator = g_list + g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] + + if self._p is None and self._q is None: # p and q are uniform distributions as default. + + def init_worker(g1_toshare, g_list_toshare): + global G_g1, G_g_list + G_g1 = g1_toshare + G_g_list = g_list_toshare + + do_fun = self._wrapper_kernel_list_do + + def func_assign(result, var_to_assign): + var_to_assign[result[0]] = result[1] + itr = range(len(g_list)) + len_itr = len(g_list) + parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, + init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', + n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) + + else: # @todo + pass + + return kernel_list + + + def _wrapper_kernel_list_do(self, itr): + return itr, self._kernel_do(G_g1, G_g_list[itr], self._weight) + + + def _compute_single_kernel_series(self, g1, g2): + self._check_edge_weight([g1] + [g2], self._verbose) + self._check_graphs([g1] + [g2]) + + lmda = self._weight + + # Reindex nodes using consecutive integers for the convenience of kernel computation. + g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') + g2 = nx.convert_node_labels_to_integers(g2, first_label=0, label_attribute='label_orignal') + + if self._p is None and self._q is None: # p and q are uniform distributions as default. + kernel = self.__kernel_do(g1, g2, lmda) + + else: # @todo + pass + + return kernel + + + def __kernel_do(self, g1, g2, lmda): + + # Frist, compute kernels between all pairs of nodes using the method borrowed + # from FCSP. It is faster than directly computing all edge kernels + # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the + # graphs compared, which is the most case we went though. For very + # sparse graphs, this would be slow. + vk_dict = self._compute_vertex_kernels(g1, g2) + + # Compute the weight matrix of the direct product graph. + w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict) + # use uniform distribution if there is no prior knowledge. + p_times_uni = 1 / w_dim + A = identity(w_times.shape[0]) - w_times * lmda + b = np.full((w_dim, 1), p_times_uni) + x, _ = cg(A, b) + # use uniform distribution if there is no prior knowledge. + q_times = np.full((1, w_dim), p_times_uni) + return np.dot(q_times, x) + + + def _wrapper_kernel_do(self, itr): + i = itr[0] + j = itr[1] + return i, j, self.__kernel_do(G_gn[i], G_gn[j], self._weight) + + + def _func_fp(x, p_times, lmda, w_times): + haha = w_times * x + haha = lmda * haha + haha = p_times + haha + return p_times + lmda * np.dot(w_times, x) + + + def _compute_vertex_kernels(self, g1, g2): + """Compute vertex kernels between vertices of two graphs. + """ + return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs) + + + # @todo: move if out to make it faster. + # @todo: node/edge kernels use direct function rather than dicts. + def _compute_weight_matrix(self, g1, g2, vk_dict): + """Compute the weight matrix of the direct product graph. + """ + # Define edge kernels. + def compute_ek_11(e1, e2, ke): + e1_labels = [e1[2][el] for el in self._edge_labels] + e2_labels = [e2[2][el] for el in self.__edge_labels] + e1_attrs = [e1[2][ea] for ea in self._edge_attrs] + e2_attrs = [e2[2][ea] for ea in self._edge_attrs] + return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) + + def compute_ek_10(e1, e2, ke): + e1_labels = [e1[2][el] for el in self.__edge_labels] + e2_labels = [e2[2][el] for el in self.__edge_labels] + return ke(e1_labels, e2_labels) + + def compute_ek_01(e1, e2, ke): + e1_attrs = [e1[2][ea] for ea in self.__edge_attrs] + e2_attrs = [e2[2][ea] for ea in self.__edge_attrs] + return ke(e1_attrs, e2_attrs) + + def compute_ek_00(e1, e2, ke): + return 1 + + # Select the proper edge kernel. + if len(self._edge_labels) > 0: + # edge symb and non-synb labeled + if len(self._edge_attrs) > 0: + ke = self._edge_kernels['mix'] + ek_temp = compute_ek_11 + # edge symb labeled + else: + ke = self._edge_kernels['symb'] + ek_temp = compute_ek_10 + else: + # edge non-synb labeled + if len(self._edge_attrs) > 0: + ke = self._edge_kernels['nsymb'] + ek_temp = compute_ek_01 + # edge unlabeled + else: + ke = None + ek_temp = compute_ek_00 # @todo: check how much slower is this. + + # Compute the weight matrix. + w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) + w_times = np.zeros((w_dim, w_dim)) + + if vk_dict: # node labeled + if self._ds_infos['directed']: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] * ek_temp(e1, e2, ke) * vk_dict[(e1[1], e2[1])] + else: # undirected + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] * ek_temp(e1, e2, ke) * vk_dict[(e1[1], e2[1])] + vk_dict[(e1[0], e2[1])] * ek_temp(e1, e2, ke) * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + else: # node unlabeled + if self._ds_infos['directed']: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp(e1, e2, ke) + else: # undirected + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp(e1, e2, ke) + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + + return w_times, w_dim diff --git a/gklearn/kernels/random_walk.py b/gklearn/kernels/random_walk.py index f2d0961..1bee342 100644 --- a/gklearn/kernels/random_walk.py +++ b/gklearn/kernels/random_walk.py @@ -10,85 +10,47 @@ Created on Wed Aug 19 16:55:17 2020 [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. """ -import sys -from tqdm import tqdm -import numpy as np -import networkx as nx -from gklearn.utils import SpecialLabel -from gklearn.utils.parallel import parallel_gm, parallel_me -from gklearn.utils.utils import direct_product_graph -from gklearn.kernels import GraphKernel +from gklearn.kernels import SylvesterEquation, ConjugateGradient, FixedPoint, SpectralDecomposition -class RandomWalk(GraphKernel): +class RandomWalk(SylvesterEquation, ConjugateGradient, FixedPoint, SpectralDecomposition): def __init__(self, **kwargs): - GraphKernel.__init__(self) self._compute_method = kwargs.get('compute_method', None) - self._weight = kwargs.get('weight', 1) - self._p = kwargs.get('p', None) - self._q = kwargs.get('q', None) - self._edge_weight = kwargs.get('edge_weight', None) - self._ds_infos = kwargs.get('ds_infos', {}) + self._compute_method = self._compute_method.lower() - self._compute_method = self.__compute_method.lower() + if self._compute_method == 'sylvester': + self._parent = SylvesterEquation + elif self._compute_method == 'conjugate': + self._parent = ConjugateGradient + elif self._compute_method == 'fp': + self._parent = FixedPoint + elif self._compute_method == 'spectral': + self._parent = SpectralDecomposition + elif self._compute_method == 'kon': + raise Exception('This computing method is not completed yet.') + else: + raise Exception('This computing method does not exist. The possible choices inlcude: "sylvester", "conjugate", "fp", "spectral".') + + self._parent.__init__(self, **kwargs) def _compute_gm_series(self): - pass + return self._parent._compute_gm_series(self) def _compute_gm_imap_unordered(self): - pass + return self._parent._compute_gm_imap_unordered(self) def _compute_kernel_list_series(self, g1, g_list): - pass + return self._parent._compute_kernel_list_series(self, g1, g_list) def _compute_kernel_list_imap_unordered(self, g1, g_list): - pass + return self._parent._compute_kernel_list_imap_unordered(self, g1, g_list) def _compute_single_kernel_series(self, g1, g2): - pass - - - def _check_graphs(self, Gn): - # remove graphs with no edges, as no walk can be found in their structures, - # so the weight matrix between such a graph and itself might be zero. - for g in Gn: - if nx.number_of_edges(g) == 0: - raise Exception('Graphs must contain edges to construct weight matrices.') - - - def _check_edge_weight(self, G0, verbose): - eweight = None - if self._edge_weight == None: - if verbose >= 2: - print('\n None edge weight is specified. Set all weight to 1.\n') - else: - try: - some_weight = list(nx.get_edge_attributes(G0, self._edge_weight).values())[0] - if isinstance(some_weight, float) or isinstance(some_weight, int): - eweight = self._edge_weight - else: - if verbose >= 2: - print('\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % self._edge_weight) - except: - if verbose >= 2: - print('\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % self._edge_weight) - - self._edge_weight = eweight - - - def _add_dummy_labels(self, Gn): - if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY): - for i in range(len(Gn)): - nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) - self.__node_labels = [SpecialLabel.DUMMY] - if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY): - for i in range(len(Gn)): - nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) - self.__edge_labels = [SpecialLabel.DUMMY] \ No newline at end of file + return self._parent._compute_single_kernel_series(self, g1, g2) \ No newline at end of file diff --git a/gklearn/kernels/random_walk_meta.py b/gklearn/kernels/random_walk_meta.py new file mode 100644 index 0000000..f67f33e --- /dev/null +++ b/gklearn/kernels/random_walk_meta.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Aug 19 16:55:17 2020 + +@author: ljia + +@references: + + [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010. +""" + +import networkx as nx +from gklearn.utils import SpecialLabel +from gklearn.kernels import GraphKernel + + +class RandomWalkMeta(GraphKernel): + + + def __init__(self, **kwargs): + GraphKernel.__init__(self) + self._weight = kwargs.get('weight', 1) + self._p = kwargs.get('p', None) + self._q = kwargs.get('q', None) + self._edge_weight = kwargs.get('edge_weight', None) + self._ds_infos = kwargs.get('ds_infos', {}) + + + def _compute_gm_series(self): + pass + + + def _compute_gm_imap_unordered(self): + pass + + + def _compute_kernel_list_series(self, g1, g_list): + pass + + + def _compute_kernel_list_imap_unordered(self, g1, g_list): + pass + + + def _compute_single_kernel_series(self, g1, g2): + pass + + + def _check_graphs(self, Gn): + # remove graphs with no edges, as no walk can be found in their structures, + # so the weight matrix between such a graph and itself might be zero. + for g in Gn: + if nx.number_of_edges(g) == 0: + raise Exception('Graphs must contain edges to construct weight matrices.') + + + def _check_edge_weight(self, G0, verbose): + eweight = None + if self._edge_weight is None: + if verbose >= 2: + print('\n None edge weight is specified. Set all weight to 1.\n') + else: + try: + some_weight = list(nx.get_edge_attributes(G0, self._edge_weight).values())[0] + if isinstance(some_weight, float) or isinstance(some_weight, int): + eweight = self._edge_weight + else: + if verbose >= 2: + print('\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % self._edge_weight) + except: + if verbose >= 2: + print('\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % self._edge_weight) + + self._edge_weight = eweight + + + def _add_dummy_labels(self, Gn): + if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.__node_labels = [SpecialLabel.DUMMY] + if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY): + for i in range(len(Gn)): + nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) + self.__edge_labels = [SpecialLabel.DUMMY] \ No newline at end of file From 7237aa846c87885662f80de2935db66dd0de7ecd Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 15 Oct 2020 16:11:59 +0200 Subject: [PATCH 11/13] Update random walk kernels. --- gklearn/kernels/fixed_point.py | 360 ++++++++++++++++++------------ gklearn/kernels/spectral_decomposition.py | 46 ++-- gklearn/kernels/sylvester_equation.py | 60 ++--- 3 files changed, 271 insertions(+), 195 deletions(-) diff --git a/gklearn/kernels/fixed_point.py b/gklearn/kernels/fixed_point.py index 4eeeb8a..b4193e8 100644 --- a/gklearn/kernels/fixed_point.py +++ b/gklearn/kernels/fixed_point.py @@ -14,61 +14,56 @@ import sys from tqdm import tqdm import numpy as np import networkx as nx -from control import dlyap +from scipy import optimize from gklearn.utils.parallel import parallel_gm, parallel_me -from gklearn.kernels import RandomWalk +from gklearn.kernels import RandomWalkMeta +from gklearn.utils.utils import compute_vertex_kernels -class FixedPoint(RandomWalk): + +class FixedPoint(RandomWalkMeta): def __init__(self, **kwargs): - RandomWalk.__init__(self, **kwargs) + super().__init__(**kwargs) + self._node_kernels = kwargs.get('node_kernels', None) + self._edge_kernels = kwargs.get('edge_kernels', None) + self._node_labels = kwargs.get('node_labels', []) + self._edge_labels = kwargs.get('edge_labels', []) + self._node_attrs = kwargs.get('node_attrs', []) + self._edge_attrs = kwargs.get('edge_attrs', []) def _compute_gm_series(self): - self._check_edge_weight(self._graphs) + self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: - import warnings - warnings.warn('All labels are ignored.') lmda = self._weight - # compute Gram matrix. + # Compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - if self._q == None: - # don't normalize adjacency matrices if q is a uniform vector. Note - # A_wave_list actually contains the transposes of the adjacency matrices. + # Reindex nodes using consecutive integers for the convenience of kernel computation. + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout) + else: + iterator = self._graphs + self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] + + if self._p is None and self._q is None: # p and q are uniform distributions as default. + + from itertools import combinations_with_replacement + itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) else: - iterator = self._graphs - A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] - # # normalized adjacency matrices - # A_wave_list = [] - # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): - # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() - # norm = A_tilde.sum(axis=0) - # norm[norm == 0] = 1 - # A_wave_list.append(A_tilde / norm) - - if self._p == None: # p is uniform distribution as default. - from itertools import combinations_with_replacement - itr = combinations_with_replacement(range(0, len(self._graphs)), 2) - if self._verbose >= 2: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) - else: - iterator = itr - - for i, j in iterator: - kernel = self.__kernel_do(A_wave_list[i], A_wave_list[j], lmda) - gram_matrix[i][j] = kernel - gram_matrix[j][i] = kernel - - else: # @todo - pass + iterator = itr + + for i, j in iterator: + kernel = self.__kernel_do(self._graphs[i], self._graphs[j], lmda) + gram_matrix[i][j] = kernel + gram_matrix[j][i] = kernel + else: # @todo pass @@ -76,36 +71,31 @@ class FixedPoint(RandomWalk): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs) + self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) - if self._verbose >= 2: - import warnings - warnings.warn('All labels are ignored.') - # compute Gram matrix. - gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) + # Compute Gram matrix. + gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - if self._q == None: - # don't normalize adjacency matrices if q is a uniform vector. Note - # A_wave_list actually contains the transposes of the adjacency matrices. - if self._verbose >= 2: - iterator = tqdm(self._graphs, desc='compute adjacency matrices', file=sys.stdout) - else: - iterator = self._graphs - A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? - - if self._p == None: # p is uniform distribution as default. - def init_worker(A_wave_list_toshare): - global G_A_wave_list - G_A_wave_list = A_wave_list_toshare - - do_fun = self._wrapper_kernel_do - - parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, - glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) - - else: # @todo - pass + # @todo: parallel this. + # Reindex nodes using consecutive integers for the convenience of kernel computation. + if self._verbose >= 2: + iterator = tqdm(self._graphs, desc='Reindex vertices', file=sys.stdout) + else: + iterator = self._graphs + self._graphs = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] + + if self._p is None and self._q is None: # p and q are uniform distributions as default. + + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + + do_fun = self._wrapper_kernel_do + + parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, + glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) + else: # @todo pass @@ -113,39 +103,33 @@ class FixedPoint(RandomWalk): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1]) + self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: - import warnings - warnings.warn('All labels are ignored.') lmda = self._weight # compute kernel list. kernel_list = [None] * len(g_list) + + # Reindex nodes using consecutive integers for the convenience of kernel computation. + g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') + if self._verbose >= 2: + iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout) + else: + iterator = g_list + g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] - if self._q == None: - # don't normalize adjacency matrices if q is a uniform vector. Note - # A_wave_list actually contains the transposes of the adjacency matrices. - A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() + if self._p is None and self._q is None: # p and q are uniform distributions as default. + if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) + iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) else: iterator = range(len(g_list)) - A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] - - if self._p == None: # p is uniform distribution as default. - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) - else: - iterator = range(len(g_list)) - - for i in iterator: - kernel = self.__kernel_do(A_wave_1, A_wave_list[i], lmda) - kernel_list[i] = kernel - - else: # @todo - pass + + for i in iterator: + kernel = self.__kernel_do(g1, g_list[i], lmda) + kernel_list[i] = kernel + else: # @todo pass @@ -153,43 +137,38 @@ class FixedPoint(RandomWalk): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1]) + self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) - if self._verbose >= 2: - import warnings - warnings.warn('All labels are ignored.') # compute kernel list. kernel_list = [None] * len(g_list) - if self._q == None: - # don't normalize adjacency matrices if q is a uniform vector. Note - # A_wave_list actually contains the transposes of the adjacency matrices. - A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() - if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) - else: - iterator = range(len(g_list)) - A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? + # Reindex nodes using consecutive integers for the convenience of kernel computation. + g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') + # @todo: parallel this. + if self._verbose >= 2: + iterator = tqdm(g_list, desc='Reindex vertices', file=sys.stdout) + else: + iterator = g_list + g_list = [nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator] + + if self._p is None and self._q is None: # p and q are uniform distributions as default. - if self._p == None: # p is uniform distribution as default. - def init_worker(A_wave_1_toshare, A_wave_list_toshare): - global G_A_wave_1, G_A_wave_list - G_A_wave_1 = A_wave_1_toshare - G_A_wave_list = A_wave_list_toshare + def init_worker(g1_toshare, g_list_toshare): + global G_g1, G_g_list + G_g1 = g1_toshare + G_g_list = g_list_toshare - do_fun = self._wrapper_kernel_list_do - - def func_assign(result, var_to_assign): - var_to_assign[result[0]] = result[1] - itr = range(len(g_list)) - len_itr = len(g_list) - parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + do_fun = self._wrapper_kernel_list_do + + def func_assign(result, var_to_assign): + var_to_assign[result[0]] = result[1] + itr = range(len(g_list)) + len_itr = len(g_list) + parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, + init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', + n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) - else: # @todo - pass else: # @todo pass @@ -197,49 +176,146 @@ class FixedPoint(RandomWalk): def _wrapper_kernel_list_do(self, itr): - return itr, self._kernel_do(G_A_wave_1, G_A_wave_list[itr], self._weight) + return itr, self._kernel_do(G_g1, G_g_list[itr], self._weight) def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2]) + self._check_edge_weight([g1] + [g2], self._verbose) self._check_graphs([g1] + [g2]) - if self._verbose >= 2: - import warnings - warnings.warn('All labels are ignored.') lmda = self._weight - if self._q == None: - # don't normalize adjacency matrices if q is a uniform vector. Note - # A_wave_list actually contains the transposes of the adjacency matrices. - A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() - A_wave_2 = nx.adjacency_matrix(g2, self._edge_weight).todense().transpose() - if self._p == None: # p is uniform distribution as default. - kernel = self.__kernel_do(A_wave_1, A_wave_2, lmda) - else: # @todo - pass + # Reindex nodes using consecutive integers for the convenience of kernel computation. + g1 = nx.convert_node_labels_to_integers(g1, first_label=0, label_attribute='label_orignal') + g2 = nx.convert_node_labels_to_integers(g2, first_label=0, label_attribute='label_orignal') + + if self._p is None and self._q is None: # p and q are uniform distributions as default. + kernel = self.__kernel_do(g1, g2, lmda) + else: # @todo pass return kernel - def __kernel_do(self, A_wave1, A_wave2, lmda): + def __kernel_do(self, g1, g2, lmda): - S = lmda * A_wave2 - T_t = A_wave1 + # Frist, compute kernels between all pairs of nodes using the method borrowed + # from FCSP. It is faster than directly computing all edge kernels + # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the + # graphs compared, which is the most case we went though. For very + # sparse graphs, this would be slow. + vk_dict = self._compute_vertex_kernels(g1, g2) + + # Compute the weight matrix of the direct product graph. + w_times, w_dim = self._compute_weight_matrix(g1, g2, vk_dict) # use uniform distribution if there is no prior knowledge. - nb_pd = len(A_wave1) * len(A_wave2) - p_times_uni = 1 / nb_pd - M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni) - X = dlyap(S, T_t, M0) - X = np.reshape(X, (-1, 1), order='F') + p_times_uni = 1 / w_dim + p_times = np.full((w_dim, 1), p_times_uni) + x = optimize.fixed_point(self._func_fp, p_times, args=(p_times, lmda, w_times), xtol=1e-06, maxiter=1000) # use uniform distribution if there is no prior knowledge. - q_times = np.full((1, nb_pd), p_times_uni) - return np.dot(q_times, X) + q_times = np.full((1, w_dim), p_times_uni) + return np.dot(q_times, x) def _wrapper_kernel_do(self, itr): i = itr[0] j = itr[1] - return i, j, self.__kernel_do(G_A_wave_list[i], G_A_wave_list[j], self._weight) \ No newline at end of file + return i, j, self.__kernel_do(G_gn[i], G_gn[j], self._weight) + + + def _func_fp(x, p_times, lmda, w_times): + haha = w_times * x + haha = lmda * haha + haha = p_times + haha + return p_times + lmda * np.dot(w_times, x) + + + def _compute_vertex_kernels(self, g1, g2): + """Compute vertex kernels between vertices of two graphs. + """ + return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs) + + + # @todo: move if out to make it faster. + # @todo: node/edge kernels use direct function rather than dicts. + def _compute_weight_matrix(self, g1, g2, vk_dict): + """Compute the weight matrix of the direct product graph. + """ + # Define edge kernels. + def compute_ek_11(e1, e2, ke): + e1_labels = [e1[2][el] for el in self._edge_labels] + e2_labels = [e2[2][el] for el in self.__edge_labels] + e1_attrs = [e1[2][ea] for ea in self._edge_attrs] + e2_attrs = [e2[2][ea] for ea in self._edge_attrs] + return ke(e1_labels, e2_labels, e1_attrs, e2_attrs) + + def compute_ek_10(e1, e2, ke): + e1_labels = [e1[2][el] for el in self.__edge_labels] + e2_labels = [e2[2][el] for el in self.__edge_labels] + return ke(e1_labels, e2_labels) + + def compute_ek_01(e1, e2, ke): + e1_attrs = [e1[2][ea] for ea in self.__edge_attrs] + e2_attrs = [e2[2][ea] for ea in self.__edge_attrs] + return ke(e1_attrs, e2_attrs) + + def compute_ek_00(e1, e2, ke): + return 1 + + # Select the proper edge kernel. + if len(self._edge_labels) > 0: + # edge symb and non-synb labeled + if len(self._edge_attrs) > 0: + ke = self._edge_kernels['mix'] + ek_temp = compute_ek_11 + # edge symb labeled + else: + ke = self._edge_kernels['symb'] + ek_temp = compute_ek_10 + else: + # edge non-synb labeled + if len(self._edge_attrs) > 0: + ke = self._edge_kernels['nsymb'] + ek_temp = compute_ek_01 + # edge unlabeled + else: + ke = None + ek_temp = compute_ek_00 # @todo: check how much slower is this. + + # Compute the weight matrix. + w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) + w_times = np.zeros((w_dim, w_dim)) + + if vk_dict: # node labeled + if self._ds_infos['directed']: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] * ek_temp(e1, e2, ke) * vk_dict[(e1[1], e2[1])] + else: # undirected + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = vk_dict[(e1[0], e2[0])] * ek_temp(e1, e2, ke) * vk_dict[(e1[1], e2[1])] + vk_dict[(e1[0], e2[1])] * ek_temp(e1, e2, ke) * vk_dict[(e1[1], e2[0])] + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + else: # node unlabeled + if self._ds_infos['directed']: + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp(e1, e2, ke) + else: # undirected + for e1 in g1.edges(data=True): + for e2 in g2.edges(data=True): + w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0], e1[1] * nx.number_of_nodes(g2) + e2[1]) + w_times[w_idx] = ek_temp(e1, e2, ke) + w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]] + w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1], e1[1] * nx.number_of_nodes(g2) + e2[0]) + w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]] + w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]] + + return w_times, w_dim diff --git a/gklearn/kernels/spectral_decomposition.py b/gklearn/kernels/spectral_decomposition.py index 5509ee6..7efc005 100644 --- a/gklearn/kernels/spectral_decomposition.py +++ b/gklearn/kernels/spectral_decomposition.py @@ -16,19 +16,19 @@ import numpy as np import networkx as nx from scipy.sparse import kron from gklearn.utils.parallel import parallel_gm, parallel_me -from gklearn.kernels import RandomWalk +from gklearn.kernels import RandomWalkMeta -class SpectralDecomposition(RandomWalk): +class SpectralDecomposition(RandomWalkMeta): def __init__(self, **kwargs): - RandomWalk.__init__(self, **kwargs) + super().__init__(**kwargs) self._sub_kernel = kwargs.get('sub_kernel', None) def _compute_gm_series(self): - self._check_edge_weight(self._graphs) + self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) if self._verbose >= 2: import warnings @@ -37,7 +37,7 @@ class SpectralDecomposition(RandomWalk): # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - if self._q == None: + if self._q is None: # precompute the spectral decomposition of each graph. P_list = [] D_list = [] @@ -54,14 +54,14 @@ class SpectralDecomposition(RandomWalk): P_list.append(ev) # P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? - if self._p == None: # p is uniform distribution as default. + if self._p is None: # p is uniform distribution as default. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in self._graphs] # q_T_list = [q.T for q in q_list] from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) else: iterator = itr @@ -79,7 +79,7 @@ class SpectralDecomposition(RandomWalk): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs) + self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) if self._verbose >= 2: import warnings @@ -88,7 +88,7 @@ class SpectralDecomposition(RandomWalk): # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - if self._q == None: + if self._q is None: # precompute the spectral decomposition of each graph. P_list = [] D_list = [] @@ -104,7 +104,7 @@ class SpectralDecomposition(RandomWalk): D_list.append(ew) P_list.append(ev) # @todo: parallel? - if self._p == None: # p is uniform distribution as default. + if self._p is None: # p is uniform distribution as default. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in self._graphs] # @todo: parallel? def init_worker(q_T_list_toshare, P_list_toshare, D_list_toshare): @@ -126,7 +126,7 @@ class SpectralDecomposition(RandomWalk): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1]) + self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) if self._verbose >= 2: import warnings @@ -135,16 +135,16 @@ class SpectralDecomposition(RandomWalk): # compute kernel list. kernel_list = [None] * len(g_list) - if self._q == None: + if self._q is None: # precompute the spectral decomposition of each graph. A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() D1, P1 = np.linalg.eig(A1) P_list = [] D_list = [] if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='spectral decompose', file=sys.stdout) + iterator = tqdm(g_list, desc='spectral decompose', file=sys.stdout) else: - iterator = range(len(g_list)) + iterator = g_list for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. @@ -153,11 +153,11 @@ class SpectralDecomposition(RandomWalk): D_list.append(ew) P_list.append(ev) - if self._p == None: # p is uniform distribution as default. + if self._p is None: # p is uniform distribution as default. q_T1 = 1 / nx.number_of_nodes(g1) q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) else: iterator = range(len(g_list)) @@ -174,7 +174,7 @@ class SpectralDecomposition(RandomWalk): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1]) + self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) if self._verbose >= 2: import warnings @@ -183,7 +183,7 @@ class SpectralDecomposition(RandomWalk): # compute kernel list. kernel_list = [None] * len(g_list) - if self._q == None: + if self._q is None: # precompute the spectral decomposition of each graph. A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() D1, P1 = np.linalg.eig(A1) @@ -201,7 +201,7 @@ class SpectralDecomposition(RandomWalk): D_list.append(ew) P_list.append(ev) # @todo: parallel? - if self._p == None: # p is uniform distribution as default. + if self._p is None: # p is uniform distribution as default. q_T1 = 1 / nx.number_of_nodes(g1) q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in g_list] # @todo: parallel? @@ -221,7 +221,7 @@ class SpectralDecomposition(RandomWalk): itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(q_T1, P1, D1, q_T_list, P_list, D_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) else: # @todo pass @@ -236,20 +236,20 @@ class SpectralDecomposition(RandomWalk): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2]) + self._check_edge_weight([g1] + [g2], self._verbose) self._check_graphs([g1] + [g2]) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') - if self._q == None: + if self._q is None: # precompute the spectral decomposition of each graph. A1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() D1, P1 = np.linalg.eig(A1) A2 = nx.adjacency_matrix(g2, self._edge_weight).todense().transpose() D2, P2 = np.linalg.eig(A2) - if self._p == None: # p is uniform distribution as default. + if self._p is None: # p is uniform distribution as default. q_T1 = 1 / nx.number_of_nodes(g1) q_T2 = 1 / nx.number_of_nodes(g2) kernel = self.__kernel_do(q_T1, q_T2, P1, P2, D1, D2, self._weight, self._sub_kernel) diff --git a/gklearn/kernels/sylvester_equation.py b/gklearn/kernels/sylvester_equation.py index 3879b59..bf9cb2d 100644 --- a/gklearn/kernels/sylvester_equation.py +++ b/gklearn/kernels/sylvester_equation.py @@ -16,18 +16,18 @@ import numpy as np import networkx as nx from control import dlyap from gklearn.utils.parallel import parallel_gm, parallel_me -from gklearn.kernels import RandomWalk +from gklearn.kernels import RandomWalkMeta -class SylvesterEquation(RandomWalk): +class SylvesterEquation(RandomWalkMeta): def __init__(self, **kwargs): - RandomWalk.__init__(self, **kwargs) + super().__init__(**kwargs) def _compute_gm_series(self): - self._check_edge_weight(self._graphs) + self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) if self._verbose >= 2: import warnings @@ -38,7 +38,7 @@ class SylvesterEquation(RandomWalk): # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - if self._q == None: + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. if self._verbose >= 2: @@ -54,16 +54,16 @@ class SylvesterEquation(RandomWalk): # norm[norm == 0] = 1 # A_wave_list.append(A_tilde / norm) - if self._p == None: # p is uniform distribution as default. + if self._p is None: # p is uniform distribution as default. from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) else: iterator = itr for i, j in iterator: - kernel = self.__kernel_do(A_wave_list[i], A_wave_list[j], lmda) + kernel = self._kernel_do(A_wave_list[i], A_wave_list[j], lmda) gram_matrix[i][j] = kernel gram_matrix[j][i] = kernel @@ -76,7 +76,7 @@ class SylvesterEquation(RandomWalk): def _compute_gm_imap_unordered(self): - self._check_edge_weight(self._graphs) + self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) if self._verbose >= 2: import warnings @@ -85,7 +85,7 @@ class SylvesterEquation(RandomWalk): # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) - if self._q == None: + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. if self._verbose >= 2: @@ -94,7 +94,7 @@ class SylvesterEquation(RandomWalk): iterator = self._graphs A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? - if self._p == None: # p is uniform distribution as default. + if self._p is None: # p is uniform distribution as default. def init_worker(A_wave_list_toshare): global G_A_wave_list G_A_wave_list = A_wave_list_toshare @@ -113,7 +113,7 @@ class SylvesterEquation(RandomWalk): def _compute_kernel_list_series(self, g1, g_list): - self._check_edge_weight(g_list + [g1]) + self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) if self._verbose >= 2: import warnings @@ -124,24 +124,24 @@ class SylvesterEquation(RandomWalk): # compute kernel list. kernel_list = [None] * len(g_list) - if self._q == None: + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) + iterator = tqdm(g_list, desc='compute adjacency matrices', file=sys.stdout) else: - iterator = range(len(g_list)) + iterator = g_list A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] - if self._p == None: # p is uniform distribution as default. + if self._p is None: # p is uniform distribution as default. if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) else: iterator = range(len(g_list)) for i in iterator: - kernel = self.__kernel_do(A_wave_1, A_wave_list[i], lmda) + kernel = self._kernel_do(A_wave_1, A_wave_list[i], lmda) kernel_list[i] = kernel else: # @todo @@ -153,7 +153,7 @@ class SylvesterEquation(RandomWalk): def _compute_kernel_list_imap_unordered(self, g1, g_list): - self._check_edge_weight(g_list + [g1]) + self._check_edge_weight(g_list + [g1], self._verbose) self._check_graphs(g_list + [g1]) if self._verbose >= 2: import warnings @@ -162,17 +162,17 @@ class SylvesterEquation(RandomWalk): # compute kernel list. kernel_list = [None] * len(g_list) - if self._q == None: + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='compute adjacency matrices', file=sys.stdout) + iterator = tqdm(g_list, desc='compute adjacency matrices', file=sys.stdout) else: - iterator = range(len(g_list)) + iterator = g_list A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? - if self._p == None: # p is uniform distribution as default. + if self._p is None: # p is uniform distribution as default. def init_worker(A_wave_1_toshare, A_wave_list_toshare): global G_A_wave_1, G_A_wave_list G_A_wave_1 = A_wave_1_toshare @@ -186,7 +186,7 @@ class SylvesterEquation(RandomWalk): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(A_wave_1, A_wave_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) else: # @todo pass @@ -201,7 +201,7 @@ class SylvesterEquation(RandomWalk): def _compute_single_kernel_series(self, g1, g2): - self._check_edge_weight([g1] + [g2]) + self._check_edge_weight([g1] + [g2], self._verbose) self._check_graphs([g1] + [g2]) if self._verbose >= 2: import warnings @@ -209,13 +209,13 @@ class SylvesterEquation(RandomWalk): lmda = self._weight - if self._q == None: + if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_1 = nx.adjacency_matrix(g1, self._edge_weight).todense().transpose() A_wave_2 = nx.adjacency_matrix(g2, self._edge_weight).todense().transpose() - if self._p == None: # p is uniform distribution as default. - kernel = self.__kernel_do(A_wave_1, A_wave_2, lmda) + if self._p is None: # p is uniform distribution as default. + kernel = self._kernel_do(A_wave_1, A_wave_2, lmda) else: # @todo pass else: # @todo @@ -224,7 +224,7 @@ class SylvesterEquation(RandomWalk): return kernel - def __kernel_do(self, A_wave1, A_wave2, lmda): + def _kernel_do(self, A_wave1, A_wave2, lmda): S = lmda * A_wave2 T_t = A_wave1 @@ -242,4 +242,4 @@ class SylvesterEquation(RandomWalk): def _wrapper_kernel_do(self, itr): i = itr[0] j = itr[1] - return i, j, self.__kernel_do(G_A_wave_list[i], G_A_wave_list[j], self._weight) \ No newline at end of file + return i, j, self._kernel_do(G_A_wave_list[i], G_A_wave_list[j], self._weight) \ No newline at end of file From 320964dd169b6b7155e42824bca8c99aa6147983 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 15 Oct 2020 16:13:10 +0200 Subject: [PATCH 12/13] Update comments, minor bugs for graph kernels. --- gklearn/kernels/commonWalkKernel.py | 24 ++++----- gklearn/kernels/common_walk.py | 14 ++--- gklearn/kernels/graph_kernel.py | 4 +- gklearn/kernels/marginalized.py | 22 ++++---- gklearn/kernels/marginalizedKernel.py | 32 ++++++------ gklearn/kernels/path_up_to_h.py | 16 +++--- gklearn/kernels/randomWalkKernel.py | 86 +++++++++++++++---------------- gklearn/kernels/shortest_path.py | 6 +-- gklearn/kernels/spKernel.py | 10 ++-- gklearn/kernels/structural_sp.py | 45 +++------------- gklearn/kernels/structuralspKernel.py | 26 +++++----- gklearn/kernels/treelet.py | 16 +++--- gklearn/kernels/treeletKernel.py | 16 +++--- gklearn/kernels/untilHPathKernel.py | 34 ++++++------ gklearn/kernels/weisfeilerLehmanKernel.py | 34 ++++++------ gklearn/kernels/weisfeiler_lehman.py | 26 +++++----- gklearn/utils/parallel.py | 2 +- 17 files changed, 190 insertions(+), 223 deletions(-) diff --git a/gklearn/kernels/commonWalkKernel.py b/gklearn/kernels/commonWalkKernel.py index a5f9cb1..fb6bd10 100644 --- a/gklearn/kernels/commonWalkKernel.py +++ b/gklearn/kernels/commonWalkKernel.py @@ -30,15 +30,15 @@ def commonwalkkernel(*args, n_jobs=None, chunksize=None, verbose=True): - """Calculate common walk graph kernels between graphs. + """Compute common walk graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. + Two graphs between which the kernel is computed. node_label : string Node attribute used as symbolic label. The default node label is 'atom'. edge_label : string @@ -133,7 +133,7 @@ def commonwalkkernel(*args, # # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, chunksize), -# desc='calculating kernels', +# desc='computing kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel @@ -145,14 +145,14 @@ def commonwalkkernel(*args, # # direct product graph method - exponential # itr = combinations_with_replacement(range(0, len(Gn)), 2) # if compute_method == 'exp': -# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] # # # direct product graph method - geometric # elif compute_method == 'geo': -# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] @@ -161,7 +161,7 @@ def commonwalkkernel(*args, # # search all paths use brute force. # elif compute_method == 'brute': # n = int(n) -# # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. +# # get all paths of all graphs before computing kernels to save time, but this may cost a lot of memory for large dataset. # all_walks = [ # find_all_walks_until_length(Gn[i], n, node_label, edge_label) # for i in range(0, len(Gn)) @@ -185,13 +185,13 @@ def commonwalkkernel(*args, def _commonwalkkernel_exp(g1, g2, node_label, edge_label, beta): - """Calculate walk graph kernels up to n between 2 graphs using exponential + """Compute walk graph kernels up to n between 2 graphs using exponential series. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. node_label : string Node attribute used as label. edge_label : string @@ -259,13 +259,13 @@ def wrapper_cw_exp(node_label, edge_label, beta, itr): def _commonwalkkernel_geo(g1, g2, node_label, edge_label, gamma): - """Calculate common walk graph kernels up to n between 2 graphs using + """Compute common walk graph kernels up to n between 2 graphs using geometric series. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. node_label : string Node attribute used as label. edge_label : string @@ -304,7 +304,7 @@ def _commonwalkkernel_brute(walks1, node_label='atom', edge_label='bond_type', labeled=True): - """Calculate walk graph kernels up to n between 2 graphs. + """Compute walk graph kernels up to n between 2 graphs. Parameters ---------- diff --git a/gklearn/kernels/common_walk.py b/gklearn/kernels/common_walk.py index 0aeb3ee..6372200 100644 --- a/gklearn/kernels/common_walk.py +++ b/gklearn/kernels/common_walk.py @@ -46,7 +46,7 @@ class CommonWalk(GraphKernel): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) else: iterator = itr @@ -102,7 +102,7 @@ class CommonWalk(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) else: iterator = range(len(g_list)) @@ -148,7 +148,7 @@ class CommonWalk(GraphKernel): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=_init_worker_list, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) return kernel_list @@ -179,13 +179,13 @@ class CommonWalk(GraphKernel): def __kernel_do_exp(self, g1, g2, beta): - """Calculate common walk graph kernel between 2 graphs using exponential + """Compute common walk graph kernel between 2 graphs using exponential series. Parameters ---------- g1, g2 : NetworkX graphs - Graphs between which the kernels are calculated. + Graphs between which the kernels are computed. beta : integer Weight. @@ -231,13 +231,13 @@ class CommonWalk(GraphKernel): def __kernel_do_geo(self, g1, g2, gamma): - """Calculate common walk graph kernel between 2 graphs using geometric + """Compute common walk graph kernel between 2 graphs using geometric series. Parameters ---------- g1, g2 : NetworkX graphs - Graphs between which the kernels are calculated. + Graphs between which the kernels are computed. gamma : integer Weight. diff --git a/gklearn/kernels/graph_kernel.py b/gklearn/kernels/graph_kernel.py index 7c6afde..a8dbd32 100644 --- a/gklearn/kernels/graph_kernel.py +++ b/gklearn/kernels/graph_kernel.py @@ -104,7 +104,7 @@ class GraphKernel(object): if self._parallel == 'imap_unordered': gram_matrix = self._compute_gm_imap_unordered() - elif self._parallel == None: + elif self._parallel is None: gram_matrix = self._compute_gm_series() else: raise Exception('Parallel mode is not set correctly.') @@ -130,7 +130,7 @@ class GraphKernel(object): if self._parallel == 'imap_unordered': kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) - elif self._parallel == None: + elif self._parallel is None: kernel_list = self._compute_kernel_list_series(g1, g_list) else: raise Exception('Parallel mode is not set correctly.') diff --git a/gklearn/kernels/marginalized.py b/gklearn/kernels/marginalized.py index 6910468..499d51b 100644 --- a/gklearn/kernels/marginalized.py +++ b/gklearn/kernels/marginalized.py @@ -59,7 +59,7 @@ class Marginalized(GraphKernel): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) else: iterator = itr for i, j in iterator: @@ -119,7 +119,7 @@ class Marginalized(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) else: iterator = range(len(g_list)) for i in iterator: @@ -165,7 +165,7 @@ class Marginalized(GraphKernel): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) return kernel_list @@ -184,12 +184,12 @@ class Marginalized(GraphKernel): def __kernel_do(self, g1, g2): - """Calculate marginalized graph kernel between 2 graphs. + """Compute marginalized graph kernel between 2 graphs. Parameters ---------- g1, g2 : NetworkX graphs - 2 graphs between which the kernel is calculated. + 2 graphs between which the kernel is computed. Return ------ @@ -212,12 +212,12 @@ class Marginalized(GraphKernel): # # matrix to save all the R_inf for all pairs of nodes # R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) # - # # calculate R_inf with a simple interative method + # # Compute R_inf with a simple interative method # for i in range(1, n_iteration): # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) # R_inf_new.fill(r1) # - # # calculate R_inf for each pair of nodes + # # Compute R_inf for each pair of nodes # for node1 in g1.nodes(data=True): # neighbor_n1 = g1[node1[0]] # # the transition probability distribution in the random walks @@ -243,7 +243,7 @@ class Marginalized(GraphKernel): # neighbor2] # ref [1] equation (8) # R_inf[:] = R_inf_new # - # # add elements of R_inf up and calculate kernel + # # add elements of R_inf up and compute kernel # for node1 in g1.nodes(data=True): # for node2 in g2.nodes(data=True): # s = p_init_G1 * p_init_G2 * deltakernel( @@ -288,11 +288,11 @@ class Marginalized(GraphKernel): deltakernel(tuple(g1.nodes[neighbor1][nl] for nl in self.__node_labels), tuple(g2.nodes[neighbor2][nl] for nl in self.__node_labels)) * \ deltakernel(tuple(neighbor_n1[neighbor1][el] for el in self.__edge_labels), tuple(neighbor_n2[neighbor2][el] for el in self.__edge_labels)) - # calculate R_inf with a simple interative method + # Compute R_inf with a simple interative method for i in range(2, self.__n_iteration + 1): R_inf_old = R_inf.copy() - # calculate R_inf for each pair of nodes + # Compute R_inf for each pair of nodes for node1 in g1.nodes(): neighbor_n1 = g1[node1] # the transition probability distribution in the random walks @@ -309,7 +309,7 @@ class Marginalized(GraphKernel): (t_dict[(node1, node2, neighbor1, neighbor2)] * \ R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) - # add elements of R_inf up and calculate kernel + # add elements of R_inf up and compute kernel. for (n1, n2), value in R_inf.items(): s = p_init_G1 * p_init_G2 * deltakernel(tuple(g1.nodes[n1][nl] for nl in self.__node_labels), tuple(g2.nodes[n2][nl] for nl in self.__node_labels)) kernel += s * value # ref [1] equation (6) diff --git a/gklearn/kernels/marginalizedKernel.py b/gklearn/kernels/marginalizedKernel.py index 950f1a6..b6d7fb0 100644 --- a/gklearn/kernels/marginalizedKernel.py +++ b/gklearn/kernels/marginalizedKernel.py @@ -39,15 +39,15 @@ def marginalizedkernel(*args, n_jobs=None, chunksize=None, verbose=True): - """Calculate marginalized graph kernels between graphs. + """Compute marginalized graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. + Two graphs between which the kernel is computed. node_label : string Node attribute used as symbolic label. The default node label is 'atom'. @@ -59,7 +59,7 @@ def marginalizedkernel(*args, The termination probability in the random walks generating step. n_iteration : integer - Time of iterations to calculate R_inf. + Time of iterations to compute R_inf. remove_totters : boolean Whether to remove totterings by method introduced in [2]. The default @@ -83,11 +83,11 @@ def marginalizedkernel(*args, Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) - if not ds_attrs['node_labeled'] or node_label == None: + if not ds_attrs['node_labeled'] or node_label is None: node_label = 'atom' for G in Gn: nx.set_node_attributes(G, '0', 'atom') - if not ds_attrs['edge_labeled'] or edge_label == None: + if not ds_attrs['edge_labeled'] or edge_label is None: edge_label = 'bond_type' for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') @@ -133,7 +133,7 @@ def marginalizedkernel(*args, # # ---- direct running, normally use single CPU core. ---- ## pbar = tqdm( ## total=(1 + len(Gn)) * len(Gn) / 2, -## desc='calculating kernels', +## desc='Computing kernels', ## file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): @@ -152,12 +152,12 @@ def marginalizedkernel(*args, def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): - """Calculate marginalized graph kernel between 2 graphs. + """Compute marginalized graph kernel between 2 graphs. Parameters ---------- G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. + 2 graphs between which the kernel is computed. node_label : string node attribute used as label. edge_label : string @@ -165,7 +165,7 @@ def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): p_quit : integer the termination probability in the random walks generating step. n_iteration : integer - time of iterations to calculate R_inf. + time of iterations to compute R_inf. Return ------ @@ -188,12 +188,12 @@ def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): # # matrix to save all the R_inf for all pairs of nodes # R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) # -# # calculate R_inf with a simple interative method +# # Compute R_inf with a simple interative method # for i in range(1, n_iteration): # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) # R_inf_new.fill(r1) # -# # calculate R_inf for each pair of nodes +# # Compute R_inf for each pair of nodes # for node1 in g1.nodes(data=True): # neighbor_n1 = g1[node1[0]] # # the transition probability distribution in the random walks @@ -219,7 +219,7 @@ def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): # neighbor2] # ref [1] equation (8) # R_inf[:] = R_inf_new # -# # add elements of R_inf up and calculate kernel +# # add elements of R_inf up and compute kernel. # for node1 in g1.nodes(data=True): # for node2 in g2.nodes(data=True): # s = p_init_G1 * p_init_G2 * deltakernel( @@ -267,11 +267,11 @@ def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): neighbor_n1[neighbor1][edge_label], neighbor_n2[neighbor2][edge_label]) - # calculate R_inf with a simple interative method + # Compute R_inf with a simple interative method for i in range(2, n_iteration + 1): R_inf_old = R_inf.copy() - # calculate R_inf for each pair of nodes + # Compute R_inf for each pair of nodes for node1 in g1.nodes(): neighbor_n1 = g1[node1] # the transition probability distribution in the random walks @@ -288,7 +288,7 @@ def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): (t_dict[(node1, node2, neighbor1, neighbor2)] * \ R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) - # add elements of R_inf up and calculate kernel + # add elements of R_inf up and compute kernel. for (n1, n2), value in R_inf.items(): s = p_init_G1 * p_init_G2 * deltakernel( g1.nodes[n1][node_label], g2.nodes[n2][node_label]) diff --git a/gklearn/kernels/path_up_to_h.py b/gklearn/kernels/path_up_to_h.py index 1c8b5e2..d8cc387 100644 --- a/gklearn/kernels/path_up_to_h.py +++ b/gklearn/kernels/path_up_to_h.py @@ -24,7 +24,7 @@ from gklearn.kernels import GraphKernel from gklearn.utils import Trie -class PathUpToH(GraphKernel): # @todo: add function for k_func == None +class PathUpToH(GraphKernel): # @todo: add function for k_func is None def __init__(self, **kwargs): GraphKernel.__init__(self) @@ -43,7 +43,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None itr_kernel = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: iterator_ps = tqdm(range(0, len(self._graphs)), desc='getting paths', file=sys.stdout) - iterator_kernel = tqdm(itr_kernel, desc='calculating kernels', file=sys.stdout) + iterator_kernel = tqdm(itr_kernel, desc='Computing kernels', file=sys.stdout) else: iterator_ps = range(0, len(self._graphs)) iterator_kernel = itr_kernel @@ -69,7 +69,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None def _compute_gm_imap_unordered(self): self.__add_dummy_labels(self._graphs) - # get all paths of all graphs before calculating kernels to save time, + # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(self._n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) @@ -123,7 +123,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None if self._verbose >= 2: iterator_ps = tqdm(g_list, desc='getting paths', file=sys.stdout) - iterator_kernel = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + iterator_kernel = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) else: iterator_ps = g_list iterator_kernel = range(len(g_list)) @@ -149,7 +149,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None def _compute_kernel_list_imap_unordered(self, g1, g_list): self.__add_dummy_labels(g_list + [g1]) - # get all paths of all graphs before calculating kernels to save time, + # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(self._n_jobs) itr = zip(g_list, range(0, len(g_list))) @@ -190,7 +190,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(paths_g1, paths_g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) return kernel_list @@ -218,7 +218,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None def __kernel_do_trie(self, trie1, trie2): - """Calculate path graph kernels up to depth d between 2 graphs using trie. + """Compute path graph kernels up to depth d between 2 graphs using trie. Parameters ---------- @@ -335,7 +335,7 @@ class PathUpToH(GraphKernel): # @todo: add function for k_func == None def __kernel_do_naive(self, paths1, paths2): - """Calculate path graph kernels up to depth d between 2 graphs naively. + """Compute path graph kernels up to depth d between 2 graphs naively. Parameters ---------- diff --git a/gklearn/kernels/randomWalkKernel.py b/gklearn/kernels/randomWalkKernel.py index 346bc98..65bf63c 100644 --- a/gklearn/kernels/randomWalkKernel.py +++ b/gklearn/kernels/randomWalkKernel.py @@ -37,15 +37,15 @@ def randomwalkkernel(*args, n_jobs=None, chunksize=None, verbose=True): - """Calculate random walk graph kernels. + """Compute random walk graph kernels. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. + Two graphs between which the kernel is computed. compute_method : string Method used to compute kernel. The Following choices are @@ -125,7 +125,7 @@ def randomwalkkernel(*args, Gn = [g.copy() for g in Gn] eweight = None - if edge_weight == None: + if edge_weight is None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: @@ -212,12 +212,12 @@ def randomwalkkernel(*args, ############################################################################### def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, chunksize, verbose=True): - """Calculate walk graph kernels up to n between 2 graphs using Sylvester method. + """Compute walk graph kernels up to n between 2 graphs using Sylvester method. Parameters ---------- G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. + Graphs between which the kernel is computed. node_label : string node attribute used as label. edge_label : string @@ -230,7 +230,7 @@ def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, chunksize, verbose=True """ Kmatrix = np.zeros((len(Gn), len(Gn))) - if q == None: + if q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_list = [ @@ -245,7 +245,7 @@ def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, chunksize, verbose=True # norm = A_tilde.sum(axis=0) # norm[norm == 0] = 1 # A_wave_list.append(A_tilde / norm) - if p == None: # p is uniform distribution as default. + if p is None: # p is uniform distribution as default. def init_worker(Awl_toshare): global G_Awl G_Awl = Awl_toshare @@ -255,7 +255,7 @@ def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, chunksize, verbose=True # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', +# desc='Computing kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): @@ -300,12 +300,12 @@ def _se_do(A_wave1, A_wave2, lmda): ############################################################################### def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, chunksize, verbose=True): - """Calculate walk graph kernels up to n between 2 graphs using conjugate method. + """Compute walk graph kernels up to n between 2 graphs using conjugate method. Parameters ---------- G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. + Graphs between which the kernel is computed. node_label : string node attribute used as label. edge_label : string @@ -321,14 +321,14 @@ def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1: # # this is faster from unlabeled graphs. @todo: why? -# if q == None: +# if q is None: # # don't normalize adjacency matrices if q is a uniform vector. Note # # A_wave_list actually contains the transposes of the adjacency matrices. # A_wave_list = [ # nx.adjacency_matrix(G, eweight).todense().transpose() for G in # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) # ] -# if p == None: # p is uniform distribution as default. +# if p is None: # p is uniform distribution as default. # def init_worker(Awl_toshare): # global G_Awl # G_Awl = Awl_toshare @@ -336,23 +336,23 @@ def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, # parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, # glbv=(A_wave_list,), n_jobs=n_jobs) # else: - # reindex nodes using consecutive integers for convenience of kernel calculation. + # reindex nodes using consecutive integers for convenience of kernel computation. Gn = [nx.convert_node_labels_to_integers( g, first_label=0, label_attribute='label_orignal') for g in (tqdm( Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)] - if p == None and q == None: # p and q are uniform distributions as default. + if p is None and q is None: # p and q are uniform distributions as default. def init_worker(gn_toshare): global G_gn G_gn = gn_toshare - do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels, + do_partial = partial(wrapper_cg_labeled_do, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', +# desc='Computing kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): @@ -382,24 +382,24 @@ def _cg_unlabled_do(A_wave1, A_wave2, lmda): return np.dot(q_times, x) -def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, +def wrapper_cg_labeled_do(ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda, itr): i = itr[0] j = itr[1] - return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, + return i, j, _cg_labeled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda) -def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, +def _cg_labeled_do(g1, g2, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda): - # Frist, compute kernels between all pairs of nodes, method borrowed + # Frist, compute kernels between all pairs of nodes using the method borrowed # from FCSP. It is faster than directly computing all edge kernels # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the # graphs compared, which is the most case we went though. For very # sparse graphs, this would be slow. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label) - # Compute weight matrix of the direct product graph. + # Compute the weight matrix of the direct product graph. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label) # use uniform distribution if there is no prior knowledge. @@ -415,12 +415,12 @@ def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label, ############################################################################### def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, chunksize, verbose=True): - """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method. + """Compute walk graph kernels up to n between 2 graphs using Fixed-Point method. Parameters ---------- G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. + Graphs between which the kernel is computed. node_label : string node attribute used as label. edge_label : string @@ -438,17 +438,17 @@ def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1: # # this is faster from unlabeled graphs. @todo: why? -# if q == None: +# if q is None: # # don't normalize adjacency matrices if q is a uniform vector. Note # # A_wave_list actually contains the transposes of the adjacency matrices. # A_wave_list = [ # nx.adjacency_matrix(G, eweight).todense().transpose() for G in # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) # ] -# if p == None: # p is uniform distribution as default. +# if p is None: # p is uniform distribution as default. # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', +# desc='Computing kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): @@ -464,33 +464,33 @@ def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) # else: - # reindex nodes using consecutive integers for convenience of kernel calculation. + # reindex nodes using consecutive integers for the convenience of kernel computation. Gn = [nx.convert_node_labels_to_integers( g, first_label=0, label_attribute='label_orignal') for g in (tqdm( Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)] - if p == None and q == None: # p and q are uniform distributions as default. + if p is None and q is None: # p and q are uniform distributions as default. def init_worker(gn_toshare): global G_gn G_gn = gn_toshare - do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels, + do_partial = partial(wrapper_fp_labeled_do, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) return Kmatrix -def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels, +def wrapper_fp_labeled_do(ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda, itr): i = itr[0] j = itr[1] - return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, + return i, j, _fp_labeled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda) -def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label, +def _fp_labeled_do(g1, g2, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda): - # Frist, compute kernels between all pairs of nodes, method borrowed + # Frist, compute kernels between all pairs of nodes using the method borrowed # from FCSP. It is faster than directly computing all edge kernels # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the # graphs compared, which is the most case we went though. For very @@ -519,13 +519,13 @@ def func_fp(x, p_times, lmda, w_times): ############################################################################### def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, chunksize, verbose=True): - """Calculate walk graph kernels up to n between 2 unlabeled graphs using + """Compute walk graph kernels up to n between 2 unlabeled graphs using spectral decomposition method. Labels will be ignored. Parameters ---------- G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. + Graphs between which the kernel is computed. node_label : string node attribute used as label. edge_label : string @@ -538,7 +538,7 @@ def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, chunk """ Kmatrix = np.zeros((len(Gn), len(Gn))) - if q == None: + if q is None: # precompute the spectral decomposition of each graph. P_list = [] D_list = [] @@ -552,7 +552,7 @@ def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, chunk P_list.append(ev) # P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? - if p == None: # p is uniform distribution as default. + if p is None: # p is uniform distribution as default. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn] # q_T_list = [q.T for q in q_list] def init_worker(q_T_toshare, P_toshare, D_toshare): @@ -568,7 +568,7 @@ def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, chunk # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, -# desc='calculating kernels', +# desc='Computing kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): @@ -605,12 +605,12 @@ def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel): ############################################################################### def _randomwalkkernel_kron(G1, G2, node_label, edge_label): - """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method. + """Compute walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method. Parameters ---------- G1, G2 : NetworkX graph - Graphs between which the kernel is calculated. + Graphs between which the kernel is computed. node_label : string node attribute used as label. edge_label : string @@ -692,8 +692,8 @@ def computeVK(g1, g2, ds_attrs, node_kernels, node_label): def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label): - '''Compute weight matrix of the direct product graph. - ''' + """Compute the weight matrix of the direct product graph. + """ w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2) w_times = np.zeros((w_dim, w_dim)) if vk_dict: # node labeled diff --git a/gklearn/kernels/shortest_path.py b/gklearn/kernels/shortest_path.py index 1923b00..b068e6e 100644 --- a/gklearn/kernels/shortest_path.py +++ b/gklearn/kernels/shortest_path.py @@ -47,7 +47,7 @@ class ShortestPath(GraphKernel): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) else: iterator = itr for i, j in iterator: @@ -102,7 +102,7 @@ class ShortestPath(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) else: iterator = range(len(g_list)) for i in iterator: @@ -145,7 +145,7 @@ class ShortestPath(GraphKernel): itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) return kernel_list diff --git a/gklearn/kernels/spKernel.py b/gklearn/kernels/spKernel.py index b48a905..eaf59df 100644 --- a/gklearn/kernels/spKernel.py +++ b/gklearn/kernels/spKernel.py @@ -29,15 +29,15 @@ def spkernel(*args, n_jobs=None, chunksize=None, verbose=True): - """Calculate shortest-path kernels between graphs. + """Compute shortest-path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. + Two graphs between which the kernel is computed. node_label : string Node attribute used as label. The default node label is atom. @@ -179,7 +179,7 @@ def spkernel(*args, # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j, kernel in tqdm( - # pool.map(do_partial, itr), desc='calculating kernels', + # pool.map(do_partial, itr), desc='Computing kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel @@ -202,7 +202,7 @@ def spkernel(*args, # # ---- direct running, normally use single CPU core. ---- # from itertools import combinations_with_replacement # itr = combinations_with_replacement(range(0, len(Gn)), 2) -# for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): +# for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout): # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel diff --git a/gklearn/kernels/structural_sp.py b/gklearn/kernels/structural_sp.py index 4b9fb26..254f2cc 100644 --- a/gklearn/kernels/structural_sp.py +++ b/gklearn/kernels/structural_sp.py @@ -18,7 +18,7 @@ from tqdm import tqdm # import networkx as nx import numpy as np from gklearn.utils.parallel import parallel_gm, parallel_me -from gklearn.utils.utils import get_shortest_paths +from gklearn.utils.utils import get_shortest_paths, compute_vertex_kernels from gklearn.kernels import GraphKernel @@ -57,7 +57,7 @@ class StructuralSP(GraphKernel): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) else: iterator = itr if self.__compute_method == 'trie': @@ -135,7 +135,7 @@ class StructuralSP(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) else: iterator = range(len(g_list)) if self.__compute_method == 'trie': @@ -193,7 +193,7 @@ class StructuralSP(GraphKernel): itr = range(len(g_list)) len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, - init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + init_worker=init_worker, glbv=(sp1, splist, g1, g_list), method='imap_unordered', n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) return kernel_list @@ -273,7 +273,7 @@ class StructuralSP(GraphKernel): if len(p1) == len(p2): kernel += 1 try: - kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average + kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average except ZeroDivisionError: print(spl1, spl2) print(g1.nodes(data=True)) @@ -318,40 +318,7 @@ class StructuralSP(GraphKernel): def __get_all_node_kernels(self, g1, g2): - # compute shortest path matrices, method borrowed from FCSP. - vk_dict = {} # shortest path matrices dict - if len(self.__node_labels) > 0: - # node symb and non-synb labeled - if len(self.__node_attrs) > 0: - kn = self.__node_kernels['mix'] - for n1, n2 in product(g1.nodes(data=True), g2.nodes(data=True)): - n1_labels = [n1[1][nl] for nl in self.__node_labels] - n2_labels = [n2[1][nl] for nl in self.__node_labels] - n1_attrs = [n1[1][na] for na in self.__node_attrs] - n2_attrs = [n2[1][na] for na in self.__node_attrs] - vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels, n1_attrs, n2_attrs) - # node symb labeled - else: - kn = self.__node_kernels['symb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_labels = [n1[1][nl] for nl in self.__node_labels] - n2_labels = [n2[1][nl] for nl in self.__node_labels] - vk_dict[(n1[0], n2[0])] = kn(n1_labels, n2_labels) - else: - # node non-synb labeled - if len(self.__node_attrs) > 0: - kn = self.__node_kernels['nsymb'] - for n1 in g1.nodes(data=True): - for n2 in g2.nodes(data=True): - n1_attrs = [n1[1][na] for na in self.__node_attrs] - n2_attrs = [n2[1][na] for na in self.__node_attrs] - vk_dict[(n1[0], n2[0])] = kn(n1_attrs, n2_attrs) - # node unlabeled - else: - pass - - return vk_dict + return compute_vertex_kernels(g1, g2, self._node_kernels, node_labels=self._node_labels, node_attrs=self._node_attrs) def __get_all_edge_kernels(self, g1, g2): diff --git a/gklearn/kernels/structuralspKernel.py b/gklearn/kernels/structuralspKernel.py index fb8dbf9..cfafc8c 100644 --- a/gklearn/kernels/structuralspKernel.py +++ b/gklearn/kernels/structuralspKernel.py @@ -37,15 +37,15 @@ def structuralspkernel(*args, n_jobs=None, chunksize=None, verbose=True): - """Calculate mean average structural shortest path kernels between graphs. + """Compute mean average structural shortest path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. + Two graphs between which the kernel is computed. node_label : string Node attribute used as label. The default node label is atom. @@ -215,7 +215,7 @@ def structuralspkernel(*args, from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(Gn)), 2) if verbose: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) else: iterator = itr if compute_method == 'trie': @@ -241,7 +241,7 @@ def structuralspkernel(*args, # combinations_with_replacement(splist, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # for i, j, kernel in tqdm( -# pool.map(do_partial, itr), desc='calculating kernels', +# pool.map(do_partial, itr), desc='Computing kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel @@ -263,7 +263,7 @@ def structuralspkernel(*args, # with closing(Pool(n_jobs)) as pool: # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, 1000), -# desc='calculating kernels', +# desc='Computing kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel @@ -335,7 +335,7 @@ def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label, if len(p1) == len(p2): kernel += 1 try: - kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average + kernel = kernel / (len(spl1) * len(spl2)) # Compute mean average except ZeroDivisionError: print(spl1, spl2) print(g1.nodes(data=True)) @@ -429,7 +429,7 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, # # compute graph kernels # traverseBothTrie(trie1[0].root, trie2[0], kernel) # -# kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average +# kernel = kernel[0] / (trie1[1] * trie2[1]) # Compute mean average # # traverse all paths in graph1. Deep-first search is applied. # def traverseBothTrie(root, trie2, kernel, vk_dict, ek_dict, pcurrent=[]): @@ -485,7 +485,7 @@ def ssp_do_trie(g1, g2, trie1, trie2, ds_attrs, node_label, edge_label, else: traverseBothTrieu(trie1[0].root, trie2[0], kernel, vk_dict, ek_dict) - kernel = kernel[0] / (trie1[1] * trie2[1]) # calculate mean average + kernel = kernel[0] / (trie1[1] * trie2[1]) # Compute mean average return kernel @@ -781,9 +781,9 @@ def get_shortest_paths(G, weight, directed): Parameters ---------- G : NetworkX graphs - The graphs whose paths are calculated. + The graphs whose paths are computed. weight : string/None - edge attribute used as weight to calculate the shortest path. + edge attribute used as weight to compute the shortest path. directed: boolean Whether graph is directed. @@ -822,9 +822,9 @@ def get_sps_as_trie(G, weight, directed): Parameters ---------- G : NetworkX graphs - The graphs whose paths are calculated. + The graphs whose paths are computed. weight : string/None - edge attribute used as weight to calculate the shortest path. + edge attribute used as weight to compute the shortest path. directed: boolean Whether graph is directed. diff --git a/gklearn/kernels/treelet.py b/gklearn/kernels/treelet.py index c3204ec..61ffd47 100644 --- a/gklearn/kernels/treelet.py +++ b/gklearn/kernels/treelet.py @@ -39,7 +39,7 @@ class Treelet(GraphKernel): def _compute_gm_series(self): self.__add_dummy_labels(self._graphs) - # get all canonical keys of all graphs before calculating kernels to save + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys = [] if self._verbose >= 2: @@ -55,7 +55,7 @@ class Treelet(GraphKernel): from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(self._graphs)), 2) if self._verbose >= 2: - iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) + iterator = tqdm(itr, desc='Computing kernels', file=sys.stdout) else: iterator = itr for i, j in iterator: @@ -69,7 +69,7 @@ class Treelet(GraphKernel): def _compute_gm_imap_unordered(self): self.__add_dummy_labels(self._graphs) - # get all canonical keys of all graphs before calculating kernels to save + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. pool = Pool(self._n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) @@ -105,7 +105,7 @@ class Treelet(GraphKernel): def _compute_kernel_list_series(self, g1, g_list): self.__add_dummy_labels(g_list + [g1]) - # get all canonical keys of all graphs before calculating kernels to save + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys_1 = self.__get_canonkeys(g1) canonkeys_list = [] @@ -119,7 +119,7 @@ class Treelet(GraphKernel): # compute kernel list. kernel_list = [None] * len(g_list) if self._verbose >= 2: - iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) + iterator = tqdm(range(len(g_list)), desc='Computing kernels', file=sys.stdout) else: iterator = range(len(g_list)) for i in iterator: @@ -132,7 +132,7 @@ class Treelet(GraphKernel): def _compute_kernel_list_imap_unordered(self, g1, g_list): self.__add_dummy_labels(g_list + [g1]) - # get all canonical keys of all graphs before calculating kernels to save + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys_1 = self.__get_canonkeys(g1) canonkeys_list = [[] for _ in range(len(g_list))] @@ -167,7 +167,7 @@ class Treelet(GraphKernel): len_itr = len(g_list) parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, init_worker=init_worker, glbv=(canonkeys_1, canonkeys_list), method='imap_unordered', - n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) + n_jobs=self._n_jobs, itr_desc='Computing kernels', verbose=self._verbose) return kernel_list @@ -185,7 +185,7 @@ class Treelet(GraphKernel): def __kernel_do(self, canonkey1, canonkey2): - """Calculate treelet graph kernel between 2 graphs. + """Compute treelet graph kernel between 2 graphs. Parameters ---------- diff --git a/gklearn/kernels/treeletKernel.py b/gklearn/kernels/treeletKernel.py index 809a623..14577ff 100644 --- a/gklearn/kernels/treeletKernel.py +++ b/gklearn/kernels/treeletKernel.py @@ -29,15 +29,15 @@ def treeletkernel(*args, n_jobs=None, chunksize=None, verbose=True): - """Calculate treelet graph kernels between graphs. + """Compute treelet graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. + Two graphs between which the kernel is computed. sub_kernel : function The sub-kernel between 2 real number vectors. Each vector counts the @@ -89,7 +89,7 @@ def treeletkernel(*args, # ---- use pool.imap_unordered to parallel and track progress. ---- if parallel == 'imap_unordered': - # get all canonical keys of all graphs before calculating kernels to save + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) @@ -120,8 +120,8 @@ def treeletkernel(*args, glbv=(canonkeys,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # ---- do not use parallelization. ---- - elif parallel == None: - # get all canonical keys of all graphs before calculating kernels to save + elif parallel is None: + # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys = [] for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout) if verbose else Gn): @@ -148,7 +148,7 @@ def treeletkernel(*args, def _treeletkernel_do(canonkey1, canonkey2, sub_kernel): - """Calculate treelet graph kernel between 2 graphs. + """Compute treelet graph kernel between 2 graphs. Parameters ---------- @@ -210,7 +210,7 @@ def get_canonkeys(G, node_label, edge_label, labeled, is_directed): # n-star patterns patterns['3star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3] - patterns['4star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4] + patterns['4star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4] # @todo: check self loop. patterns['5star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5] # n-star patterns canonkey['6'] = len(patterns['3star']) diff --git a/gklearn/kernels/untilHPathKernel.py b/gklearn/kernels/untilHPathKernel.py index 9bac28b..62c8626 100644 --- a/gklearn/kernels/untilHPathKernel.py +++ b/gklearn/kernels/untilHPathKernel.py @@ -34,15 +34,15 @@ def untilhpathkernel(*args, n_jobs=None, chunksize=None, verbose=True): - """Calculate path graph kernels up to depth/hight h between graphs. + """Compute path graph kernels up to depth/hight h between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. + Two graphs between which the kernel is computed. node_label : string Node attribute used as label. The default node label is atom. @@ -91,7 +91,7 @@ def untilhpathkernel(*args, attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed'], node_label=node_label, edge_label=edge_label) - if k_func != None: + if k_func is not None: if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') @@ -103,7 +103,7 @@ def untilhpathkernel(*args, if parallel == 'imap_unordered': # ---- use pool.imap_unordered to parallel and track progress. ---- - # get all paths of all graphs before calculating kernels to save time, + # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) @@ -113,10 +113,10 @@ def untilhpathkernel(*args, else: chunksize = 100 all_paths = [[] for _ in range(len(Gn))] - if compute_method == 'trie' and k_func != None: + if compute_method == 'trie' and k_func is not None: getps_partial = partial(wrapper_find_all_path_as_trie, depth, ds_attrs, node_label, edge_label) - elif compute_method != 'trie' and k_func != None: + elif compute_method != 'trie' and k_func is not None: getps_partial = partial(wrapper_find_all_paths_until_length, depth, ds_attrs, node_label, edge_label, True) else: @@ -133,9 +133,9 @@ def untilhpathkernel(*args, pool.join() # for g in Gn: -# if compute_method == 'trie' and k_func != None: +# if compute_method == 'trie' and k_func is not None: # find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) -# elif compute_method != 'trie' and k_func != None: +# elif compute_method != 'trie' and k_func is not None: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) # else: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False) @@ -155,14 +155,14 @@ def untilhpathkernel(*args, ## all_paths[i] = ps ## print(time.time() - ttt) - if compute_method == 'trie' and k_func != None: + if compute_method == 'trie' and k_func is not None: def init_worker(trie_toshare): global G_trie G_trie = trie_toshare do_partial = partial(wrapper_uhpath_do_trie, k_func) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) - elif compute_method != 'trie' and k_func != None: + elif compute_method != 'trie' and k_func is not None: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare @@ -177,7 +177,7 @@ def untilhpathkernel(*args, parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) - elif parallel == None: + elif parallel is None: # from pympler import asizeof # ---- direct running, normally use single CPU core. ---- # print(asizeof.asized(all_paths, detail=1).format()) @@ -195,7 +195,7 @@ def untilhpathkernel(*args, # print(sizeof_allpaths) pbar = tqdm( total=((len(Gn) + 1) * len(Gn) / 2), - desc='calculating kernels', + desc='Computing kernels', file=sys.stdout) for i in range(0, len(Gn)): for j in range(i, len(Gn)): @@ -217,7 +217,7 @@ def untilhpathkernel(*args, # print(sizeof_allpaths) pbar = tqdm( total=((len(Gn) + 1) * len(Gn) / 2), - desc='calculating kernels', + desc='Computing kernels', file=sys.stdout) for i in range(0, len(Gn)): for j in range(i, len(Gn)): @@ -236,7 +236,7 @@ def untilhpathkernel(*args, def _untilhpathkernel_do_trie(trie1, trie2, k_func): - """Calculate path graph kernels up to depth d between 2 graphs using trie. + """Compute path graph kernels up to depth d between 2 graphs using trie. Parameters ---------- @@ -351,7 +351,7 @@ def wrapper_uhpath_do_trie(k_func, itr): def _untilhpathkernel_do_naive(paths1, paths2, k_func): - """Calculate path graph kernels up to depth d between 2 graphs naively. + """Compute path graph kernels up to depth d between 2 graphs naively. Parameters ---------- @@ -400,7 +400,7 @@ def wrapper_uhpath_do_naive(k_func, itr): def _untilhpathkernel_do_kernelless(paths1, paths2, k_func): - """Calculate path graph kernels up to depth d between 2 graphs naively. + """Compute path graph kernels up to depth d between 2 graphs naively. Parameters ---------- diff --git a/gklearn/kernels/weisfeilerLehmanKernel.py b/gklearn/kernels/weisfeilerLehmanKernel.py index 222f5c5..469dcd8 100644 --- a/gklearn/kernels/weisfeilerLehmanKernel.py +++ b/gklearn/kernels/weisfeilerLehmanKernel.py @@ -32,15 +32,15 @@ def weisfeilerlehmankernel(*args, n_jobs=None, chunksize=None, verbose=True): - """Calculate Weisfeiler-Lehman kernels between graphs. + """Compute Weisfeiler-Lehman kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs - Two graphs between which the kernel is calculated. + Two graphs between which the kernel is computed. node_label : string Node attribute used as label. The default node label is atom. @@ -115,12 +115,12 @@ def weisfeilerlehmankernel(*args, def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose): - """Calculate Weisfeiler-Lehman kernels between graphs. + """Compute Weisfeiler-Lehman kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. node_label : string node attribute used as label. edge_label : string @@ -146,7 +146,7 @@ def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksiz # number of occurence of each label in G all_num_of_each_label.append(dict(Counter(labels_ori))) - # calculate subtree kernel with the 0th iteration and add it to the final kernel + # Compute subtree kernel with the 0th iteration and add it to the final kernel compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False) # iterate each height @@ -255,7 +255,7 @@ def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksiz # all_labels_ori.update(labels_comp) all_num_of_each_label.append(dict(Counter(labels_comp))) - # calculate subtree kernel with h iterations and add it to the final kernel + # Compute subtree kernel with h iterations and add it to the final kernel compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, False) return Kmatrix @@ -316,7 +316,7 @@ def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_num_of_each_label,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) - elif parallel == None: + elif parallel is None: for i in range(len(Kmatrix)): for j in range(i, len(Kmatrix)): Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i], @@ -345,12 +345,12 @@ def wrapper_compute_subtree_kernel(Kmatrix, itr): def _wl_spkernel_do(Gn, node_label, edge_label, height): - """Calculate Weisfeiler-Lehman shortest path kernels between graphs. + """Compute Weisfeiler-Lehman shortest path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. node_label : string node attribute used as label. edge_label : string @@ -413,7 +413,7 @@ def _wl_spkernel_do(Gn, node_label, edge_label, height): for node in G.nodes(data = True): node[1][node_label] = set_compressed[set_multisets[node[0]]] - # calculate subtree kernel with h iterations and add it to the final kernel + # Compute subtree kernel with h iterations and add it to the final kernel for i in range(0, len(Gn)): for j in range(i, len(Gn)): for e1 in Gn[i].edges(data = True): @@ -427,12 +427,12 @@ def _wl_spkernel_do(Gn, node_label, edge_label, height): def _wl_edgekernel_do(Gn, node_label, edge_label, height): - """Calculate Weisfeiler-Lehman edge kernels between graphs. + """Compute Weisfeiler-Lehman edge kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. node_label : string node attribute used as label. edge_label : string @@ -491,7 +491,7 @@ def _wl_edgekernel_do(Gn, node_label, edge_label, height): for node in G.nodes(data = True): node[1][node_label] = set_compressed[set_multisets[node[0]]] - # calculate subtree kernel with h iterations and add it to the final kernel + # Compute subtree kernel with h iterations and add it to the final kernel for i in range(0, len(Gn)): for j in range(i, len(Gn)): for e1 in Gn[i].edges(data = True): @@ -504,12 +504,12 @@ def _wl_edgekernel_do(Gn, node_label, edge_label, height): def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel): - """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs. + """Compute Weisfeiler-Lehman kernels based on user-defined kernel between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. node_label : string node attribute used as label. edge_label : string @@ -564,7 +564,7 @@ def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel): for node in G.nodes(data = True): node[1][node_label] = set_compressed[set_multisets[node[0]]] - # calculate kernel with h iterations and add it to the final kernel + # Compute kernel with h iterations and add it to the final kernel Kmatrix += base_kernel(Gn, node_label, edge_label) return Kmatrix diff --git a/gklearn/kernels/weisfeiler_lehman.py b/gklearn/kernels/weisfeiler_lehman.py index f5f4145..8ab7634 100644 --- a/gklearn/kernels/weisfeiler_lehman.py +++ b/gklearn/kernels/weisfeiler_lehman.py @@ -125,12 +125,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge def __subtree_kernel_do(self, Gn): - """Calculate Weisfeiler-Lehman kernels between graphs. + """Compute Weisfeiler-Lehman kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. Return ------ @@ -152,7 +152,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge # number of occurence of each label in G all_num_of_each_label.append(dict(Counter(labels_ori))) - # calculate subtree kernel with the 0th iteration and add it to the final kernel. + # Compute subtree kernel with the 0th iteration and add it to the final kernel. self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn) # iterate each height @@ -198,7 +198,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge # all_labels_ori.update(labels_comp) all_num_of_each_label.append(dict(Counter(labels_comp))) - # calculate subtree kernel with h iterations and add it to the final kernel + # Compute subtree kernel with h iterations and add it to the final kernel self.__compute_gram_matrix(gram_matrix, all_num_of_each_label, Gn) return gram_matrix @@ -244,12 +244,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge def _wl_spkernel_do(Gn, node_label, edge_label, height): - """Calculate Weisfeiler-Lehman shortest path kernels between graphs. + """Compute Weisfeiler-Lehman shortest path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. node_label : string node attribute used as label. edge_label : string @@ -312,7 +312,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge for node in G.nodes(data = True): node[1][node_label] = set_compressed[set_multisets[node[0]]] - # calculate subtree kernel with h iterations and add it to the final kernel + # Compute subtree kernel with h iterations and add it to the final kernel for i in range(0, len(Gn)): for j in range(i, len(Gn)): for e1 in Gn[i].edges(data = True): @@ -326,12 +326,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge def _wl_edgekernel_do(Gn, node_label, edge_label, height): - """Calculate Weisfeiler-Lehman edge kernels between graphs. + """Compute Weisfeiler-Lehman edge kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. node_label : string node attribute used as label. edge_label : string @@ -390,7 +390,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge for node in G.nodes(data = True): node[1][node_label] = set_compressed[set_multisets[node[0]]] - # calculate subtree kernel with h iterations and add it to the final kernel + # Compute subtree kernel with h iterations and add it to the final kernel for i in range(0, len(Gn)): for j in range(i, len(Gn)): for e1 in Gn[i].edges(data = True): @@ -403,12 +403,12 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel): - """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs. + """Compute Weisfeiler-Lehman kernels based on user-defined kernel between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are computed. node_label : string node attribute used as label. edge_label : string @@ -463,7 +463,7 @@ class WeisfeilerLehman(GraphKernel): # @todo: total parallelization and sp, edge for node in G.nodes(data = True): node[1][node_label] = set_compressed[set_multisets[node[0]]] - # calculate kernel with h iterations and add it to the final kernel + # Compute kernel with h iterations and add it to the final kernel gram_matrix += base_kernel(Gn, node_label, edge_label) return gram_matrix diff --git a/gklearn/utils/parallel.py b/gklearn/utils/parallel.py index 71bb47c..a1862c0 100644 --- a/gklearn/utils/parallel.py +++ b/gklearn/utils/parallel.py @@ -63,4 +63,4 @@ def parallel_gm(func, Kmatrix, Gn, init_worker=None, glbv=None, len_itr = int(len(Gn) * (len(Gn) + 1) / 2) parallel_me(func, func_assign, Kmatrix, itr, len_itr=len_itr, init_worker=init_worker, glbv=glbv, method=method, n_jobs=n_jobs, - chunksize=chunksize, itr_desc='calculating kernels', verbose=verbose) \ No newline at end of file + chunksize=chunksize, itr_desc='Computing kernels', verbose=verbose) \ No newline at end of file From 0f8d4af2dc2a63348249d8a2c309925d3d8c3ec4 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Thu, 15 Oct 2020 16:13:29 +0200 Subject: [PATCH 13/13] Update version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cd45970..ee51e5c 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ with open('requirements_pypi.txt') as fp: setuptools.setup( name="graphkit-learn", - version="0.2.1b1", + version="0.2.1", author="Linlin Jia", author_email="linlin.jia@insa-rouen.fr", description="A Python library for graph kernels, graph edit distances, and graph pre-images",