diff --git a/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py b/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py new file mode 100644 index 0000000..c25c116 --- /dev/null +++ b/gklearn/experiments/papers/PRL_2020/accuracy_diff_entropy.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Oct 5 16:08:33 2020 + +@author: ljia + +This script compute classification accuracy of each geaph kernel on datasets +with different entropy of degree distribution. +""" +from utils import Graph_Kernel_List, cross_validate +import numpy as np +import logging + +num_nodes = 40 +half_num_graphs = 100 + + +def generate_graphs(): +# from gklearn.utils.graph_synthesizer import GraphSynthesizer +# gsyzer = GraphSynthesizer() +# graphs = gsyzer.unified_graphs(num_graphs=1000, num_nodes=20, num_edges=40, num_node_labels=0, num_edge_labels=0, seed=None, directed=False) +# return graphs + import networkx as nx + + degrees11 = [5] * num_nodes +# degrees12 = [2] * num_nodes + degrees12 = [5] * num_nodes + degrees21 = list(range(1, 11)) * 6 +# degrees22 = [5 * i for i in list(range(1, 11)) * 6] + degrees22 = list(range(1, 11)) * 6 + + # method 1 + graphs11 = [nx.configuration_model(degrees11, create_using=nx.Graph) for i in range(half_num_graphs)] + graphs12 = [nx.configuration_model(degrees12, create_using=nx.Graph) for i in range(half_num_graphs)] + + # method 2: can easily generate isomorphic graphs. +# graphs11 = [nx.random_regular_graph(2, num_nodes, seed=None) for i in range(half_num_graphs)] +# graphs12 = [nx.random_regular_graph(10, num_nodes, seed=None) for i in range(half_num_graphs)] + + # Add node labels. + for g in graphs11: + for n in g.nodes(): + g.nodes[n]['atom'] = 0 + for g in graphs12: + for n in g.nodes(): + g.nodes[n]['atom'] = 1 + + graphs1 = graphs11 + graphs12 + + # method 1: the entorpy of the two classes is not the same. + graphs21 = [nx.configuration_model(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] + graphs22 = [nx.configuration_model(degrees22, create_using=nx.Graph) for i in range(half_num_graphs)] + +# # method 2: tooo slow, and may fail. +# graphs21 = [nx.random_degree_sequence_graph(degrees21, seed=None, tries=100) for i in range(half_num_graphs)] +# graphs22 = [nx.random_degree_sequence_graph(degrees22, seed=None, tries=100) for i in range(half_num_graphs)] + +# # method 3: no randomness. +# graphs21 = [nx.havel_hakimi_graph(degrees21, create_using=None) for i in range(half_num_graphs)] +# graphs22 = [nx.havel_hakimi_graph(degrees22, create_using=None) for i in range(half_num_graphs)] + +# # method 4: +# graphs21 = [nx.configuration_model(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] +# graphs22 = [nx.degree_sequence_tree(degrees21, create_using=nx.Graph) for i in range(half_num_graphs)] + +# # method 5: the entorpy of the two classes is not the same. +# graphs21 = [nx.expected_degree_graph(degrees21, seed=None, selfloops=False) for i in range(half_num_graphs)] +# graphs22 = [nx.expected_degree_graph(degrees22, seed=None, selfloops=False) for i in range(half_num_graphs)] + +# # method 6: seems there is no randomness0 +# graphs21 = [nx.random_powerlaw_tree(num_nodes, gamma=3, seed=None, tries=10000) for i in range(half_num_graphs)] +# graphs22 = [nx.random_powerlaw_tree(num_nodes, gamma=3, seed=None, tries=10000) for i in range(half_num_graphs)] + + # Add node labels. + for g in graphs21: + for n in g.nodes(): + g.nodes[n]['atom'] = 0 + for g in graphs22: + for n in g.nodes(): + g.nodes[n]['atom'] = 1 + + graphs2 = graphs21 + graphs22 + +# # check for isomorphism. +# iso_mat1 = np.zeros((len(graphs1), len(graphs1))) +# num1 = 0 +# num2 = 0 +# for i in range(len(graphs1)): +# for j in range(i + 1, len(graphs1)): +# if nx.is_isomorphic(graphs1[i], graphs1[j]): +# iso_mat1[i, j] = 1 +# iso_mat1[j, i] = 1 +# num1 += 1 +# print('iso:', num1, ':', i, ',', j) +# else: +# num2 += 1 +# print('not iso:', num2, ':', i, ',', j) +# +# iso_mat2 = np.zeros((len(graphs2), len(graphs2))) +# num1 = 0 +# num2 = 0 +# for i in range(len(graphs2)): +# for j in range(i + 1, len(graphs2)): +# if nx.is_isomorphic(graphs2[i], graphs2[j]): +# iso_mat2[i, j] = 1 +# iso_mat2[j, i] = 1 +# num1 += 1 +# print('iso:', num1, ':', i, ',', j) +# else: +# num2 += 1 +# print('not iso:', num2, ':', i, ',', j) + + return graphs1, graphs2 + + +def get_infos(graph): + from gklearn.utils import Dataset + ds = Dataset() + ds.load_graphs(graph) + infos = ds.get_dataset_infos(keys=['all_degree_entropy', 'ave_node_degree']) + infos['ave_degree_entropy'] = np.mean(infos['all_degree_entropy']) + print(infos['ave_degree_entropy'], ',', infos['ave_node_degree']) + return infos + + +def xp_accuracy_diff_entropy(): + + # Generate graphs. + graphs1, graphs2 = generate_graphs() + + + # Compute entropy of degree distribution of the generated graphs. + info11 = get_infos(graphs1[0:half_num_graphs]) + info12 = get_infos(graphs1[half_num_graphs:]) + info21 = get_infos(graphs2[0:half_num_graphs]) + info22 = get_infos(graphs2[half_num_graphs:]) + + # Run and save. + import pickle + import os + save_dir = 'outputs/accuracy_diff_entropy/' + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + accuracies = {} + confidences = {} + + for kernel_name in Graph_Kernel_List: + print() + print('Kernel:', kernel_name) + + accuracies[kernel_name] = [] + confidences[kernel_name] = [] + for set_i, graphs in enumerate([graphs1, graphs2]): + print() + print('Graph set', set_i) + + tmp_graphs = [g.copy() for g in graphs] + targets = [0] * half_num_graphs + [1] * half_num_graphs + + accuracy = 'error' + confidence = 'error' + try: + accuracy, confidence = cross_validate(tmp_graphs, targets, kernel_name, ds_name=str(set_i), output_dir=save_dir) #, n_jobs=1) + except Exception as exp: + print('An exception occured when running this experiment:') + LOG_FILENAME = save_dir + 'error.txt' + logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + logging.exception('\n' + kernel_name + ', ' + str(set_i) + ':') + print(repr(exp)) + accuracies[kernel_name].append(accuracy) + confidences[kernel_name].append(confidence) + + pickle.dump(accuracy, open(save_dir + 'accuracy.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb')) + pickle.dump(confidence, open(save_dir + 'confidence.' + kernel_name + '.' + str(set_i) + '.pkl', 'wb')) + + # Save all. + pickle.dump(accuracies, open(save_dir + 'accuracies.pkl', 'wb')) + pickle.dump(confidences, open(save_dir + 'confidences.pkl', 'wb')) + + return + + +if __name__ == '__main__': + xp_accuracy_diff_entropy() \ No newline at end of file diff --git a/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py b/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py index 4c827ce..0e25f46 100644 --- a/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py +++ b/gklearn/experiments/papers/PRL_2020/runtimes_28cores.py @@ -21,14 +21,14 @@ def xp_runtimes_of_all_28cores(): run_times = {} - for kernel_name in Graph_Kernel_List: + for ds_name in Dataset_List: print() - print('Kernel:', kernel_name) + print('Dataset:', ds_name) - run_times[kernel_name] = [] - for ds_name in Dataset_List: + run_times[ds_name] = [] + for kernel_name in Graph_Kernel_List: print() - print('Dataset:', ds_name) + print('Kernel:', kernel_name) # get graphs. graphs, _ = load_predefined_dataset(ds_name) @@ -43,7 +43,7 @@ def xp_runtimes_of_all_28cores(): logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) - run_times[kernel_name].append(run_time) + run_times[ds_name].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.pkl', 'wb')) diff --git a/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py b/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py index 343694c..6d118d8 100644 --- a/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py +++ b/gklearn/experiments/papers/PRL_2020/runtimes_diff_chunksizes.py @@ -20,17 +20,17 @@ def xp_runtimes_diff_chunksizes(): os.makedirs(save_dir) run_times = {} - - for kernel_name in Graph_Kernel_List: + + for ds_name in Dataset_List: print() - print('Kernel:', kernel_name) - - run_times[kernel_name] = [] - for ds_name in Dataset_List: + print('Dataset:', ds_name) + + run_times[ds_name] = [] + for kernel_name in Graph_Kernel_List: print() - print('Dataset:', ds_name) + print('Kernel:', kernel_name) - run_times[kernel_name].append([]) + run_times[ds_name].append([]) for chunksize in [1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000]: print() print('Chunksize:', chunksize) @@ -48,7 +48,7 @@ def xp_runtimes_diff_chunksizes(): logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) logging.exception('') print(repr(exp)) - run_times[kernel_name][-1].append(run_time) + run_times[ds_name][-1].append(run_time) pickle.dump(run_time, open(save_dir + 'run_time.' + kernel_name + '.' + ds_name + '.' + str(chunksize) + '.pkl', 'wb')) diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py index 36bf1bc..a7056f3 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_N.py @@ -16,7 +16,7 @@ def generate_graphs(): return graphs -def xp_synthesied_graphs_dataset_size(): +def xp_synthesized_graphs_dataset_size(): # Generate graphs. graphs = generate_graphs() @@ -61,4 +61,4 @@ def xp_synthesied_graphs_dataset_size(): if __name__ == '__main__': - xp_synthesied_graphs_dataset_size() \ No newline at end of file + xp_synthesized_graphs_dataset_size() \ No newline at end of file diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py index 0562d81..2f5594d 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_degrees.py @@ -16,7 +16,7 @@ def generate_graphs(degree): return graphs -def xp_synthesied_graphs_degrees(): +def xp_synthesized_graphs_degrees(): # Run and save. import pickle @@ -60,4 +60,4 @@ def xp_synthesied_graphs_degrees(): if __name__ == '__main__': - xp_synthesied_graphs_degrees() + xp_synthesized_graphs_degrees() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py index 9a8e721..51e07ba 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_el.py @@ -16,7 +16,7 @@ def generate_graphs(num_el_alp): return graphs -def xp_synthesied_graphs_num_edge_label_alphabet(): +def xp_synthesized_graphs_num_edge_label_alphabet(): # Run and save. import pickle @@ -60,4 +60,4 @@ def xp_synthesied_graphs_num_edge_label_alphabet(): if __name__ == '__main__': - xp_synthesied_graphs_num_edge_label_alphabet() + xp_synthesized_graphs_num_edge_label_alphabet() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py index 2ab63ee..61609ba 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nl.py @@ -16,7 +16,7 @@ def generate_graphs(num_nl_alp): return graphs -def xp_synthesied_graphs_num_node_label_alphabet(): +def xp_synthesized_graphs_num_node_label_alphabet(): # Run and save. import pickle @@ -61,4 +61,4 @@ def xp_synthesied_graphs_num_node_label_alphabet(): if __name__ == '__main__': - xp_synthesied_graphs_num_node_label_alphabet() + xp_synthesized_graphs_num_node_label_alphabet() diff --git a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py index d0d6ebb..ec6557c 100644 --- a/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py +++ b/gklearn/experiments/papers/PRL_2020/synthesized_graphs_num_nodes.py @@ -16,7 +16,7 @@ def generate_graphs(num_nodes): return graphs -def xp_synthesied_graphs_num_nodes(): +def xp_synthesized_graphs_num_nodes(): # Run and save. import pickle @@ -61,4 +61,4 @@ def xp_synthesied_graphs_num_nodes(): if __name__ == '__main__': - xp_synthesied_graphs_num_nodes() + xp_synthesized_graphs_num_nodes() diff --git a/gklearn/experiments/papers/PRL_2020/utils.py b/gklearn/experiments/papers/PRL_2020/utils.py index 07c82f7..99e2d20 100644 --- a/gklearn/experiments/papers/PRL_2020/utils.py +++ b/gklearn/experiments/papers/PRL_2020/utils.py @@ -6,6 +6,8 @@ Created on Tue Sep 22 11:33:28 2020 @author: ljia """ import multiprocessing +import numpy as np +from gklearn.utils import model_selection_for_precomputed_kernel Graph_Kernel_List = ['PathUpToH', 'WLSubtree', 'SylvesterEquation', 'Marginalized', 'ShortestPath', 'Treelet', 'ConjugateGradient', 'FixedPoint', 'SpectralDecomposition', 'StructuralSP', 'CommonWalk'] @@ -109,4 +111,123 @@ def compute_graph_kernel(graphs, kernel_name, n_jobs=multiprocessing.cpu_count() params['verbose'] = True results = estimator(graphs, **params) + return results[0], results[1] + + +def cross_validate(graphs, targets, kernel_name, output_dir='outputs/', ds_name='synthesized', n_jobs=multiprocessing.cpu_count()): + + param_grid = None + + if kernel_name == 'CommonWalk': + from gklearn.kernels.commonWalkKernel import commonwalkkernel + estimator = commonwalkkernel + param_grid_precomputed = [{'compute_method': ['geo'], + 'weight': np.linspace(0.01, 0.15, 15)}] + + elif kernel_name == 'Marginalized': + from gklearn.kernels.marginalizedKernel import marginalizedkernel + estimator = marginalizedkernel + param_grid_precomputed = {'p_quit': np.linspace(0.1, 0.9, 9), + 'n_iteration': np.linspace(1, 19, 7), + 'remove_totters': [False]} + + elif kernel_name == 'SylvesterEquation': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + param_grid_precomputed = {'compute_method': ['sylvester'], +# 'weight': np.linspace(0.01, 0.10, 10)} + 'weight': np.logspace(-1, -10, num=10, base=10)} + + elif kernel_name == 'ConjugateGradient': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + param_grid_precomputed = {'compute_method': ['conjugate'], + 'node_kernels': [sub_kernel], 'edge_kernels': [sub_kernel], + 'weight': np.logspace(-1, -10, num=10, base=10)} + + elif kernel_name == 'FixedPoint': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + param_grid_precomputed = {'compute_method': ['fp'], + 'node_kernels': [sub_kernel], 'edge_kernels': [sub_kernel], + 'weight': np.logspace(-3, -10, num=8, base=10)} + + elif kernel_name == 'SpectralDecomposition': + from gklearn.kernels.randomWalkKernel import randomwalkkernel + estimator = randomwalkkernel + param_grid_precomputed = {'compute_method': ['spectral'], + 'weight': np.logspace(-1, -10, num=10, base=10), + 'sub_kernel': ['geo', 'exp']} + + elif kernel_name == 'ShortestPath': + from gklearn.kernels.spKernel import spkernel + estimator = spkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + param_grid_precomputed = {'node_kernels': [sub_kernel]} + + elif kernel_name == 'StructuralSP': + from gklearn.kernels.structuralspKernel import structuralspkernel + estimator = structuralspkernel + from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct + import functools + mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) + sub_kernel = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} + param_grid_precomputed = {'node_kernels': [sub_kernel], 'edge_kernels': [sub_kernel], + 'compute_method': ['naive']} + + elif kernel_name == 'PathUpToH': + from gklearn.kernels.untilHPathKernel import untilhpathkernel + estimator = untilhpathkernel + param_grid_precomputed = {'depth': np.linspace(1, 10, 10), # [2], + 'k_func': ['MinMax', 'tanimoto'], # ['MinMax'], # + 'compute_method': ['trie']} # ['MinMax']} + + elif kernel_name == 'Treelet': + from gklearn.kernels.treeletKernel import treeletkernel + estimator = treeletkernel + from gklearn.utils.kernels import polynomialkernel + import functools + gkernels = [functools.partial(gaussiankernel, gamma=1 / ga) + # for ga in np.linspace(1, 10, 10)] + for ga in np.logspace(0, 10, num=11, base=10)] + pkernels = [functools.partial(polynomialkernel, d=d, c=c) for d in range(1, 11) + for c in np.logspace(0, 10, num=11, base=10)] + param_grid_precomputed = {'sub_kernel': pkernels + gkernels} + + elif kernel_name == 'WLSubtree': + from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel + estimator = weisfeilerlehmankernel + param_grid_precomputed = {'base_kernel': ['subtree'], + 'height': np.linspace(0, 10, 11)} + param_grid = {'C': np.logspace(-10, 4, num=29, base=10)} + + if param_grid is None: + param_grid = {'C': np.logspace(-10, 10, num=41, base=10)} + + results = model_selection_for_precomputed_kernel( + graphs, + estimator, + param_grid_precomputed, + param_grid, + 'classification', + NUM_TRIALS=28, + datafile_y=targets, + extra_params=None, + ds_name=ds_name, + output_dir=output_dir, + n_jobs=n_jobs, + read_gm_from_file=False, + verbose=True) + return results[0], results[1] \ No newline at end of file diff --git a/gklearn/utils/dataset.py b/gklearn/utils/dataset.py index 7201a0d..3d68212 100644 --- a/gklearn/utils/dataset.py +++ b/gklearn/utils/dataset.py @@ -13,6 +13,7 @@ import os class Dataset(object): + def __init__(self, filename=None, filename_targets=None, **kwargs): if filename is None: self.__graphs = None @@ -180,13 +181,13 @@ class Dataset(object): # return 0 - def get_dataset_infos(self, keys=None): + def get_dataset_infos(self, keys=None, params=None): """Computes and returns the structure and property information of the graph dataset. Parameters ---------- - keys : list - List of strings which indicate which informations will be returned. The + keys : list, optional + A list of strings which indicate which informations will be returned. The possible choices includes: 'substructures': sub-structures graphs contains, including 'linear', 'non @@ -241,7 +242,15 @@ class Dataset(object): 'class_number': number of classes. Only available for classification problems. + 'all_degree_entropy': the entropy of degree distribution of each graph. + + 'ave_degree_entropy': the average entropy of degree distribution of all graphs. + All informations above will be returned if `keys` is not given. + + params: dict of dict, optional + A dictinary which contains extra parameters for each possible + element in ``keys``. Return ------ @@ -276,6 +285,8 @@ class Dataset(object): 'node_attr_dim', 'edge_attr_dim', 'class_number', + 'all_degree_entropy', + 'ave_degree_entropy' ] # dataset size @@ -420,6 +431,22 @@ class Dataset(object): self.__edge_attr_dim = self.__get_edge_attr_dim() infos['edge_attr_dim'] = self.__edge_attr_dim + # entropy of degree distribution. + + if 'all_degree_entropy' in keys: + if params is not None and ('all_degree_entropy' in params) and ('base' in params['all_degree_entropy']): + base = params['all_degree_entropy']['base'] + else: + base = None + infos['all_degree_entropy'] = self.__compute_all_degree_entropy(base=base) + + if 'ave_degree_entropy' in keys: + if params is not None and ('ave_degree_entropy' in params) and ('base' in params['ave_degree_entropy']): + base = params['ave_degree_entropy']['base'] + else: + base = None + infos['ave_degree_entropy'] = np.mean(self.__compute_all_degree_entropy(base=base)) + return infos @@ -653,8 +680,7 @@ class Dataset(object): def __get_all_fill_factors(self): - """ - Get fill factor, the number of non-zero entries in the adjacency matrix. + """Get fill factor, the number of non-zero entries in the adjacency matrix. Returns ------- @@ -721,7 +747,30 @@ class Dataset(object): def __get_edge_attr_dim(self): return len(self.__edge_attrs) + + def __compute_all_degree_entropy(self, base=None): + """Compute the entropy of degree distribution of each graph. + + Parameters + ---------- + base : float, optional + The logarithmic base to use. The default is ``e`` (natural logarithm). + + Returns + ------- + degree_entropy : float + The calculated entropy. + """ + from gklearn.utils.stats import entropy + + degree_entropy = [] + for g in self.__graphs: + degrees = list(dict(g.degree()).values()) + en = entropy(degrees, base=base) + degree_entropy.append(en) + return degree_entropy + @property def graphs(self): diff --git a/gklearn/utils/model_selection_precomputed.py b/gklearn/utils/model_selection_precomputed.py index 1252f12..517d30a 100644 --- a/gklearn/utils/model_selection_precomputed.py +++ b/gklearn/utils/model_selection_precomputed.py @@ -22,936 +22,938 @@ from tqdm import tqdm #@profile def model_selection_for_precomputed_kernel(datafile, - estimator, - param_grid_precomputed, - param_grid, - model_type, - NUM_TRIALS=30, - datafile_y=None, - extra_params=None, - ds_name='ds-unknown', + estimator, + param_grid_precomputed, + param_grid, + model_type, + NUM_TRIALS=30, + datafile_y=None, + extra_params=None, + ds_name='ds-unknown', output_dir='outputs/', - n_jobs=1, - read_gm_from_file=False, - verbose=True): - """Perform model selection, fitting and testing for precomputed kernels - using nested CV. Print out neccessary data during the process then finally - the results. - - Parameters - ---------- - datafile : string - Path of dataset file. - estimator : function - kernel function used to estimate. This function needs to return a gram matrix. - param_grid_precomputed : dictionary - Dictionary with names (string) of parameters used to calculate gram - matrices as keys and lists of parameter settings to try as values. This - enables searching over any sequence of parameter settings. Params with - length 1 will be omitted. - param_grid : dictionary - Dictionary with names (string) of parameters used as penelties as keys - and lists of parameter settings to try as values. This enables - searching over any sequence of parameter settings. Params with length 1 - will be omitted. - model_type : string - Type of the problem, can be 'regression' or 'classification'. - NUM_TRIALS : integer - Number of random trials of the outer CV loop. The default is 30. - datafile_y : string - Path of file storing y data. This parameter is optional depending on - the given dataset file. - extra_params : dict - Extra parameters for loading dataset. See function gklearn.utils. - graphfiles.loadDataset for detail. - ds_name : string - Name of the dataset. - n_jobs : int - Number of jobs for parallelization. - read_gm_from_file : boolean - Whether gram matrices are loaded from a file. - - Examples - -------- - >>> import numpy as np - >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel - >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel - >>> - >>> datafile = '../datasets/MUTAG/MUTAG_A.txt' - >>> estimator = untilhpathkernel - >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: - [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} - >>> # ’C’ for classification problems and ’alpha’ for regression problems. - >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: - np.logspace(-10, 10, num=41, base=10)}] - >>> - >>> model_selection_for_precomputed_kernel(datafile, estimator, - param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) - """ - tqdm.monitor_interval = 0 - - output_dir += estimator.__name__ - if not os.path.exists(output_dir): - os.makedirs(output_dir) - # a string to save all the results. - str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' - str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' - - # setup the model type - model_type = model_type.lower() - if model_type != 'regression' and model_type != 'classification': - raise Exception( - 'The model type is incorrect! Please choose from regression or classification.' - ) - if verbose: - print() - print('--- This is a %s problem ---' % model_type) - str_fw += 'This is a %s problem.\n' % model_type - - # calculate gram matrices rather than read them from file. - if read_gm_from_file == False: - # Load the dataset - if verbose: - print() - print('\n1. Loading dataset from file...') - if isinstance(datafile, str): - dataset, y_all = loadDataset( - datafile, filename_y=datafile_y, extra_params=extra_params) - else: # load data directly from variable. - dataset = datafile - y_all = datafile_y - - # import matplotlib.pyplot as plt - # import networkx as nx - # nx.draw_networkx(dataset[30]) - # plt.show() - - # Grid of parameters with a discrete number of values for each. - param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) - param_list = list(ParameterGrid(param_grid)) - - gram_matrices = [ - ] # a list to store gram matrices for all param_grid_precomputed - gram_matrix_time = [ - ] # a list to store time to calculate gram matrices - param_list_pre_revised = [ - ] # list to store param grids precomputed ignoring the useless ones - - # calculate all gram matrices - if verbose: - print() - print('2. Calculating gram matrices. This could take a while...') - str_fw += '\nII. Gram matrices.\n\n' - tts = time.time() # start training time - nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) - for idx, params_out in enumerate(param_list_precomputed): - y = y_all[:] - params_out['n_jobs'] = n_jobs - params_out['verbose'] = verbose -# print(dataset) -# import networkx as nx -# nx.draw_networkx(dataset[1]) -# plt.show() - rtn_data = estimator(dataset[:], **params_out) - Kmatrix = rtn_data[0] - current_run_time = rtn_data[1] - # for some kernels, some graphs in datasets may not meet the - # kernels' requirements for graph structure. These graphs are trimmed. - if len(rtn_data) == 3: - idx_trim = rtn_data[2] # the index of trimmed graph list - y = [y[idxt] for idxt in idx_trim] # trim y accordingly -# Kmatrix = np.random.rand(2250, 2250) -# current_run_time = 0.1 - - # remove graphs whose kernels with themselves are zeros - # @todo: y not changed accordingly? - Kmatrix_diag = Kmatrix.diagonal().copy() - nb_g_ignore = 0 - for idxk, diag in enumerate(Kmatrix_diag): - if diag == 0: - Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) - Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) - nb_g_ignore += 1 - # normalization - # @todo: works only for undirected graph? - Kmatrix_diag = Kmatrix.diagonal().copy() - for i in range(len(Kmatrix)): - for j in range(i, len(Kmatrix)): - Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) - Kmatrix[j][i] = Kmatrix[i][j] - if verbose: - print() - if params_out == {}: - if verbose: - print('the gram matrix is: ') - str_fw += 'the gram matrix is:\n\n' - else: - if verbose: - print('the gram matrix with parameters', params_out, 'is: \n\n') - str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out - if len(Kmatrix) < 2: - nb_gm_ignore += 1 - if verbose: - print('ignored, as at most only one of all its diagonal value is non-zero.') - str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' - else: - if np.isnan(Kmatrix).any( - ): # if the matrix contains elements that are not numbers - nb_gm_ignore += 1 - if verbose: - print('ignored, as it contains elements that are not numbers.') - str_fw += 'ignored, as it contains elements that are not numbers.\n\n' - else: -# print(Kmatrix) - str_fw += np.array2string( - Kmatrix, - separator=',') + '\n\n' -# separator=',', -# threshold=np.inf, -# floatmode='unique') + '\n\n' - - fig_file_name = output_dir + '/GM[ds]' + ds_name - if params_out != {}: - fig_file_name += '[params]' + str(idx) - plt.imshow(Kmatrix) - plt.colorbar() - plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) -# plt.show() - plt.clf() - gram_matrices.append(Kmatrix) - gram_matrix_time.append(current_run_time) - param_list_pre_revised.append(params_out) - if nb_g_ignore > 0: - if verbose: - print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) - str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore - if verbose: - print() - print( - '{} gram matrices are calculated, {} of which are ignored.'.format( - len(param_list_precomputed), nb_gm_ignore)) - str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) - str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' - str_fw += ''.join([ - '{}: {}\n'.format(idx, params_out) - for idx, params_out in enumerate(param_list_precomputed) - ]) - - if verbose: - print() - if len(gram_matrices) == 0: - if verbose: - print('all gram matrices are ignored, no results obtained.') - str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' - else: - # save gram matrices to file. -# np.savez(output_dir + '/' + ds_name + '.gm', -# gms=gram_matrices, params=param_list_pre_revised, y=y, -# gmtime=gram_matrix_time) - if verbose: - print( - '3. Fitting and predicting using nested cross validation. This could really take a while...' - ) - - # ---- use pool.imap_unordered to parallel and track progress. ---- -# train_pref = [] -# val_pref = [] -# test_pref = [] -# def func_assign(result, var_to_assign): -# for idx, itm in enumerate(var_to_assign): -# itm.append(result[idx]) -# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) -# -# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, -# [train_pref, val_pref, test_pref], glbv=gram_matrices, -# method='imap_unordered', n_jobs=n_jobs, chunksize=1, -# itr_desc='cross validation') - - def init_worker(gms_toshare): - global G_gms - G_gms = gms_toshare - -# gram_matrices = np.array(gram_matrices) -# gms_shape = gram_matrices.shape -# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) -# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) - trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) - train_pref = [] - val_pref = [] - test_pref = [] -# if NUM_TRIALS < 1000 * n_jobs: -# chunksize = int(NUM_TRIALS / n_jobs) + 1 -# else: -# chunksize = 1000 - chunksize = 1 - if verbose: - iterator = tqdm(pool.imap_unordered(trial_do_partial, - range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout) - else: - iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize) - for o1, o2, o3 in iterator: - train_pref.append(o1) - val_pref.append(o2) - test_pref.append(o3) - pool.close() - pool.join() - -# # ---- use pool.map to parallel. ---- -# pool = Pool(n_jobs) -# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type) -# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) -# train_pref = [item[0] for item in result_perf] -# val_pref = [item[1] for item in result_perf] -# test_pref = [item[2] for item in result_perf] - -# # ---- direct running, normally use a single CPU core. ---- -# train_pref = [] -# val_pref = [] -# test_pref = [] -# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): -# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) -# train_pref.append(o1) -# val_pref.append(o2) -# test_pref.append(o3) -# print() - - if verbose: - print() - print('4. Getting final performance...') - str_fw += '\nIII. Performance.\n\n' - # averages and confidences of performances on outer trials for each combination of parameters - average_train_scores = np.mean(train_pref, axis=0) -# print('val_pref: ', val_pref[0][0]) - average_val_scores = np.mean(val_pref, axis=0) -# print('test_pref: ', test_pref[0][0]) - average_perf_scores = np.mean(test_pref, axis=0) - # sample std is used here - std_train_scores = np.std(train_pref, axis=0, ddof=1) - std_val_scores = np.std(val_pref, axis=0, ddof=1) - std_perf_scores = np.std(test_pref, axis=0, ddof=1) - - if model_type == 'regression': - best_val_perf = np.amin(average_val_scores) - else: - best_val_perf = np.amax(average_val_scores) -# print('average_val_scores: ', average_val_scores) -# print('best_val_perf: ', best_val_perf) -# print() - best_params_index = np.where(average_val_scores == best_val_perf) - # find smallest val std with best val perf. - best_val_stds = [ - std_val_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - min_val_std = np.amin(best_val_stds) - best_params_index = np.where(std_val_scores == min_val_std) - best_params_out = [ - param_list_pre_revised[i] for i in best_params_index[0] - ] - best_params_in = [param_list[i] for i in best_params_index[1]] - if verbose: - print('best_params_out: ', best_params_out) - print('best_params_in: ', best_params_in) - print() - print('best_val_perf: ', best_val_perf) - print('best_val_std: ', min_val_std) - str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out - str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in - str_fw += 'best_val_perf: %s\n' % best_val_perf - str_fw += 'best_val_std: %s\n' % min_val_std - -# print(best_params_index) -# print(best_params_index[0]) -# print(average_perf_scores) - final_performance = [ - average_perf_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - final_confidence = [ - std_perf_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - if verbose: - print('final_performance: ', final_performance) - print('final_confidence: ', final_confidence) - str_fw += 'final_performance: %s\n' % final_performance - str_fw += 'final_confidence: %s\n' % final_confidence - train_performance = [ - average_train_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - train_std = [ - std_train_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - if verbose: - print('train_performance: %s' % train_performance) - print('train_std: ', train_std) - str_fw += 'train_performance: %s\n' % train_performance - str_fw += 'train_std: %s\n\n' % train_std - - if verbose: - print() - tt_total = time.time() - tts # training time for all hyper-parameters - average_gram_matrix_time = np.mean(gram_matrix_time) - std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0 - best_gram_matrix_time = [ - gram_matrix_time[i] for i in best_params_index[0] - ] - ave_bgmt = np.mean(best_gram_matrix_time) - std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 - if verbose: - print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' - .format(average_gram_matrix_time, std_gram_matrix_time)) - print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( - ave_bgmt, std_bgmt)) - print('total training time with all hyper-param choices: {:.2f}s'.format( - tt_total)) - str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) - str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) - str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) - - # # save results to file - # np.savetxt(results_name_pre + 'average_train_scores.dt', - # average_train_scores) - # np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) - # np.savetxt(results_name_pre + 'average_perf_scores.dt', - # average_perf_scores) - # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) - # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) - # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) - - # np.save(results_name_pre + 'best_params_index', best_params_index) - # np.save(results_name_pre + 'best_params_pre.dt', best_params_out) - # np.save(results_name_pre + 'best_params_in.dt', best_params_in) - # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) - # np.save(results_name_pre + 'best_val_std.dt', best_val_std) - # np.save(results_name_pre + 'final_performance.dt', final_performance) - # np.save(results_name_pre + 'final_confidence.dt', final_confidence) - # np.save(results_name_pre + 'train_performance.dt', train_performance) - # np.save(results_name_pre + 'train_std.dt', train_std) - - # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) - # np.save(results_name_pre + 'average_gram_matrix_time.dt', - # average_gram_matrix_time) - # np.save(results_name_pre + 'std_gram_matrix_time.dt', - # std_gram_matrix_time) - # np.save(results_name_pre + 'best_gram_matrix_time.dt', - # best_gram_matrix_time) - - # read gram matrices from file. - else: - # Grid of parameters with a discrete number of values for each. -# param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) - param_list = list(ParameterGrid(param_grid)) - - # read gram matrices from file. - if verbose: - print() - print('2. Reading gram matrices from file...') - str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n' - gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz') - gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed - gram_matrix_time = gmfile['gmtime'] # time used to compute the gram matrices - param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones - y = gmfile['y'].tolist() - - tts = time.time() # start training time -# nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) - if verbose: - print( - '3. Fitting and predicting using nested cross validation. This could really take a while...' - ) + n_jobs=1, + read_gm_from_file=False, + verbose=True): + """Perform model selection, fitting and testing for precomputed kernels + using nested CV. Print out neccessary data during the process then finally + the results. + + Parameters + ---------- + datafile : string + Path of dataset file. + estimator : function + kernel function used to estimate. This function needs to return a gram matrix. + param_grid_precomputed : dictionary + Dictionary with names (string) of parameters used to calculate gram + matrices as keys and lists of parameter settings to try as values. This + enables searching over any sequence of parameter settings. Params with + length 1 will be omitted. + param_grid : dictionary + Dictionary with names (string) of parameters used as penelties as keys + and lists of parameter settings to try as values. This enables + searching over any sequence of parameter settings. Params with length 1 + will be omitted. + model_type : string + Type of the problem, can be 'regression' or 'classification'. + NUM_TRIALS : integer + Number of random trials of the outer CV loop. The default is 30. + datafile_y : string + Path of file storing y data. This parameter is optional depending on + the given dataset file. + extra_params : dict + Extra parameters for loading dataset. See function gklearn.utils. + graphfiles.loadDataset for detail. + ds_name : string + Name of the dataset. + n_jobs : int + Number of jobs for parallelization. + read_gm_from_file : boolean + Whether gram matrices are loaded from a file. + + Examples + -------- + >>> import numpy as np + >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel + >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel + >>> + >>> datafile = '../datasets/MUTAG/MUTAG_A.txt' + >>> estimator = untilhpathkernel + >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: + [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} + >>> # ’C’ for classification problems and ’alpha’ for regression problems. + >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: + np.logspace(-10, 10, num=41, base=10)}] + >>> + >>> model_selection_for_precomputed_kernel(datafile, estimator, + param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) + """ + tqdm.monitor_interval = 0 + + output_dir += estimator.__name__ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # a string to save all the results. + str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' + str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' + + # setup the model type + model_type = model_type.lower() + if model_type != 'regression' and model_type != 'classification': + raise Exception( + 'The model type is incorrect! Please choose from regression or classification.' + ) + if verbose: + print() + print('--- This is a %s problem ---' % model_type) + str_fw += 'This is a %s problem.\n' % model_type + + # calculate gram matrices rather than read them from file. + if read_gm_from_file == False: + # Load the dataset + if verbose: + print() + print('\n1. Loading dataset from file...') + if isinstance(datafile, str): + dataset, y_all = loadDataset( + datafile, filename_y=datafile_y, extra_params=extra_params) + else: # load data directly from variable. + dataset = datafile + y_all = datafile_y + + # import matplotlib.pyplot as plt + # import networkx as nx + # nx.draw_networkx(dataset[30]) + # plt.show() + + # Grid of parameters with a discrete number of values for each. + param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) + param_list = list(ParameterGrid(param_grid)) + + gram_matrices = [ + ] # a list to store gram matrices for all param_grid_precomputed + gram_matrix_time = [ + ] # a list to store time to calculate gram matrices + param_list_pre_revised = [ + ] # list to store param grids precomputed ignoring the useless ones + + # calculate all gram matrices + if verbose: + print() + print('2. Calculating gram matrices. This could take a while...') + str_fw += '\nII. Gram matrices.\n\n' + tts = time.time() # start training time + nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) + for idx, params_out in enumerate(param_list_precomputed): + y = y_all[:] + params_out['n_jobs'] = n_jobs + params_out['verbose'] = verbose +# print(dataset) +# import networkx as nx +# nx.draw_networkx(dataset[1]) +# plt.show() + rtn_data = estimator(dataset[:], **params_out) + Kmatrix = rtn_data[0] + current_run_time = rtn_data[1] + # for some kernels, some graphs in datasets may not meet the + # kernels' requirements for graph structure. These graphs are trimmed. + if len(rtn_data) == 3: + idx_trim = rtn_data[2] # the index of trimmed graph list + y = [y[idxt] for idxt in idx_trim] # trim y accordingly +# Kmatrix = np.random.rand(2250, 2250) +# current_run_time = 0.1 + + # remove graphs whose kernels with themselves are zeros + # @todo: y not changed accordingly? + Kmatrix_diag = Kmatrix.diagonal().copy() + nb_g_ignore = 0 + for idxk, diag in enumerate(Kmatrix_diag): + if diag == 0: + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) + nb_g_ignore += 1 + # normalization + # @todo: works only for undirected graph? + Kmatrix_diag = Kmatrix.diagonal().copy() + for i in range(len(Kmatrix)): + for j in range(i, len(Kmatrix)): + Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) + Kmatrix[j][i] = Kmatrix[i][j] + if verbose: + print() + if params_out == {}: + if verbose: + print('the gram matrix is: ') + str_fw += 'the gram matrix is:\n\n' + else: + if verbose: + print('the gram matrix with parameters', params_out, 'is: \n\n') + str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out + if len(Kmatrix) < 2: + nb_gm_ignore += 1 + if verbose: + print('ignored, as at most only one of all its diagonal value is non-zero.') + str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' + else: + if np.isnan(Kmatrix).any( + ): # if the matrix contains elements that are not numbers + nb_gm_ignore += 1 + if verbose: + print('ignored, as it contains elements that are not numbers.') + str_fw += 'ignored, as it contains elements that are not numbers.\n\n' + else: +# print(Kmatrix) + str_fw += np.array2string( + Kmatrix, + separator=',') + '\n\n' +# separator=',', +# threshold=np.inf, +# floatmode='unique') + '\n\n' + + fig_file_name = output_dir + '/GM[ds]' + ds_name + if params_out != {}: + fig_file_name += '[params]' + str(idx) + plt.imshow(Kmatrix) + plt.colorbar() + plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) +# plt.show() + plt.clf() + gram_matrices.append(Kmatrix) + gram_matrix_time.append(current_run_time) + param_list_pre_revised.append(params_out) + if nb_g_ignore > 0: + if verbose: + print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) + str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore + if verbose: + print() + print( + '{} gram matrices are calculated, {} of which are ignored.'.format( + len(param_list_precomputed), nb_gm_ignore)) + str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) + str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' + str_fw += ''.join([ + '{}: {}\n'.format(idx, params_out) + for idx, params_out in enumerate(param_list_precomputed) + ]) + + if verbose: + print() + if len(gram_matrices) == 0: + if verbose: + print('all gram matrices are ignored, no results obtained.') + str_fw += '\nall gram matrices are ignored, no results obtained.\n\n' + else: + # save gram matrices to file. +# np.savez(output_dir + '/' + ds_name + '.gm', +# gms=gram_matrices, params=param_list_pre_revised, y=y, +# gmtime=gram_matrix_time) + if verbose: + print( + '3. Fitting and predicting using nested cross validation. This could really take a while...' + ) + + # ---- use pool.imap_unordered to parallel and track progress. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# def func_assign(result, var_to_assign): +# for idx, itm in enumerate(var_to_assign): +# itm.append(result[idx]) +# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) +# +# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, +# [train_pref, val_pref, test_pref], glbv=gram_matrices, +# method='imap_unordered', n_jobs=n_jobs, chunksize=1, +# itr_desc='cross validation') + + def init_worker(gms_toshare): + global G_gms + G_gms = gms_toshare + +# gram_matrices = np.array(gram_matrices) +# gms_shape = gram_matrices.shape +# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) +# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) + trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) + train_pref = [] + val_pref = [] + test_pref = [] +# if NUM_TRIALS < 1000 * n_jobs: +# chunksize = int(NUM_TRIALS / n_jobs) + 1 +# else: +# chunksize = 1000 + chunksize = 1 + if verbose: + iterator = tqdm(pool.imap_unordered(trial_do_partial, + range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout) + else: + iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize) + for o1, o2, o3 in iterator: + train_pref.append(o1) + val_pref.append(o2) + test_pref.append(o3) + pool.close() + pool.join() + +# # ---- use pool.map to parallel. ---- +# pool = Pool(n_jobs) +# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type) +# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) +# train_pref = [item[0] for item in result_perf] +# val_pref = [item[1] for item in result_perf] +# test_pref = [item[2] for item in result_perf] + +# # ---- direct running, normally use a single CPU core. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): +# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) +# train_pref.append(o1) +# val_pref.append(o2) +# test_pref.append(o3) +# print() + + if verbose: + print() + print('4. Getting final performance...') + str_fw += '\nIII. Performance.\n\n' + # averages and confidences of performances on outer trials for each combination of parameters + average_train_scores = np.mean(train_pref, axis=0) +# print('val_pref: ', val_pref[0][0]) + average_val_scores = np.mean(val_pref, axis=0) +# print('test_pref: ', test_pref[0][0]) + average_perf_scores = np.mean(test_pref, axis=0) + # sample std is used here + std_train_scores = np.std(train_pref, axis=0, ddof=1) + std_val_scores = np.std(val_pref, axis=0, ddof=1) + std_perf_scores = np.std(test_pref, axis=0, ddof=1) + + if model_type == 'regression': + best_val_perf = np.amin(average_val_scores) + else: + best_val_perf = np.amax(average_val_scores) +# print('average_val_scores: ', average_val_scores) +# print('best_val_perf: ', best_val_perf) +# print() + best_params_index = np.where(average_val_scores == best_val_perf) + # find smallest val std with best val perf. + best_val_stds = [ + std_val_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + min_val_std = np.amin(best_val_stds) + best_params_index = np.where(std_val_scores == min_val_std) + best_params_out = [ + param_list_pre_revised[i] for i in best_params_index[0] + ] + best_params_in = [param_list[i] for i in best_params_index[1]] + if verbose: + print('best_params_out: ', best_params_out) + print('best_params_in: ', best_params_in) + print() + print('best_val_perf: ', best_val_perf) + print('best_val_std: ', min_val_std) + str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out + str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in + str_fw += 'best_val_perf: %s\n' % best_val_perf + str_fw += 'best_val_std: %s\n' % min_val_std + +# print(best_params_index) +# print(best_params_index[0]) +# print(average_perf_scores) + final_performance = [ + average_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + final_confidence = [ + std_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + if verbose: + print('final_performance: ', final_performance) + print('final_confidence: ', final_confidence) + str_fw += 'final_performance: %s\n' % final_performance + str_fw += 'final_confidence: %s\n' % final_confidence + train_performance = [ + average_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + train_std = [ + std_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + if verbose: + print('train_performance: %s' % train_performance) + print('train_std: ', train_std) + str_fw += 'train_performance: %s\n' % train_performance + str_fw += 'train_std: %s\n\n' % train_std + + if verbose: + print() + tt_total = time.time() - tts # training time for all hyper-parameters + average_gram_matrix_time = np.mean(gram_matrix_time) + std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0 + best_gram_matrix_time = [ + gram_matrix_time[i] for i in best_params_index[0] + ] + ave_bgmt = np.mean(best_gram_matrix_time) + std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 + if verbose: + print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' + .format(average_gram_matrix_time, std_gram_matrix_time)) + print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( + ave_bgmt, std_bgmt)) + print('total training time with all hyper-param choices: {:.2f}s'.format( + tt_total)) + str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) + str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) + str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) + + # # save results to file + # np.savetxt(results_name_pre + 'average_train_scores.dt', + # average_train_scores) + # np.savetxt(results_name_pre + 'average_val_scores', average_val_scores) + # np.savetxt(results_name_pre + 'average_perf_scores.dt', + # average_perf_scores) + # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores) + # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores) + # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores) + + # np.save(results_name_pre + 'best_params_index', best_params_index) + # np.save(results_name_pre + 'best_params_pre.dt', best_params_out) + # np.save(results_name_pre + 'best_params_in.dt', best_params_in) + # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) + # np.save(results_name_pre + 'best_val_std.dt', best_val_std) + # np.save(results_name_pre + 'final_performance.dt', final_performance) + # np.save(results_name_pre + 'final_confidence.dt', final_confidence) + # np.save(results_name_pre + 'train_performance.dt', train_performance) + # np.save(results_name_pre + 'train_std.dt', train_std) + + # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) + # np.save(results_name_pre + 'average_gram_matrix_time.dt', + # average_gram_matrix_time) + # np.save(results_name_pre + 'std_gram_matrix_time.dt', + # std_gram_matrix_time) + # np.save(results_name_pre + 'best_gram_matrix_time.dt', + # best_gram_matrix_time) + + # read gram matrices from file. + else: + # Grid of parameters with a discrete number of values for each. +# param_list_precomputed = list(ParameterGrid(param_grid_precomputed)) + param_list = list(ParameterGrid(param_grid)) + + # read gram matrices from file. + if verbose: + print() + print('2. Reading gram matrices from file...') + str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n' + gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz') + gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed + gram_matrix_time = gmfile['gmtime'] # time used to compute the gram matrices + param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones + y = gmfile['y'].tolist() + + tts = time.time() # start training time +# nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) + if verbose: + print( + '3. Fitting and predicting using nested cross validation. This could really take a while...' + ) - # ---- use pool.imap_unordered to parallel and track progress. ---- - def init_worker(gms_toshare): - global G_gms - G_gms = gms_toshare - - pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) - trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) - train_pref = [] - val_pref = [] - test_pref = [] - chunksize = 1 - if verbose: - iterator = tqdm(pool.imap_unordered(trial_do_partial, - range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout) - else: - iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize) - for o1, o2, o3 in iterator: - train_pref.append(o1) - val_pref.append(o2) - test_pref.append(o3) - pool.close() - pool.join() - - # # ---- use pool.map to parallel. ---- - # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) - # train_pref = [item[0] for item in result_perf] - # val_pref = [item[1] for item in result_perf] - # test_pref = [item[2] for item in result_perf] - - # # ---- use joblib.Parallel to parallel and track progress. ---- - # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) - # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) - # train_pref = [item[0] for item in result_perf] - # val_pref = [item[1] for item in result_perf] - # test_pref = [item[2] for item in result_perf] - -# # ---- direct running, normally use a single CPU core. ---- -# train_pref = [] -# val_pref = [] -# test_pref = [] -# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): -# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) -# train_pref.append(o1) -# val_pref.append(o2) -# test_pref.append(o3) - - if verbose: - print() - print('4. Getting final performance...') - str_fw += '\nIII. Performance.\n\n' - # averages and confidences of performances on outer trials for each combination of parameters - average_train_scores = np.mean(train_pref, axis=0) - average_val_scores = np.mean(val_pref, axis=0) - average_perf_scores = np.mean(test_pref, axis=0) - # sample std is used here - std_train_scores = np.std(train_pref, axis=0, ddof=1) - std_val_scores = np.std(val_pref, axis=0, ddof=1) - std_perf_scores = np.std(test_pref, axis=0, ddof=1) - - if model_type == 'regression': - best_val_perf = np.amin(average_val_scores) - else: - best_val_perf = np.amax(average_val_scores) - best_params_index = np.where(average_val_scores == best_val_perf) - # find smallest val std with best val perf. - best_val_stds = [ - std_val_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - min_val_std = np.amin(best_val_stds) - best_params_index = np.where(std_val_scores == min_val_std) - best_params_out = [ - param_list_pre_revised[i] for i in best_params_index[0] - ] - best_params_in = [param_list[i] for i in best_params_index[1]] - if verbose: - print('best_params_out: ', best_params_out) - print('best_params_in: ', best_params_in) - print() - print('best_val_perf: ', best_val_perf) - print('best_val_std: ', min_val_std) - str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out - str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in - str_fw += 'best_val_perf: %s\n' % best_val_perf - str_fw += 'best_val_std: %s\n' % min_val_std - - final_performance = [ - average_perf_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - final_confidence = [ - std_perf_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - if verbose: - print('final_performance: ', final_performance) - print('final_confidence: ', final_confidence) - str_fw += 'final_performance: %s\n' % final_performance - str_fw += 'final_confidence: %s\n' % final_confidence - train_performance = [ - average_train_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - train_std = [ - std_train_scores[value][best_params_index[1][idx]] - for idx, value in enumerate(best_params_index[0]) - ] - if verbose: - print('train_performance: %s' % train_performance) - print('train_std: ', train_std) - str_fw += 'train_performance: %s\n' % train_performance - str_fw += 'train_std: %s\n\n' % train_std - - if verbose: - print() - average_gram_matrix_time = np.mean(gram_matrix_time) - std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0 - best_gram_matrix_time = [ - gram_matrix_time[i] for i in best_params_index[0] - ] - ave_bgmt = np.mean(best_gram_matrix_time) - std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 - if verbose: - print( - 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' - .format(average_gram_matrix_time, std_gram_matrix_time)) - print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( - ave_bgmt, std_bgmt)) - tt_poster = time.time() - tts # training time with hyper-param choices who did not participate in calculation of gram matrices - if verbose: - print( - 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format( - tt_poster)) - print('total training time with all hyper-param choices: {:.2f}s'.format( - tt_poster + np.sum(gram_matrix_time))) -# str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) -# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) - str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) - - # open file to save all results for this dataset. - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # print out results as table. - str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, - std_val_scores, average_perf_scores, std_perf_scores, - average_train_scores, std_train_scores, gram_matrix_time, - model_type, verbose) - - # open file to save all results for this dataset. - if not os.path.exists(output_dir + '/' + ds_name + '.output.txt'): - with open(output_dir + '/' + ds_name + '.output.txt', 'w') as f: - f.write(str_fw) - else: - with open(output_dir + '/' + ds_name + '.output.txt', 'r+') as f: - content = f.read() - f.seek(0, 0) - f.write(str_fw + '\n\n\n' + content) + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(gms_toshare): + global G_gms + G_gms = gms_toshare + + pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,)) + trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type) + train_pref = [] + val_pref = [] + test_pref = [] + chunksize = 1 + if verbose: + iterator = tqdm(pool.imap_unordered(trial_do_partial, + range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout) + else: + iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize) + for o1, o2, o3 in iterator: + train_pref.append(o1) + val_pref.append(o2) + test_pref.append(o3) + pool.close() + pool.join() + + # # ---- use pool.map to parallel. ---- + # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) + # train_pref = [item[0] for item in result_perf] + # val_pref = [item[1] for item in result_perf] + # test_pref = [item[2] for item in result_perf] + + # # ---- use joblib.Parallel to parallel and track progress. ---- + # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type) + # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS)) + # train_pref = [item[0] for item in result_perf] + # val_pref = [item[1] for item in result_perf] + # test_pref = [item[2] for item in result_perf] + +# # ---- direct running, normally use a single CPU core. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): +# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) +# train_pref.append(o1) +# val_pref.append(o2) +# test_pref.append(o3) + + if verbose: + print() + print('4. Getting final performance...') + str_fw += '\nIII. Performance.\n\n' + # averages and confidences of performances on outer trials for each combination of parameters + average_train_scores = np.mean(train_pref, axis=0) + average_val_scores = np.mean(val_pref, axis=0) + average_perf_scores = np.mean(test_pref, axis=0) + # sample std is used here + std_train_scores = np.std(train_pref, axis=0, ddof=1) + std_val_scores = np.std(val_pref, axis=0, ddof=1) + std_perf_scores = np.std(test_pref, axis=0, ddof=1) + + if model_type == 'regression': + best_val_perf = np.amin(average_val_scores) + else: + best_val_perf = np.amax(average_val_scores) + best_params_index = np.where(average_val_scores == best_val_perf) + # find smallest val std with best val perf. + best_val_stds = [ + std_val_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + min_val_std = np.amin(best_val_stds) + best_params_index = np.where(std_val_scores == min_val_std) + best_params_out = [ + param_list_pre_revised[i] for i in best_params_index[0] + ] + best_params_in = [param_list[i] for i in best_params_index[1]] + if verbose: + print('best_params_out: ', best_params_out) + print('best_params_in: ', best_params_in) + print() + print('best_val_perf: ', best_val_perf) + print('best_val_std: ', min_val_std) + str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out + str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in + str_fw += 'best_val_perf: %s\n' % best_val_perf + str_fw += 'best_val_std: %s\n' % min_val_std + + final_performance = [ + average_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + final_confidence = [ + std_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + if verbose: + print('final_performance: ', final_performance) + print('final_confidence: ', final_confidence) + str_fw += 'final_performance: %s\n' % final_performance + str_fw += 'final_confidence: %s\n' % final_confidence + train_performance = [ + average_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + train_std = [ + std_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + if verbose: + print('train_performance: %s' % train_performance) + print('train_std: ', train_std) + str_fw += 'train_performance: %s\n' % train_performance + str_fw += 'train_std: %s\n\n' % train_std + + if verbose: + print() + average_gram_matrix_time = np.mean(gram_matrix_time) + std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0 + best_gram_matrix_time = [ + gram_matrix_time[i] for i in best_params_index[0] + ] + ave_bgmt = np.mean(best_gram_matrix_time) + std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 + if verbose: + print( + 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' + .format(average_gram_matrix_time, std_gram_matrix_time)) + print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( + ave_bgmt, std_bgmt)) + tt_poster = time.time() - tts # training time with hyper-param choices who did not participate in calculation of gram matrices + if verbose: + print( + 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format( + tt_poster)) + print('total training time with all hyper-param choices: {:.2f}s'.format( + tt_poster + np.sum(gram_matrix_time))) +# str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) +# str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) + str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster) + + # open file to save all results for this dataset. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # print out results as table. + str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores, + std_val_scores, average_perf_scores, std_perf_scores, + average_train_scores, std_train_scores, gram_matrix_time, + model_type, verbose) + + # open file to save all results for this dataset. + if not os.path.exists(output_dir + '/' + ds_name + '.output.txt'): + with open(output_dir + '/' + ds_name + '.output.txt', 'w') as f: + f.write(str_fw) + else: + with open(output_dir + '/' + ds_name + '.output.txt', 'r+') as f: + content = f.read() + f.seek(0, 0) + f.write(str_fw + '\n\n\n' + content) + + return final_performance, final_confidence def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level -# # get gram matrices from global variables. -# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') - - # Arrays to store scores - train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) - val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) - test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) - - # randomness added to seeds of split function below. "high" is "size" times - # 10 so that at least 10 different random output will be yielded. Remove - # these lines if identical outputs is required. - rdm_out = np.random.RandomState(seed=None) - rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, - size=len(param_list_pre_revised)) -# print(trial, rdm_seed_out_l) -# print() - # loop for each outer param tuple - for index_out, params_out in enumerate(param_list_pre_revised): - # get gram matrices from global variables. -# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] -# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') - gm_now = gram_matrices[index_out].copy() - - # split gram matrix and y to app and test sets. - indices = range(len(y)) - # The argument "random_state" in function "train_test_split" can not be - # set to None, because it will use RandomState instance used by - # np.random, which is possible for multiple subprocesses to inherit the - # same seed if they forked at the same time, leading to identical - # random variates for different subprocesses. Instead, we use "trial" - # and "index_out" parameters to generate different seeds for different - # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add - # randomness into seeds, so that it yields a different output every - # time the program is run. To yield identical outputs every time, - # remove the second line below. Same method is used to the "KFold" - # function in the inner loop. - rdm_seed_out = (trial + 1) * (index_out + 1) - rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) -# print(trial, rdm_seed_out) - X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( - gm_now, y, indices, test_size=0.1, - random_state=rdm_seed_out, shuffle=True) -# print(trial, idx_app, idx_test) -# print() - X_app = X_app[:, idx_app] - X_test = X_test[:, idx_app] - y_app = np.array(y_app) - y_test = np.array(y_test) - - rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, - size=len(param_list)) - # loop for each inner param tuple - for index_in, params_in in enumerate(param_list): -# if trial == 0: -# print(index_out, index_in) -# print('params_in: ', params_in) -# st = time.time() - rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) -# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) - rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) -# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) - inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) - current_train_perf = [] - current_valid_perf = [] - current_test_perf = [] - - # For regression use the Kernel Ridge method -# try: - if model_type == 'regression': - kr = KernelRidge(kernel='precomputed', **params_in) - # loop for each split on validation set level - # validation set level - for train_index, valid_index in inner_cv.split(X_app): -# print("train_index, valid_index: ", trial, index_in, train_index, valid_index) -# if trial == 0: -# print('train_index: ', train_index) -# print('valid_index: ', valid_index) -# print('idx_test: ', idx_test) -# print('y_app[train_index]: ', y_app[train_index]) -# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) -# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) - kr.fit(X_app[train_index, :][:, train_index], - y_app[train_index]) - - # predict on the train, validation and test set - y_pred_train = kr.predict( - X_app[train_index, :][:, train_index]) - y_pred_valid = kr.predict( - X_app[valid_index, :][:, train_index]) -# if trial == 0: -# print('y_pred_valid: ', y_pred_valid) -# print() - y_pred_test = kr.predict( - X_test[:, train_index]) - - # root mean squared errors - current_train_perf.append( - np.sqrt( - mean_squared_error( - y_app[train_index], y_pred_train))) - current_valid_perf.append( - np.sqrt( - mean_squared_error( - y_app[valid_index], y_pred_valid))) -# if trial == 0: -# print(mean_squared_error( -# y_app[valid_index], y_pred_valid)) - current_test_perf.append( - np.sqrt( - mean_squared_error( - y_test, y_pred_test))) - # For clcassification use SVM - else: - svc = SVC(kernel='precomputed', cache_size=200, - verbose=False, **params_in) - # loop for each split on validation set level - # validation set level - for train_index, valid_index in inner_cv.split(X_app): -# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) -# if trial == 0: -# print('train_index: ', train_index) -# print('valid_index: ', valid_index) -# print('idx_test: ', idx_test) -# print('y_app[train_index]: ', y_app[train_index]) -# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) -# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) - svc.fit(X_app[train_index, :][:, train_index], - y_app[train_index]) - - # predict on the train, validation and test set - y_pred_train = svc.predict( - X_app[train_index, :][:, train_index]) - y_pred_valid = svc.predict( - X_app[valid_index, :][:, train_index]) - y_pred_test = svc.predict( - X_test[:, train_index]) - - # root mean squared errors - current_train_perf.append( - accuracy_score(y_app[train_index], - y_pred_train)) - current_valid_perf.append( - accuracy_score(y_app[valid_index], - y_pred_valid)) - current_test_perf.append( - accuracy_score(y_test, y_pred_test)) -# except ValueError: -# print(sys.exc_info()[0]) -# print(params_out, params_in) - - # average performance on inner splits - train_pref[index_out][index_in] = np.mean( - current_train_perf) - val_pref[index_out][index_in] = np.mean( - current_valid_perf) - test_pref[index_out][index_in] = np.mean( - current_test_perf) -# print(time.time() - st) -# if trial == 0: -# print('val_pref: ', val_pref) -# print('test_pref: ', test_pref) - - return train_pref, val_pref, test_pref +# # get gram matrices from global variables. +# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') + + # Arrays to store scores + train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + + # randomness added to seeds of split function below. "high" is "size" times + # 10 so that at least 10 different random output will be yielded. Remove + # these lines if identical outputs is required. + rdm_out = np.random.RandomState(seed=None) + rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, + size=len(param_list_pre_revised)) +# print(trial, rdm_seed_out_l) +# print() + # loop for each outer param tuple + for index_out, params_out in enumerate(param_list_pre_revised): + # get gram matrices from global variables. +# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] +# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') + gm_now = gram_matrices[index_out].copy() + + # split gram matrix and y to app and test sets. + indices = range(len(y)) + # The argument "random_state" in function "train_test_split" can not be + # set to None, because it will use RandomState instance used by + # np.random, which is possible for multiple subprocesses to inherit the + # same seed if they forked at the same time, leading to identical + # random variates for different subprocesses. Instead, we use "trial" + # and "index_out" parameters to generate different seeds for different + # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add + # randomness into seeds, so that it yields a different output every + # time the program is run. To yield identical outputs every time, + # remove the second line below. Same method is used to the "KFold" + # function in the inner loop. + rdm_seed_out = (trial + 1) * (index_out + 1) + rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) +# print(trial, rdm_seed_out) + X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( + gm_now, y, indices, test_size=0.1, + random_state=rdm_seed_out, shuffle=True) +# print(trial, idx_app, idx_test) +# print() + X_app = X_app[:, idx_app] + X_test = X_test[:, idx_app] + y_app = np.array(y_app) + y_test = np.array(y_test) + + rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, + size=len(param_list)) + # loop for each inner param tuple + for index_in, params_in in enumerate(param_list): +# if trial == 0: +# print(index_out, index_in) +# print('params_in: ', params_in) +# st = time.time() + rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) +# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) + rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) +# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) + inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) + current_train_perf = [] + current_valid_perf = [] + current_test_perf = [] + + # For regression use the Kernel Ridge method +# try: + if model_type == 'regression': + kr = KernelRidge(kernel='precomputed', **params_in) + # loop for each split on validation set level + # validation set level + for train_index, valid_index in inner_cv.split(X_app): +# print("train_index, valid_index: ", trial, index_in, train_index, valid_index) +# if trial == 0: +# print('train_index: ', train_index) +# print('valid_index: ', valid_index) +# print('idx_test: ', idx_test) +# print('y_app[train_index]: ', y_app[train_index]) +# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) +# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) + kr.fit(X_app[train_index, :][:, train_index], + y_app[train_index]) + + # predict on the train, validation and test set + y_pred_train = kr.predict( + X_app[train_index, :][:, train_index]) + y_pred_valid = kr.predict( + X_app[valid_index, :][:, train_index]) +# if trial == 0: +# print('y_pred_valid: ', y_pred_valid) +# print() + y_pred_test = kr.predict( + X_test[:, train_index]) + + # root mean squared errors + current_train_perf.append( + np.sqrt( + mean_squared_error( + y_app[train_index], y_pred_train))) + current_valid_perf.append( + np.sqrt( + mean_squared_error( + y_app[valid_index], y_pred_valid))) +# if trial == 0: +# print(mean_squared_error( +# y_app[valid_index], y_pred_valid)) + current_test_perf.append( + np.sqrt( + mean_squared_error( + y_test, y_pred_test))) + # For clcassification use SVM + else: + svc = SVC(kernel='precomputed', cache_size=200, + verbose=False, **params_in) + # loop for each split on validation set level + # validation set level + for train_index, valid_index in inner_cv.split(X_app): +# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) +# if trial == 0: +# print('train_index: ', train_index) +# print('valid_index: ', valid_index) +# print('idx_test: ', idx_test) +# print('y_app[train_index]: ', y_app[train_index]) +# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) +# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) + svc.fit(X_app[train_index, :][:, train_index], + y_app[train_index]) + + # predict on the train, validation and test set + y_pred_train = svc.predict( + X_app[train_index, :][:, train_index]) + y_pred_valid = svc.predict( + X_app[valid_index, :][:, train_index]) + y_pred_test = svc.predict( + X_test[:, train_index]) + + # root mean squared errors + current_train_perf.append( + accuracy_score(y_app[train_index], + y_pred_train)) + current_valid_perf.append( + accuracy_score(y_app[valid_index], + y_pred_valid)) + current_test_perf.append( + accuracy_score(y_test, y_pred_test)) +# except ValueError: +# print(sys.exc_info()[0]) +# print(params_out, params_in) + + # average performance on inner splits + train_pref[index_out][index_in] = np.mean( + current_train_perf) + val_pref[index_out][index_in] = np.mean( + current_valid_perf) + test_pref[index_out][index_in] = np.mean( + current_test_perf) +# print(time.time() - st) +# if trial == 0: +# print('val_pref: ', val_pref) +# print('test_pref: ', test_pref) + + return train_pref, val_pref, test_pref def parallel_trial_do(param_list_pre_revised, param_list, y, model_type, trial): - train_pref, val_pref, test_pref = trial_do(param_list_pre_revised, - param_list, G_gms, y, - model_type, trial) - return train_pref, val_pref, test_pref + train_pref, val_pref, test_pref = trial_do(param_list_pre_revised, + param_list, G_gms, y, + model_type, trial) + return train_pref, val_pref, test_pref def compute_gram_matrices(dataset, y, estimator, param_list_precomputed, - output_dir, ds_name, - n_jobs=1, str_fw='', verbose=True): - gram_matrices = [ - ] # a list to store gram matrices for all param_grid_precomputed - gram_matrix_time = [ - ] # a list to store time to calculate gram matrices - param_list_pre_revised = [ - ] # list to store param grids precomputed ignoring the useless ones - - nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) - for idx, params_out in enumerate(param_list_precomputed): - params_out['n_jobs'] = n_jobs -# print(dataset) -# import networkx as nx -# nx.draw_networkx(dataset[1]) -# plt.show() - rtn_data = estimator(dataset[:], **params_out) - Kmatrix = rtn_data[0] - current_run_time = rtn_data[1] - # for some kernels, some graphs in datasets may not meet the - # kernels' requirements for graph structure. These graphs are trimmed. - if len(rtn_data) == 3: - idx_trim = rtn_data[2] # the index of trimmed graph list - y = [y[idxt] for idxt in idx_trim] # trim y accordingly - - Kmatrix_diag = Kmatrix.diagonal().copy() - # remove graphs whose kernels with themselves are zeros - nb_g_ignore = 0 - for idxk, diag in enumerate(Kmatrix_diag): - if diag == 0: - Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) - Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) - nb_g_ignore += 1 - # normalization - for i in range(len(Kmatrix)): - for j in range(i, len(Kmatrix)): - Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) - Kmatrix[j][i] = Kmatrix[i][j] - - if verbose: - print() - if params_out == {}: - if verbose: - print('the gram matrix is: ') - str_fw += 'the gram matrix is:\n\n' - else: - if verbose: - print('the gram matrix with parameters', params_out, 'is: ') - str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out - if len(Kmatrix) < 2: - nb_gm_ignore += 1 - if verbose: - print('ignored, as at most only one of all its diagonal value is non-zero.') - str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' - else: - if np.isnan(Kmatrix).any( - ): # if the matrix contains elements that are not numbers - nb_gm_ignore += 1 - if verbose: - print('ignored, as it contains elements that are not numbers.') - str_fw += 'ignored, as it contains elements that are not numbers.\n\n' - else: -# print(Kmatrix) - str_fw += np.array2string( - Kmatrix, - separator=',') + '\n\n' -# separator=',', -# threshold=np.inf, -# floatmode='unique') + '\n\n' - - fig_file_name = output_dir + '/GM[ds]' + ds_name - if params_out != {}: - fig_file_name += '[params]' + str(idx) - plt.imshow(Kmatrix) - plt.colorbar() - plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) -# plt.show() - plt.clf() - gram_matrices.append(Kmatrix) - gram_matrix_time.append(current_run_time) - param_list_pre_revised.append(params_out) - if nb_g_ignore > 0: - if verbose: - print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) - str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore - if verbose: - print() - print( - '{} gram matrices are calculated, {} of which are ignored.'.format( - len(param_list_precomputed), nb_gm_ignore)) - str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) - str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' - str_fw += ''.join([ - '{}: {}\n'.format(idx, params_out) - for idx, params_out in enumerate(param_list_precomputed) - ]) - - return gram_matrices, gram_matrix_time, param_list_pre_revised, y, str_fw + output_dir, ds_name, + n_jobs=1, str_fw='', verbose=True): + gram_matrices = [ + ] # a list to store gram matrices for all param_grid_precomputed + gram_matrix_time = [ + ] # a list to store time to calculate gram matrices + param_list_pre_revised = [ + ] # list to store param grids precomputed ignoring the useless ones + + nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) + for idx, params_out in enumerate(param_list_precomputed): + params_out['n_jobs'] = n_jobs +# print(dataset) +# import networkx as nx +# nx.draw_networkx(dataset[1]) +# plt.show() + rtn_data = estimator(dataset[:], **params_out) + Kmatrix = rtn_data[0] + current_run_time = rtn_data[1] + # for some kernels, some graphs in datasets may not meet the + # kernels' requirements for graph structure. These graphs are trimmed. + if len(rtn_data) == 3: + idx_trim = rtn_data[2] # the index of trimmed graph list + y = [y[idxt] for idxt in idx_trim] # trim y accordingly + + Kmatrix_diag = Kmatrix.diagonal().copy() + # remove graphs whose kernels with themselves are zeros + nb_g_ignore = 0 + for idxk, diag in enumerate(Kmatrix_diag): + if diag == 0: + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) + nb_g_ignore += 1 + # normalization + for i in range(len(Kmatrix)): + for j in range(i, len(Kmatrix)): + Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) + Kmatrix[j][i] = Kmatrix[i][j] + + if verbose: + print() + if params_out == {}: + if verbose: + print('the gram matrix is: ') + str_fw += 'the gram matrix is:\n\n' + else: + if verbose: + print('the gram matrix with parameters', params_out, 'is: ') + str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out + if len(Kmatrix) < 2: + nb_gm_ignore += 1 + if verbose: + print('ignored, as at most only one of all its diagonal value is non-zero.') + str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' + else: + if np.isnan(Kmatrix).any( + ): # if the matrix contains elements that are not numbers + nb_gm_ignore += 1 + if verbose: + print('ignored, as it contains elements that are not numbers.') + str_fw += 'ignored, as it contains elements that are not numbers.\n\n' + else: +# print(Kmatrix) + str_fw += np.array2string( + Kmatrix, + separator=',') + '\n\n' +# separator=',', +# threshold=np.inf, +# floatmode='unique') + '\n\n' + + fig_file_name = output_dir + '/GM[ds]' + ds_name + if params_out != {}: + fig_file_name += '[params]' + str(idx) + plt.imshow(Kmatrix) + plt.colorbar() + plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) +# plt.show() + plt.clf() + gram_matrices.append(Kmatrix) + gram_matrix_time.append(current_run_time) + param_list_pre_revised.append(params_out) + if nb_g_ignore > 0: + if verbose: + print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) + str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore + if verbose: + print() + print( + '{} gram matrices are calculated, {} of which are ignored.'.format( + len(param_list_precomputed), nb_gm_ignore)) + str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore) + str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' + str_fw += ''.join([ + '{}: {}\n'.format(idx, params_out) + for idx, params_out in enumerate(param_list_precomputed) + ]) + + return gram_matrices, gram_matrix_time, param_list_pre_revised, y, str_fw def read_gram_matrices_from_file(output_dir, ds_name): - gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz') - gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed - param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones - y = gmfile['y'].tolist() - return gram_matrices, param_list_pre_revised, y + gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz') + gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed + param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones + y = gmfile['y'].tolist() + return gram_matrices, param_list_pre_revised, y def printResultsInTable(param_list, param_list_pre_revised, average_val_scores, - std_val_scores, average_perf_scores, std_perf_scores, - average_train_scores, std_train_scores, gram_matrix_time, - model_type, verbose): - from collections import OrderedDict - from tabulate import tabulate - table_dict = {} - if model_type == 'regression': - for param_in in param_list: - param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) - else: - for param_in in param_list: - param_in['C'] = '{:.2e}'.format(param_in['C']) - table_dict['params'] = [{**param_out, **param_in} - for param_in in param_list for param_out in param_list_pre_revised] - table_dict['gram_matrix_time'] = [ - '{:.2f}'.format(gram_matrix_time[index_out]) - for param_in in param_list - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['valid_perf'] = [ - '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], - std_val_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['test_perf'] = [ - '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], - std_perf_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - table_dict['train_perf'] = [ - '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], - std_train_scores[index_out][index_in]) - for index_in, _ in enumerate(param_list) - for index_out, _ in enumerate(param_list_pre_revised) - ] - - keyorder = [ - 'params', 'train_perf', 'valid_perf', 'test_perf', - 'gram_matrix_time' - ] - if verbose: - print() - tb_print = tabulate(OrderedDict(sorted(table_dict.items(), - key=lambda i: keyorder.index(i[0]))), headers='keys') -# print(tb_print) - return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print \ No newline at end of file + std_val_scores, average_perf_scores, std_perf_scores, + average_train_scores, std_train_scores, gram_matrix_time, + model_type, verbose): + from collections import OrderedDict + from tabulate import tabulate + table_dict = {} + if model_type == 'regression': + for param_in in param_list: + param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) + else: + for param_in in param_list: + param_in['C'] = '{:.2e}'.format(param_in['C']) + table_dict['params'] = [{**param_out, **param_in} + for param_in in param_list for param_out in param_list_pre_revised] + table_dict['gram_matrix_time'] = [ + '{:.2f}'.format(gram_matrix_time[index_out]) + for param_in in param_list + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['valid_perf'] = [ + '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], + std_val_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['test_perf'] = [ + '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], + std_perf_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['train_perf'] = [ + '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], + std_train_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + + keyorder = [ + 'params', 'train_perf', 'valid_perf', 'test_perf', + 'gram_matrix_time' + ] + if verbose: + print() + tb_print = tabulate(OrderedDict(sorted(table_dict.items(), + key=lambda i: keyorder.index(i[0]))), headers='keys') +# print(tb_print) + return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print \ No newline at end of file diff --git a/gklearn/utils/stats.py b/gklearn/utils/stats.py new file mode 100644 index 0000000..d51cf48 --- /dev/null +++ b/gklearn/utils/stats.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Oct 5 15:12:41 2020 + +@author: ljia +""" +from collections import Counter +from scipy import stats + + +def entropy(labels, base=None): + """Calculate the entropy of a distribution for given list of labels. + + Parameters + ---------- + labels : list + Given list of labels. + base : float, optional + The logarithmic base to use. The default is ``e`` (natural logarithm). + + Returns + ------- + float + The calculated entropy. + """ + return stats.entropy(list(Counter(labels).values()), base=base) \ No newline at end of file diff --git a/setup.py b/setup.py index ea10603..cd45970 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ with open('requirements_pypi.txt') as fp: setuptools.setup( name="graphkit-learn", - version="0.2.0", + version="0.2.1b1", author="Linlin Jia", author_email="linlin.jia@insa-rouen.fr", description="A Python library for graph kernels, graph edit distances, and graph pre-images",