From ed170105357449d07d5becf701dac7ff0b8029c1 Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Sat, 22 May 2021 11:59:05 +0200 Subject: [PATCH] [Major Feature][Todo] Add model_learning module, including a new NestedCV class and a Workflow class for graph kernel computation. --- gklearn/model_learning/__init__.py | 14 + gklearn/model_learning/nested_cv.py | 714 ++++++++++++++++++++++++++++++++++++ gklearn/model_learning/workflow.py | 109 ++++++ 3 files changed, 837 insertions(+) create mode 100644 gklearn/model_learning/__init__.py create mode 100644 gklearn/model_learning/nested_cv.py create mode 100644 gklearn/model_learning/workflow.py diff --git a/gklearn/model_learning/__init__.py b/gklearn/model_learning/__init__.py new file mode 100644 index 0000000..d4c6aaa --- /dev/null +++ b/gklearn/model_learning/__init__.py @@ -0,0 +1,14 @@ +# -*-coding:utf-8 -*- +""" +model learning. +""" + +# info +__version__ = "0.2" +__author__ = "Linlin Jia" +__date__ = "November 2020" + + +from gklearn.model_learning.nested_cv import NestedCV +from gklearn.model_learning.workflow import Workflow +from gklearn.model_learning.parameters import dichotomous_permutation \ No newline at end of file diff --git a/gklearn/model_learning/nested_cv.py b/gklearn/model_learning/nested_cv.py new file mode 100644 index 0000000..92bc8f9 --- /dev/null +++ b/gklearn/model_learning/nested_cv.py @@ -0,0 +1,714 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Nov 27 18:59:28 2020 + +@author: ljia +""" +import os +import datetime +import time +import sys +from tqdm import tqdm +from multiprocessing import Pool, Array +from functools import partial +import numpy as np +from matplotlib import pyplot as plt +from sklearn.model_selection import KFold, train_test_split, ParameterGrid +from sklearn.kernel_ridge import KernelRidge +from sklearn.svm import SVC +from sklearn.metrics import accuracy_score, mean_squared_error + + +class NestedCV(object): + """Perform model selection, fitting and testing for precomputed kernels + using nested CV. Print out neccessary data during the process then finally + the results. + + Parameters + ---------- + datafile : string + Path of dataset file. + estimator : function + kernel function used to estimate. This function needs to return a gram matrix. + param_grid_precomputed : dictionary + Dictionary with names (string) of parameters used to calculate gram + matrices as keys and lists of parameter settings to try as values. This + enables searching over any sequence of parameter settings. Params with + length 1 will be omitted. + param_grid : dictionary + Dictionary with names (string) of parameters used as penelties as keys + and lists of parameter settings to try as values. This enables + searching over any sequence of parameter settings. Params with length 1 + will be omitted. + model_type : string + Type of the problem, can be 'regression' or 'classification'. + NUM_TRIALS : integer + Number of random trials of the outer CV loop. The default is 30. + datafile_y : string + Path of file storing y data. This parameter is optional depending on + the given dataset file. + extra_params : dict + Extra parameters for loading dataset. See function gklearn.utils. + graphfiles.loadDataset for detail. + ds_name : string + Name of the dataset. + n_jobs : int + Number of jobs for parallelization. + read_gm_from_file : boolean + Whether gram matrices are loaded from a file. + + Examples + -------- + >>> import numpy as np + >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel + >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel + >>> + >>> datafile = '../datasets/MUTAG/MUTAG_A.txt' + >>> estimator = untilhpathkernel + >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’: + [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]} + >>> # ’C’ for classification problems and ’alpha’ for regression problems. + >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’: + np.logspace(-10, 10, num=41, base=10)}] + >>> + >>> model_selection_for_precomputed_kernel(datafile, estimator, + param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’) + """ + def __init__(self, dataset, estimator, param_grid_precomputed=None, param_grid=None, model_type=None, num_trials=30, output_dir=None, n_jobs=1, save_gms=True, save_gm_figs=False, logging=True, verbose=True, **kwargs): + tqdm.monitor_interval = 0 + self._ds = dataset + self._estimator = estimator + self._num_trials = num_trials + self._n_jobs = n_jobs + self._save_gms = save_gms + self._save_gm_figs = save_gm_figs + self._logging = logging + self._verbose = verbose + self._kwargs = kwargs + + # Set dataset name. + if self._ds._ds_name is None: + self._ds_name = 'ds-unknown' + else: + self._ds_name = self._ds._ds_name + + # The output directory. + if output_dir is None: + self._output_dir = os.path.join('outputs/', estimator.__name__) + else: + self._output_dir = output_dir + os.makedirs(self._output_dir, exist_ok=True) + + # Setup the model type. + if model_type is None: + self._model_type = dataset._task_type + else: + self._model_type = model_type.lower() + if self._model_type != 'regression' and self._model_type != 'classification': + raise Exception('The model type is incorrect! Please choose from regression or classification.') + + # @todo: Set param_grid_precomputed and param_grid. + self._param_grid_precomputed = param_grid_precomputed + self._param_grid = param_grid + + if self._verbose: + print() + print('--- This is a %s problem ---' % self._model_type) + # A string to save all the results. + if self._logging: + self._str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n' + self._str_fw += '# This file contains results of ' + self._estimator.__name__ + ' on dataset ' + self._ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n' + self._str_fw += 'This is a %s problem.\n' % self._model_type + + self.run() + + + def run(self): + self.fit() + self.compute_gram_matrices() + if len(self._gram_matrices) == 0: + if self._verbose: + print('All gram matrices are ignored, no results obtained.') + if self._logging: + self._str_fw += '\nAll gram matrices are ignored, no results obtained.\n\n' + else: + self.do_cv() + + # print out results as table. + if self._logging: + self._str_fw += self.printResultsInTable(self._param_list, self._param_list_pre_revised, self._average_val_scores, self._std_val_scores, self._average_perf_scores, self._std_perf_scores, self._average_train_scores, self._std_train_scores, self._gram_matrix_time, self._model_type, self._verbose) + + # open file to save all results for this dataset. + if not os.path.exists(self._output_dir + '/' + self._ds_name + '.output.txt'): + with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'w') as f: + f.write(self._str_fw) + else: + with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'r+') as f: + content = f.read() + f.seek(0, 0) + f.write(self._str_fw + '\n\n\n' + content) + + return self._final_performance, self._final_confidence + + + def fit(self): + return + + + def compute_gram_matrices(self): + """Compute all gram matrices. + + Returns + ------- + None. + + """ + # Grid of parameters with a discrete number of values for each. + self._param_list_precomputed = list(ParameterGrid(self._param_grid_precomputed)) + self._param_list = list(ParameterGrid(self._param_grid)) + + self._gram_matrices = [ + ] # a list to store gram matrices for all param_grid_precomputed + self._gram_matrix_time = [ + ] # a list to store time to calculate gram matrices + self._param_list_pre_revised = [ + ] # list to store param grids precomputed ignoring the useless ones + + if self._verbose: + print() + print('\n1. Computing gram matrices. This could take a while...') + if self._logging: + self._str_fw += '\nI. Gram matrices.\n\n' + self._tts = time.time() # start training time + nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN) + for idx, params_out in enumerate(self._param_list_precomputed): + y = self._ds.targets[:] + params_out['n_jobs'] = self._n_jobs + params_out['verbose'] = self._verbose +# print(dataset) +# import networkx as nx +# nx.draw_networkx(dataset[1]) +# plt.show() + rtn_data = self._estimator(self._ds.graphs[:], **params_out) # @todo: Attention! this will not copy the graphs. + Kmatrix = rtn_data[0] + current_run_time = rtn_data[1] + # for some kernels, some graphs in datasets may not meet the + # kernels' requirements for graph structure. These graphs are trimmed. + if len(rtn_data) == 3: + idx_trim = rtn_data[2] # the index of trimmed graph list + y = [y[idxt] for idxt in idx_trim] # trim y accordingly +# Kmatrix = np.random.rand(2250, 2250) +# current_run_time = 0.1 + + # remove graphs whose kernels with themselves are zeros + # @todo: y not changed accordingly? + Kmatrix_diag = Kmatrix.diagonal().copy() + nb_g_ignore = 0 + for idxk, diag in enumerate(Kmatrix_diag): + if diag == 0: + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0) + Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) + nb_g_ignore += 1 + + # normalization + # @todo: works only for undirected graph? + Kmatrix_diag = Kmatrix.diagonal().copy() + for i in range(len(Kmatrix)): + for j in range(i, len(Kmatrix)): + Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) + Kmatrix[j][i] = Kmatrix[i][j] + if self._verbose: + print() + + if params_out == {}: + if self._verbose: + print('the gram matrix is: ') + if self._logging: + self._str_fw += 'the gram matrix is:\n\n' + else: + if self._verbose: + print('the gram matrix with parameters', params_out, 'is: \n\n') + if self._logging: + self._str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out + + if len(Kmatrix) < 2: + nb_gm_ignore += 1 + if self._verbose: + print('ignored, as at most only one of all its diagonal value is non-zero.') + if self._logging: + self._str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n' + else: + if np.isnan(Kmatrix).any( + ): # if the matrix contains elements that are not numbers + nb_gm_ignore += 1 + if self._verbose: + print('ignored, as it contains elements that are not numbers.') + if self._logging: + self._str_fw += 'ignored, as it contains elements that are not numbers.\n\n' + else: +# print(Kmatrix) + if self._logging: + self._str_fw += np.array2string( + Kmatrix, + separator=',') + '\n\n' +# separator=',', +# threshold=np.inf, +# floatmode='unique') + '\n\n' + + # Draw and save Gram matrix figures. + if self._save_gm_figs: + fig_file_name = self._output_dir + '/GM[ds]' + self._ds_name + if params_out != {}: + fig_file_name += '[params]' + str(idx) + plt.imshow(Kmatrix) + plt.colorbar() + plt.savefig(fig_file_name + '.eps', format='eps', dpi=300) + # plt.show() + plt.clf() + + self._gram_matrices.append(Kmatrix) + self._gram_matrix_time.append(current_run_time) + self._param_list_pre_revised.append(params_out) + + if nb_g_ignore > 0: + if self._verbose: + print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore) + if self._logging: + self._str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore + + if self._verbose: + print() + print('{} gram matrices are calculated, {} of which are ignored.'.format(len(self._param_list_precomputed), nb_gm_ignore)) + if self._logging: + self._str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(self._param_list_precomputed), nb_gm_ignore) + self._str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n' + self._str_fw += ''.join(['{}: {}\n'.format(idx, params_out) for idx, params_out in enumerate(self._param_list_precomputed)]) + + + def do_cv(self): +# save gram matrices to file. +# np.savez(output_dir + '/' + ds_name + '.gm', +# gms=gram_matrices, params=param_list_pre_revised, y=y, +# gmtime=gram_matrix_time) + if self._verbose: + print('2. Fitting and predicting using nested cross validation. This could really take a while...') + + # ---- use pool.imap_unordered to parallel and track progress. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# def func_assign(result, var_to_assign): +# for idx, itm in enumerate(var_to_assign): +# itm.append(result[idx]) +# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type) +# +# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign, +# [train_pref, val_pref, test_pref], glbv=gram_matrices, +# method='imap_unordered', n_jobs=n_jobs, chunksize=1, +# itr_desc='cross validation') + + def init_worker(gms_toshare): + global G_gms + G_gms = gms_toshare + +# gram_matrices = np.array(gram_matrices) +# gms_shape = gram_matrices.shape +# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C')) +# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape)) + pool = Pool(processes=self._n_jobs, initializer=init_worker, initargs=(self._gram_matrices,)) + trial_do_partial = partial(self._parallel_trial_do, self._param_list_pre_revised, self._param_list, self._ds.targets[:], self._model_type) # @todo: maybe self._ds.targets[:] should be y. + train_pref = [] + val_pref = [] + test_pref = [] +# if NUM_TRIALS < 1000 * n_jobs: +# chunksize = int(NUM_TRIALS / n_jobs) + 1 +# else: +# chunksize = 1000 + chunksize = 1 + if self._verbose: + iterator = tqdm(pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize), desc='cross validation', file=sys.stdout) + else: + iterator = pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize) + for o1, o2, o3 in iterator: + train_pref.append(o1) + val_pref.append(o2) + test_pref.append(o3) + pool.close() + pool.join() + +# # ---- use pool.map to parallel. ---- +# pool = Pool(n_jobs) +# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type) +# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS)) +# train_pref = [item[0] for item in result_perf] +# val_pref = [item[1] for item in result_perf] +# test_pref = [item[2] for item in result_perf] + +# # ---- direct running, normally use a single CPU core. ---- +# train_pref = [] +# val_pref = [] +# test_pref = [] +# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout): +# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i) +# train_pref.append(o1) +# val_pref.append(o2) +# test_pref.append(o3) +# print() + + if self._verbose: + print() + print('3. Getting final performance...') + if self._logging: + self._str_fw += '\nII. Performance.\n\n' + + # averages and confidences of performances on outer trials for each combination of parameters + self._average_train_scores = np.mean(train_pref, axis=0) +# print('val_pref: ', val_pref[0][0]) + self._average_val_scores = np.mean(val_pref, axis=0) +# print('test_pref: ', test_pref[0][0]) + self._average_perf_scores = np.mean(test_pref, axis=0) + # sample std is used here + self._std_train_scores = np.std(train_pref, axis=0, ddof=1) + self._std_val_scores = np.std(val_pref, axis=0, ddof=1) + self._std_perf_scores = np.std(test_pref, axis=0, ddof=1) + + if self._model_type == 'regression': + best_val_perf = np.amin(self._average_val_scores) + else: + best_val_perf = np.amax(self._average_val_scores) +# print('average_val_scores: ', self._average_val_scores) +# print('best_val_perf: ', best_val_perf) +# print() + best_params_index = np.where(self._average_val_scores == best_val_perf) + # find smallest val std with best val perf. + best_val_stds = [ + self._std_val_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + min_val_std = np.amin(best_val_stds) + best_params_index = np.where(self._std_val_scores == min_val_std) + best_params_out = [self._param_list_pre_revised[i] for i in best_params_index[0]] + best_params_in = [self._param_list[i] for i in best_params_index[1]] + + if self._verbose: + print('best_params_out: ', best_params_out) + print('best_params_in: ', best_params_in) + print() + print('best_val_perf: ', best_val_perf) + print('best_val_std: ', min_val_std) + if self._logging: + self._str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out + self._str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in + self._str_fw += 'best_val_perf: %s\n' % best_val_perf + self._str_fw += 'best_val_std: %s\n' % min_val_std + +# print(best_params_index) +# print(best_params_index[0]) +# print(self._average_perf_scores) + self._final_performance = [ + self._average_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + self._final_confidence = [ + self._std_perf_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + + if self._verbose: + print('final_performance: ', self._final_performance) + print('final_confidence: ', self._final_confidence) + if self._logging: + self._str_fw += 'final_performance: %s\n' % self._final_performance + self._str_fw += 'final_confidence: %s\n' % self._final_confidence + + train_performance = [ + self._average_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + train_std = [ + self._std_train_scores[value][best_params_index[1][idx]] + for idx, value in enumerate(best_params_index[0]) + ] + + if self._verbose: + print('train_performance: %s' % train_performance) + print('train_std: ', train_std) + if self._logging: + self._str_fw += 'train_performance: %s\n' % train_performance + self._str_fw += 'train_std: %s\n\n' % train_std + + if self._verbose: + print() + + tt_total = time.time() - self._tts # training time for all hyper-parameters + average_gram_matrix_time = np.mean(self._gram_matrix_time) + std_gram_matrix_time = np.std(self._gram_matrix_time, ddof=1) if len(self._gram_matrix_time) > 1 else 0 + best_gram_matrix_time = [self._gram_matrix_time[i] for i in best_params_index[0]] + ave_bgmt = np.mean(best_gram_matrix_time) + std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0 + + if self._verbose: + print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s' + .format(average_gram_matrix_time, std_gram_matrix_time)) + print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format( + ave_bgmt, std_bgmt)) + print('total training time with all hyper-param choices: {:.2f}s'.format( + tt_total)) + if self._logging: + self._str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time) + self._str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt) + self._str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total) + + # # save results to file + # np.savetxt(results_name_pre + 'average_train_scores.dt', + # average_train_scores) + # np.savetxt(results_name_pre + 'average_val_scores', self._average_val_scores) + # np.savetxt(results_name_pre + 'average_perf_scores.dt', + # average_perf_scores) + # np.savetxt(results_name_pre + 'std_train_scores.dt', self._std_train_scores) + # np.savetxt(results_name_pre + 'std_val_scores.dt', self._std_val_scores) + # np.savetxt(results_name_pre + 'std_perf_scores.dt', self._std_perf_scores) + + # np.save(results_name_pre + 'best_params_index', best_params_index) + # np.save(results_name_pre + 'best_params_pre.dt', best_params_out) + # np.save(results_name_pre + 'best_params_in.dt', best_params_in) + # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf) + # np.save(results_name_pre + 'best_val_std.dt', best_val_std) + # np.save(results_name_pre + 'final_performance.dt', self._final_performance) + # np.save(results_name_pre + 'final_confidence.dt', self._final_confidence) + # np.save(results_name_pre + 'train_performance.dt', train_performance) + # np.save(results_name_pre + 'train_std.dt', train_std) + + # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time) + # np.save(results_name_pre + 'average_gram_matrix_time.dt', + # average_gram_matrix_time) + # np.save(results_name_pre + 'std_gram_matrix_time.dt', + # std_gram_matrix_time) + # np.save(results_name_pre + 'best_gram_matrix_time.dt', + # best_gram_matrix_time) + + + def trial_do(self, param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level + + # # get gram matrices from global variables. + # gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C') + + # Arrays to store scores + train_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + val_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + test_pref = np.zeros((len(param_list_pre_revised), len(param_list))) + + # randomness added to seeds of split function below. "high" is "size" times + # 10 so that at least 10 different random output will be yielded. Remove + # these lines if identical outputs is required. + rdm_out = np.random.RandomState(seed=None) + rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10, + size=len(param_list_pre_revised)) + # print(trial, rdm_seed_out_l) + # print() + # loop for each outer param tuple + for index_out, params_out in enumerate(param_list_pre_revised): + # get gram matrices from global variables. + # gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]] + # gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C') + gm_now = gram_matrices[index_out].copy() + + # split gram matrix and y to app and test sets. + indices = range(len(y)) + # The argument "random_state" in function "train_test_split" can not be + # set to None, because it will use RandomState instance used by + # np.random, which is possible for multiple subprocesses to inherit the + # same seed if they forked at the same time, leading to identical + # random variates for different subprocesses. Instead, we use "trial" + # and "index_out" parameters to generate different seeds for different + # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add + # randomness into seeds, so that it yields a different output every + # time the program is run. To yield identical outputs every time, + # remove the second line below. Same method is used to the "KFold" + # function in the inner loop. + rdm_seed_out = (trial + 1) * (index_out + 1) + rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1) + # print(trial, rdm_seed_out) + X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split( + gm_now, y, indices, test_size=0.1, + random_state=rdm_seed_out, shuffle=True) + # print(trial, idx_app, idx_test) + # print() + X_app = X_app[:, idx_app] + X_test = X_test[:, idx_app] + y_app = np.array(y_app) + y_test = np.array(y_test) + + rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10, + size=len(param_list)) + # loop for each inner param tuple + for index_in, params_in in enumerate(param_list): + # if trial == 0: + # print(index_out, index_in) + # print('params_in: ', params_in) + # st = time.time() + rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1) + # print("rdm_seed_in1: ", trial, index_in, rdm_seed_in) + rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1) + # print("rdm_seed_in2: ", trial, index_in, rdm_seed_in) + inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in) + current_train_perf = [] + current_valid_perf = [] + current_test_perf = [] + + # For regression use the Kernel Ridge method + # try: + if self._model_type == 'regression': + kr = KernelRidge(kernel='precomputed', **params_in) + # loop for each split on validation set level + # validation set level + for train_index, valid_index in inner_cv.split(X_app): + # print("train_index, valid_index: ", trial, index_in, train_index, valid_index) + # if trial == 0: + # print('train_index: ', train_index) + # print('valid_index: ', valid_index) + # print('idx_test: ', idx_test) + # print('y_app[train_index]: ', y_app[train_index]) + # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) + # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) + kr.fit(X_app[train_index, :][:, train_index], + y_app[train_index]) + + # predict on the train, validation and test set + y_pred_train = kr.predict( + X_app[train_index, :][:, train_index]) + y_pred_valid = kr.predict( + X_app[valid_index, :][:, train_index]) + # if trial == 0: + # print('y_pred_valid: ', y_pred_valid) + # print() + y_pred_test = kr.predict( + X_test[:, train_index]) + + # root mean squared errors + current_train_perf.append( + np.sqrt( + mean_squared_error( + y_app[train_index], y_pred_train))) + current_valid_perf.append( + np.sqrt( + mean_squared_error( + y_app[valid_index], y_pred_valid))) + # if trial == 0: + # print(mean_squared_error( + # y_app[valid_index], y_pred_valid)) + current_test_perf.append( + np.sqrt( + mean_squared_error( + y_test, y_pred_test))) + # For clcassification use SVM + else: + svc = SVC(kernel='precomputed', cache_size=200, + verbose=False, **params_in) + # loop for each split on validation set level + # validation set level + for train_index, valid_index in inner_cv.split(X_app): + # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index]) + # if trial == 0: + # print('train_index: ', train_index) + # print('valid_index: ', valid_index) + # print('idx_test: ', idx_test) + # print('y_app[train_index]: ', y_app[train_index]) + # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index]) + # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index]) + svc.fit(X_app[train_index, :][:, train_index], + y_app[train_index]) + + # predict on the train, validation and test set + y_pred_train = svc.predict( + X_app[train_index, :][:, train_index]) + y_pred_valid = svc.predict( + X_app[valid_index, :][:, train_index]) + y_pred_test = svc.predict( + X_test[:, train_index]) + + # root mean squared errors + current_train_perf.append( + accuracy_score(y_app[train_index], + y_pred_train)) + current_valid_perf.append( + accuracy_score(y_app[valid_index], + y_pred_valid)) + current_test_perf.append( + accuracy_score(y_test, y_pred_test)) + # except ValueError: + # print(sys.exc_info()[0]) + # print(params_out, params_in) + + # average performance on inner splits + train_pref[index_out][index_in] = np.mean( + current_train_perf) + val_pref[index_out][index_in] = np.mean( + current_valid_perf) + test_pref[index_out][index_in] = np.mean( + current_test_perf) + # print(time.time() - st) + # if trial == 0: + # print('val_pref: ', val_pref) + # print('test_pref: ', test_pref) + + return train_pref, val_pref, test_pref + + + def _parallel_trial_do(self, param_list_pre_revised, param_list, y, model_type, trial): + train_pref, val_pref, test_pref = self._trial_do(param_list_pre_revised, + param_list, G_gms, y, + model_type, trial) + return train_pref, val_pref, test_pref + + + def printResultsInTable(self, param_list, param_list_pre_revised, average_val_scores, + std_val_scores, average_perf_scores, std_perf_scores, + average_train_scores, std_train_scores, gram_matrix_time, + model_type, verbose): + from collections import OrderedDict + from tabulate import tabulate + table_dict = {} + if model_type == 'regression': + for param_in in param_list: + param_in['alpha'] = '{:.2e}'.format(param_in['alpha']) + else: + for param_in in param_list: + param_in['C'] = '{:.2e}'.format(param_in['C']) + table_dict['params'] = [{**param_out, **param_in} + for param_in in param_list for param_out in param_list_pre_revised] + table_dict['gram_matrix_time'] = [ + '{:.2f}'.format(gram_matrix_time[index_out]) + for param_in in param_list + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['valid_perf'] = [ + '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], + std_val_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['test_perf'] = [ + '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], + std_perf_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + table_dict['train_perf'] = [ + '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], + std_train_scores[index_out][index_in]) + for index_in, _ in enumerate(param_list) + for index_out, _ in enumerate(param_list_pre_revised) + ] + + keyorder = [ + 'params', 'train_perf', 'valid_perf', 'test_perf', + 'gram_matrix_time' + ] + if verbose: + print() + tb_print = tabulate(OrderedDict(sorted(table_dict.items(), + key=lambda i: keyorder.index(i[0]))), headers='keys') + # print(tb_print) + return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print \ No newline at end of file diff --git a/gklearn/model_learning/workflow.py b/gklearn/model_learning/workflow.py new file mode 100644 index 0000000..7f1be1d --- /dev/null +++ b/gklearn/model_learning/workflow.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Nov 27 19:33:51 2020 + +@author: ljia +""" +import os +import numpy as np +import pickle +from gklearn.dataset import Dataset +from gklearn.model_learning import NestedCV +from gklearn.kernels import GRAPH_KERNELS + +class Workflow(object): + + + def __init__(self, **kwargs): + self._job_prefix = kwargs.get('job_prefix', 'gktask') + self._max_num_running_tasks = kwargs.get('max_num_running_tasks', np.inf) + self._root_dir = kwargs.get('root_dir', 'outputs/') + + + def run(self, tasks): + ### Check inputs. + if self._check_inputs(tasks): + self._tasks = tasks + else: + raise ValueError('The input "tasks" is not correct.') + + + ### Sort tasks. + self.sort_tasks_by_complexity() + + + ### The main process. + complete = False + while not complete: + + self.get_running_tasks() + + if self._num_running_tasks < self._max_num_running_tasks: + + ### Load results from table. + self.load_results_from_table() + + for task in self._tasks: + state = self.get_task_state(task) + if state != 'complete' and state != 'runnning': + self.run_task(task) + + if self._num_running_tasks >= self._max_num_running_tasks: + break + + ### Save results. + self.save_results() + + complete = self.check_completeness() + +# sleep() + + + def _check_inputs(self, tasks): + if not isinstance(tasks, list): + return False + else: + for i in tasks: + if not 'kernel' in i or not 'dataset' in i: + return False + return True + + + def sort_tasks_by_complexity(self): + return + + + def get_running_tasks(self): + command = 'squeue --user $USER --format "%.50j" --noheader' + stream = os.popen(command) + output = stream.readlines() + running_tasks = [o for o in output if o.strip().startswith(self._job_prefix)] + self._num_running_tasks = len(running_tasks) + + + def load_results_from_table(self): + pass + + + def get_task_state(self, task): + task_dir = os.path.join(self._root_dir, task['kernel'] + '.' + task['dataset'] + '/') + fn_summary = os.path.join(task_dir, 'results_summary.pkl') + if os.path.isfile(fn_summary): + output = pickle.loads(fn_summary) + state = output['state'] + return state + else: + return 'unstarted' + + + def run_task(self, task): + ds_name = task['dataset'] + k_name = task['kernel'] + + # Get dataset. + ds = Dataset(ds_name) + graph_kernel = GRAPH_KERNELS[k_name] + + # Start CV. + results = NestedCV(ds, graph_kernel) \ No newline at end of file