Browse Source

[Major Feature][Todo] Add model_learning module, including a new NestedCV class and a Workflow class for graph kernel computation.

v0.2.x
jajupmochi 4 years ago
parent
commit
ed17010535
3 changed files with 837 additions and 0 deletions
  1. +14
    -0
      gklearn/model_learning/__init__.py
  2. +714
    -0
      gklearn/model_learning/nested_cv.py
  3. +109
    -0
      gklearn/model_learning/workflow.py

+ 14
- 0
gklearn/model_learning/__init__.py View File

@@ -0,0 +1,14 @@
# -*-coding:utf-8 -*-
"""
model learning.
"""

# info
__version__ = "0.2"
__author__ = "Linlin Jia"
__date__ = "November 2020"


from gklearn.model_learning.nested_cv import NestedCV
from gklearn.model_learning.workflow import Workflow
from gklearn.model_learning.parameters import dichotomous_permutation

+ 714
- 0
gklearn/model_learning/nested_cv.py View File

@@ -0,0 +1,714 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 18:59:28 2020

@author: ljia
"""
import os
import datetime
import time
import sys
from tqdm import tqdm
from multiprocessing import Pool, Array
from functools import partial
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold, train_test_split, ParameterGrid
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error


class NestedCV(object):
"""Perform model selection, fitting and testing for precomputed kernels
using nested CV. Print out neccessary data during the process then finally
the results.

Parameters
----------
datafile : string
Path of dataset file.
estimator : function
kernel function used to estimate. This function needs to return a gram matrix.
param_grid_precomputed : dictionary
Dictionary with names (string) of parameters used to calculate gram
matrices as keys and lists of parameter settings to try as values. This
enables searching over any sequence of parameter settings. Params with
length 1 will be omitted.
param_grid : dictionary
Dictionary with names (string) of parameters used as penelties as keys
and lists of parameter settings to try as values. This enables
searching over any sequence of parameter settings. Params with length 1
will be omitted.
model_type : string
Type of the problem, can be 'regression' or 'classification'.
NUM_TRIALS : integer
Number of random trials of the outer CV loop. The default is 30.
datafile_y : string
Path of file storing y data. This parameter is optional depending on
the given dataset file.
extra_params : dict
Extra parameters for loading dataset. See function gklearn.utils.
graphfiles.loadDataset for detail.
ds_name : string
Name of the dataset.
n_jobs : int
Number of jobs for parallelization.
read_gm_from_file : boolean
Whether gram matrices are loaded from a file.

Examples
--------
>>> import numpy as np
>>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
>>> from gklearn.kernels.untilHPathKernel import untilhpathkernel
>>>
>>> datafile = '../datasets/MUTAG/MUTAG_A.txt'
>>> estimator = untilhpathkernel
>>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’:
[’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]}
>>> # ’C’ for classification problems and ’alpha’ for regression problems.
>>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’:
np.logspace(-10, 10, num=41, base=10)}]
>>>
>>> model_selection_for_precomputed_kernel(datafile, estimator,
param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’)
"""
def __init__(self, dataset, estimator, param_grid_precomputed=None, param_grid=None, model_type=None, num_trials=30, output_dir=None, n_jobs=1, save_gms=True, save_gm_figs=False, logging=True, verbose=True, **kwargs):
tqdm.monitor_interval = 0
self._ds = dataset
self._estimator = estimator
self._num_trials = num_trials
self._n_jobs = n_jobs
self._save_gms = save_gms
self._save_gm_figs = save_gm_figs
self._logging = logging
self._verbose = verbose
self._kwargs = kwargs

# Set dataset name.
if self._ds._ds_name is None:
self._ds_name = 'ds-unknown'
else:
self._ds_name = self._ds._ds_name

# The output directory.
if output_dir is None:
self._output_dir = os.path.join('outputs/', estimator.__name__)
else:
self._output_dir = output_dir
os.makedirs(self._output_dir, exist_ok=True)

# Setup the model type.
if model_type is None:
self._model_type = dataset._task_type
else:
self._model_type = model_type.lower()
if self._model_type != 'regression' and self._model_type != 'classification':
raise Exception('The model type is incorrect! Please choose from regression or classification.')

# @todo: Set param_grid_precomputed and param_grid.
self._param_grid_precomputed = param_grid_precomputed
self._param_grid = param_grid

if self._verbose:
print()
print('--- This is a %s problem ---' % self._model_type)
# A string to save all the results.
if self._logging:
self._str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
self._str_fw += '# This file contains results of ' + self._estimator.__name__ + ' on dataset ' + self._ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'
self._str_fw += 'This is a %s problem.\n' % self._model_type

self.run()


def run(self):
self.fit()
self.compute_gram_matrices()
if len(self._gram_matrices) == 0:
if self._verbose:
print('All gram matrices are ignored, no results obtained.')
if self._logging:
self._str_fw += '\nAll gram matrices are ignored, no results obtained.\n\n'
else:
self.do_cv()

# print out results as table.
if self._logging:
self._str_fw += self.printResultsInTable(self._param_list, self._param_list_pre_revised, self._average_val_scores, self._std_val_scores, self._average_perf_scores, self._std_perf_scores, self._average_train_scores, self._std_train_scores, self._gram_matrix_time, self._model_type, self._verbose)

# open file to save all results for this dataset.
if not os.path.exists(self._output_dir + '/' + self._ds_name + '.output.txt'):
with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'w') as f:
f.write(self._str_fw)
else:
with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'r+') as f:
content = f.read()
f.seek(0, 0)
f.write(self._str_fw + '\n\n\n' + content)

return self._final_performance, self._final_confidence


def fit(self):
return


def compute_gram_matrices(self):
"""Compute all gram matrices.

Returns
-------
None.

"""
# Grid of parameters with a discrete number of values for each.
self._param_list_precomputed = list(ParameterGrid(self._param_grid_precomputed))
self._param_list = list(ParameterGrid(self._param_grid))

self._gram_matrices = [
] # a list to store gram matrices for all param_grid_precomputed
self._gram_matrix_time = [
] # a list to store time to calculate gram matrices
self._param_list_pre_revised = [
] # list to store param grids precomputed ignoring the useless ones

if self._verbose:
print()
print('\n1. Computing gram matrices. This could take a while...')
if self._logging:
self._str_fw += '\nI. Gram matrices.\n\n'
self._tts = time.time() # start training time
nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
for idx, params_out in enumerate(self._param_list_precomputed):
y = self._ds.targets[:]
params_out['n_jobs'] = self._n_jobs
params_out['verbose'] = self._verbose
# print(dataset)
# import networkx as nx
# nx.draw_networkx(dataset[1])
# plt.show()
rtn_data = self._estimator(self._ds.graphs[:], **params_out) # @todo: Attention! this will not copy the graphs.
Kmatrix = rtn_data[0]
current_run_time = rtn_data[1]
# for some kernels, some graphs in datasets may not meet the
# kernels' requirements for graph structure. These graphs are trimmed.
if len(rtn_data) == 3:
idx_trim = rtn_data[2] # the index of trimmed graph list
y = [y[idxt] for idxt in idx_trim] # trim y accordingly
# Kmatrix = np.random.rand(2250, 2250)
# current_run_time = 0.1

# remove graphs whose kernels with themselves are zeros
# @todo: y not changed accordingly?
Kmatrix_diag = Kmatrix.diagonal().copy()
nb_g_ignore = 0
for idxk, diag in enumerate(Kmatrix_diag):
if diag == 0:
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0)
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
nb_g_ignore += 1

# normalization
# @todo: works only for undirected graph?
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
if self._verbose:
print()

if params_out == {}:
if self._verbose:
print('the gram matrix is: ')
if self._logging:
self._str_fw += 'the gram matrix is:\n\n'
else:
if self._verbose:
print('the gram matrix with parameters', params_out, 'is: \n\n')
if self._logging:
self._str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out

if len(Kmatrix) < 2:
nb_gm_ignore += 1
if self._verbose:
print('ignored, as at most only one of all its diagonal value is non-zero.')
if self._logging:
self._str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
else:
if np.isnan(Kmatrix).any(
): # if the matrix contains elements that are not numbers
nb_gm_ignore += 1
if self._verbose:
print('ignored, as it contains elements that are not numbers.')
if self._logging:
self._str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
else:
# print(Kmatrix)
if self._logging:
self._str_fw += np.array2string(
Kmatrix,
separator=',') + '\n\n'
# separator=',',
# threshold=np.inf,
# floatmode='unique') + '\n\n'

# Draw and save Gram matrix figures.
if self._save_gm_figs:
fig_file_name = self._output_dir + '/GM[ds]' + self._ds_name
if params_out != {}:
fig_file_name += '[params]' + str(idx)
plt.imshow(Kmatrix)
plt.colorbar()
plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
# plt.show()
plt.clf()

self._gram_matrices.append(Kmatrix)
self._gram_matrix_time.append(current_run_time)
self._param_list_pre_revised.append(params_out)

if nb_g_ignore > 0:
if self._verbose:
print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
if self._logging:
self._str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore

if self._verbose:
print()
print('{} gram matrices are calculated, {} of which are ignored.'.format(len(self._param_list_precomputed), nb_gm_ignore))
if self._logging:
self._str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(self._param_list_precomputed), nb_gm_ignore)
self._str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
self._str_fw += ''.join(['{}: {}\n'.format(idx, params_out) for idx, params_out in enumerate(self._param_list_precomputed)])


def do_cv(self):
# save gram matrices to file.
# np.savez(output_dir + '/' + ds_name + '.gm',
# gms=gram_matrices, params=param_list_pre_revised, y=y,
# gmtime=gram_matrix_time)
if self._verbose:
print('2. Fitting and predicting using nested cross validation. This could really take a while...')

# ---- use pool.imap_unordered to parallel and track progress. ----
# train_pref = []
# val_pref = []
# test_pref = []
# def func_assign(result, var_to_assign):
# for idx, itm in enumerate(var_to_assign):
# itm.append(result[idx])
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)
#
# parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign,
# [train_pref, val_pref, test_pref], glbv=gram_matrices,
# method='imap_unordered', n_jobs=n_jobs, chunksize=1,
# itr_desc='cross validation')

def init_worker(gms_toshare):
global G_gms
G_gms = gms_toshare

# gram_matrices = np.array(gram_matrices)
# gms_shape = gram_matrices.shape
# gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C'))
# pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape))
pool = Pool(processes=self._n_jobs, initializer=init_worker, initargs=(self._gram_matrices,))
trial_do_partial = partial(self._parallel_trial_do, self._param_list_pre_revised, self._param_list, self._ds.targets[:], self._model_type) # @todo: maybe self._ds.targets[:] should be y.
train_pref = []
val_pref = []
test_pref = []
# if NUM_TRIALS < 1000 * n_jobs:
# chunksize = int(NUM_TRIALS / n_jobs) + 1
# else:
# chunksize = 1000
chunksize = 1
if self._verbose:
iterator = tqdm(pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize), desc='cross validation', file=sys.stdout)
else:
iterator = pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize)
for o1, o2, o3 in iterator:
train_pref.append(o1)
val_pref.append(o2)
test_pref.append(o3)
pool.close()
pool.join()

# # ---- use pool.map to parallel. ----
# pool = Pool(n_jobs)
# trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type)
# result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
# train_pref = [item[0] for item in result_perf]
# val_pref = [item[1] for item in result_perf]
# test_pref = [item[2] for item in result_perf]

# # ---- direct running, normally use a single CPU core. ----
# train_pref = []
# val_pref = []
# test_pref = []
# for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
# o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
# train_pref.append(o1)
# val_pref.append(o2)
# test_pref.append(o3)
# print()

if self._verbose:
print()
print('3. Getting final performance...')
if self._logging:
self._str_fw += '\nII. Performance.\n\n'

# averages and confidences of performances on outer trials for each combination of parameters
self._average_train_scores = np.mean(train_pref, axis=0)
# print('val_pref: ', val_pref[0][0])
self._average_val_scores = np.mean(val_pref, axis=0)
# print('test_pref: ', test_pref[0][0])
self._average_perf_scores = np.mean(test_pref, axis=0)
# sample std is used here
self._std_train_scores = np.std(train_pref, axis=0, ddof=1)
self._std_val_scores = np.std(val_pref, axis=0, ddof=1)
self._std_perf_scores = np.std(test_pref, axis=0, ddof=1)

if self._model_type == 'regression':
best_val_perf = np.amin(self._average_val_scores)
else:
best_val_perf = np.amax(self._average_val_scores)
# print('average_val_scores: ', self._average_val_scores)
# print('best_val_perf: ', best_val_perf)
# print()
best_params_index = np.where(self._average_val_scores == best_val_perf)
# find smallest val std with best val perf.
best_val_stds = [
self._std_val_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
min_val_std = np.amin(best_val_stds)
best_params_index = np.where(self._std_val_scores == min_val_std)
best_params_out = [self._param_list_pre_revised[i] for i in best_params_index[0]]
best_params_in = [self._param_list[i] for i in best_params_index[1]]

if self._verbose:
print('best_params_out: ', best_params_out)
print('best_params_in: ', best_params_in)
print()
print('best_val_perf: ', best_val_perf)
print('best_val_std: ', min_val_std)
if self._logging:
self._str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
self._str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
self._str_fw += 'best_val_perf: %s\n' % best_val_perf
self._str_fw += 'best_val_std: %s\n' % min_val_std

# print(best_params_index)
# print(best_params_index[0])
# print(self._average_perf_scores)
self._final_performance = [
self._average_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
self._final_confidence = [
self._std_perf_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]

if self._verbose:
print('final_performance: ', self._final_performance)
print('final_confidence: ', self._final_confidence)
if self._logging:
self._str_fw += 'final_performance: %s\n' % self._final_performance
self._str_fw += 'final_confidence: %s\n' % self._final_confidence

train_performance = [
self._average_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]
train_std = [
self._std_train_scores[value][best_params_index[1][idx]]
for idx, value in enumerate(best_params_index[0])
]

if self._verbose:
print('train_performance: %s' % train_performance)
print('train_std: ', train_std)
if self._logging:
self._str_fw += 'train_performance: %s\n' % train_performance
self._str_fw += 'train_std: %s\n\n' % train_std

if self._verbose:
print()

tt_total = time.time() - self._tts # training time for all hyper-parameters
average_gram_matrix_time = np.mean(self._gram_matrix_time)
std_gram_matrix_time = np.std(self._gram_matrix_time, ddof=1) if len(self._gram_matrix_time) > 1 else 0
best_gram_matrix_time = [self._gram_matrix_time[i] for i in best_params_index[0]]
ave_bgmt = np.mean(best_gram_matrix_time)
std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0

if self._verbose:
print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
.format(average_gram_matrix_time, std_gram_matrix_time))
print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
ave_bgmt, std_bgmt))
print('total training time with all hyper-param choices: {:.2f}s'.format(
tt_total))
if self._logging:
self._str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
self._str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
self._str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)

# # save results to file
# np.savetxt(results_name_pre + 'average_train_scores.dt',
# average_train_scores)
# np.savetxt(results_name_pre + 'average_val_scores', self._average_val_scores)
# np.savetxt(results_name_pre + 'average_perf_scores.dt',
# average_perf_scores)
# np.savetxt(results_name_pre + 'std_train_scores.dt', self._std_train_scores)
# np.savetxt(results_name_pre + 'std_val_scores.dt', self._std_val_scores)
# np.savetxt(results_name_pre + 'std_perf_scores.dt', self._std_perf_scores)

# np.save(results_name_pre + 'best_params_index', best_params_index)
# np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
# np.save(results_name_pre + 'best_params_in.dt', best_params_in)
# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
# np.save(results_name_pre + 'best_val_std.dt', best_val_std)
# np.save(results_name_pre + 'final_performance.dt', self._final_performance)
# np.save(results_name_pre + 'final_confidence.dt', self._final_confidence)
# np.save(results_name_pre + 'train_performance.dt', train_performance)
# np.save(results_name_pre + 'train_std.dt', train_std)

# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
# np.save(results_name_pre + 'average_gram_matrix_time.dt',
# average_gram_matrix_time)
# np.save(results_name_pre + 'std_gram_matrix_time.dt',
# std_gram_matrix_time)
# np.save(results_name_pre + 'best_gram_matrix_time.dt',
# best_gram_matrix_time)


def trial_do(self, param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level

# # get gram matrices from global variables.
# gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C')

# Arrays to store scores
train_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))

# randomness added to seeds of split function below. "high" is "size" times
# 10 so that at least 10 different random output will be yielded. Remove
# these lines if identical outputs is required.
rdm_out = np.random.RandomState(seed=None)
rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10,
size=len(param_list_pre_revised))
# print(trial, rdm_seed_out_l)
# print()
# loop for each outer param tuple
for index_out, params_out in enumerate(param_list_pre_revised):
# get gram matrices from global variables.
# gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]]
# gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C')
gm_now = gram_matrices[index_out].copy()

# split gram matrix and y to app and test sets.
indices = range(len(y))
# The argument "random_state" in function "train_test_split" can not be
# set to None, because it will use RandomState instance used by
# np.random, which is possible for multiple subprocesses to inherit the
# same seed if they forked at the same time, leading to identical
# random variates for different subprocesses. Instead, we use "trial"
# and "index_out" parameters to generate different seeds for different
# trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add
# randomness into seeds, so that it yields a different output every
# time the program is run. To yield identical outputs every time,
# remove the second line below. Same method is used to the "KFold"
# function in the inner loop.
rdm_seed_out = (trial + 1) * (index_out + 1)
rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1)
# print(trial, rdm_seed_out)
X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
gm_now, y, indices, test_size=0.1,
random_state=rdm_seed_out, shuffle=True)
# print(trial, idx_app, idx_test)
# print()
X_app = X_app[:, idx_app]
X_test = X_test[:, idx_app]
y_app = np.array(y_app)
y_test = np.array(y_test)

rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10,
size=len(param_list))
# loop for each inner param tuple
for index_in, params_in in enumerate(param_list):
# if trial == 0:
# print(index_out, index_in)
# print('params_in: ', params_in)
# st = time.time()
rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1)
# print("rdm_seed_in1: ", trial, index_in, rdm_seed_in)
rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1)
# print("rdm_seed_in2: ", trial, index_in, rdm_seed_in)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in)
current_train_perf = []
current_valid_perf = []
current_test_perf = []

# For regression use the Kernel Ridge method
# try:
if self._model_type == 'regression':
kr = KernelRidge(kernel='precomputed', **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# print("train_index, valid_index: ", trial, index_in, train_index, valid_index)
# if trial == 0:
# print('train_index: ', train_index)
# print('valid_index: ', valid_index)
# print('idx_test: ', idx_test)
# print('y_app[train_index]: ', y_app[train_index])
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
kr.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = kr.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = kr.predict(
X_app[valid_index, :][:, train_index])
# if trial == 0:
# print('y_pred_valid: ', y_pred_valid)
# print()
y_pred_test = kr.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
np.sqrt(
mean_squared_error(
y_app[train_index], y_pred_train)))
current_valid_perf.append(
np.sqrt(
mean_squared_error(
y_app[valid_index], y_pred_valid)))
# if trial == 0:
# print(mean_squared_error(
# y_app[valid_index], y_pred_valid))
current_test_perf.append(
np.sqrt(
mean_squared_error(
y_test, y_pred_test)))
# For clcassification use SVM
else:
svc = SVC(kernel='precomputed', cache_size=200,
verbose=False, **params_in)
# loop for each split on validation set level
# validation set level
for train_index, valid_index in inner_cv.split(X_app):
# np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
# if trial == 0:
# print('train_index: ', train_index)
# print('valid_index: ', valid_index)
# print('idx_test: ', idx_test)
# print('y_app[train_index]: ', y_app[train_index])
# print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
# print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
svc.fit(X_app[train_index, :][:, train_index],
y_app[train_index])

# predict on the train, validation and test set
y_pred_train = svc.predict(
X_app[train_index, :][:, train_index])
y_pred_valid = svc.predict(
X_app[valid_index, :][:, train_index])
y_pred_test = svc.predict(
X_test[:, train_index])

# root mean squared errors
current_train_perf.append(
accuracy_score(y_app[train_index],
y_pred_train))
current_valid_perf.append(
accuracy_score(y_app[valid_index],
y_pred_valid))
current_test_perf.append(
accuracy_score(y_test, y_pred_test))
# except ValueError:
# print(sys.exc_info()[0])
# print(params_out, params_in)

# average performance on inner splits
train_pref[index_out][index_in] = np.mean(
current_train_perf)
val_pref[index_out][index_in] = np.mean(
current_valid_perf)
test_pref[index_out][index_in] = np.mean(
current_test_perf)
# print(time.time() - st)
# if trial == 0:
# print('val_pref: ', val_pref)
# print('test_pref: ', test_pref)

return train_pref, val_pref, test_pref


def _parallel_trial_do(self, param_list_pre_revised, param_list, y, model_type, trial):
train_pref, val_pref, test_pref = self._trial_do(param_list_pre_revised,
param_list, G_gms, y,
model_type, trial)
return train_pref, val_pref, test_pref


def printResultsInTable(self, param_list, param_list_pre_revised, average_val_scores,
std_val_scores, average_perf_scores, std_perf_scores,
average_train_scores, std_train_scores, gram_matrix_time,
model_type, verbose):
from collections import OrderedDict
from tabulate import tabulate
table_dict = {}
if model_type == 'regression':
for param_in in param_list:
param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
else:
for param_in in param_list:
param_in['C'] = '{:.2e}'.format(param_in['C'])
table_dict['params'] = [{**param_out, **param_in}
for param_in in param_list for param_out in param_list_pre_revised]
table_dict['gram_matrix_time'] = [
'{:.2f}'.format(gram_matrix_time[index_out])
for param_in in param_list
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['valid_perf'] = [
'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
std_val_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['test_perf'] = [
'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
std_perf_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]
table_dict['train_perf'] = [
'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
std_train_scores[index_out][index_in])
for index_in, _ in enumerate(param_list)
for index_out, _ in enumerate(param_list_pre_revised)
]

keyorder = [
'params', 'train_perf', 'valid_perf', 'test_perf',
'gram_matrix_time'
]
if verbose:
print()
tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
key=lambda i: keyorder.index(i[0]))), headers='keys')
# print(tb_print)
return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print

+ 109
- 0
gklearn/model_learning/workflow.py View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 19:33:51 2020

@author: ljia
"""
import os
import numpy as np
import pickle
from gklearn.dataset import Dataset
from gklearn.model_learning import NestedCV
from gklearn.kernels import GRAPH_KERNELS

class Workflow(object):


def __init__(self, **kwargs):
self._job_prefix = kwargs.get('job_prefix', 'gktask')
self._max_num_running_tasks = kwargs.get('max_num_running_tasks', np.inf)
self._root_dir = kwargs.get('root_dir', 'outputs/')


def run(self, tasks):
### Check inputs.
if self._check_inputs(tasks):
self._tasks = tasks
else:
raise ValueError('The input "tasks" is not correct.')


### Sort tasks.
self.sort_tasks_by_complexity()


### The main process.
complete = False
while not complete:

self.get_running_tasks()

if self._num_running_tasks < self._max_num_running_tasks:

### Load results from table.
self.load_results_from_table()

for task in self._tasks:
state = self.get_task_state(task)
if state != 'complete' and state != 'runnning':
self.run_task(task)

if self._num_running_tasks >= self._max_num_running_tasks:
break

### Save results.
self.save_results()

complete = self.check_completeness()

# sleep()


def _check_inputs(self, tasks):
if not isinstance(tasks, list):
return False
else:
for i in tasks:
if not 'kernel' in i or not 'dataset' in i:
return False
return True


def sort_tasks_by_complexity(self):
return


def get_running_tasks(self):
command = 'squeue --user $USER --format "%.50j" --noheader'
stream = os.popen(command)
output = stream.readlines()
running_tasks = [o for o in output if o.strip().startswith(self._job_prefix)]
self._num_running_tasks = len(running_tasks)


def load_results_from_table(self):
pass


def get_task_state(self, task):
task_dir = os.path.join(self._root_dir, task['kernel'] + '.' + task['dataset'] + '/')
fn_summary = os.path.join(task_dir, 'results_summary.pkl')
if os.path.isfile(fn_summary):
output = pickle.loads(fn_summary)
state = output['state']
return state
else:
return 'unstarted'


def run_task(self, task):
ds_name = task['dataset']
k_name = task['kernel']

# Get dataset.
ds = Dataset(ds_name)
graph_kernel = GRAPH_KERNELS[k_name]

# Start CV.
results = NestedCV(ds, graph_kernel)

Loading…
Cancel
Save