From ed170105357449d07d5becf701dac7ff0b8029c1 Mon Sep 17 00:00:00 2001
From: jajupmochi <jajupmochi@gmail.com>
Date: Sat, 22 May 2021 11:59:05 +0200
Subject: [PATCH] [Major Feature][Todo] Add model_learning module, including a
 new NestedCV class and a Workflow class for graph kernel computation.

---
 gklearn/model_learning/__init__.py  |  14 +
 gklearn/model_learning/nested_cv.py | 714 ++++++++++++++++++++++++++++++++++++
 gklearn/model_learning/workflow.py  | 109 ++++++
 3 files changed, 837 insertions(+)
 create mode 100644 gklearn/model_learning/__init__.py
 create mode 100644 gklearn/model_learning/nested_cv.py
 create mode 100644 gklearn/model_learning/workflow.py

diff --git a/gklearn/model_learning/__init__.py b/gklearn/model_learning/__init__.py
new file mode 100644
index 0000000..d4c6aaa
--- /dev/null
+++ b/gklearn/model_learning/__init__.py
@@ -0,0 +1,14 @@
+# -*-coding:utf-8 -*-
+"""
+model learning.
+"""
+
+# info
+__version__ = "0.2"
+__author__  = "Linlin Jia"
+__date__    = "November 2020"
+
+
+from gklearn.model_learning.nested_cv import NestedCV
+from gklearn.model_learning.workflow import Workflow
+from gklearn.model_learning.parameters import dichotomous_permutation
\ No newline at end of file
diff --git a/gklearn/model_learning/nested_cv.py b/gklearn/model_learning/nested_cv.py
new file mode 100644
index 0000000..92bc8f9
--- /dev/null
+++ b/gklearn/model_learning/nested_cv.py
@@ -0,0 +1,714 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Nov 27 18:59:28 2020
+
+@author: ljia
+"""
+import os
+import datetime
+import time
+import sys
+from tqdm import tqdm
+from multiprocessing import Pool, Array
+from functools import partial
+import numpy as np
+from matplotlib import pyplot as plt
+from sklearn.model_selection import KFold, train_test_split, ParameterGrid
+from sklearn.kernel_ridge import KernelRidge
+from sklearn.svm import SVC
+from sklearn.metrics import accuracy_score, mean_squared_error
+
+
+class NestedCV(object):
+	"""Perform model selection, fitting and testing for precomputed kernels
+	using nested CV. Print out neccessary data during the process then finally
+	the results.
+
+	Parameters
+	----------
+	datafile : string
+		Path of dataset file.
+	estimator : function
+		kernel function used to estimate. This function needs to return a gram matrix.
+	param_grid_precomputed : dictionary
+		Dictionary with names (string) of parameters used to calculate gram
+		matrices as keys and lists of parameter settings to try as values. This
+		enables searching over any sequence of parameter settings. Params with
+		length 1 will be omitted.
+	param_grid : dictionary
+		Dictionary with names (string) of parameters used as penelties as keys
+		and lists of parameter settings to try as values. This enables
+		searching over any sequence of parameter settings. Params with length 1
+		will be omitted.
+	model_type : string
+		Type of the problem, can be 'regression' or 'classification'.
+	NUM_TRIALS : integer
+		Number of random trials of the outer CV loop. The default is 30.
+	datafile_y : string
+		Path of file storing y data. This parameter is optional depending on
+		the given dataset file.
+	extra_params : dict
+		Extra parameters for loading dataset. See function gklearn.utils.
+		graphfiles.loadDataset for detail.
+	ds_name : string
+		Name of the dataset.
+	n_jobs : int
+		Number of jobs for parallelization.
+	read_gm_from_file : boolean
+		Whether gram matrices are loaded from a file.
+
+	Examples
+	--------
+	>>> import numpy as np
+	>>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
+	>>> from gklearn.kernels.untilHPathKernel import untilhpathkernel
+	>>>
+	>>> datafile = '../datasets/MUTAG/MUTAG_A.txt'
+	>>> estimator = untilhpathkernel
+	>>> param_grid_precomputed = {’depth’:  np.linspace(1, 10, 10), ’k_func’:
+			[’MinMax’, ’tanimoto’], ’compute_method’:  [’trie’]}
+	>>> # ’C’ for classification problems and ’alpha’ for regression problems.
+	>>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’:
+			np.logspace(-10, 10, num=41, base=10)}]
+	>>>
+	>>> model_selection_for_precomputed_kernel(datafile, estimator,
+			param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’)
+	"""
+	def __init__(self, dataset, estimator, param_grid_precomputed=None, param_grid=None, model_type=None, num_trials=30, output_dir=None, n_jobs=1, save_gms=True, save_gm_figs=False, logging=True, verbose=True,  **kwargs):
+		tqdm.monitor_interval = 0
+		self._ds = dataset
+		self._estimator = estimator
+		self._num_trials = num_trials
+		self._n_jobs = n_jobs
+		self._save_gms = save_gms
+		self._save_gm_figs = save_gm_figs
+		self._logging = logging
+		self._verbose = verbose
+		self._kwargs = kwargs
+
+		# Set dataset name.
+		if self._ds._ds_name is None:
+			self._ds_name = 'ds-unknown'
+		else:
+			self._ds_name = self._ds._ds_name
+
+		# The output directory.
+		if output_dir is None:
+			self._output_dir = os.path.join('outputs/', estimator.__name__)
+		else:
+			self._output_dir = output_dir
+		os.makedirs(self._output_dir, exist_ok=True)
+
+		# Setup the model type.
+		if model_type is None:
+			self._model_type = dataset._task_type
+		else:
+			self._model_type = model_type.lower()
+		if self._model_type != 'regression' and self._model_type != 'classification':
+			raise Exception('The model type is incorrect! Please choose from regression or classification.')
+
+		# @todo: Set param_grid_precomputed and param_grid.
+		self._param_grid_precomputed = param_grid_precomputed
+		self._param_grid = param_grid
+
+		if self._verbose:
+			print()
+			print('--- This is a %s problem ---' % self._model_type)
+		# A string to save all the results.
+		if self._logging:
+			self._str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
+			self._str_fw += '# This file contains results of ' + self._estimator.__name__ + ' on dataset ' + self._ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'
+			self._str_fw += 'This is a %s problem.\n' % self._model_type
+
+		self.run()
+
+
+	def run(self):
+		self.fit()
+		self.compute_gram_matrices()
+		if len(self._gram_matrices) == 0:
+			if self._verbose:
+				print('All gram matrices are ignored, no results obtained.')
+			if self._logging:
+				self._str_fw += '\nAll gram matrices are ignored, no results obtained.\n\n'
+		else:
+			self.do_cv()
+
+		# print out results as table.
+		if self._logging:
+			self._str_fw += self.printResultsInTable(self._param_list, self._param_list_pre_revised, self._average_val_scores, self._std_val_scores, self._average_perf_scores, self._std_perf_scores, self._average_train_scores, self._std_train_scores, self._gram_matrix_time, self._model_type, self._verbose)
+
+			# open file to save all results for this dataset.
+			if not os.path.exists(self._output_dir + '/' + self._ds_name + '.output.txt'):
+				with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'w') as f:
+					f.write(self._str_fw)
+			else:
+				with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'r+') as f:
+					content = f.read()
+					f.seek(0, 0)
+					f.write(self._str_fw + '\n\n\n' + content)
+
+		return self._final_performance, self._final_confidence
+
+
+	def fit(self):
+		return
+
+
+	def compute_gram_matrices(self):
+		"""Compute all gram matrices.
+
+		Returns
+		-------
+		None.
+
+		"""
+		# Grid of parameters with a discrete number of values for each.
+		self._param_list_precomputed = list(ParameterGrid(self._param_grid_precomputed))
+		self._param_list = list(ParameterGrid(self._param_grid))
+
+		self._gram_matrices = [
+		]  # a list to store gram matrices for all param_grid_precomputed
+		self._gram_matrix_time = [
+		]  # a list to store time to calculate gram matrices
+		self._param_list_pre_revised = [
+		]  # list to store param grids precomputed ignoring the useless ones
+
+		if self._verbose:
+			print()
+			print('\n1. Computing gram matrices. This could take a while...')
+		if self._logging:
+			self._str_fw += '\nI. Gram matrices.\n\n'
+		self._tts = time.time()  # start training time
+		nb_gm_ignore = 0  # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
+		for idx, params_out in enumerate(self._param_list_precomputed):
+			y = self._ds.targets[:]
+			params_out['n_jobs'] = self._n_jobs
+			params_out['verbose'] = self._verbose
+#			print(dataset)
+#			import networkx as nx
+#			nx.draw_networkx(dataset[1])
+#			plt.show()
+			rtn_data = self._estimator(self._ds.graphs[:], **params_out) # @todo: Attention! this will not copy the graphs.
+			Kmatrix = rtn_data[0]
+			current_run_time = rtn_data[1]
+			# for some kernels, some graphs in datasets may not meet the
+			# kernels' requirements for graph structure. These graphs are trimmed.
+			if len(rtn_data) == 3:
+				idx_trim = rtn_data[2]  # the index of trimmed graph list
+				y = [y[idxt] for idxt in idx_trim] # trim y accordingly
+#			Kmatrix = np.random.rand(2250, 2250)
+#			current_run_time = 0.1
+
+			# remove graphs whose kernels with themselves are zeros
+			# @todo: y not changed accordingly?
+			Kmatrix_diag = Kmatrix.diagonal().copy()
+			nb_g_ignore = 0
+			for idxk, diag in enumerate(Kmatrix_diag):
+				if diag == 0:
+					Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0)
+					Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
+					nb_g_ignore += 1
+
+			# normalization
+			# @todo: works only for undirected graph?
+			Kmatrix_diag = Kmatrix.diagonal().copy()
+			for i in range(len(Kmatrix)):
+				for j in range(i, len(Kmatrix)):
+					Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
+					Kmatrix[j][i] = Kmatrix[i][j]
+			if self._verbose:
+				print()
+
+			if params_out == {}:
+				if self._verbose:
+					print('the gram matrix is: ')
+				if self._logging:
+					self._str_fw += 'the gram matrix is:\n\n'
+			else:
+				if self._verbose:
+					print('the gram matrix with parameters', params_out, 'is: \n\n')
+				if self._logging:
+					self._str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
+
+			if len(Kmatrix) < 2:
+				nb_gm_ignore += 1
+				if self._verbose:
+					print('ignored, as at most only one of all its diagonal value is non-zero.')
+				if self._logging:
+					self._str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
+			else:
+				if np.isnan(Kmatrix).any(
+				):  # if the matrix contains elements that are not numbers
+					nb_gm_ignore += 1
+					if self._verbose:
+						print('ignored, as it contains elements that are not numbers.')
+					if self._logging:
+						self._str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
+				else:
+#					print(Kmatrix)
+					if self._logging:
+						self._str_fw += np.array2string(
+							Kmatrix,
+							separator=',') + '\n\n'
+#							separator=',',
+#							threshold=np.inf,
+#							floatmode='unique') + '\n\n'
+
+					# Draw and save Gram matrix figures.
+					if self._save_gm_figs:
+						fig_file_name = self._output_dir + '/GM[ds]' + self._ds_name
+						if params_out != {}:
+							fig_file_name += '[params]' + str(idx)
+						plt.imshow(Kmatrix)
+						plt.colorbar()
+						plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
+	#					plt.show()
+						plt.clf()
+
+					self._gram_matrices.append(Kmatrix)
+					self._gram_matrix_time.append(current_run_time)
+					self._param_list_pre_revised.append(params_out)
+
+					if nb_g_ignore > 0:
+						if self._verbose:
+							print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
+						if self._logging:
+							self._str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
+
+		if self._verbose:
+			print()
+			print('{} gram matrices are calculated, {} of which are ignored.'.format(len(self._param_list_precomputed), nb_gm_ignore))
+		if self._logging:
+			self._str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(self._param_list_precomputed), nb_gm_ignore)
+			self._str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
+			self._str_fw += ''.join(['{}: {}\n'.format(idx, params_out) for idx, params_out in enumerate(self._param_list_precomputed)])
+
+
+	def do_cv(self):
+# save gram matrices to file.
+#			np.savez(output_dir + '/' + ds_name + '.gm',
+#					 gms=gram_matrices, params=param_list_pre_revised, y=y,
+#					 gmtime=gram_matrix_time)
+		if self._verbose:
+			print('2. Fitting and predicting using nested cross validation. This could really take a while...')
+
+		# ---- use pool.imap_unordered to parallel and track progress. ----
+#			train_pref = []
+#			val_pref = []
+#			test_pref = []
+#			def func_assign(result, var_to_assign):
+#				for idx, itm in enumerate(var_to_assign):
+#					itm.append(result[idx])
+#			trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)
+#
+#			parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign,
+#						[train_pref, val_pref, test_pref], glbv=gram_matrices,
+#						method='imap_unordered', n_jobs=n_jobs, chunksize=1,
+#						itr_desc='cross validation')
+
+		def init_worker(gms_toshare):
+			global G_gms
+			G_gms = gms_toshare
+
+#			gram_matrices = np.array(gram_matrices)
+#			gms_shape = gram_matrices.shape
+#			gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C'))
+#			pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape))
+		pool = Pool(processes=self._n_jobs, initializer=init_worker, initargs=(self._gram_matrices,))
+		trial_do_partial = partial(self._parallel_trial_do, self._param_list_pre_revised, self._param_list, self._ds.targets[:], self._model_type) # @todo: maybe self._ds.targets[:] should be y.
+		train_pref = []
+		val_pref = []
+		test_pref = []
+#			if NUM_TRIALS < 1000 * n_jobs:
+#				chunksize = int(NUM_TRIALS / n_jobs) + 1
+#			else:
+#				chunksize = 1000
+		chunksize = 1
+		if self._verbose:
+			iterator = tqdm(pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize), desc='cross validation', file=sys.stdout)
+		else:
+			iterator = pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize)
+		for o1, o2, o3 in iterator:
+			train_pref.append(o1)
+			val_pref.append(o2)
+			test_pref.append(o3)
+		pool.close()
+		pool.join()
+
+#			# ---- use pool.map to parallel. ----
+#			pool =  Pool(n_jobs)
+#			trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type)
+#			result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
+#			train_pref = [item[0] for item in result_perf]
+#			val_pref = [item[1] for item in result_perf]
+#			test_pref = [item[2] for item in result_perf]
+
+#			# ---- direct running, normally use a single CPU core. ----
+#			train_pref = []
+#			val_pref = []
+#			test_pref = []
+#			for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
+#				o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
+#				train_pref.append(o1)
+#				val_pref.append(o2)
+#				test_pref.append(o3)
+#			print()
+
+		if self._verbose:
+			print()
+			print('3. Getting final performance...')
+		if self._logging:
+			self._str_fw += '\nII. Performance.\n\n'
+
+		# averages and confidences of performances on outer trials for each combination of parameters
+		self._average_train_scores = np.mean(train_pref, axis=0)
+#			print('val_pref: ', val_pref[0][0])
+		self._average_val_scores = np.mean(val_pref, axis=0)
+#			print('test_pref: ', test_pref[0][0])
+		self._average_perf_scores = np.mean(test_pref, axis=0)
+		# sample std is used here
+		self._std_train_scores = np.std(train_pref, axis=0, ddof=1)
+		self._std_val_scores = np.std(val_pref, axis=0, ddof=1)
+		self._std_perf_scores = np.std(test_pref, axis=0, ddof=1)
+
+		if self._model_type == 'regression':
+			best_val_perf = np.amin(self._average_val_scores)
+		else:
+			best_val_perf = np.amax(self._average_val_scores)
+#			print('average_val_scores: ', self._average_val_scores)
+#			print('best_val_perf: ', best_val_perf)
+#			print()
+		best_params_index = np.where(self._average_val_scores == best_val_perf)
+		# find smallest val std with best val perf.
+		best_val_stds = [
+			self._std_val_scores[value][best_params_index[1][idx]]
+			for idx, value in enumerate(best_params_index[0])
+		]
+		min_val_std = np.amin(best_val_stds)
+		best_params_index = np.where(self._std_val_scores == min_val_std)
+		best_params_out = [self._param_list_pre_revised[i] for i in best_params_index[0]]
+		best_params_in = [self._param_list[i] for i in best_params_index[1]]
+
+		if self._verbose:
+			print('best_params_out: ', best_params_out)
+			print('best_params_in: ', best_params_in)
+			print()
+			print('best_val_perf: ', best_val_perf)
+			print('best_val_std: ', min_val_std)
+		if self._logging:
+			self._str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
+			self._str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
+			self._str_fw += 'best_val_perf: %s\n' % best_val_perf
+			self._str_fw += 'best_val_std: %s\n' % min_val_std
+
+#			print(best_params_index)
+#			print(best_params_index[0])
+#			print(self._average_perf_scores)
+		self._final_performance = [
+			self._average_perf_scores[value][best_params_index[1][idx]]
+			for idx, value in enumerate(best_params_index[0])
+		]
+		self._final_confidence = [
+			self._std_perf_scores[value][best_params_index[1][idx]]
+			for idx, value in enumerate(best_params_index[0])
+		]
+
+		if self._verbose:
+			print('final_performance: ', self._final_performance)
+			print('final_confidence: ', self._final_confidence)
+		if self._logging:
+			self._str_fw += 'final_performance: %s\n' % self._final_performance
+			self._str_fw += 'final_confidence: %s\n' % self._final_confidence
+
+		train_performance = [
+			self._average_train_scores[value][best_params_index[1][idx]]
+			for idx, value in enumerate(best_params_index[0])
+		]
+		train_std = [
+			self._std_train_scores[value][best_params_index[1][idx]]
+			for idx, value in enumerate(best_params_index[0])
+		]
+
+		if self._verbose:
+			print('train_performance: %s' % train_performance)
+			print('train_std: ', train_std)
+		if self._logging:
+			self._str_fw += 'train_performance: %s\n' % train_performance
+			self._str_fw += 'train_std: %s\n\n' % train_std
+
+		if self._verbose:
+			print()
+
+		tt_total = time.time() - self._tts  # training time for all hyper-parameters
+		average_gram_matrix_time = np.mean(self._gram_matrix_time)
+		std_gram_matrix_time = np.std(self._gram_matrix_time, ddof=1) if len(self._gram_matrix_time) > 1 else 0
+		best_gram_matrix_time = [self._gram_matrix_time[i] for i in best_params_index[0]]
+		ave_bgmt = np.mean(best_gram_matrix_time)
+		std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0
+
+		if self._verbose:
+			print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
+				  .format(average_gram_matrix_time, std_gram_matrix_time))
+			print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
+					ave_bgmt, std_bgmt))
+			print('total training time with all hyper-param choices: {:.2f}s'.format(
+					tt_total))
+		if self._logging:
+			self._str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
+			self._str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
+			self._str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)
+
+		# # save results to file
+		# np.savetxt(results_name_pre + 'average_train_scores.dt',
+		#			average_train_scores)
+		# np.savetxt(results_name_pre + 'average_val_scores', self._average_val_scores)
+		# np.savetxt(results_name_pre + 'average_perf_scores.dt',
+		#			average_perf_scores)
+		# np.savetxt(results_name_pre + 'std_train_scores.dt', self._std_train_scores)
+		# np.savetxt(results_name_pre + 'std_val_scores.dt', self._std_val_scores)
+		# np.savetxt(results_name_pre + 'std_perf_scores.dt', self._std_perf_scores)
+
+		# np.save(results_name_pre + 'best_params_index', best_params_index)
+		# np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
+		# np.save(results_name_pre + 'best_params_in.dt', best_params_in)
+		# np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
+		# np.save(results_name_pre + 'best_val_std.dt', best_val_std)
+		# np.save(results_name_pre + 'final_performance.dt', self._final_performance)
+		# np.save(results_name_pre + 'final_confidence.dt', self._final_confidence)
+		# np.save(results_name_pre + 'train_performance.dt', train_performance)
+		# np.save(results_name_pre + 'train_std.dt', train_std)
+
+		# np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
+		# np.save(results_name_pre + 'average_gram_matrix_time.dt',
+		#		 average_gram_matrix_time)
+		# np.save(results_name_pre + 'std_gram_matrix_time.dt',
+		#		 std_gram_matrix_time)
+		# np.save(results_name_pre + 'best_gram_matrix_time.dt',
+		#		 best_gram_matrix_time)
+
+
+	def trial_do(self, param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level
+
+	#	# get gram matrices from global variables.
+	#	gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C')
+
+		# Arrays to store scores
+		train_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
+		val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
+		test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
+
+		# randomness added to seeds of split function below. "high" is "size" times
+		# 10 so that at least 10 different random output will be yielded. Remove
+		# these lines if identical outputs is required.
+		rdm_out = np.random.RandomState(seed=None)
+		rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10,
+									   size=len(param_list_pre_revised))
+	#	print(trial, rdm_seed_out_l)
+	#	print()
+		# loop for each outer param tuple
+		for index_out, params_out in enumerate(param_list_pre_revised):
+			# get gram matrices from global variables.
+	#		gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]]
+	#		gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C')
+			gm_now = gram_matrices[index_out].copy()
+
+			# split gram matrix and y to app and test sets.
+			indices = range(len(y))
+			# The argument "random_state" in function "train_test_split" can not be
+			# set to None, because it will use RandomState instance used by
+			# np.random, which is possible for multiple subprocesses to inherit the
+			# same seed if they forked at the same time, leading to identical
+			# random variates for different subprocesses. Instead, we use "trial"
+			# and "index_out" parameters to generate different seeds for different
+			# trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add
+			# randomness into seeds, so that it yields a different output every
+			# time the program is run. To yield identical outputs every time,
+			# remove the second line below. Same method is used to the "KFold"
+			# function in the inner loop.
+			rdm_seed_out = (trial + 1) * (index_out + 1)
+			rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1)
+	#		print(trial, rdm_seed_out)
+			X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
+				gm_now, y, indices, test_size=0.1,
+				random_state=rdm_seed_out, shuffle=True)
+	#		print(trial, idx_app, idx_test)
+	#		print()
+			X_app = X_app[:, idx_app]
+			X_test = X_test[:, idx_app]
+			y_app = np.array(y_app)
+			y_test = np.array(y_test)
+
+			rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10,
+									   size=len(param_list))
+			# loop for each inner param tuple
+			for index_in, params_in in enumerate(param_list):
+	#			if trial == 0:
+	#				print(index_out, index_in)
+	#				print('params_in: ', params_in)
+	#			st = time.time()
+				rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1)
+	#			print("rdm_seed_in1: ", trial, index_in, rdm_seed_in)
+				rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1)
+	#			print("rdm_seed_in2: ", trial, index_in, rdm_seed_in)
+				inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in)
+				current_train_perf = []
+				current_valid_perf = []
+				current_test_perf = []
+
+				# For regression use the Kernel Ridge method
+	#			try:
+				if self._model_type == 'regression':
+					kr = KernelRidge(kernel='precomputed', **params_in)
+					# loop for each split on validation set level
+					# validation set level
+					for train_index, valid_index in inner_cv.split(X_app):
+	#					print("train_index, valid_index: ", trial, index_in, train_index, valid_index)
+	#					if trial == 0:
+	#						print('train_index: ', train_index)
+	#						print('valid_index: ', valid_index)
+	#						print('idx_test: ', idx_test)
+	#						print('y_app[train_index]: ', y_app[train_index])
+	#						print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
+	#						print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
+						kr.fit(X_app[train_index, :][:, train_index],
+							   y_app[train_index])
+
+						# predict on the train, validation and test set
+						y_pred_train = kr.predict(
+							X_app[train_index, :][:, train_index])
+						y_pred_valid = kr.predict(
+							X_app[valid_index, :][:, train_index])
+	#					if trial == 0:
+	#						print('y_pred_valid: ', y_pred_valid)
+	#						print()
+						y_pred_test = kr.predict(
+							X_test[:, train_index])
+
+						# root mean squared errors
+						current_train_perf.append(
+							np.sqrt(
+								mean_squared_error(
+									y_app[train_index], y_pred_train)))
+						current_valid_perf.append(
+							np.sqrt(
+								mean_squared_error(
+									y_app[valid_index], y_pred_valid)))
+	#					if trial == 0:
+	#						print(mean_squared_error(
+	#								y_app[valid_index], y_pred_valid))
+						current_test_perf.append(
+							np.sqrt(
+								mean_squared_error(
+									y_test, y_pred_test)))
+				# For clcassification use SVM
+				else:
+					svc = SVC(kernel='precomputed', cache_size=200,
+							  verbose=False, **params_in)
+					# loop for each split on validation set level
+					# validation set level
+					for train_index, valid_index in inner_cv.split(X_app):
+	#						np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
+	#					if trial == 0:
+	#						print('train_index: ', train_index)
+	#						print('valid_index: ', valid_index)
+	#						print('idx_test: ', idx_test)
+	#						print('y_app[train_index]: ', y_app[train_index])
+	#						print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
+	#						print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
+						svc.fit(X_app[train_index, :][:, train_index],
+							   y_app[train_index])
+
+						# predict on the train, validation and test set
+						y_pred_train = svc.predict(
+							X_app[train_index, :][:, train_index])
+						y_pred_valid = svc.predict(
+							X_app[valid_index, :][:, train_index])
+						y_pred_test = svc.predict(
+							X_test[:, train_index])
+
+						# root mean squared errors
+						current_train_perf.append(
+							accuracy_score(y_app[train_index],
+										   y_pred_train))
+						current_valid_perf.append(
+							accuracy_score(y_app[valid_index],
+										   y_pred_valid))
+						current_test_perf.append(
+							accuracy_score(y_test, y_pred_test))
+	#			except ValueError:
+	#				print(sys.exc_info()[0])
+	#				print(params_out, params_in)
+
+				# average performance on inner splits
+				train_pref[index_out][index_in] = np.mean(
+					current_train_perf)
+				val_pref[index_out][index_in] = np.mean(
+					current_valid_perf)
+				test_pref[index_out][index_in] = np.mean(
+					current_test_perf)
+	#			print(time.time() - st)
+	#	if trial == 0:
+	#		print('val_pref: ', val_pref)
+	#		print('test_pref: ', test_pref)
+
+		return train_pref, val_pref, test_pref
+
+
+	def _parallel_trial_do(self, param_list_pre_revised, param_list, y, model_type, trial):
+		train_pref, val_pref, test_pref = self._trial_do(param_list_pre_revised,
+												   param_list, G_gms, y,
+												   model_type, trial)
+		return train_pref, val_pref, test_pref
+
+
+	def printResultsInTable(self, param_list, param_list_pre_revised, average_val_scores,
+							std_val_scores, average_perf_scores, std_perf_scores,
+							average_train_scores, std_train_scores, gram_matrix_time,
+							model_type, verbose):
+		from collections import OrderedDict
+		from tabulate import tabulate
+		table_dict = {}
+		if model_type == 'regression':
+			for param_in in param_list:
+				param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
+		else:
+			for param_in in param_list:
+				param_in['C'] = '{:.2e}'.format(param_in['C'])
+		table_dict['params'] = [{**param_out, **param_in}
+								for param_in in param_list for param_out in param_list_pre_revised]
+		table_dict['gram_matrix_time'] = [
+			'{:.2f}'.format(gram_matrix_time[index_out])
+			for param_in in param_list
+			for index_out, _ in enumerate(param_list_pre_revised)
+		]
+		table_dict['valid_perf'] = [
+			'{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
+								   std_val_scores[index_out][index_in])
+			for index_in, _ in enumerate(param_list)
+			for index_out, _ in enumerate(param_list_pre_revised)
+		]
+		table_dict['test_perf'] = [
+			'{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
+								   std_perf_scores[index_out][index_in])
+			for index_in, _ in enumerate(param_list)
+			for index_out, _ in enumerate(param_list_pre_revised)
+		]
+		table_dict['train_perf'] = [
+			'{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
+								   std_train_scores[index_out][index_in])
+			for index_in, _ in enumerate(param_list)
+			for index_out, _ in enumerate(param_list_pre_revised)
+		]
+
+		keyorder = [
+			'params', 'train_perf', 'valid_perf', 'test_perf',
+			'gram_matrix_time'
+		]
+		if verbose:
+			print()
+		tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
+							key=lambda i: keyorder.index(i[0]))), headers='keys')
+	#			print(tb_print)
+		return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print
\ No newline at end of file
diff --git a/gklearn/model_learning/workflow.py b/gklearn/model_learning/workflow.py
new file mode 100644
index 0000000..7f1be1d
--- /dev/null
+++ b/gklearn/model_learning/workflow.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Nov 27 19:33:51 2020
+
+@author: ljia
+"""
+import os
+import numpy as np
+import pickle
+from gklearn.dataset import Dataset
+from gklearn.model_learning import NestedCV
+from gklearn.kernels import GRAPH_KERNELS
+
+class Workflow(object):
+
+
+	def __init__(self, **kwargs):
+		self._job_prefix = kwargs.get('job_prefix', 'gktask')
+		self._max_num_running_tasks = kwargs.get('max_num_running_tasks', np.inf)
+		self._root_dir = kwargs.get('root_dir', 'outputs/')
+
+
+	def run(self, tasks):
+		### Check inputs.
+		if self._check_inputs(tasks):
+			self._tasks = tasks
+		else:
+			raise ValueError('The input "tasks" is not correct.')
+
+
+		### Sort tasks.
+		self.sort_tasks_by_complexity()
+
+
+		### The main process.
+		complete = False
+		while not complete:
+
+			self.get_running_tasks()
+
+			if self._num_running_tasks < self._max_num_running_tasks:
+
+				### Load results from table.
+				self.load_results_from_table()
+
+				for task in self._tasks:
+					state = self.get_task_state(task)
+					if state != 'complete' and state != 'runnning':
+						self.run_task(task)
+
+					if self._num_running_tasks >= self._max_num_running_tasks:
+						break
+
+			### Save results.
+			self.save_results()
+
+			complete = self.check_completeness()
+
+# 			sleep()
+
+
+	def _check_inputs(self, tasks):
+		if not isinstance(tasks, list):
+			return False
+		else:
+			for i in tasks:
+				if not 'kernel' in i or not 'dataset' in i:
+					return False
+		return True
+
+
+	def sort_tasks_by_complexity(self):
+		return
+
+
+	def get_running_tasks(self):
+		command = 'squeue --user $USER --format "%.50j" --noheader'
+		stream = os.popen(command)
+		output = stream.readlines()
+		running_tasks = [o for o in output if o.strip().startswith(self._job_prefix)]
+		self._num_running_tasks = len(running_tasks)
+
+
+	def load_results_from_table(self):
+		pass
+
+
+	def get_task_state(self, task):
+		task_dir = os.path.join(self._root_dir, task['kernel'] + '.' + task['dataset'] + '/')
+		fn_summary = os.path.join(task_dir, 'results_summary.pkl')
+		if os.path.isfile(fn_summary):
+			output = pickle.loads(fn_summary)
+			state = output['state']
+			return state
+		else:
+			return 'unstarted'
+
+
+	def run_task(self, task):
+		ds_name = task['dataset']
+		k_name = task['kernel']
+
+		# Get dataset.
+		ds = Dataset(ds_name)
+		graph_kernel = GRAPH_KERNELS[k_name]
+
+		# Start CV.
+		results = NestedCV(ds, graph_kernel)
\ No newline at end of file