You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nested_cv.py 29 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Nov 27 18:59:28 2020
  5. @author: ljia
  6. """
  7. import os
  8. import datetime
  9. import time
  10. import sys
  11. from tqdm import tqdm
  12. from multiprocessing import Pool, Array
  13. from functools import partial
  14. import numpy as np
  15. from matplotlib import pyplot as plt
  16. from sklearn.model_selection import KFold, train_test_split, ParameterGrid
  17. from sklearn.kernel_ridge import KernelRidge
  18. from sklearn.svm import SVC
  19. from sklearn.metrics import accuracy_score, mean_squared_error
  20. class NestedCV(object):
  21. """Perform model selection, fitting and testing for precomputed kernels
  22. using nested CV. Print out neccessary data during the process then finally
  23. the results.
  24. Parameters
  25. ----------
  26. datafile : string
  27. Path of dataset file.
  28. estimator : function
  29. kernel function used to estimate. This function needs to return a gram matrix.
  30. param_grid_precomputed : dictionary
  31. Dictionary with names (string) of parameters used to calculate gram
  32. matrices as keys and lists of parameter settings to try as values. This
  33. enables searching over any sequence of parameter settings. Params with
  34. length 1 will be omitted.
  35. param_grid : dictionary
  36. Dictionary with names (string) of parameters used as penelties as keys
  37. and lists of parameter settings to try as values. This enables
  38. searching over any sequence of parameter settings. Params with length 1
  39. will be omitted.
  40. model_type : string
  41. Type of the problem, can be 'regression' or 'classification'.
  42. NUM_TRIALS : integer
  43. Number of random trials of the outer CV loop. The default is 30.
  44. datafile_y : string
  45. Path of file storing y data. This parameter is optional depending on
  46. the given dataset file.
  47. extra_params : dict
  48. Extra parameters for loading dataset. See function gklearn.utils.
  49. graphfiles.loadDataset for detail.
  50. ds_name : string
  51. Name of the dataset.
  52. n_jobs : int
  53. Number of jobs for parallelization.
  54. read_gm_from_file : boolean
  55. Whether gram matrices are loaded from a file.
  56. Examples
  57. --------
  58. >>> import numpy as np
  59. >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
  60. >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel
  61. >>>
  62. >>> datafile = '../datasets/MUTAG/MUTAG_A.txt'
  63. >>> estimator = untilhpathkernel
  64. >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’:
  65. [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]}
  66. >>> # ’C’ for classification problems and ’alpha’ for regression problems.
  67. >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’:
  68. np.logspace(-10, 10, num=41, base=10)}]
  69. >>>
  70. >>> model_selection_for_precomputed_kernel(datafile, estimator,
  71. param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’)
  72. """
  73. def __init__(self, dataset, estimator, param_grid_precomputed=None, param_grid=None, model_type=None, num_trials=30, output_dir=None, n_jobs=1, save_gms=True, save_gm_figs=False, logging=True, verbose=True, **kwargs):
  74. tqdm.monitor_interval = 0
  75. self._ds = dataset
  76. self._estimator = estimator
  77. self._num_trials = num_trials
  78. self._n_jobs = n_jobs
  79. self._save_gms = save_gms
  80. self._save_gm_figs = save_gm_figs
  81. self._logging = logging
  82. self._verbose = verbose
  83. self._kwargs = kwargs
  84. # Set dataset name.
  85. if self._ds._ds_name is None:
  86. self._ds_name = 'ds-unknown'
  87. else:
  88. self._ds_name = self._ds._ds_name
  89. # The output directory.
  90. if output_dir is None:
  91. self._output_dir = os.path.join('outputs/', estimator.__name__)
  92. else:
  93. self._output_dir = output_dir
  94. os.makedirs(self._output_dir, exist_ok=True)
  95. # Setup the model type.
  96. if model_type is None:
  97. self._model_type = dataset._task_type
  98. else:
  99. self._model_type = model_type.lower()
  100. if self._model_type != 'regression' and self._model_type != 'classification':
  101. raise Exception('The model type is incorrect! Please choose from regression or classification.')
  102. # @todo: Set param_grid_precomputed and param_grid.
  103. self._param_grid_precomputed = param_grid_precomputed
  104. self._param_grid = param_grid
  105. if self._verbose:
  106. print()
  107. print('--- This is a %s problem ---' % self._model_type)
  108. # A string to save all the results.
  109. if self._logging:
  110. self._str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
  111. self._str_fw += '# This file contains results of ' + self._estimator.__name__ + ' on dataset ' + self._ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'
  112. self._str_fw += 'This is a %s problem.\n' % self._model_type
  113. self.run()
  114. def run(self):
  115. self.fit()
  116. self.compute_gram_matrices()
  117. if len(self._gram_matrices) == 0:
  118. if self._verbose:
  119. print('All gram matrices are ignored, no results obtained.')
  120. if self._logging:
  121. self._str_fw += '\nAll gram matrices are ignored, no results obtained.\n\n'
  122. else:
  123. self.do_cv()
  124. # print out results as table.
  125. if self._logging:
  126. self._str_fw += self.printResultsInTable(self._param_list, self._param_list_pre_revised, self._average_val_scores, self._std_val_scores, self._average_perf_scores, self._std_perf_scores, self._average_train_scores, self._std_train_scores, self._gram_matrix_time, self._model_type, self._verbose)
  127. # open file to save all results for this dataset.
  128. if not os.path.exists(self._output_dir + '/' + self._ds_name + '.output.txt'):
  129. with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'w') as f:
  130. f.write(self._str_fw)
  131. else:
  132. with open(self._output_dir + '/' + self._ds_name + '.output.txt', 'r+') as f:
  133. content = f.read()
  134. f.seek(0, 0)
  135. f.write(self._str_fw + '\n\n\n' + content)
  136. return self._final_performance, self._final_confidence
  137. def fit(self):
  138. return
  139. def compute_gram_matrices(self):
  140. """Compute all gram matrices.
  141. Returns
  142. -------
  143. None.
  144. """
  145. # Grid of parameters with a discrete number of values for each.
  146. self._param_list_precomputed = list(ParameterGrid(self._param_grid_precomputed))
  147. self._param_list = list(ParameterGrid(self._param_grid))
  148. self._gram_matrices = [
  149. ] # a list to store gram matrices for all param_grid_precomputed
  150. self._gram_matrix_time = [
  151. ] # a list to store time to calculate gram matrices
  152. self._param_list_pre_revised = [
  153. ] # list to store param grids precomputed ignoring the useless ones
  154. if self._verbose:
  155. print()
  156. print('\n1. Computing gram matrices. This could take a while...')
  157. if self._logging:
  158. self._str_fw += '\nI. Gram matrices.\n\n'
  159. self._tts = time.time() # start training time
  160. nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
  161. for idx, params_out in enumerate(self._param_list_precomputed):
  162. y = self._ds.targets[:]
  163. params_out['n_jobs'] = self._n_jobs
  164. params_out['verbose'] = self._verbose
  165. # print(dataset)
  166. # import networkx as nx
  167. # nx.draw_networkx(dataset[1])
  168. # plt.show()
  169. rtn_data = self._estimator(self._ds.graphs[:], **params_out) # @todo: Attention! this will not copy the graphs.
  170. Kmatrix = rtn_data[0]
  171. current_run_time = rtn_data[1]
  172. # for some kernels, some graphs in datasets may not meet the
  173. # kernels' requirements for graph structure. These graphs are trimmed.
  174. if len(rtn_data) == 3:
  175. idx_trim = rtn_data[2] # the index of trimmed graph list
  176. y = [y[idxt] for idxt in idx_trim] # trim y accordingly
  177. # Kmatrix = np.random.rand(2250, 2250)
  178. # current_run_time = 0.1
  179. # remove graphs whose kernels with themselves are zeros
  180. # @todo: y not changed accordingly?
  181. Kmatrix_diag = Kmatrix.diagonal().copy()
  182. nb_g_ignore = 0
  183. for idxk, diag in enumerate(Kmatrix_diag):
  184. if diag == 0:
  185. Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0)
  186. Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
  187. nb_g_ignore += 1
  188. # normalization
  189. # @todo: works only for undirected graph?
  190. Kmatrix_diag = Kmatrix.diagonal().copy()
  191. for i in range(len(Kmatrix)):
  192. for j in range(i, len(Kmatrix)):
  193. Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
  194. Kmatrix[j][i] = Kmatrix[i][j]
  195. if self._verbose:
  196. print()
  197. if params_out == {}:
  198. if self._verbose:
  199. print('the gram matrix is: ')
  200. if self._logging:
  201. self._str_fw += 'the gram matrix is:\n\n'
  202. else:
  203. if self._verbose:
  204. print('the gram matrix with parameters', params_out, 'is: \n\n')
  205. if self._logging:
  206. self._str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
  207. if len(Kmatrix) < 2:
  208. nb_gm_ignore += 1
  209. if self._verbose:
  210. print('ignored, as at most only one of all its diagonal value is non-zero.')
  211. if self._logging:
  212. self._str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
  213. else:
  214. if np.isnan(Kmatrix).any(
  215. ): # if the matrix contains elements that are not numbers
  216. nb_gm_ignore += 1
  217. if self._verbose:
  218. print('ignored, as it contains elements that are not numbers.')
  219. if self._logging:
  220. self._str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
  221. else:
  222. # print(Kmatrix)
  223. if self._logging:
  224. self._str_fw += np.array2string(
  225. Kmatrix,
  226. separator=',') + '\n\n'
  227. # separator=',',
  228. # threshold=np.inf,
  229. # floatmode='unique') + '\n\n'
  230. # Draw and save Gram matrix figures.
  231. if self._save_gm_figs:
  232. fig_file_name = self._output_dir + '/GM[ds]' + self._ds_name
  233. if params_out != {}:
  234. fig_file_name += '[params]' + str(idx)
  235. plt.imshow(Kmatrix)
  236. plt.colorbar()
  237. plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
  238. # plt.show()
  239. plt.clf()
  240. self._gram_matrices.append(Kmatrix)
  241. self._gram_matrix_time.append(current_run_time)
  242. self._param_list_pre_revised.append(params_out)
  243. if nb_g_ignore > 0:
  244. if self._verbose:
  245. print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
  246. if self._logging:
  247. self._str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
  248. if self._verbose:
  249. print()
  250. print('{} gram matrices are calculated, {} of which are ignored.'.format(len(self._param_list_precomputed), nb_gm_ignore))
  251. if self._logging:
  252. self._str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(self._param_list_precomputed), nb_gm_ignore)
  253. self._str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
  254. self._str_fw += ''.join(['{}: {}\n'.format(idx, params_out) for idx, params_out in enumerate(self._param_list_precomputed)])
  255. def do_cv(self):
  256. # save gram matrices to file.
  257. # np.savez(output_dir + '/' + ds_name + '.gm',
  258. # gms=gram_matrices, params=param_list_pre_revised, y=y,
  259. # gmtime=gram_matrix_time)
  260. if self._verbose:
  261. print('2. Fitting and predicting using nested cross validation. This could really take a while...')
  262. # ---- use pool.imap_unordered to parallel and track progress. ----
  263. # train_pref = []
  264. # val_pref = []
  265. # test_pref = []
  266. # def func_assign(result, var_to_assign):
  267. # for idx, itm in enumerate(var_to_assign):
  268. # itm.append(result[idx])
  269. # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)
  270. #
  271. # parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign,
  272. # [train_pref, val_pref, test_pref], glbv=gram_matrices,
  273. # method='imap_unordered', n_jobs=n_jobs, chunksize=1,
  274. # itr_desc='cross validation')
  275. def init_worker(gms_toshare):
  276. global G_gms
  277. G_gms = gms_toshare
  278. # gram_matrices = np.array(gram_matrices)
  279. # gms_shape = gram_matrices.shape
  280. # gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C'))
  281. # pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape))
  282. pool = Pool(processes=self._n_jobs, initializer=init_worker, initargs=(self._gram_matrices,))
  283. trial_do_partial = partial(self._parallel_trial_do, self._param_list_pre_revised, self._param_list, self._ds.targets[:], self._model_type) # @todo: maybe self._ds.targets[:] should be y.
  284. train_pref = []
  285. val_pref = []
  286. test_pref = []
  287. # if NUM_TRIALS < 1000 * n_jobs:
  288. # chunksize = int(NUM_TRIALS / n_jobs) + 1
  289. # else:
  290. # chunksize = 1000
  291. chunksize = 1
  292. if self._verbose:
  293. iterator = tqdm(pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize), desc='cross validation', file=sys.stdout)
  294. else:
  295. iterator = pool.imap_unordered(trial_do_partial, range(self._num_trials), chunksize)
  296. for o1, o2, o3 in iterator:
  297. train_pref.append(o1)
  298. val_pref.append(o2)
  299. test_pref.append(o3)
  300. pool.close()
  301. pool.join()
  302. # # ---- use pool.map to parallel. ----
  303. # pool = Pool(n_jobs)
  304. # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type)
  305. # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
  306. # train_pref = [item[0] for item in result_perf]
  307. # val_pref = [item[1] for item in result_perf]
  308. # test_pref = [item[2] for item in result_perf]
  309. # # ---- direct running, normally use a single CPU core. ----
  310. # train_pref = []
  311. # val_pref = []
  312. # test_pref = []
  313. # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
  314. # o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
  315. # train_pref.append(o1)
  316. # val_pref.append(o2)
  317. # test_pref.append(o3)
  318. # print()
  319. if self._verbose:
  320. print()
  321. print('3. Getting final performance...')
  322. if self._logging:
  323. self._str_fw += '\nII. Performance.\n\n'
  324. # averages and confidences of performances on outer trials for each combination of parameters
  325. self._average_train_scores = np.mean(train_pref, axis=0)
  326. # print('val_pref: ', val_pref[0][0])
  327. self._average_val_scores = np.mean(val_pref, axis=0)
  328. # print('test_pref: ', test_pref[0][0])
  329. self._average_perf_scores = np.mean(test_pref, axis=0)
  330. # sample std is used here
  331. self._std_train_scores = np.std(train_pref, axis=0, ddof=1)
  332. self._std_val_scores = np.std(val_pref, axis=0, ddof=1)
  333. self._std_perf_scores = np.std(test_pref, axis=0, ddof=1)
  334. if self._model_type == 'regression':
  335. best_val_perf = np.amin(self._average_val_scores)
  336. else:
  337. best_val_perf = np.amax(self._average_val_scores)
  338. # print('average_val_scores: ', self._average_val_scores)
  339. # print('best_val_perf: ', best_val_perf)
  340. # print()
  341. best_params_index = np.where(self._average_val_scores == best_val_perf)
  342. # find smallest val std with best val perf.
  343. best_val_stds = [
  344. self._std_val_scores[value][best_params_index[1][idx]]
  345. for idx, value in enumerate(best_params_index[0])
  346. ]
  347. min_val_std = np.amin(best_val_stds)
  348. best_params_index = np.where(self._std_val_scores == min_val_std)
  349. best_params_out = [self._param_list_pre_revised[i] for i in best_params_index[0]]
  350. best_params_in = [self._param_list[i] for i in best_params_index[1]]
  351. if self._verbose:
  352. print('best_params_out: ', best_params_out)
  353. print('best_params_in: ', best_params_in)
  354. print()
  355. print('best_val_perf: ', best_val_perf)
  356. print('best_val_std: ', min_val_std)
  357. if self._logging:
  358. self._str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
  359. self._str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
  360. self._str_fw += 'best_val_perf: %s\n' % best_val_perf
  361. self._str_fw += 'best_val_std: %s\n' % min_val_std
  362. # print(best_params_index)
  363. # print(best_params_index[0])
  364. # print(self._average_perf_scores)
  365. self._final_performance = [
  366. self._average_perf_scores[value][best_params_index[1][idx]]
  367. for idx, value in enumerate(best_params_index[0])
  368. ]
  369. self._final_confidence = [
  370. self._std_perf_scores[value][best_params_index[1][idx]]
  371. for idx, value in enumerate(best_params_index[0])
  372. ]
  373. if self._verbose:
  374. print('final_performance: ', self._final_performance)
  375. print('final_confidence: ', self._final_confidence)
  376. if self._logging:
  377. self._str_fw += 'final_performance: %s\n' % self._final_performance
  378. self._str_fw += 'final_confidence: %s\n' % self._final_confidence
  379. train_performance = [
  380. self._average_train_scores[value][best_params_index[1][idx]]
  381. for idx, value in enumerate(best_params_index[0])
  382. ]
  383. train_std = [
  384. self._std_train_scores[value][best_params_index[1][idx]]
  385. for idx, value in enumerate(best_params_index[0])
  386. ]
  387. if self._verbose:
  388. print('train_performance: %s' % train_performance)
  389. print('train_std: ', train_std)
  390. if self._logging:
  391. self._str_fw += 'train_performance: %s\n' % train_performance
  392. self._str_fw += 'train_std: %s\n\n' % train_std
  393. if self._verbose:
  394. print()
  395. tt_total = time.time() - self._tts # training time for all hyper-parameters
  396. average_gram_matrix_time = np.mean(self._gram_matrix_time)
  397. std_gram_matrix_time = np.std(self._gram_matrix_time, ddof=1) if len(self._gram_matrix_time) > 1 else 0
  398. best_gram_matrix_time = [self._gram_matrix_time[i] for i in best_params_index[0]]
  399. ave_bgmt = np.mean(best_gram_matrix_time)
  400. std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0
  401. if self._verbose:
  402. print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
  403. .format(average_gram_matrix_time, std_gram_matrix_time))
  404. print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
  405. ave_bgmt, std_bgmt))
  406. print('total training time with all hyper-param choices: {:.2f}s'.format(
  407. tt_total))
  408. if self._logging:
  409. self._str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
  410. self._str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
  411. self._str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)
  412. # # save results to file
  413. # np.savetxt(results_name_pre + 'average_train_scores.dt',
  414. # average_train_scores)
  415. # np.savetxt(results_name_pre + 'average_val_scores', self._average_val_scores)
  416. # np.savetxt(results_name_pre + 'average_perf_scores.dt',
  417. # average_perf_scores)
  418. # np.savetxt(results_name_pre + 'std_train_scores.dt', self._std_train_scores)
  419. # np.savetxt(results_name_pre + 'std_val_scores.dt', self._std_val_scores)
  420. # np.savetxt(results_name_pre + 'std_perf_scores.dt', self._std_perf_scores)
  421. # np.save(results_name_pre + 'best_params_index', best_params_index)
  422. # np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
  423. # np.save(results_name_pre + 'best_params_in.dt', best_params_in)
  424. # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
  425. # np.save(results_name_pre + 'best_val_std.dt', best_val_std)
  426. # np.save(results_name_pre + 'final_performance.dt', self._final_performance)
  427. # np.save(results_name_pre + 'final_confidence.dt', self._final_confidence)
  428. # np.save(results_name_pre + 'train_performance.dt', train_performance)
  429. # np.save(results_name_pre + 'train_std.dt', train_std)
  430. # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
  431. # np.save(results_name_pre + 'average_gram_matrix_time.dt',
  432. # average_gram_matrix_time)
  433. # np.save(results_name_pre + 'std_gram_matrix_time.dt',
  434. # std_gram_matrix_time)
  435. # np.save(results_name_pre + 'best_gram_matrix_time.dt',
  436. # best_gram_matrix_time)
  437. def trial_do(self, param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level
  438. # # get gram matrices from global variables.
  439. # gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C')
  440. # Arrays to store scores
  441. train_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
  442. val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
  443. test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
  444. # randomness added to seeds of split function below. "high" is "size" times
  445. # 10 so that at least 10 different random output will be yielded. Remove
  446. # these lines if identical outputs is required.
  447. rdm_out = np.random.RandomState(seed=None)
  448. rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10,
  449. size=len(param_list_pre_revised))
  450. # print(trial, rdm_seed_out_l)
  451. # print()
  452. # loop for each outer param tuple
  453. for index_out, params_out in enumerate(param_list_pre_revised):
  454. # get gram matrices from global variables.
  455. # gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]]
  456. # gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C')
  457. gm_now = gram_matrices[index_out].copy()
  458. # split gram matrix and y to app and test sets.
  459. indices = range(len(y))
  460. # The argument "random_state" in function "train_test_split" can not be
  461. # set to None, because it will use RandomState instance used by
  462. # np.random, which is possible for multiple subprocesses to inherit the
  463. # same seed if they forked at the same time, leading to identical
  464. # random variates for different subprocesses. Instead, we use "trial"
  465. # and "index_out" parameters to generate different seeds for different
  466. # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add
  467. # randomness into seeds, so that it yields a different output every
  468. # time the program is run. To yield identical outputs every time,
  469. # remove the second line below. Same method is used to the "KFold"
  470. # function in the inner loop.
  471. rdm_seed_out = (trial + 1) * (index_out + 1)
  472. rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1)
  473. # print(trial, rdm_seed_out)
  474. X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
  475. gm_now, y, indices, test_size=0.1,
  476. random_state=rdm_seed_out, shuffle=True)
  477. # print(trial, idx_app, idx_test)
  478. # print()
  479. X_app = X_app[:, idx_app]
  480. X_test = X_test[:, idx_app]
  481. y_app = np.array(y_app)
  482. y_test = np.array(y_test)
  483. rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10,
  484. size=len(param_list))
  485. # loop for each inner param tuple
  486. for index_in, params_in in enumerate(param_list):
  487. # if trial == 0:
  488. # print(index_out, index_in)
  489. # print('params_in: ', params_in)
  490. # st = time.time()
  491. rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1)
  492. # print("rdm_seed_in1: ", trial, index_in, rdm_seed_in)
  493. rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1)
  494. # print("rdm_seed_in2: ", trial, index_in, rdm_seed_in)
  495. inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in)
  496. current_train_perf = []
  497. current_valid_perf = []
  498. current_test_perf = []
  499. # For regression use the Kernel Ridge method
  500. # try:
  501. if self._model_type == 'regression':
  502. kr = KernelRidge(kernel='precomputed', **params_in)
  503. # loop for each split on validation set level
  504. # validation set level
  505. for train_index, valid_index in inner_cv.split(X_app):
  506. # print("train_index, valid_index: ", trial, index_in, train_index, valid_index)
  507. # if trial == 0:
  508. # print('train_index: ', train_index)
  509. # print('valid_index: ', valid_index)
  510. # print('idx_test: ', idx_test)
  511. # print('y_app[train_index]: ', y_app[train_index])
  512. # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
  513. # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
  514. kr.fit(X_app[train_index, :][:, train_index],
  515. y_app[train_index])
  516. # predict on the train, validation and test set
  517. y_pred_train = kr.predict(
  518. X_app[train_index, :][:, train_index])
  519. y_pred_valid = kr.predict(
  520. X_app[valid_index, :][:, train_index])
  521. # if trial == 0:
  522. # print('y_pred_valid: ', y_pred_valid)
  523. # print()
  524. y_pred_test = kr.predict(
  525. X_test[:, train_index])
  526. # root mean squared errors
  527. current_train_perf.append(
  528. np.sqrt(
  529. mean_squared_error(
  530. y_app[train_index], y_pred_train)))
  531. current_valid_perf.append(
  532. np.sqrt(
  533. mean_squared_error(
  534. y_app[valid_index], y_pred_valid)))
  535. # if trial == 0:
  536. # print(mean_squared_error(
  537. # y_app[valid_index], y_pred_valid))
  538. current_test_perf.append(
  539. np.sqrt(
  540. mean_squared_error(
  541. y_test, y_pred_test)))
  542. # For clcassification use SVM
  543. else:
  544. svc = SVC(kernel='precomputed', cache_size=200,
  545. verbose=False, **params_in)
  546. # loop for each split on validation set level
  547. # validation set level
  548. for train_index, valid_index in inner_cv.split(X_app):
  549. # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
  550. # if trial == 0:
  551. # print('train_index: ', train_index)
  552. # print('valid_index: ', valid_index)
  553. # print('idx_test: ', idx_test)
  554. # print('y_app[train_index]: ', y_app[train_index])
  555. # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
  556. # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
  557. svc.fit(X_app[train_index, :][:, train_index],
  558. y_app[train_index])
  559. # predict on the train, validation and test set
  560. y_pred_train = svc.predict(
  561. X_app[train_index, :][:, train_index])
  562. y_pred_valid = svc.predict(
  563. X_app[valid_index, :][:, train_index])
  564. y_pred_test = svc.predict(
  565. X_test[:, train_index])
  566. # root mean squared errors
  567. current_train_perf.append(
  568. accuracy_score(y_app[train_index],
  569. y_pred_train))
  570. current_valid_perf.append(
  571. accuracy_score(y_app[valid_index],
  572. y_pred_valid))
  573. current_test_perf.append(
  574. accuracy_score(y_test, y_pred_test))
  575. # except ValueError:
  576. # print(sys.exc_info()[0])
  577. # print(params_out, params_in)
  578. # average performance on inner splits
  579. train_pref[index_out][index_in] = np.mean(
  580. current_train_perf)
  581. val_pref[index_out][index_in] = np.mean(
  582. current_valid_perf)
  583. test_pref[index_out][index_in] = np.mean(
  584. current_test_perf)
  585. # print(time.time() - st)
  586. # if trial == 0:
  587. # print('val_pref: ', val_pref)
  588. # print('test_pref: ', test_pref)
  589. return train_pref, val_pref, test_pref
  590. def _parallel_trial_do(self, param_list_pre_revised, param_list, y, model_type, trial):
  591. train_pref, val_pref, test_pref = self._trial_do(param_list_pre_revised,
  592. param_list, G_gms, y,
  593. model_type, trial)
  594. return train_pref, val_pref, test_pref
  595. def printResultsInTable(self, param_list, param_list_pre_revised, average_val_scores,
  596. std_val_scores, average_perf_scores, std_perf_scores,
  597. average_train_scores, std_train_scores, gram_matrix_time,
  598. model_type, verbose):
  599. from collections import OrderedDict
  600. from tabulate import tabulate
  601. table_dict = {}
  602. if model_type == 'regression':
  603. for param_in in param_list:
  604. param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
  605. else:
  606. for param_in in param_list:
  607. param_in['C'] = '{:.2e}'.format(param_in['C'])
  608. table_dict['params'] = [{**param_out, **param_in}
  609. for param_in in param_list for param_out in param_list_pre_revised]
  610. table_dict['gram_matrix_time'] = [
  611. '{:.2f}'.format(gram_matrix_time[index_out])
  612. for param_in in param_list
  613. for index_out, _ in enumerate(param_list_pre_revised)
  614. ]
  615. table_dict['valid_perf'] = [
  616. '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
  617. std_val_scores[index_out][index_in])
  618. for index_in, _ in enumerate(param_list)
  619. for index_out, _ in enumerate(param_list_pre_revised)
  620. ]
  621. table_dict['test_perf'] = [
  622. '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
  623. std_perf_scores[index_out][index_in])
  624. for index_in, _ in enumerate(param_list)
  625. for index_out, _ in enumerate(param_list_pre_revised)
  626. ]
  627. table_dict['train_perf'] = [
  628. '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
  629. std_train_scores[index_out][index_in])
  630. for index_in, _ in enumerate(param_list)
  631. for index_out, _ in enumerate(param_list_pre_revised)
  632. ]
  633. keyorder = [
  634. 'params', 'train_perf', 'valid_perf', 'test_perf',
  635. 'gram_matrix_time'
  636. ]
  637. if verbose:
  638. print()
  639. tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
  640. key=lambda i: keyorder.index(i[0]))), headers='keys')
  641. # print(tb_print)
  642. return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print

A Python package for graph kernels, graph edit distances and graph pre-image problem.