You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

model_selection_precomputed.py 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712
  1. import numpy as np
  2. from matplotlib import pyplot as plt
  3. from sklearn.kernel_ridge import KernelRidge
  4. from sklearn.svm import SVC
  5. from sklearn.metrics import accuracy_score, mean_squared_error
  6. from sklearn.model_selection import KFold, train_test_split, ParameterGrid
  7. #from joblib import Parallel, delayed
  8. from multiprocessing import Pool
  9. from functools import partial
  10. import sys
  11. sys.path.insert(0, "../")
  12. import os
  13. import time
  14. import datetime
  15. #from os.path import basename, splitext
  16. from pygraph.utils.graphfiles import loadDataset
  17. from tqdm import tqdm
  18. def model_selection_for_precomputed_kernel(datafile,
  19. estimator,
  20. param_grid_precomputed,
  21. param_grid,
  22. model_type,
  23. NUM_TRIALS=30,
  24. datafile_y=None,
  25. extra_params=None,
  26. ds_name='ds-unknown',
  27. n_jobs=1,
  28. read_gm_from_file=False):
  29. """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.
  30. Parameters
  31. ----------
  32. datafile : string
  33. Path of dataset file.
  34. estimator : function
  35. kernel function used to estimate. This function needs to return a gram matrix.
  36. param_grid_precomputed : dictionary
  37. Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
  38. param_grid : dictionary
  39. Dictionary with names (string) of parameters used as penelties as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
  40. model_type : string
  41. Typr of the problem, can be regression or classification.
  42. NUM_TRIALS : integer
  43. Number of random trials of outer cv loop. The default is 30.
  44. datafile_y : string
  45. Path of file storing y data. This parameter is optional depending on the given dataset file.
  46. read_gm_from_file : boolean
  47. Whether gram matrices are loaded from file.
  48. Examples
  49. --------
  50. >>> import numpy as np
  51. >>> import sys
  52. >>> sys.path.insert(0, "../")
  53. >>> from pygraph.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
  54. >>> from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
  55. >>>
  56. >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'
  57. >>> estimator = weisfeilerlehmankernel
  58. >>> param_grid_precomputed = {'height': [0,1,2,3,4,5,6,7,8,9,10], 'base_kernel': ['subtree']}
  59. >>> param_grid = {"alpha": np.logspace(-2, 2, num = 10, base = 10)}
  60. >>>
  61. >>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression')
  62. """
  63. tqdm.monitor_interval = 0
  64. results_dir = '../notebooks/results/' + estimator.__name__
  65. # a string to save all the results.
  66. str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
  67. str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'
  68. # setup the model type
  69. model_type = model_type.lower()
  70. if model_type != 'regression' and model_type != 'classification':
  71. raise Exception(
  72. 'The model type is incorrect! Please choose from regression or classification.'
  73. )
  74. print()
  75. print('--- This is a %s problem ---' % model_type)
  76. str_fw += 'This is a %s problem.\n' % model_type
  77. # calculate gram matrices rather than read them from file.
  78. if read_gm_from_file == False:
  79. # Load the dataset
  80. print()
  81. print('\n1. Loading dataset from file...')
  82. dataset, y = loadDataset(
  83. datafile, filename_y=datafile_y, extra_params=extra_params)
  84. # import matplotlib.pyplot as plt
  85. # import networkx as nx
  86. # nx.draw_networkx(dataset[30])
  87. # plt.show()
  88. # Grid of parameters with a discrete number of values for each.
  89. param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
  90. param_list = list(ParameterGrid(param_grid))
  91. gram_matrices = [
  92. ] # a list to store gram matrices for all param_grid_precomputed
  93. gram_matrix_time = [
  94. ] # a list to store time to calculate gram matrices
  95. param_list_pre_revised = [
  96. ] # list to store param grids precomputed ignoring the useless ones
  97. # calculate all gram matrices
  98. print()
  99. print('2. Calculating gram matrices. This could take a while...')
  100. str_fw += '\nII. Gram matrices.\n\n'
  101. tts = time.time() # start training time
  102. nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
  103. for idx, params_out in enumerate(param_list_precomputed):
  104. params_out['n_jobs'] = n_jobs
  105. rtn_data = estimator(dataset, **params_out)
  106. Kmatrix = rtn_data[0]
  107. current_run_time = rtn_data[1]
  108. # for some kernels, some graphs in datasets may not meet the
  109. # kernels' requirements for graph structure. These graphs are trimmed.
  110. if len(rtn_data) == 3:
  111. idx_trim = rtn_data[2] # the index of trimmed graph list
  112. y = [y[idx] for idx in idx_trim] # trim y accordingly
  113. Kmatrix_diag = Kmatrix.diagonal().copy()
  114. # remove graphs whose kernels with themselves are zeros
  115. nb_g_ignore = 0
  116. for idx, diag in enumerate(Kmatrix_diag):
  117. if diag == 0:
  118. Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
  119. Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
  120. nb_g_ignore += 1
  121. # normalization
  122. for i in range(len(Kmatrix)):
  123. for j in range(i, len(Kmatrix)):
  124. Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
  125. Kmatrix[j][i] = Kmatrix[i][j]
  126. print()
  127. if params_out == {}:
  128. print('the gram matrix is: ')
  129. str_fw += 'the gram matrix is:\n\n'
  130. else:
  131. print('the gram matrix with parameters', params_out, 'is: ')
  132. str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
  133. if len(Kmatrix) < 2:
  134. nb_gm_ignore += 1
  135. print('ignored, as at most only one of all its diagonal value is non-zero.')
  136. str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
  137. else:
  138. if np.isnan(Kmatrix).any(
  139. ): # if the matrix contains elements that are not numbers
  140. nb_gm_ignore += 1
  141. print('ignored, as it contains elements that are not numbers.')
  142. str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
  143. else:
  144. print(Kmatrix)
  145. str_fw += np.array2string(
  146. Kmatrix,
  147. separator=',') + '\n\n'
  148. # separator=',',
  149. # threshold=np.inf,
  150. # floatmode='unique') + '\n\n'
  151. fig_file_name = results_dir + '/GM[ds]' + ds_name
  152. if params_out != {}:
  153. fig_file_name += '[params]' + str(idx)
  154. plt.imshow(Kmatrix)
  155. plt.colorbar()
  156. plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
  157. plt.show()
  158. gram_matrices.append(Kmatrix)
  159. gram_matrix_time.append(current_run_time)
  160. param_list_pre_revised.append(params_out)
  161. if nb_g_ignore > 0:
  162. print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
  163. str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
  164. print()
  165. print(
  166. '{} gram matrices are calculated, {} of which are ignored.'.format(
  167. len(param_list_precomputed), nb_gm_ignore))
  168. str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)
  169. str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
  170. str_fw += ''.join([
  171. '{}: {}\n'.format(idx, params_out)
  172. for idx, params_out in enumerate(param_list_precomputed)
  173. ])
  174. print()
  175. if len(gram_matrices) == 0:
  176. print('all gram matrices are ignored, no results obtained.')
  177. str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
  178. else:
  179. # save gram matrices to file.
  180. np.savez(results_dir + '/' + ds_name + '.gm',
  181. gms=gram_matrices, params=param_list_pre_revised, y=y,
  182. gmtime=gram_matrix_time)
  183. print(
  184. '3. Fitting and predicting using nested cross validation. This could really take a while...'
  185. )
  186. pool = Pool(n_jobs)
  187. trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
  188. train_pref = []
  189. val_pref = []
  190. test_pref = []
  191. if NUM_TRIALS < 100:
  192. chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
  193. if extra:
  194. chunksize += 1
  195. else:
  196. chunksize = 100
  197. for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
  198. train_pref.append(o1)
  199. val_pref.append(o2)
  200. test_pref.append(o3)
  201. pool.close()
  202. pool.join()
  203. # # ---- use pool.map to parallel. ----
  204. # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
  205. # train_pref = [item[0] for item in result_perf]
  206. # val_pref = [item[1] for item in result_perf]
  207. # test_pref = [item[2] for item in result_perf]
  208. # # ---- use joblib.Parallel to parallel and track progress. ----
  209. # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
  210. # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
  211. # train_pref = [item[0] for item in result_perf]
  212. # val_pref = [item[1] for item in result_perf]
  213. # test_pref = [item[2] for item in result_perf]
  214. # # ---- direct running, normally use a single CPU core. ----
  215. # train_pref = []
  216. # val_pref = []
  217. # test_pref = []
  218. # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
  219. # o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
  220. # train_pref.append(o1)
  221. # val_pref.append(o2)
  222. # test_pref.append(o3)
  223. print()
  224. print('4. Getting final performance...')
  225. str_fw += '\nIII. Performance.\n\n'
  226. # averages and confidences of performances on outer trials for each combination of parameters
  227. average_train_scores = np.mean(train_pref, axis=0)
  228. average_val_scores = np.mean(val_pref, axis=0)
  229. average_perf_scores = np.mean(test_pref, axis=0)
  230. # sample std is used here
  231. std_train_scores = np.std(train_pref, axis=0, ddof=1)
  232. std_val_scores = np.std(val_pref, axis=0, ddof=1)
  233. std_perf_scores = np.std(test_pref, axis=0, ddof=1)
  234. if model_type == 'regression':
  235. best_val_perf = np.amin(average_val_scores)
  236. else:
  237. best_val_perf = np.amax(average_val_scores)
  238. best_params_index = np.where(average_val_scores == best_val_perf)
  239. # find smallest val std with best val perf.
  240. best_val_stds = [
  241. std_val_scores[value][best_params_index[1][idx]]
  242. for idx, value in enumerate(best_params_index[0])
  243. ]
  244. min_val_std = np.amin(best_val_stds)
  245. best_params_index = np.where(std_val_scores == min_val_std)
  246. best_params_out = [
  247. param_list_pre_revised[i] for i in best_params_index[0]
  248. ]
  249. best_params_in = [param_list[i] for i in best_params_index[1]]
  250. print('best_params_out: ', best_params_out)
  251. print('best_params_in: ', best_params_in)
  252. print()
  253. print('best_val_perf: ', best_val_perf)
  254. print('best_val_std: ', min_val_std)
  255. str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
  256. str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
  257. str_fw += 'best_val_perf: %s\n' % best_val_perf
  258. str_fw += 'best_val_std: %s\n' % min_val_std
  259. final_performance = [
  260. average_perf_scores[value][best_params_index[1][idx]]
  261. for idx, value in enumerate(best_params_index[0])
  262. ]
  263. final_confidence = [
  264. std_perf_scores[value][best_params_index[1][idx]]
  265. for idx, value in enumerate(best_params_index[0])
  266. ]
  267. print('final_performance: ', final_performance)
  268. print('final_confidence: ', final_confidence)
  269. str_fw += 'final_performance: %s\n' % final_performance
  270. str_fw += 'final_confidence: %s\n' % final_confidence
  271. train_performance = [
  272. average_train_scores[value][best_params_index[1][idx]]
  273. for idx, value in enumerate(best_params_index[0])
  274. ]
  275. train_std = [
  276. std_train_scores[value][best_params_index[1][idx]]
  277. for idx, value in enumerate(best_params_index[0])
  278. ]
  279. print('train_performance: %s' % train_performance)
  280. print('train_std: ', train_std)
  281. str_fw += 'train_performance: %s\n' % train_performance
  282. str_fw += 'train_std: %s\n\n' % train_std
  283. print()
  284. tt_total = time.time() - tts # training time for all hyper-parameters
  285. average_gram_matrix_time = np.mean(gram_matrix_time)
  286. std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
  287. best_gram_matrix_time = [
  288. gram_matrix_time[i] for i in best_params_index[0]
  289. ]
  290. ave_bgmt = np.mean(best_gram_matrix_time)
  291. std_bgmt = np.std(best_gram_matrix_time, ddof=1)
  292. print(
  293. 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
  294. .format(average_gram_matrix_time, std_gram_matrix_time))
  295. print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
  296. ave_bgmt, std_bgmt))
  297. print(
  298. 'total training time with all hyper-param choices: {:.2f}s'.format(
  299. tt_total))
  300. str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
  301. str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
  302. str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)
  303. # # save results to file
  304. # np.savetxt(results_name_pre + 'average_train_scores.dt',
  305. # average_train_scores)
  306. # np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)
  307. # np.savetxt(results_name_pre + 'average_perf_scores.dt',
  308. # average_perf_scores)
  309. # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
  310. # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
  311. # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)
  312. # np.save(results_name_pre + 'best_params_index', best_params_index)
  313. # np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
  314. # np.save(results_name_pre + 'best_params_in.dt', best_params_in)
  315. # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
  316. # np.save(results_name_pre + 'best_val_std.dt', best_val_std)
  317. # np.save(results_name_pre + 'final_performance.dt', final_performance)
  318. # np.save(results_name_pre + 'final_confidence.dt', final_confidence)
  319. # np.save(results_name_pre + 'train_performance.dt', train_performance)
  320. # np.save(results_name_pre + 'train_std.dt', train_std)
  321. # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
  322. # np.save(results_name_pre + 'average_gram_matrix_time.dt',
  323. # average_gram_matrix_time)
  324. # np.save(results_name_pre + 'std_gram_matrix_time.dt',
  325. # std_gram_matrix_time)
  326. # np.save(results_name_pre + 'best_gram_matrix_time.dt',
  327. # best_gram_matrix_time)
  328. # print out as table.
  329. from collections import OrderedDict
  330. from tabulate import tabulate
  331. table_dict = {}
  332. if model_type == 'regression':
  333. for param_in in param_list:
  334. param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
  335. else:
  336. for param_in in param_list:
  337. param_in['C'] = '{:.2e}'.format(param_in['C'])
  338. table_dict['params'] = [{**param_out, **param_in}
  339. for param_in in param_list for param_out in param_list_pre_revised]
  340. table_dict['gram_matrix_time'] = [
  341. '{:.2f}'.format(gram_matrix_time[index_out])
  342. for param_in in param_list
  343. for index_out, _ in enumerate(param_list_pre_revised)
  344. ]
  345. table_dict['valid_perf'] = [
  346. '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
  347. std_val_scores[index_out][index_in])
  348. for index_in, _ in enumerate(param_list)
  349. for index_out, _ in enumerate(param_list_pre_revised)
  350. ]
  351. table_dict['test_perf'] = [
  352. '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
  353. std_perf_scores[index_out][index_in])
  354. for index_in, _ in enumerate(param_list)
  355. for index_out, _ in enumerate(param_list_pre_revised)
  356. ]
  357. table_dict['train_perf'] = [
  358. '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
  359. std_train_scores[index_out][index_in])
  360. for index_in, _ in enumerate(param_list)
  361. for index_out, _ in enumerate(param_list_pre_revised)
  362. ]
  363. keyorder = [
  364. 'params', 'train_perf', 'valid_perf', 'test_perf',
  365. 'gram_matrix_time'
  366. ]
  367. print()
  368. tb_print = tabulate(
  369. OrderedDict(
  370. sorted(table_dict.items(),
  371. key=lambda i: keyorder.index(i[0]))),
  372. headers='keys')
  373. print(tb_print)
  374. str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print
  375. # read gram matrices from file.
  376. else:
  377. # Grid of parameters with a discrete number of values for each.
  378. # param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
  379. param_list = list(ParameterGrid(param_grid))
  380. # read gram matrices from file.
  381. print()
  382. print('2. Reading gram matrices from file...')
  383. str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n'
  384. gmfile = np.load(results_dir + '/' + ds_name + '.gm.npz')
  385. gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed
  386. param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones
  387. y = gmfile['y'].tolist()
  388. tts = time.time() # start training time
  389. # nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
  390. print(
  391. '3. Fitting and predicting using nested cross validation. This could really take a while...'
  392. )
  393. pool = Pool(n_jobs)
  394. trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
  395. train_pref = []
  396. val_pref = []
  397. test_pref = []
  398. if NUM_TRIALS < 100:
  399. chunksize, extra = divmod(NUM_TRIALS, n_jobs * 4)
  400. if extra:
  401. chunksize += 1
  402. else:
  403. chunksize = 100
  404. for o1, o2, o3 in tqdm(pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout):
  405. train_pref.append(o1)
  406. val_pref.append(o2)
  407. test_pref.append(o3)
  408. pool.close()
  409. pool.join()
  410. # # ---- use pool.map to parallel. ----
  411. # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
  412. # train_pref = [item[0] for item in result_perf]
  413. # val_pref = [item[1] for item in result_perf]
  414. # test_pref = [item[2] for item in result_perf]
  415. # # ---- use joblib.Parallel to parallel and track progress. ----
  416. # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
  417. # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
  418. # train_pref = [item[0] for item in result_perf]
  419. # val_pref = [item[1] for item in result_perf]
  420. # test_pref = [item[2] for item in result_perf]
  421. # # ---- direct running, normally use a single CPU core. ----
  422. # train_pref = []
  423. # val_pref = []
  424. # test_pref = []
  425. # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
  426. # o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
  427. # train_pref.append(o1)
  428. # val_pref.append(o2)
  429. # test_pref.append(o3)
  430. print()
  431. print('4. Getting final performance...')
  432. str_fw += '\nIII. Performance.\n\n'
  433. # averages and confidences of performances on outer trials for each combination of parameters
  434. average_train_scores = np.mean(train_pref, axis=0)
  435. average_val_scores = np.mean(val_pref, axis=0)
  436. average_perf_scores = np.mean(test_pref, axis=0)
  437. # sample std is used here
  438. std_train_scores = np.std(train_pref, axis=0, ddof=1)
  439. std_val_scores = np.std(val_pref, axis=0, ddof=1)
  440. std_perf_scores = np.std(test_pref, axis=0, ddof=1)
  441. if model_type == 'regression':
  442. best_val_perf = np.amin(average_val_scores)
  443. else:
  444. best_val_perf = np.amax(average_val_scores)
  445. best_params_index = np.where(average_val_scores == best_val_perf)
  446. # find smallest val std with best val perf.
  447. best_val_stds = [
  448. std_val_scores[value][best_params_index[1][idx]]
  449. for idx, value in enumerate(best_params_index[0])
  450. ]
  451. min_val_std = np.amin(best_val_stds)
  452. best_params_index = np.where(std_val_scores == min_val_std)
  453. best_params_out = [
  454. param_list_pre_revised[i] for i in best_params_index[0]
  455. ]
  456. best_params_in = [param_list[i] for i in best_params_index[1]]
  457. print('best_params_out: ', best_params_out)
  458. print('best_params_in: ', best_params_in)
  459. print()
  460. print('best_val_perf: ', best_val_perf)
  461. print('best_val_std: ', min_val_std)
  462. str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
  463. str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
  464. str_fw += 'best_val_perf: %s\n' % best_val_perf
  465. str_fw += 'best_val_std: %s\n' % min_val_std
  466. final_performance = [
  467. average_perf_scores[value][best_params_index[1][idx]]
  468. for idx, value in enumerate(best_params_index[0])
  469. ]
  470. final_confidence = [
  471. std_perf_scores[value][best_params_index[1][idx]]
  472. for idx, value in enumerate(best_params_index[0])
  473. ]
  474. print('final_performance: ', final_performance)
  475. print('final_confidence: ', final_confidence)
  476. str_fw += 'final_performance: %s\n' % final_performance
  477. str_fw += 'final_confidence: %s\n' % final_confidence
  478. train_performance = [
  479. average_train_scores[value][best_params_index[1][idx]]
  480. for idx, value in enumerate(best_params_index[0])
  481. ]
  482. train_std = [
  483. std_train_scores[value][best_params_index[1][idx]]
  484. for idx, value in enumerate(best_params_index[0])
  485. ]
  486. print('train_performance: %s' % train_performance)
  487. print('train_std: ', train_std)
  488. str_fw += 'train_performance: %s\n' % train_performance
  489. str_fw += 'train_std: %s\n\n' % train_std
  490. print()
  491. tt_poster = time.time() - tts # training time with hyper-param choices who did not participate in calculation of gram matrices
  492. # average_gram_matrix_time = np.mean(gram_matrix_time)
  493. # std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
  494. # best_gram_matrix_time = [
  495. # gram_matrix_time[i] for i in best_params_index[0]
  496. # ]
  497. # ave_bgmt = np.mean(best_gram_matrix_time)
  498. # std_bgmt = np.std(best_gram_matrix_time, ddof=1)
  499. # print(
  500. # 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
  501. # .format(average_gram_matrix_time, std_gram_matrix_time))
  502. # print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
  503. # ave_bgmt, std_bgmt))
  504. print(
  505. 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format(
  506. tt_poster))
  507. # str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
  508. # str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
  509. str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster)
  510. # print out as table.
  511. from collections import OrderedDict
  512. from tabulate import tabulate
  513. table_dict = {}
  514. if model_type == 'regression':
  515. for param_in in param_list:
  516. param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
  517. else:
  518. for param_in in param_list:
  519. param_in['C'] = '{:.2e}'.format(param_in['C'])
  520. table_dict['params'] = [{**param_out, **param_in}
  521. for param_in in param_list for param_out in param_list_pre_revised]
  522. # table_dict['gram_matrix_time'] = [
  523. # '{:.2f}'.format(gram_matrix_time[index_out])
  524. # for param_in in param_list
  525. # for index_out, _ in enumerate(param_list_pre_revised)
  526. # ]
  527. table_dict['valid_perf'] = [
  528. '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
  529. std_val_scores[index_out][index_in])
  530. for index_in, _ in enumerate(param_list)
  531. for index_out, _ in enumerate(param_list_pre_revised)
  532. ]
  533. table_dict['test_perf'] = [
  534. '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
  535. std_perf_scores[index_out][index_in])
  536. for index_in, _ in enumerate(param_list)
  537. for index_out, _ in enumerate(param_list_pre_revised)
  538. ]
  539. table_dict['train_perf'] = [
  540. '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
  541. std_train_scores[index_out][index_in])
  542. for index_in, _ in enumerate(param_list)
  543. for index_out, _ in enumerate(param_list_pre_revised)
  544. ]
  545. keyorder = [
  546. 'params', 'train_perf', 'valid_perf', 'test_perf'
  547. ]
  548. print()
  549. tb_print = tabulate(
  550. OrderedDict(
  551. sorted(table_dict.items(),
  552. key=lambda i: keyorder.index(i[0]))),
  553. headers='keys')
  554. print(tb_print)
  555. str_fw += 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print
  556. # open file to save all results for this dataset.
  557. if not os.path.exists(results_dir):
  558. os.makedirs(results_dir)
  559. # open file to save all results for this dataset.
  560. if not os.path.exists(results_dir):
  561. os.makedirs(results_dir)
  562. if not os.path.exists(results_dir + '/' + ds_name + '.output.txt'):
  563. with open(results_dir + '/' + ds_name + '.output.txt', 'w') as f:
  564. f.write(str_fw)
  565. else:
  566. with open(results_dir + '/' + ds_name + '.output.txt', 'r+') as f:
  567. content = f.read()
  568. f.seek(0, 0)
  569. f.write(str_fw + '\n\n\n' + content)
  570. def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level
  571. # Arrays to store scores
  572. train_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
  573. val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
  574. test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
  575. # loop for each outer param tuple
  576. for index_out, params_out in enumerate(param_list_pre_revised):
  577. # split gram matrix and y to app and test sets.
  578. indices = range(len(y))
  579. X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
  580. gram_matrices[index_out], y, indices, test_size=0.1,
  581. random_state=None, shuffle=True)
  582. X_app = X_app[:, idx_app]
  583. X_test = X_test[:, idx_app]
  584. y_app = np.array(y_app)
  585. y_test = np.array(y_test)
  586. # loop for each inner param tuple
  587. for index_in, params_in in enumerate(param_list):
  588. inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
  589. current_train_perf = []
  590. current_valid_perf = []
  591. current_test_perf = []
  592. # For regression use the Kernel Ridge method
  593. try:
  594. if model_type == 'regression':
  595. kr = KernelRidge(kernel='precomputed', **params_in)
  596. # loop for each split on validation set level
  597. # validation set level
  598. for train_index, valid_index in inner_cv.split(X_app):
  599. kr.fit(X_app[train_index, :][:, train_index],
  600. y_app[train_index])
  601. # predict on the train, validation and test set
  602. y_pred_train = kr.predict(
  603. X_app[train_index, :][:, train_index])
  604. y_pred_valid = kr.predict(
  605. X_app[valid_index, :][:, train_index])
  606. y_pred_test = kr.predict(
  607. X_test[:, train_index])
  608. # root mean squared errors
  609. current_train_perf.append(
  610. np.sqrt(
  611. mean_squared_error(
  612. y_app[train_index], y_pred_train)))
  613. current_valid_perf.append(
  614. np.sqrt(
  615. mean_squared_error(
  616. y_app[valid_index], y_pred_valid)))
  617. current_test_perf.append(
  618. np.sqrt(
  619. mean_squared_error(
  620. y_test, y_pred_test)))
  621. # For clcassification use SVM
  622. else:
  623. svc = SVC(kernel='precomputed', cache_size=200,
  624. verbose=False, **params_in)
  625. # loop for each split on validation set level
  626. # validation set level
  627. for train_index, valid_index in inner_cv.split(X_app):
  628. # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
  629. svc.fit(X_app[train_index, :][:, train_index],
  630. y_app[train_index])
  631. # predict on the train, validation and test set
  632. y_pred_train = svc.predict(
  633. X_app[train_index, :][:, train_index])
  634. y_pred_valid = svc.predict(
  635. X_app[valid_index, :][:, train_index])
  636. y_pred_test = svc.predict(
  637. X_test[:, train_index])
  638. # root mean squared errors
  639. current_train_perf.append(
  640. accuracy_score(y_app[train_index],
  641. y_pred_train))
  642. current_valid_perf.append(
  643. accuracy_score(y_app[valid_index],
  644. y_pred_valid))
  645. current_test_perf.append(
  646. accuracy_score(y_test, y_pred_test))
  647. except ValueError:
  648. print(sys.exc_info()[0])
  649. print(params_out, params_in)
  650. # average performance on inner splits
  651. train_pref[index_out][index_in] = np.mean(
  652. current_train_perf)
  653. val_pref[index_out][index_in] = np.mean(
  654. current_valid_perf)
  655. test_pref[index_out][index_in] = np.mean(
  656. current_test_perf)
  657. return train_pref, val_pref, test_pref

A Python package for graph kernels, graph edit distances and graph pre-image problem.