You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

model_selection_precomputed.py 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. def model_selection_for_precomputed_kernel(datafile, estimator,
  2. param_grid_precomputed, param_grid,
  3. model_type, NUM_TRIALS=30,
  4. datafile_y=None,
  5. extra_params=None):
  6. """Perform model selection, fitting and testing for precomputed kernels using nested cv. Print out neccessary data during the process then finally the results.
  7. Parameters
  8. ----------
  9. datafile : string
  10. Path of dataset file.
  11. estimator : function
  12. kernel function used to estimate. This function needs to return a gram matrix.
  13. param_grid_precomputed : dictionary
  14. Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings.
  15. param_grid : dictionary
  16. Dictionary with names (string) of parameters used as penelties as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings.
  17. model_type : string
  18. Typr of the problem, can be regression or classification.
  19. NUM_TRIALS : integer
  20. Number of random trials of outer cv loop. The default is 30.
  21. datafile_y : string
  22. Path of file storing y data. This parameter is optional depending on the given dataset file.
  23. Examples
  24. --------
  25. >>> import numpy as np
  26. >>> import sys
  27. >>> sys.path.insert(0, "../")
  28. >>> from pygraph.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
  29. >>> from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel
  30. >>>
  31. >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'
  32. >>> estimator = weisfeilerlehmankernel
  33. >>> param_grid_precomputed = {'height': [0,1,2,3,4,5,6,7,8,9,10], 'base_kernel': ['subtree']}
  34. >>> param_grid = {"alpha": np.logspace(-2, 2, num = 10, base = 10)}
  35. >>>
  36. >>> model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, 'regression')
  37. """
  38. import numpy as np
  39. from matplotlib import pyplot as plt
  40. from sklearn.kernel_ridge import KernelRidge
  41. from sklearn.svm import SVC
  42. from sklearn.metrics import accuracy_score, mean_squared_error
  43. from sklearn.model_selection import KFold, train_test_split, ParameterGrid
  44. import sys
  45. sys.path.insert(0, "../")
  46. import os
  47. from os.path import basename
  48. from pygraph.utils.graphfiles import loadDataset
  49. from tqdm import tqdm
  50. tqdm.monitor_interval = 0
  51. results_dir = '../notebooks/results/' + estimator.__name__
  52. if not os.path.exists(results_dir):
  53. os.makedirs(results_dir)
  54. results_name_pre = results_dir + '/' + basename(datafile) + '_'
  55. # setup the model type
  56. model_type = model_type.lower()
  57. if model_type != 'regression' and model_type != 'classification':
  58. raise Exception(
  59. 'The model type is incorrect! Please choose from regression or classification.')
  60. print()
  61. print('--- This is a %s problem ---' % model_type)
  62. # Load the dataset
  63. print()
  64. print('1. Loading dataset from file...')
  65. dataset, y = loadDataset(datafile, filename_y=datafile_y, extra_params=extra_params)
  66. # import matplotlib.pyplot as plt
  67. # import networkx as nx
  68. # nx.draw_networkx(dataset[30])
  69. # plt.show()
  70. # Grid of parameters with a discrete number of values for each.
  71. param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
  72. param_list = list(ParameterGrid(param_grid))
  73. # np.savetxt(results_name_pre + 'param_grid_precomputed.dt',
  74. # [[key, value] for key, value in sorted(param_grid_precomputed)])
  75. # np.savetxt(results_name_pre + 'param_grid.dt',
  76. # [[key, value] for key, value in sorted(param_grid)])
  77. gram_matrices = [] # a list to store gram matrices for all param_grid_precomputed
  78. gram_matrix_time = [] # a list to store time to calculate gram matrices
  79. param_list_pre_revised = [] # list to store param grids precomputed ignoring the useless ones
  80. # calculate all gram matrices
  81. print()
  82. print('2. Calculating gram matrices. This could take a while...')
  83. nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
  84. for params_out in param_list_precomputed:
  85. print()
  86. if params_out != {}:
  87. print('gram matrix with parameters', params_out, 'is: ')
  88. Kmatrix, current_run_time = estimator(dataset, **params_out)
  89. Kmatrix_diag = Kmatrix.diagonal().copy()
  90. for i in range(len(Kmatrix)):
  91. for j in range(i, len(Kmatrix)):
  92. # print(Kmatrix[i][j])
  93. # if Kmatrix_diag[i] != 0 and Kmatrix_diag[j] != 0:
  94. Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
  95. # print(i, j, Kmatrix[i][j], Kmatrix_diag[i], Kmatrix_diag[j])
  96. Kmatrix[j][i] = Kmatrix[i][j]
  97. if np.isnan(Kmatrix).any(): # if the matrix contains elements that are not numbers
  98. nb_gm_ignore += 1
  99. print('ignored, as it contains elements that are not numbers.')
  100. else:
  101. print(Kmatrix)
  102. plt.matshow(Kmatrix)
  103. plt.colorbar()
  104. fig_name_suffix = '_'.join(['{}-{}'.format(key, val)
  105. for key, val in sorted(params_out.items())])
  106. plt.savefig(
  107. results_name_pre + 'gram_matrix_{}.png'.format(fig_name_suffix))
  108. plt.show()
  109. gram_matrices.append(Kmatrix)
  110. gram_matrix_time.append(current_run_time)
  111. param_list_pre_revised.append(params_out)
  112. np.save(results_name_pre + 'gram_matrices.dt', gram_matrices)
  113. np.save(results_name_pre + 'param_list_precomputed.dt', param_list_pre_revised)
  114. np.save(results_name_pre + 'param_list.dt', param_list)
  115. print()
  116. print('{} gram matrices are calculated, {} of which are ignored.'.format(len(param_list_precomputed), nb_gm_ignore))
  117. print()
  118. print('3. Fitting and predicting using nested cross validation. This could really take a while...')
  119. # Arrays to store scores
  120. train_pref = np.zeros(
  121. (NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
  122. val_pref = np.zeros(
  123. (NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
  124. test_pref = np.zeros(
  125. (NUM_TRIALS, len(param_list_pre_revised), len(param_list)))
  126. # Loop for each trial
  127. pbar = tqdm(total=NUM_TRIALS * len(param_list_pre_revised) * len(param_list),
  128. desc='calculate performance', file=sys.stdout)
  129. for trial in range(NUM_TRIALS): # Test set level
  130. # loop for each outer param tuple
  131. for index_out, params_out in enumerate(param_list_pre_revised):
  132. # split gram matrix and y to app and test sets.
  133. X_app, X_test, y_app, y_test = train_test_split(
  134. gram_matrices[index_out], y, test_size=0.1)
  135. split_index_app = [y.index(y_i) for y_i in y_app if y_i in y]
  136. # split_index_test = [y.index(y_i) for y_i in y_test if y_i in y]
  137. X_app = X_app[:, split_index_app]
  138. X_test = X_test[:, split_index_app]
  139. y_app = np.array(y_app)
  140. y_test = np.array(y_test)
  141. # loop for each inner param tuple
  142. for index_in, params_in in enumerate(param_list):
  143. inner_cv = KFold(n_splits=10, shuffle=True, random_state=trial)
  144. current_train_perf = []
  145. current_valid_perf = []
  146. current_test_perf = []
  147. # For regression use the Kernel Ridge method
  148. try:
  149. if model_type == 'regression':
  150. KR = KernelRidge(kernel='precomputed', **params_in)
  151. # loop for each split on validation set level
  152. # validation set level
  153. for train_index, valid_index in inner_cv.split(X_app):
  154. KR.fit(X_app[train_index, :]
  155. [:, train_index], y_app[train_index])
  156. # predict on the train, validation and test set
  157. y_pred_train = KR.predict(
  158. X_app[train_index, :][:, train_index])
  159. y_pred_valid = KR.predict(
  160. X_app[valid_index, :][:, train_index])
  161. y_pred_test = KR.predict(X_test[:, train_index])
  162. # root mean squared errors
  163. current_train_perf.append(
  164. np.sqrt(mean_squared_error(y_app[train_index], y_pred_train)))
  165. current_valid_perf.append(
  166. np.sqrt(mean_squared_error(y_app[valid_index], y_pred_valid)))
  167. current_test_perf.append(
  168. np.sqrt(mean_squared_error(y_test, y_pred_test)))
  169. # For clcassification use SVM
  170. else:
  171. KR = SVC(kernel='precomputed', **params_in)
  172. # loop for each split on validation set level
  173. # validation set level
  174. for train_index, valid_index in inner_cv.split(X_app):
  175. KR.fit(X_app[train_index, :]
  176. [:, train_index], y_app[train_index])
  177. # predict on the train, validation and test set
  178. y_pred_train = KR.predict(
  179. X_app[train_index, :][:, train_index])
  180. y_pred_valid = KR.predict(
  181. X_app[valid_index, :][:, train_index])
  182. y_pred_test = KR.predict(
  183. X_test[:, train_index])
  184. # root mean squared errors
  185. current_train_perf.append(accuracy_score(
  186. y_app[train_index], y_pred_train))
  187. current_valid_perf.append(accuracy_score(
  188. y_app[valid_index], y_pred_valid))
  189. current_test_perf.append(
  190. accuracy_score(y_test, y_pred_test))
  191. except ValueError:
  192. print(sys.exc_info()[0])
  193. print(params_out, params_in)
  194. # average performance on inner splits
  195. train_pref[trial][index_out][index_in] = np.mean(
  196. current_train_perf)
  197. val_pref[trial][index_out][index_in] = np.mean(
  198. current_valid_perf)
  199. test_pref[trial][index_out][index_in] = np.mean(
  200. current_test_perf)
  201. pbar.update(1)
  202. pbar.clear()
  203. np.save(results_name_pre + 'train_pref.dt', train_pref)
  204. np.save(results_name_pre + 'val_pref.dt', val_pref)
  205. np.save(results_name_pre + 'test_pref.dt', test_pref)
  206. # print('val_pref: ', val_pref) #####
  207. # print(val_pref.shape)
  208. print()
  209. print('4. Getting final performances...')
  210. # averages and confidences of performances on outer trials for each combination of parameters
  211. average_train_scores = np.mean(train_pref, axis=0)
  212. average_val_scores = np.mean(val_pref, axis=0)
  213. # print('average_val_scores: ', average_val_scores) #####
  214. # print(average_val_scores.shape)
  215. average_perf_scores = np.mean(test_pref, axis=0)
  216. # sample std is used here
  217. std_train_scores = np.std(train_pref, axis=0, ddof=1)
  218. std_val_scores = np.std(val_pref, axis=0, ddof=1)
  219. std_perf_scores = np.std(test_pref, axis=0, ddof=1)
  220. if model_type == 'regression':
  221. best_val_perf = np.amin(average_val_scores)
  222. else:
  223. best_val_perf = np.amax(average_val_scores)
  224. # print()
  225. # print('best_val_perf: ', best_val_perf) #####
  226. # print(best_val_perf.shape)
  227. best_params_index = np.where(average_val_scores == best_val_perf)
  228. # print('best_params_index: ', best_params_index) #####
  229. #print(best_params_index[0])
  230. #print(best_params_index[1])
  231. # print(best_params_index.shape)
  232. best_params_out = [param_list_pre_revised[i] for i in best_params_index[0]]
  233. best_params_in = [param_list[i] for i in best_params_index[1]]
  234. # print('best_params_index: ', best_params_index)
  235. print('best_params_out: ', best_params_out)
  236. print('best_params_in: ', best_params_in)
  237. print()
  238. print('best_val_perf: ', best_val_perf)
  239. # below: only find one performance; muitiple pref might exist
  240. best_val_std = std_val_scores[best_params_index[0]
  241. [0]][best_params_index[1][0]]
  242. print('best_val_std: ', best_val_std)
  243. final_performance = average_perf_scores[best_params_index[0]
  244. [0]][best_params_index[1][0]]
  245. final_confidence = std_perf_scores[best_params_index[0]
  246. [0]][best_params_index[1][0]]
  247. print('final_performance: ', final_performance)
  248. print('final_confidence: ', final_confidence)
  249. train_performance = average_train_scores[best_params_index[0]
  250. [0]][best_params_index[1][0]]
  251. train_std = std_train_scores[best_params_index[0]
  252. [0]][best_params_index[1][0]]
  253. print('train_performance: ', train_performance)
  254. print('train_std: ', train_std)
  255. print()
  256. average_gram_matrix_time = np.mean(gram_matrix_time)
  257. std_gram_matrix_time = np.std(gram_matrix_time, ddof=1)
  258. best_gram_matrix_time = gram_matrix_time[best_params_index[0][0]]
  259. print('time to calculate gram matrix with different hyperpapams: {:.2f}±{:.2f}'
  260. .format(average_gram_matrix_time, std_gram_matrix_time))
  261. print('time to calculate best gram matrix: ', best_gram_matrix_time, 's')
  262. # save results to file
  263. np.savetxt(results_name_pre + 'average_train_scores.dt',
  264. average_train_scores)
  265. np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)
  266. np.savetxt(results_name_pre + 'average_perf_scores.dt',
  267. average_perf_scores)
  268. np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
  269. np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
  270. np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)
  271. np.save(results_name_pre + 'best_params_index', best_params_index)
  272. np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
  273. np.save(results_name_pre + 'best_params_in.dt', best_params_in)
  274. np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
  275. np.save(results_name_pre + 'best_val_std.dt', best_val_std)
  276. np.save(results_name_pre + 'final_performance.dt', final_performance)
  277. np.save(results_name_pre + 'final_confidence.dt', final_confidence)
  278. np.save(results_name_pre + 'train_performance.dt', train_performance)
  279. np.save(results_name_pre + 'train_std.dt', train_std)
  280. np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
  281. np.save(results_name_pre + 'average_gram_matrix_time.dt',
  282. average_gram_matrix_time)
  283. np.save(results_name_pre + 'std_gram_matrix_time.dt',
  284. std_gram_matrix_time)
  285. np.save(results_name_pre + 'best_gram_matrix_time.dt',
  286. best_gram_matrix_time)
  287. # print out as table.
  288. from collections import OrderedDict
  289. from tabulate import tabulate
  290. table_dict = {}
  291. if model_type == 'regression':
  292. for param_in in param_list:
  293. param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
  294. else:
  295. for param_in in param_list:
  296. param_in['C'] = '{:.2e}'.format(param_in['C'])
  297. table_dict['params'] = [{**param_out, **param_in}
  298. for param_in in param_list for param_out in param_list_pre_revised]
  299. table_dict['gram_matrix_time'] = ['{:.2f}'.format(gram_matrix_time[index_out])
  300. for param_in in param_list for index_out, _ in enumerate(param_list_pre_revised)]
  301. table_dict['valid_perf'] = ['{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in], std_val_scores[index_out][index_in])
  302. for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
  303. table_dict['test_perf'] = ['{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in], std_perf_scores[index_out][index_in])
  304. for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
  305. table_dict['train_perf'] = ['{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in], std_train_scores[index_out][index_in])
  306. for index_in, _ in enumerate(param_list) for index_out, _ in enumerate(param_list_pre_revised)]
  307. keyorder = ['params', 'train_perf', 'valid_perf',
  308. 'test_perf', 'gram_matrix_time']
  309. print()
  310. print(tabulate(OrderedDict(sorted(table_dict.items(),
  311. key=lambda i: keyorder.index(i[0]))), headers='keys'))
  312. np.save(results_name_pre + 'results_vs_params.dt', table_dict)

A Python package for graph kernels, graph edit distances and graph pre-image problem.