You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

model_selection_precomputed.py 39 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959
  1. import numpy as np
  2. import matplotlib
  3. matplotlib.use('Agg')
  4. from matplotlib import pyplot as plt
  5. from sklearn.kernel_ridge import KernelRidge
  6. from sklearn.svm import SVC
  7. from sklearn.metrics import accuracy_score, mean_squared_error
  8. from sklearn.model_selection import KFold, train_test_split, ParameterGrid
  9. #from joblib import Parallel, delayed
  10. from multiprocessing import Pool, Array
  11. from functools import partial
  12. import sys
  13. import os
  14. import time
  15. import datetime
  16. #from os.path import basename, splitext
  17. from gklearn.utils.graphfiles import loadDataset
  18. from tqdm import tqdm
  19. #from memory_profiler import profile
  20. #@profile
  21. def model_selection_for_precomputed_kernel(datafile,
  22. estimator,
  23. param_grid_precomputed,
  24. param_grid,
  25. model_type,
  26. NUM_TRIALS=30,
  27. datafile_y=None,
  28. extra_params=None,
  29. ds_name='ds-unknown',
  30. output_dir='outputs/',
  31. n_jobs=1,
  32. read_gm_from_file=False,
  33. verbose=True):
  34. """Perform model selection, fitting and testing for precomputed kernels
  35. using nested CV. Print out neccessary data during the process then finally
  36. the results.
  37. Parameters
  38. ----------
  39. datafile : string
  40. Path of dataset file.
  41. estimator : function
  42. kernel function used to estimate. This function needs to return a gram matrix.
  43. param_grid_precomputed : dictionary
  44. Dictionary with names (string) of parameters used to calculate gram
  45. matrices as keys and lists of parameter settings to try as values. This
  46. enables searching over any sequence of parameter settings. Params with
  47. length 1 will be omitted.
  48. param_grid : dictionary
  49. Dictionary with names (string) of parameters used as penelties as keys
  50. and lists of parameter settings to try as values. This enables
  51. searching over any sequence of parameter settings. Params with length 1
  52. will be omitted.
  53. model_type : string
  54. Type of the problem, can be 'regression' or 'classification'.
  55. NUM_TRIALS : integer
  56. Number of random trials of the outer CV loop. The default is 30.
  57. datafile_y : string
  58. Path of file storing y data. This parameter is optional depending on
  59. the given dataset file.
  60. extra_params : dict
  61. Extra parameters for loading dataset. See function gklearn.utils.
  62. graphfiles.loadDataset for detail.
  63. ds_name : string
  64. Name of the dataset.
  65. n_jobs : int
  66. Number of jobs for parallelization.
  67. read_gm_from_file : boolean
  68. Whether gram matrices are loaded from a file.
  69. Examples
  70. --------
  71. >>> import numpy as np
  72. >>> from gklearn.utils.model_selection_precomputed import model_selection_for_precomputed_kernel
  73. >>> from gklearn.kernels.untilHPathKernel import untilhpathkernel
  74. >>>
  75. >>> datafile = '../datasets/MUTAG/MUTAG_A.txt'
  76. >>> estimator = untilhpathkernel
  77. >>> param_grid_precomputed = {’depth’: np.linspace(1, 10, 10), ’k_func’:
  78. [’MinMax’, ’tanimoto’], ’compute_method’: [’trie’]}
  79. >>> # ’C’ for classification problems and ’alpha’ for regression problems.
  80. >>> param_grid = [{’C’: np.logspace(-10, 10, num=41, base=10)}, {’alpha’:
  81. np.logspace(-10, 10, num=41, base=10)}]
  82. >>>
  83. >>> model_selection_for_precomputed_kernel(datafile, estimator,
  84. param_grid_precomputed, param_grid[0], 'classification', ds_name=’MUTAG’)
  85. """
  86. tqdm.monitor_interval = 0
  87. output_dir += estimator.__name__
  88. if not os.path.exists(output_dir):
  89. os.makedirs(output_dir)
  90. # a string to save all the results.
  91. str_fw = '###################### log time: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '. ######################\n\n'
  92. str_fw += '# This file contains results of ' + estimator.__name__ + ' on dataset ' + ds_name + ',\n# including gram matrices, serial numbers for gram matrix figures and performance.\n\n'
  93. # setup the model type
  94. model_type = model_type.lower()
  95. if model_type != 'regression' and model_type != 'classification':
  96. raise Exception(
  97. 'The model type is incorrect! Please choose from regression or classification.'
  98. )
  99. if verbose:
  100. print()
  101. print('--- This is a %s problem ---' % model_type)
  102. str_fw += 'This is a %s problem.\n' % model_type
  103. # calculate gram matrices rather than read them from file.
  104. if read_gm_from_file == False:
  105. # Load the dataset
  106. if verbose:
  107. print()
  108. print('\n1. Loading dataset from file...')
  109. if isinstance(datafile, str):
  110. dataset, y_all = loadDataset(
  111. datafile, filename_y=datafile_y, extra_params=extra_params)
  112. else: # load data directly from variable.
  113. dataset = datafile
  114. y_all = datafile_y
  115. # import matplotlib.pyplot as plt
  116. # import networkx as nx
  117. # nx.draw_networkx(dataset[30])
  118. # plt.show()
  119. # Grid of parameters with a discrete number of values for each.
  120. param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
  121. param_list = list(ParameterGrid(param_grid))
  122. gram_matrices = [
  123. ] # a list to store gram matrices for all param_grid_precomputed
  124. gram_matrix_time = [
  125. ] # a list to store time to calculate gram matrices
  126. param_list_pre_revised = [
  127. ] # list to store param grids precomputed ignoring the useless ones
  128. # calculate all gram matrices
  129. if verbose:
  130. print()
  131. print('2. Calculating gram matrices. This could take a while...')
  132. str_fw += '\nII. Gram matrices.\n\n'
  133. tts = time.time() # start training time
  134. nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
  135. for idx, params_out in enumerate(param_list_precomputed):
  136. y = y_all[:]
  137. params_out['n_jobs'] = n_jobs
  138. params_out['verbose'] = verbose
  139. # print(dataset)
  140. # import networkx as nx
  141. # nx.draw_networkx(dataset[1])
  142. # plt.show()
  143. rtn_data = estimator(dataset[:], **params_out)
  144. Kmatrix = rtn_data[0]
  145. current_run_time = rtn_data[1]
  146. # for some kernels, some graphs in datasets may not meet the
  147. # kernels' requirements for graph structure. These graphs are trimmed.
  148. if len(rtn_data) == 3:
  149. idx_trim = rtn_data[2] # the index of trimmed graph list
  150. y = [y[idxt] for idxt in idx_trim] # trim y accordingly
  151. # Kmatrix = np.random.rand(2250, 2250)
  152. # current_run_time = 0.1
  153. # remove graphs whose kernels with themselves are zeros
  154. # @todo: y not changed accordingly?
  155. Kmatrix_diag = Kmatrix.diagonal().copy()
  156. nb_g_ignore = 0
  157. for idxk, diag in enumerate(Kmatrix_diag):
  158. if diag == 0:
  159. Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0)
  160. Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
  161. nb_g_ignore += 1
  162. # normalization
  163. # @todo: works only for undirected graph?
  164. Kmatrix_diag = Kmatrix.diagonal().copy()
  165. for i in range(len(Kmatrix)):
  166. for j in range(i, len(Kmatrix)):
  167. Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
  168. Kmatrix[j][i] = Kmatrix[i][j]
  169. if verbose:
  170. print()
  171. if params_out == {}:
  172. if verbose:
  173. print('the gram matrix is: ')
  174. str_fw += 'the gram matrix is:\n\n'
  175. else:
  176. if verbose:
  177. print('the gram matrix with parameters', params_out, 'is: \n\n')
  178. str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
  179. if len(Kmatrix) < 2:
  180. nb_gm_ignore += 1
  181. if verbose:
  182. print('ignored, as at most only one of all its diagonal value is non-zero.')
  183. str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
  184. else:
  185. if np.isnan(Kmatrix).any(
  186. ): # if the matrix contains elements that are not numbers
  187. nb_gm_ignore += 1
  188. if verbose:
  189. print('ignored, as it contains elements that are not numbers.')
  190. str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
  191. else:
  192. # print(Kmatrix)
  193. str_fw += np.array2string(
  194. Kmatrix,
  195. separator=',') + '\n\n'
  196. # separator=',',
  197. # threshold=np.inf,
  198. # floatmode='unique') + '\n\n'
  199. fig_file_name = output_dir + '/GM[ds]' + ds_name
  200. if params_out != {}:
  201. fig_file_name += '[params]' + str(idx)
  202. plt.imshow(Kmatrix)
  203. plt.colorbar()
  204. plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
  205. # plt.show()
  206. plt.clf()
  207. gram_matrices.append(Kmatrix)
  208. gram_matrix_time.append(current_run_time)
  209. param_list_pre_revised.append(params_out)
  210. if nb_g_ignore > 0:
  211. if verbose:
  212. print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
  213. str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
  214. if verbose:
  215. print()
  216. print(
  217. '{} gram matrices are calculated, {} of which are ignored.'.format(
  218. len(param_list_precomputed), nb_gm_ignore))
  219. str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)
  220. str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
  221. str_fw += ''.join([
  222. '{}: {}\n'.format(idx, params_out)
  223. for idx, params_out in enumerate(param_list_precomputed)
  224. ])
  225. if verbose:
  226. print()
  227. if len(gram_matrices) == 0:
  228. if verbose:
  229. print('all gram matrices are ignored, no results obtained.')
  230. str_fw += '\nall gram matrices are ignored, no results obtained.\n\n'
  231. else:
  232. # save gram matrices to file.
  233. # np.savez(output_dir + '/' + ds_name + '.gm',
  234. # gms=gram_matrices, params=param_list_pre_revised, y=y,
  235. # gmtime=gram_matrix_time)
  236. if verbose:
  237. print(
  238. '3. Fitting and predicting using nested cross validation. This could really take a while...'
  239. )
  240. # ---- use pool.imap_unordered to parallel and track progress. ----
  241. # train_pref = []
  242. # val_pref = []
  243. # test_pref = []
  244. # def func_assign(result, var_to_assign):
  245. # for idx, itm in enumerate(var_to_assign):
  246. # itm.append(result[idx])
  247. # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, y, model_type)
  248. #
  249. # parallel_me(trial_do_partial, range(NUM_TRIALS), func_assign,
  250. # [train_pref, val_pref, test_pref], glbv=gram_matrices,
  251. # method='imap_unordered', n_jobs=n_jobs, chunksize=1,
  252. # itr_desc='cross validation')
  253. def init_worker(gms_toshare):
  254. global G_gms
  255. G_gms = gms_toshare
  256. # gram_matrices = np.array(gram_matrices)
  257. # gms_shape = gram_matrices.shape
  258. # gms_array = Array('d', np.reshape(gram_matrices.copy(), -1, order='C'))
  259. # pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gms_array, gms_shape))
  260. pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,))
  261. trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type)
  262. train_pref = []
  263. val_pref = []
  264. test_pref = []
  265. # if NUM_TRIALS < 1000 * n_jobs:
  266. # chunksize = int(NUM_TRIALS / n_jobs) + 1
  267. # else:
  268. # chunksize = 1000
  269. chunksize = 1
  270. if verbose:
  271. iterator = tqdm(pool.imap_unordered(trial_do_partial,
  272. range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout)
  273. else:
  274. iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize)
  275. for o1, o2, o3 in iterator:
  276. train_pref.append(o1)
  277. val_pref.append(o2)
  278. test_pref.append(o3)
  279. pool.close()
  280. pool.join()
  281. # # ---- use pool.map to parallel. ----
  282. # pool = Pool(n_jobs)
  283. # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y[0:250], model_type)
  284. # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
  285. # train_pref = [item[0] for item in result_perf]
  286. # val_pref = [item[1] for item in result_perf]
  287. # test_pref = [item[2] for item in result_perf]
  288. # # ---- direct running, normally use a single CPU core. ----
  289. # train_pref = []
  290. # val_pref = []
  291. # test_pref = []
  292. # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
  293. # o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
  294. # train_pref.append(o1)
  295. # val_pref.append(o2)
  296. # test_pref.append(o3)
  297. # print()
  298. if verbose:
  299. print()
  300. print('4. Getting final performance...')
  301. str_fw += '\nIII. Performance.\n\n'
  302. # averages and confidences of performances on outer trials for each combination of parameters
  303. average_train_scores = np.mean(train_pref, axis=0)
  304. # print('val_pref: ', val_pref[0][0])
  305. average_val_scores = np.mean(val_pref, axis=0)
  306. # print('test_pref: ', test_pref[0][0])
  307. average_perf_scores = np.mean(test_pref, axis=0)
  308. # sample std is used here
  309. std_train_scores = np.std(train_pref, axis=0, ddof=1)
  310. std_val_scores = np.std(val_pref, axis=0, ddof=1)
  311. std_perf_scores = np.std(test_pref, axis=0, ddof=1)
  312. if model_type == 'regression':
  313. best_val_perf = np.amin(average_val_scores)
  314. else:
  315. best_val_perf = np.amax(average_val_scores)
  316. # print('average_val_scores: ', average_val_scores)
  317. # print('best_val_perf: ', best_val_perf)
  318. # print()
  319. best_params_index = np.where(average_val_scores == best_val_perf)
  320. # find smallest val std with best val perf.
  321. best_val_stds = [
  322. std_val_scores[value][best_params_index[1][idx]]
  323. for idx, value in enumerate(best_params_index[0])
  324. ]
  325. min_val_std = np.amin(best_val_stds)
  326. best_params_index = np.where(std_val_scores == min_val_std)
  327. best_params_out = [
  328. param_list_pre_revised[i] for i in best_params_index[0]
  329. ]
  330. best_params_in = [param_list[i] for i in best_params_index[1]]
  331. if verbose:
  332. print('best_params_out: ', best_params_out)
  333. print('best_params_in: ', best_params_in)
  334. print()
  335. print('best_val_perf: ', best_val_perf)
  336. print('best_val_std: ', min_val_std)
  337. str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
  338. str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
  339. str_fw += 'best_val_perf: %s\n' % best_val_perf
  340. str_fw += 'best_val_std: %s\n' % min_val_std
  341. # print(best_params_index)
  342. # print(best_params_index[0])
  343. # print(average_perf_scores)
  344. final_performance = [
  345. average_perf_scores[value][best_params_index[1][idx]]
  346. for idx, value in enumerate(best_params_index[0])
  347. ]
  348. final_confidence = [
  349. std_perf_scores[value][best_params_index[1][idx]]
  350. for idx, value in enumerate(best_params_index[0])
  351. ]
  352. if verbose:
  353. print('final_performance: ', final_performance)
  354. print('final_confidence: ', final_confidence)
  355. str_fw += 'final_performance: %s\n' % final_performance
  356. str_fw += 'final_confidence: %s\n' % final_confidence
  357. train_performance = [
  358. average_train_scores[value][best_params_index[1][idx]]
  359. for idx, value in enumerate(best_params_index[0])
  360. ]
  361. train_std = [
  362. std_train_scores[value][best_params_index[1][idx]]
  363. for idx, value in enumerate(best_params_index[0])
  364. ]
  365. if verbose:
  366. print('train_performance: %s' % train_performance)
  367. print('train_std: ', train_std)
  368. str_fw += 'train_performance: %s\n' % train_performance
  369. str_fw += 'train_std: %s\n\n' % train_std
  370. if verbose:
  371. print()
  372. tt_total = time.time() - tts # training time for all hyper-parameters
  373. average_gram_matrix_time = np.mean(gram_matrix_time)
  374. std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0
  375. best_gram_matrix_time = [
  376. gram_matrix_time[i] for i in best_params_index[0]
  377. ]
  378. ave_bgmt = np.mean(best_gram_matrix_time)
  379. std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0
  380. if verbose:
  381. print('time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
  382. .format(average_gram_matrix_time, std_gram_matrix_time))
  383. print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
  384. ave_bgmt, std_bgmt))
  385. print('total training time with all hyper-param choices: {:.2f}s'.format(
  386. tt_total))
  387. str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
  388. str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
  389. str_fw += 'total training time with all hyper-param choices: {:.2f}s\n\n'.format(tt_total)
  390. # # save results to file
  391. # np.savetxt(results_name_pre + 'average_train_scores.dt',
  392. # average_train_scores)
  393. # np.savetxt(results_name_pre + 'average_val_scores', average_val_scores)
  394. # np.savetxt(results_name_pre + 'average_perf_scores.dt',
  395. # average_perf_scores)
  396. # np.savetxt(results_name_pre + 'std_train_scores.dt', std_train_scores)
  397. # np.savetxt(results_name_pre + 'std_val_scores.dt', std_val_scores)
  398. # np.savetxt(results_name_pre + 'std_perf_scores.dt', std_perf_scores)
  399. # np.save(results_name_pre + 'best_params_index', best_params_index)
  400. # np.save(results_name_pre + 'best_params_pre.dt', best_params_out)
  401. # np.save(results_name_pre + 'best_params_in.dt', best_params_in)
  402. # np.save(results_name_pre + 'best_val_perf.dt', best_val_perf)
  403. # np.save(results_name_pre + 'best_val_std.dt', best_val_std)
  404. # np.save(results_name_pre + 'final_performance.dt', final_performance)
  405. # np.save(results_name_pre + 'final_confidence.dt', final_confidence)
  406. # np.save(results_name_pre + 'train_performance.dt', train_performance)
  407. # np.save(results_name_pre + 'train_std.dt', train_std)
  408. # np.save(results_name_pre + 'gram_matrix_time.dt', gram_matrix_time)
  409. # np.save(results_name_pre + 'average_gram_matrix_time.dt',
  410. # average_gram_matrix_time)
  411. # np.save(results_name_pre + 'std_gram_matrix_time.dt',
  412. # std_gram_matrix_time)
  413. # np.save(results_name_pre + 'best_gram_matrix_time.dt',
  414. # best_gram_matrix_time)
  415. # read gram matrices from file.
  416. else:
  417. # Grid of parameters with a discrete number of values for each.
  418. # param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
  419. param_list = list(ParameterGrid(param_grid))
  420. # read gram matrices from file.
  421. if verbose:
  422. print()
  423. print('2. Reading gram matrices from file...')
  424. str_fw += '\nII. Gram matrices.\n\nGram matrices are read from file, see last log for detail.\n'
  425. gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz')
  426. gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed
  427. gram_matrix_time = gmfile['gmtime'] # time used to compute the gram matrices
  428. param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones
  429. y = gmfile['y'].tolist()
  430. tts = time.time() # start training time
  431. # nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
  432. if verbose:
  433. print(
  434. '3. Fitting and predicting using nested cross validation. This could really take a while...'
  435. )
  436. # ---- use pool.imap_unordered to parallel and track progress. ----
  437. def init_worker(gms_toshare):
  438. global G_gms
  439. G_gms = gms_toshare
  440. pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(gram_matrices,))
  441. trial_do_partial = partial(parallel_trial_do, param_list_pre_revised, param_list, y, model_type)
  442. train_pref = []
  443. val_pref = []
  444. test_pref = []
  445. chunksize = 1
  446. if verbose:
  447. iterator = tqdm(pool.imap_unordered(trial_do_partial,
  448. range(NUM_TRIALS), chunksize), desc='cross validation', file=sys.stdout)
  449. else:
  450. iterator = pool.imap_unordered(trial_do_partial, range(NUM_TRIALS), chunksize)
  451. for o1, o2, o3 in iterator:
  452. train_pref.append(o1)
  453. val_pref.append(o2)
  454. test_pref.append(o3)
  455. pool.close()
  456. pool.join()
  457. # # ---- use pool.map to parallel. ----
  458. # result_perf = pool.map(trial_do_partial, range(NUM_TRIALS))
  459. # train_pref = [item[0] for item in result_perf]
  460. # val_pref = [item[1] for item in result_perf]
  461. # test_pref = [item[2] for item in result_perf]
  462. # # ---- use joblib.Parallel to parallel and track progress. ----
  463. # trial_do_partial = partial(trial_do, param_list_pre_revised, param_list, gram_matrices, y, model_type)
  464. # result_perf = Parallel(n_jobs=n_jobs, verbose=10)(delayed(trial_do_partial)(trial) for trial in range(NUM_TRIALS))
  465. # train_pref = [item[0] for item in result_perf]
  466. # val_pref = [item[1] for item in result_perf]
  467. # test_pref = [item[2] for item in result_perf]
  468. # # ---- direct running, normally use a single CPU core. ----
  469. # train_pref = []
  470. # val_pref = []
  471. # test_pref = []
  472. # for i in tqdm(range(NUM_TRIALS), desc='cross validation', file=sys.stdout):
  473. # o1, o2, o3 = trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, i)
  474. # train_pref.append(o1)
  475. # val_pref.append(o2)
  476. # test_pref.append(o3)
  477. if verbose:
  478. print()
  479. print('4. Getting final performance...')
  480. str_fw += '\nIII. Performance.\n\n'
  481. # averages and confidences of performances on outer trials for each combination of parameters
  482. average_train_scores = np.mean(train_pref, axis=0)
  483. average_val_scores = np.mean(val_pref, axis=0)
  484. average_perf_scores = np.mean(test_pref, axis=0)
  485. # sample std is used here
  486. std_train_scores = np.std(train_pref, axis=0, ddof=1)
  487. std_val_scores = np.std(val_pref, axis=0, ddof=1)
  488. std_perf_scores = np.std(test_pref, axis=0, ddof=1)
  489. if model_type == 'regression':
  490. best_val_perf = np.amin(average_val_scores)
  491. else:
  492. best_val_perf = np.amax(average_val_scores)
  493. best_params_index = np.where(average_val_scores == best_val_perf)
  494. # find smallest val std with best val perf.
  495. best_val_stds = [
  496. std_val_scores[value][best_params_index[1][idx]]
  497. for idx, value in enumerate(best_params_index[0])
  498. ]
  499. min_val_std = np.amin(best_val_stds)
  500. best_params_index = np.where(std_val_scores == min_val_std)
  501. best_params_out = [
  502. param_list_pre_revised[i] for i in best_params_index[0]
  503. ]
  504. best_params_in = [param_list[i] for i in best_params_index[1]]
  505. if verbose:
  506. print('best_params_out: ', best_params_out)
  507. print('best_params_in: ', best_params_in)
  508. print()
  509. print('best_val_perf: ', best_val_perf)
  510. print('best_val_std: ', min_val_std)
  511. str_fw += 'best settings of hyper-params to build gram matrix: %s\n' % best_params_out
  512. str_fw += 'best settings of other hyper-params: %s\n\n' % best_params_in
  513. str_fw += 'best_val_perf: %s\n' % best_val_perf
  514. str_fw += 'best_val_std: %s\n' % min_val_std
  515. final_performance = [
  516. average_perf_scores[value][best_params_index[1][idx]]
  517. for idx, value in enumerate(best_params_index[0])
  518. ]
  519. final_confidence = [
  520. std_perf_scores[value][best_params_index[1][idx]]
  521. for idx, value in enumerate(best_params_index[0])
  522. ]
  523. if verbose:
  524. print('final_performance: ', final_performance)
  525. print('final_confidence: ', final_confidence)
  526. str_fw += 'final_performance: %s\n' % final_performance
  527. str_fw += 'final_confidence: %s\n' % final_confidence
  528. train_performance = [
  529. average_train_scores[value][best_params_index[1][idx]]
  530. for idx, value in enumerate(best_params_index[0])
  531. ]
  532. train_std = [
  533. std_train_scores[value][best_params_index[1][idx]]
  534. for idx, value in enumerate(best_params_index[0])
  535. ]
  536. if verbose:
  537. print('train_performance: %s' % train_performance)
  538. print('train_std: ', train_std)
  539. str_fw += 'train_performance: %s\n' % train_performance
  540. str_fw += 'train_std: %s\n\n' % train_std
  541. if verbose:
  542. print()
  543. average_gram_matrix_time = np.mean(gram_matrix_time)
  544. std_gram_matrix_time = np.std(gram_matrix_time, ddof=1) if len(gram_matrix_time) > 1 else 0
  545. best_gram_matrix_time = [
  546. gram_matrix_time[i] for i in best_params_index[0]
  547. ]
  548. ave_bgmt = np.mean(best_gram_matrix_time)
  549. std_bgmt = np.std(best_gram_matrix_time, ddof=1) if len(best_gram_matrix_time) > 1 else 0
  550. if verbose:
  551. print(
  552. 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s'
  553. .format(average_gram_matrix_time, std_gram_matrix_time))
  554. print('time to calculate best gram matrix: {:.2f}±{:.2f}s'.format(
  555. ave_bgmt, std_bgmt))
  556. tt_poster = time.time() - tts # training time with hyper-param choices who did not participate in calculation of gram matrices
  557. if verbose:
  558. print(
  559. 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s'.format(
  560. tt_poster))
  561. print('total training time with all hyper-param choices: {:.2f}s'.format(
  562. tt_poster + np.sum(gram_matrix_time)))
  563. # str_fw += 'time to calculate gram matrix with different hyper-params: {:.2f}±{:.2f}s\n'.format(average_gram_matrix_time, std_gram_matrix_time)
  564. # str_fw += 'time to calculate best gram matrix: {:.2f}±{:.2f}s\n'.format(ave_bgmt, std_bgmt)
  565. str_fw += 'training time with hyper-param choices who did not participate in calculation of gram matrices: {:.2f}s\n\n'.format(tt_poster)
  566. # open file to save all results for this dataset.
  567. if not os.path.exists(output_dir):
  568. os.makedirs(output_dir)
  569. # print out results as table.
  570. str_fw += printResultsInTable(param_list, param_list_pre_revised, average_val_scores,
  571. std_val_scores, average_perf_scores, std_perf_scores,
  572. average_train_scores, std_train_scores, gram_matrix_time,
  573. model_type, verbose)
  574. # open file to save all results for this dataset.
  575. if not os.path.exists(output_dir + '/' + ds_name + '.output.txt'):
  576. with open(output_dir + '/' + ds_name + '.output.txt', 'w') as f:
  577. f.write(str_fw)
  578. else:
  579. with open(output_dir + '/' + ds_name + '.output.txt', 'r+') as f:
  580. content = f.read()
  581. f.seek(0, 0)
  582. f.write(str_fw + '\n\n\n' + content)
  583. return final_performance, final_confidence
  584. def trial_do(param_list_pre_revised, param_list, gram_matrices, y, model_type, trial): # Test set level
  585. # # get gram matrices from global variables.
  586. # gram_matrices = np.reshape(G_gms.copy(), G_gms_shape, order='C')
  587. # Arrays to store scores
  588. train_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
  589. val_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
  590. test_pref = np.zeros((len(param_list_pre_revised), len(param_list)))
  591. # randomness added to seeds of split function below. "high" is "size" times
  592. # 10 so that at least 10 different random output will be yielded. Remove
  593. # these lines if identical outputs is required.
  594. rdm_out = np.random.RandomState(seed=None)
  595. rdm_seed_out_l = rdm_out.uniform(high=len(param_list_pre_revised) * 10,
  596. size=len(param_list_pre_revised))
  597. # print(trial, rdm_seed_out_l)
  598. # print()
  599. # loop for each outer param tuple
  600. for index_out, params_out in enumerate(param_list_pre_revised):
  601. # get gram matrices from global variables.
  602. # gm_now = G_gms[index_out * G_gms_shape[1] * G_gms_shape[2]:(index_out + 1) * G_gms_shape[1] * G_gms_shape[2]]
  603. # gm_now = np.reshape(gm_now.copy(), (G_gms_shape[1], G_gms_shape[2]), order='C')
  604. gm_now = gram_matrices[index_out].copy()
  605. # split gram matrix and y to app and test sets.
  606. indices = range(len(y))
  607. # The argument "random_state" in function "train_test_split" can not be
  608. # set to None, because it will use RandomState instance used by
  609. # np.random, which is possible for multiple subprocesses to inherit the
  610. # same seed if they forked at the same time, leading to identical
  611. # random variates for different subprocesses. Instead, we use "trial"
  612. # and "index_out" parameters to generate different seeds for different
  613. # trials/subprocesses and outer loops. "rdm_seed_out_l" is used to add
  614. # randomness into seeds, so that it yields a different output every
  615. # time the program is run. To yield identical outputs every time,
  616. # remove the second line below. Same method is used to the "KFold"
  617. # function in the inner loop.
  618. rdm_seed_out = (trial + 1) * (index_out + 1)
  619. rdm_seed_out = (rdm_seed_out + int(rdm_seed_out_l[index_out])) % (2 ** 32 - 1)
  620. # print(trial, rdm_seed_out)
  621. X_app, X_test, y_app, y_test, idx_app, idx_test = train_test_split(
  622. gm_now, y, indices, test_size=0.1,
  623. random_state=rdm_seed_out, shuffle=True)
  624. # print(trial, idx_app, idx_test)
  625. # print()
  626. X_app = X_app[:, idx_app]
  627. X_test = X_test[:, idx_app]
  628. y_app = np.array(y_app)
  629. y_test = np.array(y_test)
  630. rdm_seed_in_l = rdm_out.uniform(high=len(param_list) * 10,
  631. size=len(param_list))
  632. # loop for each inner param tuple
  633. for index_in, params_in in enumerate(param_list):
  634. # if trial == 0:
  635. # print(index_out, index_in)
  636. # print('params_in: ', params_in)
  637. # st = time.time()
  638. rdm_seed_in = (trial + 1) * (index_out + 1) * (index_in + 1)
  639. # print("rdm_seed_in1: ", trial, index_in, rdm_seed_in)
  640. rdm_seed_in = (rdm_seed_in + int(rdm_seed_in_l[index_in])) % (2 ** 32 - 1)
  641. # print("rdm_seed_in2: ", trial, index_in, rdm_seed_in)
  642. inner_cv = KFold(n_splits=10, shuffle=True, random_state=rdm_seed_in)
  643. current_train_perf = []
  644. current_valid_perf = []
  645. current_test_perf = []
  646. # For regression use the Kernel Ridge method
  647. # try:
  648. if model_type == 'regression':
  649. kr = KernelRidge(kernel='precomputed', **params_in)
  650. # loop for each split on validation set level
  651. # validation set level
  652. for train_index, valid_index in inner_cv.split(X_app):
  653. # print("train_index, valid_index: ", trial, index_in, train_index, valid_index)
  654. # if trial == 0:
  655. # print('train_index: ', train_index)
  656. # print('valid_index: ', valid_index)
  657. # print('idx_test: ', idx_test)
  658. # print('y_app[train_index]: ', y_app[train_index])
  659. # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
  660. # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
  661. kr.fit(X_app[train_index, :][:, train_index],
  662. y_app[train_index])
  663. # predict on the train, validation and test set
  664. y_pred_train = kr.predict(
  665. X_app[train_index, :][:, train_index])
  666. y_pred_valid = kr.predict(
  667. X_app[valid_index, :][:, train_index])
  668. # if trial == 0:
  669. # print('y_pred_valid: ', y_pred_valid)
  670. # print()
  671. y_pred_test = kr.predict(
  672. X_test[:, train_index])
  673. # root mean squared errors
  674. current_train_perf.append(
  675. np.sqrt(
  676. mean_squared_error(
  677. y_app[train_index], y_pred_train)))
  678. current_valid_perf.append(
  679. np.sqrt(
  680. mean_squared_error(
  681. y_app[valid_index], y_pred_valid)))
  682. # if trial == 0:
  683. # print(mean_squared_error(
  684. # y_app[valid_index], y_pred_valid))
  685. current_test_perf.append(
  686. np.sqrt(
  687. mean_squared_error(
  688. y_test, y_pred_test)))
  689. # For clcassification use SVM
  690. else:
  691. svc = SVC(kernel='precomputed', cache_size=200,
  692. verbose=False, **params_in)
  693. # loop for each split on validation set level
  694. # validation set level
  695. for train_index, valid_index in inner_cv.split(X_app):
  696. # np.savez("bug.npy",X_app[train_index, :][:, train_index],y_app[train_index])
  697. # if trial == 0:
  698. # print('train_index: ', train_index)
  699. # print('valid_index: ', valid_index)
  700. # print('idx_test: ', idx_test)
  701. # print('y_app[train_index]: ', y_app[train_index])
  702. # print('X_app[train_index, :][:, train_index]: ', X_app[train_index, :][:, train_index])
  703. # print('X_app[valid_index, :][:, train_index]: ', X_app[valid_index, :][:, train_index])
  704. svc.fit(X_app[train_index, :][:, train_index],
  705. y_app[train_index])
  706. # predict on the train, validation and test set
  707. y_pred_train = svc.predict(
  708. X_app[train_index, :][:, train_index])
  709. y_pred_valid = svc.predict(
  710. X_app[valid_index, :][:, train_index])
  711. y_pred_test = svc.predict(
  712. X_test[:, train_index])
  713. # root mean squared errors
  714. current_train_perf.append(
  715. accuracy_score(y_app[train_index],
  716. y_pred_train))
  717. current_valid_perf.append(
  718. accuracy_score(y_app[valid_index],
  719. y_pred_valid))
  720. current_test_perf.append(
  721. accuracy_score(y_test, y_pred_test))
  722. # except ValueError:
  723. # print(sys.exc_info()[0])
  724. # print(params_out, params_in)
  725. # average performance on inner splits
  726. train_pref[index_out][index_in] = np.mean(
  727. current_train_perf)
  728. val_pref[index_out][index_in] = np.mean(
  729. current_valid_perf)
  730. test_pref[index_out][index_in] = np.mean(
  731. current_test_perf)
  732. # print(time.time() - st)
  733. # if trial == 0:
  734. # print('val_pref: ', val_pref)
  735. # print('test_pref: ', test_pref)
  736. return train_pref, val_pref, test_pref
  737. def parallel_trial_do(param_list_pre_revised, param_list, y, model_type, trial):
  738. train_pref, val_pref, test_pref = trial_do(param_list_pre_revised,
  739. param_list, G_gms, y,
  740. model_type, trial)
  741. return train_pref, val_pref, test_pref
  742. def compute_gram_matrices(dataset, y, estimator, param_list_precomputed,
  743. output_dir, ds_name,
  744. n_jobs=1, str_fw='', verbose=True):
  745. gram_matrices = [
  746. ] # a list to store gram matrices for all param_grid_precomputed
  747. gram_matrix_time = [
  748. ] # a list to store time to calculate gram matrices
  749. param_list_pre_revised = [
  750. ] # list to store param grids precomputed ignoring the useless ones
  751. nb_gm_ignore = 0 # the number of gram matrices those should not be considered, as they may contain elements that are not numbers (NaN)
  752. for idx, params_out in enumerate(param_list_precomputed):
  753. params_out['n_jobs'] = n_jobs
  754. # print(dataset)
  755. # import networkx as nx
  756. # nx.draw_networkx(dataset[1])
  757. # plt.show()
  758. rtn_data = estimator(dataset[:], **params_out)
  759. Kmatrix = rtn_data[0]
  760. current_run_time = rtn_data[1]
  761. # for some kernels, some graphs in datasets may not meet the
  762. # kernels' requirements for graph structure. These graphs are trimmed.
  763. if len(rtn_data) == 3:
  764. idx_trim = rtn_data[2] # the index of trimmed graph list
  765. y = [y[idxt] for idxt in idx_trim] # trim y accordingly
  766. Kmatrix_diag = Kmatrix.diagonal().copy()
  767. # remove graphs whose kernels with themselves are zeros
  768. nb_g_ignore = 0
  769. for idxk, diag in enumerate(Kmatrix_diag):
  770. if diag == 0:
  771. Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=0)
  772. Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
  773. nb_g_ignore += 1
  774. # normalization
  775. for i in range(len(Kmatrix)):
  776. for j in range(i, len(Kmatrix)):
  777. Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
  778. Kmatrix[j][i] = Kmatrix[i][j]
  779. if verbose:
  780. print()
  781. if params_out == {}:
  782. if verbose:
  783. print('the gram matrix is: ')
  784. str_fw += 'the gram matrix is:\n\n'
  785. else:
  786. if verbose:
  787. print('the gram matrix with parameters', params_out, 'is: ')
  788. str_fw += 'the gram matrix with parameters %s is:\n\n' % params_out
  789. if len(Kmatrix) < 2:
  790. nb_gm_ignore += 1
  791. if verbose:
  792. print('ignored, as at most only one of all its diagonal value is non-zero.')
  793. str_fw += 'ignored, as at most only one of all its diagonal value is non-zero.\n\n'
  794. else:
  795. if np.isnan(Kmatrix).any(
  796. ): # if the matrix contains elements that are not numbers
  797. nb_gm_ignore += 1
  798. if verbose:
  799. print('ignored, as it contains elements that are not numbers.')
  800. str_fw += 'ignored, as it contains elements that are not numbers.\n\n'
  801. else:
  802. # print(Kmatrix)
  803. str_fw += np.array2string(
  804. Kmatrix,
  805. separator=',') + '\n\n'
  806. # separator=',',
  807. # threshold=np.inf,
  808. # floatmode='unique') + '\n\n'
  809. fig_file_name = output_dir + '/GM[ds]' + ds_name
  810. if params_out != {}:
  811. fig_file_name += '[params]' + str(idx)
  812. plt.imshow(Kmatrix)
  813. plt.colorbar()
  814. plt.savefig(fig_file_name + '.eps', format='eps', dpi=300)
  815. # plt.show()
  816. plt.clf()
  817. gram_matrices.append(Kmatrix)
  818. gram_matrix_time.append(current_run_time)
  819. param_list_pre_revised.append(params_out)
  820. if nb_g_ignore > 0:
  821. if verbose:
  822. print(', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore)
  823. str_fw += ', where %d graphs are ignored as their graph kernels with themselves are zeros.' % nb_g_ignore
  824. if verbose:
  825. print()
  826. print(
  827. '{} gram matrices are calculated, {} of which are ignored.'.format(
  828. len(param_list_precomputed), nb_gm_ignore))
  829. str_fw += '{} gram matrices are calculated, {} of which are ignored.\n\n'.format(len(param_list_precomputed), nb_gm_ignore)
  830. str_fw += 'serial numbers of gram matrix figures and their corresponding parameters settings:\n\n'
  831. str_fw += ''.join([
  832. '{}: {}\n'.format(idx, params_out)
  833. for idx, params_out in enumerate(param_list_precomputed)
  834. ])
  835. return gram_matrices, gram_matrix_time, param_list_pre_revised, y, str_fw
  836. def read_gram_matrices_from_file(output_dir, ds_name):
  837. gmfile = np.load(output_dir + '/' + ds_name + '.gm.npz')
  838. gram_matrices = gmfile['gms'] # a list to store gram matrices for all param_grid_precomputed
  839. param_list_pre_revised = gmfile['params'] # list to store param grids precomputed ignoring the useless ones
  840. y = gmfile['y'].tolist()
  841. return gram_matrices, param_list_pre_revised, y
  842. def printResultsInTable(param_list, param_list_pre_revised, average_val_scores,
  843. std_val_scores, average_perf_scores, std_perf_scores,
  844. average_train_scores, std_train_scores, gram_matrix_time,
  845. model_type, verbose):
  846. from collections import OrderedDict
  847. from tabulate import tabulate
  848. table_dict = {}
  849. if model_type == 'regression':
  850. for param_in in param_list:
  851. param_in['alpha'] = '{:.2e}'.format(param_in['alpha'])
  852. else:
  853. for param_in in param_list:
  854. param_in['C'] = '{:.2e}'.format(param_in['C'])
  855. table_dict['params'] = [{**param_out, **param_in}
  856. for param_in in param_list for param_out in param_list_pre_revised]
  857. table_dict['gram_matrix_time'] = [
  858. '{:.2f}'.format(gram_matrix_time[index_out])
  859. for param_in in param_list
  860. for index_out, _ in enumerate(param_list_pre_revised)
  861. ]
  862. table_dict['valid_perf'] = [
  863. '{:.2f}±{:.2f}'.format(average_val_scores[index_out][index_in],
  864. std_val_scores[index_out][index_in])
  865. for index_in, _ in enumerate(param_list)
  866. for index_out, _ in enumerate(param_list_pre_revised)
  867. ]
  868. table_dict['test_perf'] = [
  869. '{:.2f}±{:.2f}'.format(average_perf_scores[index_out][index_in],
  870. std_perf_scores[index_out][index_in])
  871. for index_in, _ in enumerate(param_list)
  872. for index_out, _ in enumerate(param_list_pre_revised)
  873. ]
  874. table_dict['train_perf'] = [
  875. '{:.2f}±{:.2f}'.format(average_train_scores[index_out][index_in],
  876. std_train_scores[index_out][index_in])
  877. for index_in, _ in enumerate(param_list)
  878. for index_out, _ in enumerate(param_list_pre_revised)
  879. ]
  880. keyorder = [
  881. 'params', 'train_perf', 'valid_perf', 'test_perf',
  882. 'gram_matrix_time'
  883. ]
  884. if verbose:
  885. print()
  886. tb_print = tabulate(OrderedDict(sorted(table_dict.items(),
  887. key=lambda i: keyorder.index(i[0]))), headers='keys')
  888. # print(tb_print)
  889. return 'table of performance v.s. hyper-params:\n\n%s\n\n' % tb_print

A Python package for graph kernels, graph edit distances and graph pre-image problem.