You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. import networkx as nx
  2. import numpy as np
  3. from tqdm import tqdm
  4. def getSPLengths(G1):
  5. sp = nx.shortest_path(G1)
  6. distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
  7. for i in sp.keys():
  8. for j in sp[i].keys():
  9. distances[i, j] = len(sp[i][j])-1
  10. return distances
  11. def getSPGraph(G, edge_weight = 'bond_type'):
  12. """Transform graph G to its corresponding shortest-paths graph.
  13. Parameters
  14. ----------
  15. G : NetworkX graph
  16. The graph to be tramsformed.
  17. edge_weight : string
  18. edge attribute corresponding to the edge weight. The default edge weight is bond_type.
  19. Return
  20. ------
  21. S : NetworkX graph
  22. The shortest-paths graph corresponding to G.
  23. Notes
  24. ------
  25. For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes.
  26. References
  27. ----------
  28. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  29. """
  30. return floydTransformation(G, edge_weight = edge_weight)
  31. def floydTransformation(G, edge_weight = 'bond_type'):
  32. """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation.
  33. Parameters
  34. ----------
  35. G : NetworkX graph
  36. The graph to be tramsformed.
  37. edge_weight : string
  38. edge attribute corresponding to the edge weight. The default edge weight is bond_type.
  39. Return
  40. ------
  41. S : NetworkX graph
  42. The shortest-paths graph corresponding to G.
  43. References
  44. ----------
  45. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  46. """
  47. spMatrix = nx.floyd_warshall_numpy(G, weight = edge_weight)
  48. S = nx.Graph()
  49. S.add_nodes_from(G.nodes(data=True))
  50. for i in range(0, G.number_of_nodes()):
  51. for j in range(i, G.number_of_nodes()):
  52. S.add_edge(i, j, cost = spMatrix[i, j])
  53. return S
  54. def kernel_train_test(datafile, kernel_file_path, kernel_func, kernel_para, trials = 100, splits = 10, alpha_grid = None, C_grid = None, hyper_name = '', hyper_range = [1], normalize = False, datafile_y = '', model_type = 'regression'):
  55. """Perform training and testing for a kernel method. Print out neccessary data during the process then finally the results.
  56. Parameters
  57. ----------
  58. datafile : string
  59. Path of dataset file.
  60. kernel_file_path : string
  61. Path of the directory to save results.
  62. kernel_func : function
  63. kernel function to use in the process.
  64. kernel_para : dictionary
  65. Keyword arguments passed to kernel_func.
  66. trials: integer
  67. Number of trials for hyperparameter random search, where hyperparameter stands for penalty parameter for now. The default is 100.
  68. splits: integer
  69. Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10.
  70. alpha_grid : ndarray
  71. Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression.
  72. C_grid : ndarray
  73. Penalty parameter C of the error term in kernel SVM.
  74. hyper_name : string
  75. Name of the hyperparameter.
  76. hyper_range : list
  77. Range of the hyperparameter.
  78. normalize : string
  79. Determine whether or not that normalization is performed. Only works when model_type == 'regression'. The default is False.
  80. model_type : string
  81. Typr of the problem, regression or classification problem
  82. References
  83. ----------
  84. [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1
  85. Examples
  86. --------
  87. >>> import sys
  88. >>> sys.path.insert(0, "../")
  89. >>> from pygraph.utils.utils import kernel_train_test
  90. >>> from pygraph.kernels.treeletKernel import treeletkernel
  91. >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'
  92. >>> kernel_file_path = 'kernelmatrices_path_acyclic/'
  93. >>> kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', labeled = True)
  94. >>> kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)
  95. """
  96. import os
  97. import pathlib
  98. from collections import OrderedDict
  99. from tabulate import tabulate
  100. from .graphfiles import loadDataset
  101. # setup the parameters
  102. model_type = model_type.lower()
  103. if model_type != 'regression' and model_type != 'classification':
  104. raise Exception('The model type is incorrect! Please choose from regression or clqssification.')
  105. print('\n --- This is a %s problem ---' % model_type)
  106. alpha_grid = np.logspace(-10, 10, num = trials, base = 10) if alpha_grid == None else alpha_grid # corresponds to (2*C)^-1 in other linear models such as LogisticRegression
  107. C_grid = np.logspace(-10, 10, num = trials, base = 10) if C_grid == None else C_grid
  108. if not os.path.exists(kernel_file_path):
  109. os.makedirs(kernel_file_path)
  110. train_means_list = []
  111. train_stds_list = []
  112. test_means_list = []
  113. test_stds_list = []
  114. kernel_time_list = []
  115. for hyper_para in hyper_range:
  116. print('' if hyper_name == '' else '\n\n #--- calculating kernel matrix when', hyper_name, '=', hyper_para, '---#')
  117. print('\n Loading dataset from file...')
  118. dataset, y = loadDataset(datafile, filename_y = datafile_y)
  119. y = np.array(y)
  120. # normalize labels and transform non-numerical labels to numerical labels.
  121. if model_type == 'classification':
  122. from sklearn.preprocessing import LabelEncoder
  123. y = LabelEncoder().fit_transform(y)
  124. # print(y)
  125. # save kernel matrices to files / read kernel matrices from files
  126. kernel_file = kernel_file_path + 'km.ds'
  127. path = pathlib.Path(kernel_file)
  128. # get train set kernel matrix
  129. if path.is_file():
  130. print('\n Loading the kernel matrix from file...')
  131. Kmatrix = np.loadtxt(kernel_file)
  132. print(Kmatrix)
  133. else:
  134. print('\n Calculating kernel matrix, this could take a while...')
  135. if hyper_name != '':
  136. kernel_para[hyper_name] = hyper_para
  137. Kmatrix, run_time = kernel_func(dataset, **kernel_para)
  138. kernel_time_list.append(run_time)
  139. print(Kmatrix)
  140. # print('\n Saving kernel matrix to file...')
  141. # np.savetxt(kernel_file, Kmatrix)
  142. """
  143. - Here starts the main program
  144. - First we permute the data, then for each split we evaluate corresponding performances
  145. - In the end, the performances are averaged over the test sets
  146. """
  147. train_mean, train_std, test_mean, test_std = \
  148. split_train_test(Kmatrix, y, alpha_grid, C_grid, splits, trials, model_type, normalize = normalize)
  149. train_means_list.append(train_mean)
  150. train_stds_list.append(train_std)
  151. test_means_list.append(test_mean)
  152. test_stds_list.append(test_std)
  153. print('\n')
  154. if model_type == 'regression':
  155. table_dict = {'rmse_test': test_means_list, 'std_test': test_stds_list, \
  156. 'rmse_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}
  157. if hyper_name == '':
  158. keyorder = ['rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time']
  159. else:
  160. table_dict[hyper_name] = hyper_range
  161. keyorder = [hyper_name, 'rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time']
  162. elif model_type == 'classification':
  163. table_dict = {'accur_test': test_means_list, 'std_test': test_stds_list, \
  164. 'accur_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}
  165. if hyper_name == '':
  166. keyorder = ['accur_test', 'std_test', 'accur_train', 'std_train', 'k_time']
  167. else:
  168. table_dict[hyper_name] = hyper_range
  169. keyorder = [hyper_name, 'accur_test', 'std_test', 'accur_train', 'std_train', 'k_time']
  170. print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))
  171. def split_train_test(Kmatrix, train_target, alpha_grid, C_grid, splits = 10, trials = 100, model_type = 'regression', normalize = False):
  172. """Split dataset to training and testing splits, train and test. Print out and return the results.
  173. Parameters
  174. ----------
  175. Kmatrix : Numpy matrix
  176. Kernel matrix, each element of which is the kernel between 2 praphs.
  177. train_target : ndarray
  178. train target.
  179. alpha_grid : ndarray
  180. Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression.
  181. C_grid : ndarray
  182. Penalty parameter C of the error term in kernel SVM.
  183. splits : interger
  184. Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10.
  185. trials : integer
  186. Number of trials for hyperparameters random search. The final means and stds are the ones in the same trial with the best test mean. The default is 100.
  187. model_type : string
  188. Determine whether it is a regression or classification problem. The default is 'regression'.
  189. normalize : string
  190. Determine whether or not that normalization is performed. Only works when model_type == 'regression'. The default is False.
  191. Return
  192. ------
  193. train_mean : float
  194. mean of train accuracies in the same trial with the best test mean.
  195. train_std : float
  196. mean of train stds in the same trial with the best test mean.
  197. test_mean : float
  198. mean of the best tests.
  199. test_std : float
  200. mean of test stds in the same trial with the best test mean.
  201. References
  202. ----------
  203. [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1
  204. """
  205. import random
  206. from sklearn.kernel_ridge import KernelRidge # 0.17
  207. from sklearn.metrics import accuracy_score, mean_squared_error
  208. from sklearn import svm
  209. datasize = len(train_target)
  210. random.seed(20) # Set the seed for uniform parameter distribution
  211. # Initialize the performance of the best parameter trial on train with the corresponding performance on test
  212. train_split = []
  213. test_split = []
  214. # For each split of the data
  215. print('\n Starting calculate accuracy/rmse...')
  216. import sys
  217. pbar = tqdm(total = splits * trials, desc = 'calculate performance', file=sys.stdout)
  218. for j in range(10, 10 + splits):
  219. # print('\n Starting split %d...' % j)
  220. # Set the random set for data permutation
  221. random_state = int(j)
  222. np.random.seed(random_state)
  223. idx_perm = np.random.permutation(datasize)
  224. # Permute the data
  225. y_perm = train_target[idx_perm] # targets permutation
  226. Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation
  227. Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation
  228. # Set the training, test
  229. # Note: the percentage can be set up by the user
  230. num_train = int((datasize * 90) / 100) # 90% (of entire dataset) for training
  231. num_test = datasize - num_train # 10% (of entire dataset) for test
  232. # Split the kernel matrix
  233. Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]
  234. Kmatrix_test = Kmatrix_perm[num_train:datasize, 0:num_train]
  235. # Split the targets
  236. y_train = y_perm[0:num_train]
  237. # Normalization step (for real valued targets only)
  238. if normalize == True and model_type == 'regression':
  239. y_train_mean = np.mean(y_train)
  240. y_train_std = np.std(y_train)
  241. y_train_norm = (y_train - y_train_mean) / float(y_train_std)
  242. y_test = y_perm[num_train:datasize]
  243. # Record the performance for each parameter trial respectively on train and test set
  244. perf_all_train = []
  245. perf_all_test = []
  246. # For each parameter trial
  247. for i in range(trials):
  248. # For regression use the Kernel Ridge method
  249. if model_type == 'regression':
  250. # Fit the kernel ridge model
  251. KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])
  252. KR.fit(Kmatrix_train, y_train if normalize == False else y_train_norm)
  253. # predict on the train and test set
  254. y_pred_train = KR.predict(Kmatrix_train)
  255. y_pred_test = KR.predict(Kmatrix_test)
  256. # adjust prediction: needed because the training targets have been normalized
  257. if normalize == True:
  258. y_pred_train = y_pred_train * float(y_train_std) + y_train_mean
  259. y_pred_test = y_pred_test * float(y_train_std) + y_train_mean
  260. # root mean squared error on train set
  261. accuracy_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
  262. perf_all_train.append(accuracy_train)
  263. # root mean squared error on test set
  264. accuracy_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
  265. perf_all_test.append(accuracy_test)
  266. # For clcassification use SVM
  267. elif model_type == 'classification':
  268. KR = svm.SVC(kernel = 'precomputed', C = C_grid[i])
  269. KR.fit(Kmatrix_train, y_train)
  270. # predict on the train and test set
  271. y_pred_train = KR.predict(Kmatrix_train)
  272. y_pred_test = KR.predict(Kmatrix_test)
  273. # accuracy on train set
  274. accuracy_train = accuracy_score(y_train, y_pred_train)
  275. perf_all_train.append(accuracy_train)
  276. # accuracy on test set
  277. accuracy_test = accuracy_score(y_test, y_pred_test)
  278. perf_all_test.append(accuracy_test)
  279. pbar.update(1)
  280. # --- FIND THE OPTIMAL PARAMETERS --- #
  281. # For regression: minimise the mean squared error
  282. if model_type == 'regression':
  283. # get optimal parameter on test (argmin mean squared error)
  284. min_idx = np.argmin(perf_all_test)
  285. alpha_opt = alpha_grid[min_idx]
  286. # corresponding performance on train and test set for the same parameter
  287. perf_train_opt = perf_all_train[min_idx]
  288. perf_test_opt = perf_all_test[min_idx]
  289. # For classification: maximise the accuracy
  290. if model_type == 'classification':
  291. # get optimal parameter on test (argmax accuracy)
  292. max_idx = np.argmax(perf_all_test)
  293. C_opt = C_grid[max_idx]
  294. # corresponding performance on train and test set for the same parameter
  295. perf_train_opt = perf_all_train[max_idx]
  296. perf_test_opt = perf_all_test[max_idx]
  297. # append the correponding performance on the train and test set
  298. train_split.append(perf_train_opt)
  299. test_split.append(perf_test_opt)
  300. # average the results
  301. # mean of the train and test performances over the splits
  302. train_mean = np.mean(np.asarray(train_split))
  303. test_mean = np.mean(np.asarray(test_split))
  304. # std deviation of the train and test over the splits
  305. train_std = np.std(np.asarray(train_split))
  306. test_std = np.std(np.asarray(test_split))
  307. print('\n Mean performance on train set: %3f' % train_mean)
  308. print('With standard deviation: %3f' % train_std)
  309. print('\n Mean performance on test set: %3f' % test_mean)
  310. print('With standard deviation: %3f' % test_std)
  311. return train_mean, train_std, test_mean, test_std

A Python package for graph kernels, graph edit distances and graph pre-image problem.