You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. import networkx as nx
  2. import numpy as np
  3. def getSPLengths(G1):
  4. sp = nx.shortest_path(G1)
  5. distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
  6. for i in sp.keys():
  7. for j in sp[i].keys():
  8. distances[i, j] = len(sp[i][j])-1
  9. return distances
  10. def getSPGraph(G, edge_weight = 'bond_type'):
  11. """Transform graph G to its corresponding shortest-paths graph.
  12. Parameters
  13. ----------
  14. G : NetworkX graph
  15. The graph to be tramsformed.
  16. edge_weight : string
  17. edge attribute corresponding to the edge weight. The default edge weight is bond_type.
  18. Return
  19. ------
  20. S : NetworkX graph
  21. The shortest-paths graph corresponding to G.
  22. Notes
  23. ------
  24. For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes.
  25. References
  26. ----------
  27. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  28. """
  29. return floydTransformation(G, edge_weight = edge_weight)
  30. def floydTransformation(G, edge_weight = 'bond_type'):
  31. """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation.
  32. Parameters
  33. ----------
  34. G : NetworkX graph
  35. The graph to be tramsformed.
  36. edge_weight : string
  37. edge attribute corresponding to the edge weight. The default edge weight is bond_type.
  38. Return
  39. ------
  40. S : NetworkX graph
  41. The shortest-paths graph corresponding to G.
  42. References
  43. ----------
  44. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  45. """
  46. spMatrix = nx.floyd_warshall_numpy(G, weight = edge_weight)
  47. S = nx.Graph()
  48. S.add_nodes_from(G.nodes(data=True))
  49. for i in range(0, G.number_of_nodes()):
  50. for j in range(0, G.number_of_nodes()):
  51. S.add_edge(i, j, cost = spMatrix[i, j])
  52. return S
  53. import os
  54. import pathlib
  55. from collections import OrderedDict
  56. from tabulate import tabulate
  57. from .graphfiles import loadDataset
  58. def kernel_train_test(datafile, kernel_file_path, kernel_func, kernel_para, trials = 100, splits = 10, alpha_grid = None, C_grid = None, hyper_name = '', hyper_range = [1], normalize = False):
  59. """Perform training and testing for a kernel method. Print out neccessary data during the process then finally the results.
  60. Parameters
  61. ----------
  62. datafile : string
  63. Path of dataset file.
  64. kernel_file_path : string
  65. Path of the directory to save results.
  66. kernel_func : function
  67. kernel function to use in the process.
  68. kernel_para : dictionary
  69. Keyword arguments passed to kernel_func.
  70. trials: integer
  71. Number of trials for hyperparameter random search, where hyperparameter stands for penalty parameter for now. The default is 100.
  72. splits: integer
  73. Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10.
  74. alpha_grid : ndarray
  75. Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression.
  76. C_grid : ndarray
  77. Penalty parameter C of the error term in kernel SVM.
  78. hyper_name : string
  79. Name of the hyperparameter.
  80. hyper_range : list
  81. Range of the hyperparameter.
  82. normalize : string
  83. Determine whether or not that normalization is performed. The default is False.
  84. References
  85. ----------
  86. [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1
  87. Examples
  88. --------
  89. >>> import sys
  90. >>> sys.path.insert(0, "../")
  91. >>> from pygraph.utils.utils import kernel_train_test
  92. >>> from pygraph.kernels.treeletKernel import treeletkernel
  93. >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'
  94. >>> kernel_file_path = 'kernelmatrices_path_acyclic/'
  95. >>> kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', labeled = True)
  96. >>> kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)
  97. """
  98. # setup the parameters
  99. model_type = 'regression' # Regression or classification problem
  100. print('\n --- This is a %s problem ---' % model_type)
  101. alpha_grid = np.logspace(-10, 10, num = trials, base = 10) if alpha_grid == None else alpha_grid # corresponds to (2*C)^-1 in other linear models such as LogisticRegression
  102. C_grid = np.logspace(-10, 10, num = trials, base = 10) if C_grid == None else C_grid
  103. if not os.path.exists(kernel_file_path):
  104. os.makedirs(kernel_file_path)
  105. train_means_list = []
  106. train_stds_list = []
  107. test_means_list = []
  108. test_stds_list = []
  109. kernel_time_list = []
  110. for hyper_para in hyper_range:
  111. print('' if hyper_name == '' else '\n\n #--- calculating kernel matrix when %s = %.1f ---#' % (hyper_name, hyper_para))
  112. print('\n Loading dataset from file...')
  113. dataset, y = loadDataset(datafile)
  114. y = np.array(y)
  115. # print(y)
  116. # save kernel matrices to files / read kernel matrices from files
  117. kernel_file = kernel_file_path + 'km.ds'
  118. path = pathlib.Path(kernel_file)
  119. # get train set kernel matrix
  120. if path.is_file():
  121. print('\n Loading the kernel matrix from file...')
  122. Kmatrix = np.loadtxt(kernel_file)
  123. print(Kmatrix)
  124. else:
  125. print('\n Calculating kernel matrix, this could take a while...')
  126. if hyper_name != '':
  127. kernel_para[hyper_name] = hyper_para
  128. Kmatrix, run_time = kernel_func(dataset, **kernel_para)
  129. kernel_time_list.append(run_time)
  130. print(Kmatrix)
  131. print('\n Saving kernel matrix to file...')
  132. # np.savetxt(kernel_file, Kmatrix)
  133. """
  134. - Here starts the main program
  135. - First we permute the data, then for each split we evaluate corresponding performances
  136. - In the end, the performances are averaged over the test sets
  137. """
  138. train_mean, train_std, test_mean, test_std = \
  139. split_train_test(Kmatrix, y, alpha_grid, C_grid, splits, trials, model_type, normalize = normalize)
  140. train_means_list.append(train_mean)
  141. train_stds_list.append(train_std)
  142. test_means_list.append(test_mean)
  143. test_stds_list.append(test_std)
  144. print('\n')
  145. table_dict = {'RMSE_test': test_means_list, 'std_test': test_stds_list, \
  146. 'RMSE_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}
  147. if hyper_name == '':
  148. keyorder = ['RMSE_test', 'std_test', 'RMSE_train', 'std_train', 'k_time']
  149. else:
  150. table_dict[hyper_name] = hyper_range
  151. keyorder = [hyper_name, 'rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time']
  152. print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))
  153. import random
  154. from sklearn.kernel_ridge import KernelRidge # 0.17
  155. from sklearn.metrics import accuracy_score, mean_squared_error
  156. from sklearn import svm
  157. def split_train_test(Kmatrix, train_target, alpha_grid, C_grid, splits = 10, trials = 100, model_type = 'regression', normalize = False):
  158. """Split dataset to training and testing splits, train and test. Print out and return the results.
  159. Parameters
  160. ----------
  161. Kmatrix : Numpy matrix
  162. Kernel matrix, each element of which is the kernel between 2 praphs.
  163. train_target : ndarray
  164. train target.
  165. alpha_grid : ndarray
  166. Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression.
  167. C_grid : ndarray
  168. Penalty parameter C of the error term in kernel SVM.
  169. splits : interger
  170. Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10.
  171. trials : integer
  172. Number of trials for hyperparameters random search. The final means and stds are the ones in the same trial with the best test mean. The default is 100.
  173. model_type : string
  174. Determine whether it is a regression or classification problem. The default is 'regression'.
  175. normalize : string
  176. Determine whether or not that normalization is performed. The default is False.
  177. Return
  178. ------
  179. train_mean : float
  180. mean of train accuracies in the same trial with the best test mean.
  181. train_std : float
  182. mean of train stds in the same trial with the best test mean.
  183. test_mean : float
  184. mean of the best tests.
  185. test_std : float
  186. mean of test stds in the same trial with the best test mean.
  187. References
  188. ----------
  189. [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1
  190. """
  191. datasize = len(train_target)
  192. random.seed(20) # Set the seed for uniform parameter distribution
  193. # Initialize the performance of the best parameter trial on train with the corresponding performance on test
  194. train_split = []
  195. test_split = []
  196. # For each split of the data
  197. for j in range(10, 10 + splits):
  198. # print('\n Starting split %d...' % j)
  199. # Set the random set for data permutation
  200. random_state = int(j)
  201. np.random.seed(random_state)
  202. idx_perm = np.random.permutation(datasize)
  203. # Permute the data
  204. y_perm = train_target[idx_perm] # targets permutation
  205. Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation
  206. Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation
  207. # Set the training, test
  208. # Note: the percentage can be set up by the user
  209. num_train = int((datasize * 90) / 100) # 90% (of entire dataset) for training
  210. num_test = datasize - num_train # 10% (of entire dataset) for test
  211. # Split the kernel matrix
  212. Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]
  213. Kmatrix_test = Kmatrix_perm[num_train:datasize, 0:num_train]
  214. # Split the targets
  215. y_train = y_perm[0:num_train]
  216. # Normalization step (for real valued targets only)
  217. if normalize == True and model_type == 'regression':
  218. y_train_mean = np.mean(y_train)
  219. y_train_std = np.std(y_train)
  220. y_train_norm = (y_train - y_train_mean) / float(y_train_std)
  221. y_test = y_perm[num_train:datasize]
  222. # Record the performance for each parameter trial respectively on train and test set
  223. perf_all_train = []
  224. perf_all_test = []
  225. # For each parameter trial
  226. for i in range(trials):
  227. # For regression use the Kernel Ridge method
  228. if model_type == 'regression':
  229. # Fit the kernel ridge model
  230. KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])
  231. # KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])
  232. KR.fit(Kmatrix_train, y_train if normalize == False else y_train_norm)
  233. # predict on the train and test set
  234. y_pred_train = KR.predict(Kmatrix_train)
  235. y_pred_test = KR.predict(Kmatrix_test)
  236. # adjust prediction: needed because the training targets have been normalized
  237. if normalize == True:
  238. y_pred_train = y_pred_train * float(y_train_std) + y_train_mean
  239. y_pred_test = y_pred_test * float(y_train_std) + y_train_mean
  240. # root mean squared error in train set
  241. rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
  242. perf_all_train.append(rmse_train)
  243. # root mean squared error in test set
  244. rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
  245. perf_all_test.append(rmse_test)
  246. # --- FIND THE OPTIMAL PARAMETERS --- #
  247. # For regression: minimise the mean squared error
  248. if model_type == 'regression':
  249. # get optimal parameter on test (argmin mean squared error)
  250. min_idx = np.argmin(perf_all_test)
  251. alpha_opt = alpha_grid[min_idx]
  252. # corresponding performance on train and test set for the same parameter
  253. perf_train_opt = perf_all_train[min_idx]
  254. perf_test_opt = perf_all_test[min_idx]
  255. # append the correponding performance on the train and test set
  256. train_split.append(perf_train_opt)
  257. test_split.append(perf_test_opt)
  258. # average the results
  259. # mean of the train and test performances over the splits
  260. train_mean = np.mean(np.asarray(train_split))
  261. test_mean = np.mean(np.asarray(test_split))
  262. # std deviation of the train and test over the splits
  263. train_std = np.std(np.asarray(train_split))
  264. test_std = np.std(np.asarray(test_split))
  265. print('\n Mean performance on train set: %3f' % train_mean)
  266. print('With standard deviation: %3f' % train_std)
  267. print('\n Mean performance on test set: %3f' % test_mean)
  268. print('With standard deviation: %3f' % test_std)
  269. return train_mean, train_std, test_mean, test_std

A Python package for graph kernels, graph edit distances and graph pre-image problem.