You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. import networkx as nx
  2. import numpy as np
  3. from tqdm import tqdm
  4. def getSPLengths(G1):
  5. sp = nx.shortest_path(G1)
  6. distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
  7. for i in sp.keys():
  8. for j in sp[i].keys():
  9. distances[i, j] = len(sp[i][j])-1
  10. return distances
  11. def getSPGraph(G, edge_weight = 'bond_type'):
  12. """Transform graph G to its corresponding shortest-paths graph.
  13. Parameters
  14. ----------
  15. G : NetworkX graph
  16. The graph to be tramsformed.
  17. edge_weight : string
  18. edge attribute corresponding to the edge weight. The default edge weight is bond_type.
  19. Return
  20. ------
  21. S : NetworkX graph
  22. The shortest-paths graph corresponding to G.
  23. Notes
  24. ------
  25. For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes.
  26. References
  27. ----------
  28. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  29. """
  30. return floydTransformation(G, edge_weight = edge_weight)
  31. def floydTransformation(G, edge_weight = 'bond_type'):
  32. """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation.
  33. Parameters
  34. ----------
  35. G : NetworkX graph
  36. The graph to be tramsformed.
  37. edge_weight : string
  38. edge attribute corresponding to the edge weight. The default edge weight is bond_type.
  39. Return
  40. ------
  41. S : NetworkX graph
  42. The shortest-paths graph corresponding to G.
  43. References
  44. ----------
  45. [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
  46. """
  47. spMatrix = nx.floyd_warshall_numpy(G, weight = edge_weight)
  48. S = nx.Graph()
  49. S.add_nodes_from(G.nodes(data=True))
  50. for i in range(0, G.number_of_nodes()):
  51. for j in range(i, G.number_of_nodes()):
  52. S.add_edge(i, j, cost = spMatrix[i, j])
  53. return S
  54. # def kernel_train_test(datafile, kernel_file_path, kernel_func, kernel_para, trials = 100, splits = 10, alpha_grid = None, C_grid = None, hyper_name = '', hyper_range = [1], normalize = False, datafile_y = '', model_type = 'regression'):
  55. # """Perform training and testing for a kernel method. Print out neccessary data during the process then finally the results.
  56. # Parameters
  57. # ----------
  58. # datafile : string
  59. # Path of dataset file.
  60. # kernel_file_path : string
  61. # Path of the directory to save results.
  62. # kernel_func : function
  63. # kernel function to use in the process.
  64. # kernel_para : dictionary
  65. # Keyword arguments passed to kernel_func.
  66. # trials: integer
  67. # Number of trials for hyperparameter random search, where hyperparameter stands for penalty parameter for now. The default is 100.
  68. # splits: integer
  69. # Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10.
  70. # alpha_grid : ndarray
  71. # Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression.
  72. # C_grid : ndarray
  73. # Penalty parameter C of the error term in kernel SVM.
  74. # hyper_name : string
  75. # Name of the hyperparameter.
  76. # hyper_range : list
  77. # Range of the hyperparameter.
  78. # normalize : string
  79. # Determine whether or not that normalization is performed. Only works when model_type == 'regression'. The default is False.
  80. # model_type : string
  81. # Typr of the problem, regression or classification problem
  82. # References
  83. # ----------
  84. # [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1
  85. # Examples
  86. # --------
  87. # >>> import sys
  88. # >>> sys.path.insert(0, "../")
  89. # >>> from pygraph.utils.utils import kernel_train_test
  90. # >>> from pygraph.kernels.treeletKernel import treeletkernel
  91. # >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'
  92. # >>> kernel_file_path = 'kernelmatrices_path_acyclic/'
  93. # >>> kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', labeled = True)
  94. # >>> kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)
  95. # """
  96. # import os
  97. # import pathlib
  98. # from collections import OrderedDict
  99. # from tabulate import tabulate
  100. # from .graphfiles import loadDataset
  101. # # setup the parameters
  102. # model_type = model_type.lower()
  103. # if model_type != 'regression' and model_type != 'classification':
  104. # raise Exception('The model type is incorrect! Please choose from regression or classification.')
  105. # print('\n --- This is a %s problem ---' % model_type)
  106. # alpha_grid = np.logspace(-10, 10, num = trials, base = 10) if alpha_grid == None else alpha_grid # corresponds to (2*C)^-1 in other linear models such as LogisticRegression
  107. # C_grid = np.logspace(-10, 10, num = trials, base = 10) if C_grid == None else C_grid
  108. # if not os.path.exists(kernel_file_path):
  109. # os.makedirs(kernel_file_path)
  110. # train_means_list = []
  111. # train_stds_list = []
  112. # test_means_list = []
  113. # test_stds_list = []
  114. # kernel_time_list = []
  115. # for hyper_para in hyper_range:
  116. # print('' if hyper_name == '' else '\n\n #--- calculating kernel matrix when', hyper_name, '=', hyper_para, '---#')
  117. # print('\n Loading dataset from file...')
  118. # dataset, y = loadDataset(datafile, filename_y = datafile_y)
  119. # y = np.array(y)
  120. # # normalize labels and transform non-numerical labels to numerical labels.
  121. # if model_type == 'classification':
  122. # from sklearn.preprocessing import LabelEncoder
  123. # y = LabelEncoder().fit_transform(y)
  124. # # print(y)
  125. # # save kernel matrices to files / read kernel matrices from files
  126. # kernel_file = kernel_file_path + 'km.ds'
  127. # path = pathlib.Path(kernel_file)
  128. # # get train set kernel matrix
  129. # if path.is_file():
  130. # print('\n Loading the kernel matrix from file...')
  131. # Kmatrix = np.loadtxt(kernel_file)
  132. # print(Kmatrix)
  133. # else:
  134. # print('\n Calculating kernel matrix, this could take a while...')
  135. # if hyper_name != '':
  136. # kernel_para[hyper_name] = hyper_para
  137. # Kmatrix, run_time = kernel_func(dataset, **kernel_para)
  138. # kernel_time_list.append(run_time)
  139. # import matplotlib.pyplot as plt
  140. # plt.matshow(Kmatrix)
  141. # # print('\n Saving kernel matrix to file...')
  142. # # np.savetxt(kernel_file, Kmatrix)
  143. # """
  144. # - Here starts the main program
  145. # - First we permute the data, then for each split we evaluate corresponding performances
  146. # - In the end, the performances are averaged over the test sets
  147. # """
  148. # train_mean, train_std, test_mean, test_std = \
  149. # split_train_test(Kmatrix, y, alpha_grid, C_grid, splits, trials, model_type, normalize = normalize)
  150. # train_means_list.append(train_mean)
  151. # train_stds_list.append(train_std)
  152. # test_means_list.append(test_mean)
  153. # test_stds_list.append(test_std)
  154. # print('\n')
  155. # if model_type == 'regression':
  156. # table_dict = {'rmse_test': test_means_list, 'std_test': test_stds_list, \
  157. # 'rmse_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}
  158. # if hyper_name == '':
  159. # keyorder = ['rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time']
  160. # else:
  161. # table_dict[hyper_name] = hyper_range
  162. # keyorder = [hyper_name, 'rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time']
  163. # elif model_type == 'classification':
  164. # table_dict = {'accur_test': test_means_list, 'std_test': test_stds_list, \
  165. # 'accur_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}
  166. # if hyper_name == '':
  167. # keyorder = ['accur_test', 'std_test', 'accur_train', 'std_train', 'k_time']
  168. # else:
  169. # table_dict[hyper_name] = hyper_range
  170. # keyorder = [hyper_name, 'accur_test', 'std_test', 'accur_train', 'std_train', 'k_time']
  171. # print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))
  172. # def split_train_test(Kmatrix, train_target, alpha_grid, C_grid, splits = 10, trials = 100, model_type = 'regression', normalize = False):
  173. # """Split dataset to training and testing splits, train and test. Print out and return the results.
  174. # Parameters
  175. # ----------
  176. # Kmatrix : Numpy matrix
  177. # Kernel matrix, each element of which is the kernel between 2 praphs.
  178. # train_target : ndarray
  179. # train target.
  180. # alpha_grid : ndarray
  181. # Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression.
  182. # C_grid : ndarray
  183. # Penalty parameter C of the error term in kernel SVM.
  184. # splits : interger
  185. # Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10.
  186. # trials : integer
  187. # Number of trials for hyperparameters random search. The final means and stds are the ones in the same trial with the best test mean. The default is 100.
  188. # model_type : string
  189. # Determine whether it is a regression or classification problem. The default is 'regression'.
  190. # normalize : string
  191. # Determine whether or not that normalization is performed. Only works when model_type == 'regression'. The default is False.
  192. # Return
  193. # ------
  194. # train_mean : float
  195. # mean of train accuracies in the same trial with the best test mean.
  196. # train_std : float
  197. # mean of train stds in the same trial with the best test mean.
  198. # test_mean : float
  199. # mean of the best tests.
  200. # test_std : float
  201. # mean of test stds in the same trial with the best test mean.
  202. # References
  203. # ----------
  204. # [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1
  205. # """
  206. # import random
  207. # from sklearn.kernel_ridge import KernelRidge # 0.17
  208. # from sklearn.metrics import accuracy_score, mean_squared_error
  209. # from sklearn import svm
  210. # datasize = len(train_target)
  211. # random.seed(20) # Set the seed for uniform parameter distribution
  212. # # Initialize the performance of the best parameter trial on train with the corresponding performance on test
  213. # train_split = []
  214. # test_split = []
  215. # # For each split of the data
  216. # print('\n Starting calculate accuracy/rmse...')
  217. # import sys
  218. # pbar = tqdm(total = splits * trials, desc = 'calculate performance', file=sys.stdout)
  219. # for j in range(10, 10 + splits):
  220. # # print('\n Starting split %d...' % j)
  221. # # Set the random set for data permutation
  222. # random_state = int(j)
  223. # np.random.seed(random_state)
  224. # idx_perm = np.random.permutation(datasize)
  225. # # Permute the data
  226. # y_perm = train_target[idx_perm] # targets permutation
  227. # Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation
  228. # Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation
  229. # # Set the training, test
  230. # # Note: the percentage can be set up by the user
  231. # num_train = int((datasize * 90) / 100) # 90% (of entire dataset) for training
  232. # num_test = datasize - num_train # 10% (of entire dataset) for test
  233. # # Split the kernel matrix
  234. # Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]
  235. # Kmatrix_test = Kmatrix_perm[num_train:datasize, 0:num_train]
  236. # # Split the targets
  237. # y_train = y_perm[0:num_train]
  238. # # Normalization step (for real valued targets only)
  239. # if normalize == True and model_type == 'regression':
  240. # y_train_mean = np.mean(y_train)
  241. # y_train_std = np.std(y_train)
  242. # y_train_norm = (y_train - y_train_mean) / float(y_train_std)
  243. # y_test = y_perm[num_train:datasize]
  244. # # Record the performance for each parameter trial respectively on train and test set
  245. # perf_all_train = []
  246. # perf_all_test = []
  247. # # For each parameter trial
  248. # for i in range(trials):
  249. # # For regression use the Kernel Ridge method
  250. # if model_type == 'regression':
  251. # # Fit the kernel ridge model
  252. # KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])
  253. # KR.fit(Kmatrix_train, y_train if normalize == False else y_train_norm)
  254. # # predict on the train and test set
  255. # y_pred_train = KR.predict(Kmatrix_train)
  256. # y_pred_test = KR.predict(Kmatrix_test)
  257. # # adjust prediction: needed because the training targets have been normalized
  258. # if normalize == True:
  259. # y_pred_train = y_pred_train * float(y_train_std) + y_train_mean
  260. # y_pred_test = y_pred_test * float(y_train_std) + y_train_mean
  261. # # root mean squared error on train set
  262. # accuracy_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
  263. # perf_all_train.append(accuracy_train)
  264. # # root mean squared error on test set
  265. # accuracy_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
  266. # perf_all_test.append(accuracy_test)
  267. # # For clcassification use SVM
  268. # elif model_type == 'classification':
  269. # KR = svm.SVC(kernel = 'precomputed', C = C_grid[i])
  270. # KR.fit(Kmatrix_train, y_train)
  271. # # predict on the train and test set
  272. # y_pred_train = KR.predict(Kmatrix_train)
  273. # y_pred_test = KR.predict(Kmatrix_test)
  274. # # accuracy on train set
  275. # accuracy_train = accuracy_score(y_train, y_pred_train)
  276. # perf_all_train.append(accuracy_train)
  277. # # accuracy on test set
  278. # accuracy_test = accuracy_score(y_test, y_pred_test)
  279. # perf_all_test.append(accuracy_test)
  280. # pbar.update(1)
  281. # # --- FIND THE OPTIMAL PARAMETERS --- #
  282. # # For regression: minimise the mean squared error
  283. # if model_type == 'regression':
  284. # # get optimal parameter on test (argmin mean squared error)
  285. # min_idx = np.argmin(perf_all_train)
  286. # alpha_opt = alpha_grid[min_idx]
  287. # # corresponding performance on train and test set for the same parameter
  288. # perf_train_opt = perf_all_train[min_idx]
  289. # perf_test_opt = perf_all_test[min_idx]
  290. # # For classification: maximise the accuracy
  291. # if model_type == 'classification':
  292. # # get optimal parameter on test (argmax accuracy)
  293. # max_idx = np.argmax(perf_all_train)
  294. # C_opt = C_grid[max_idx]
  295. # # corresponding performance on train and test set for the same parameter
  296. # perf_train_opt = perf_all_train[max_idx]
  297. # perf_test_opt = perf_all_test[max_idx]
  298. # # append the correponding performance on the train and test set
  299. # train_split.append(perf_train_opt)
  300. # test_split.append(perf_test_opt)
  301. # # average the results
  302. # # mean of the train and test performances over the splits
  303. # train_mean = np.mean(np.asarray(train_split))
  304. # test_mean = np.mean(np.asarray(test_split))
  305. # # std deviation of the train and test over the splits
  306. # train_std = np.std(np.asarray(train_split))
  307. # test_std = np.std(np.asarray(test_split))
  308. # print('\n Mean performance on train set: %3f' % train_mean)
  309. # print('With standard deviation: %3f' % train_std)
  310. # print('\n Mean performance on test set: %3f' % test_mean)
  311. # print('With standard deviation: %3f' % test_std)
  312. # return train_mean, train_std, test_mean, test_std

A Python package for graph kernels, graph edit distances and graph pre-image problem.