import networkx as nx import numpy as np from tqdm import tqdm def getSPLengths(G1): sp = nx.shortest_path(G1) distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) for i in sp.keys(): for j in sp[i].keys(): distances[i, j] = len(sp[i][j])-1 return distances def getSPGraph(G, edge_weight = 'bond_type'): """Transform graph G to its corresponding shortest-paths graph. Parameters ---------- G : NetworkX graph The graph to be tramsformed. edge_weight : string edge attribute corresponding to the edge weight. The default edge weight is bond_type. Return ------ S : NetworkX graph The shortest-paths graph corresponding to G. Notes ------ For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes. References ---------- [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. """ return floydTransformation(G, edge_weight = edge_weight) def floydTransformation(G, edge_weight = 'bond_type'): """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation. Parameters ---------- G : NetworkX graph The graph to be tramsformed. edge_weight : string edge attribute corresponding to the edge weight. The default edge weight is bond_type. Return ------ S : NetworkX graph The shortest-paths graph corresponding to G. References ---------- [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. """ spMatrix = nx.floyd_warshall_numpy(G, weight = edge_weight) S = nx.Graph() S.add_nodes_from(G.nodes(data=True)) for i in range(0, G.number_of_nodes()): for j in range(i, G.number_of_nodes()): S.add_edge(i, j, cost = spMatrix[i, j]) return S def kernel_train_test(datafile, kernel_file_path, kernel_func, kernel_para, trials = 100, splits = 10, alpha_grid = None, C_grid = None, hyper_name = '', hyper_range = [1], normalize = False, datafile_y = '', model_type = 'regression'): """Perform training and testing for a kernel method. Print out neccessary data during the process then finally the results. Parameters ---------- datafile : string Path of dataset file. kernel_file_path : string Path of the directory to save results. kernel_func : function kernel function to use in the process. kernel_para : dictionary Keyword arguments passed to kernel_func. trials: integer Number of trials for hyperparameter random search, where hyperparameter stands for penalty parameter for now. The default is 100. splits: integer Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10. alpha_grid : ndarray Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression. C_grid : ndarray Penalty parameter C of the error term in kernel SVM. hyper_name : string Name of the hyperparameter. hyper_range : list Range of the hyperparameter. normalize : string Determine whether or not that normalization is performed. Only works when model_type == 'regression'. The default is False. model_type : string Typr of the problem, regression or classification problem References ---------- [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1 Examples -------- >>> import sys >>> sys.path.insert(0, "../") >>> from pygraph.utils.utils import kernel_train_test >>> from pygraph.kernels.treeletKernel import treeletkernel >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds' >>> kernel_file_path = 'kernelmatrices_path_acyclic/' >>> kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', labeled = True) >>> kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True) """ import os import pathlib from collections import OrderedDict from tabulate import tabulate from .graphfiles import loadDataset # setup the parameters model_type = model_type.lower() if model_type != 'regression' and model_type != 'classification': raise Exception('The model type is incorrect! Please choose from regression or clqssification.') print('\n --- This is a %s problem ---' % model_type) alpha_grid = np.logspace(-10, 10, num = trials, base = 10) if alpha_grid == None else alpha_grid # corresponds to (2*C)^-1 in other linear models such as LogisticRegression C_grid = np.logspace(-10, 10, num = trials, base = 10) if C_grid == None else C_grid if not os.path.exists(kernel_file_path): os.makedirs(kernel_file_path) train_means_list = [] train_stds_list = [] test_means_list = [] test_stds_list = [] kernel_time_list = [] for hyper_para in hyper_range: print('' if hyper_name == '' else '\n\n #--- calculating kernel matrix when', hyper_name, '=', hyper_para, '---#') print('\n Loading dataset from file...') dataset, y = loadDataset(datafile, filename_y = datafile_y) y = np.array(y) # normalize labels and transform non-numerical labels to numerical labels. if model_type == 'classification': from sklearn.preprocessing import LabelEncoder y = LabelEncoder().fit_transform(y) # print(y) # save kernel matrices to files / read kernel matrices from files kernel_file = kernel_file_path + 'km.ds' path = pathlib.Path(kernel_file) # get train set kernel matrix if path.is_file(): print('\n Loading the kernel matrix from file...') Kmatrix = np.loadtxt(kernel_file) print(Kmatrix) else: print('\n Calculating kernel matrix, this could take a while...') if hyper_name != '': kernel_para[hyper_name] = hyper_para Kmatrix, run_time = kernel_func(dataset, **kernel_para) kernel_time_list.append(run_time) print(Kmatrix) # print('\n Saving kernel matrix to file...') # np.savetxt(kernel_file, Kmatrix) """ - Here starts the main program - First we permute the data, then for each split we evaluate corresponding performances - In the end, the performances are averaged over the test sets """ train_mean, train_std, test_mean, test_std = \ split_train_test(Kmatrix, y, alpha_grid, C_grid, splits, trials, model_type, normalize = normalize) train_means_list.append(train_mean) train_stds_list.append(train_std) test_means_list.append(test_mean) test_stds_list.append(test_std) print('\n') if model_type == 'regression': table_dict = {'rmse_test': test_means_list, 'std_test': test_stds_list, \ 'rmse_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list} if hyper_name == '': keyorder = ['rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time'] else: table_dict[hyper_name] = hyper_range keyorder = [hyper_name, 'rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time'] elif model_type == 'classification': table_dict = {'accur_test': test_means_list, 'std_test': test_stds_list, \ 'accur_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list} if hyper_name == '': keyorder = ['accur_test', 'std_test', 'accur_train', 'std_train', 'k_time'] else: table_dict[hyper_name] = hyper_range keyorder = [hyper_name, 'accur_test', 'std_test', 'accur_train', 'std_train', 'k_time'] print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys')) def split_train_test(Kmatrix, train_target, alpha_grid, C_grid, splits = 10, trials = 100, model_type = 'regression', normalize = False): """Split dataset to training and testing splits, train and test. Print out and return the results. Parameters ---------- Kmatrix : Numpy matrix Kernel matrix, each element of which is the kernel between 2 praphs. train_target : ndarray train target. alpha_grid : ndarray Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression. C_grid : ndarray Penalty parameter C of the error term in kernel SVM. splits : interger Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10. trials : integer Number of trials for hyperparameters random search. The final means and stds are the ones in the same trial with the best test mean. The default is 100. model_type : string Determine whether it is a regression or classification problem. The default is 'regression'. normalize : string Determine whether or not that normalization is performed. Only works when model_type == 'regression'. The default is False. Return ------ train_mean : float mean of train accuracies in the same trial with the best test mean. train_std : float mean of train stds in the same trial with the best test mean. test_mean : float mean of the best tests. test_std : float mean of test stds in the same trial with the best test mean. References ---------- [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1 """ import random from sklearn.kernel_ridge import KernelRidge # 0.17 from sklearn.metrics import accuracy_score, mean_squared_error from sklearn import svm datasize = len(train_target) random.seed(20) # Set the seed for uniform parameter distribution # Initialize the performance of the best parameter trial on train with the corresponding performance on test train_split = [] test_split = [] # For each split of the data print('\n Starting calculate accuracy/rmse...') import sys pbar = tqdm(total = splits * trials, desc = 'calculate performance', file=sys.stdout) for j in range(10, 10 + splits): # print('\n Starting split %d...' % j) # Set the random set for data permutation random_state = int(j) np.random.seed(random_state) idx_perm = np.random.permutation(datasize) # Permute the data y_perm = train_target[idx_perm] # targets permutation Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation # Set the training, test # Note: the percentage can be set up by the user num_train = int((datasize * 90) / 100) # 90% (of entire dataset) for training num_test = datasize - num_train # 10% (of entire dataset) for test # Split the kernel matrix Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train] Kmatrix_test = Kmatrix_perm[num_train:datasize, 0:num_train] # Split the targets y_train = y_perm[0:num_train] # Normalization step (for real valued targets only) if normalize == True and model_type == 'regression': y_train_mean = np.mean(y_train) y_train_std = np.std(y_train) y_train_norm = (y_train - y_train_mean) / float(y_train_std) y_test = y_perm[num_train:datasize] # Record the performance for each parameter trial respectively on train and test set perf_all_train = [] perf_all_test = [] # For each parameter trial for i in range(trials): # For regression use the Kernel Ridge method if model_type == 'regression': # Fit the kernel ridge model KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i]) KR.fit(Kmatrix_train, y_train if normalize == False else y_train_norm) # predict on the train and test set y_pred_train = KR.predict(Kmatrix_train) y_pred_test = KR.predict(Kmatrix_test) # adjust prediction: needed because the training targets have been normalized if normalize == True: y_pred_train = y_pred_train * float(y_train_std) + y_train_mean y_pred_test = y_pred_test * float(y_train_std) + y_train_mean # root mean squared error on train set accuracy_train = np.sqrt(mean_squared_error(y_train, y_pred_train)) perf_all_train.append(accuracy_train) # root mean squared error on test set accuracy_test = np.sqrt(mean_squared_error(y_test, y_pred_test)) perf_all_test.append(accuracy_test) # For clcassification use SVM elif model_type == 'classification': KR = svm.SVC(kernel = 'precomputed', C = C_grid[i]) KR.fit(Kmatrix_train, y_train) # predict on the train and test set y_pred_train = KR.predict(Kmatrix_train) y_pred_test = KR.predict(Kmatrix_test) # accuracy on train set accuracy_train = accuracy_score(y_train, y_pred_train) perf_all_train.append(accuracy_train) # accuracy on test set accuracy_test = accuracy_score(y_test, y_pred_test) perf_all_test.append(accuracy_test) pbar.update(1) # --- FIND THE OPTIMAL PARAMETERS --- # # For regression: minimise the mean squared error if model_type == 'regression': # get optimal parameter on test (argmin mean squared error) min_idx = np.argmin(perf_all_test) alpha_opt = alpha_grid[min_idx] # corresponding performance on train and test set for the same parameter perf_train_opt = perf_all_train[min_idx] perf_test_opt = perf_all_test[min_idx] # For classification: maximise the accuracy if model_type == 'classification': # get optimal parameter on test (argmax accuracy) max_idx = np.argmax(perf_all_test) C_opt = C_grid[max_idx] # corresponding performance on train and test set for the same parameter perf_train_opt = perf_all_train[max_idx] perf_test_opt = perf_all_test[max_idx] # append the correponding performance on the train and test set train_split.append(perf_train_opt) test_split.append(perf_test_opt) # average the results # mean of the train and test performances over the splits train_mean = np.mean(np.asarray(train_split)) test_mean = np.mean(np.asarray(test_split)) # std deviation of the train and test over the splits train_std = np.std(np.asarray(train_split)) test_std = np.std(np.asarray(test_split)) print('\n Mean performance on train set: %3f' % train_mean) print('With standard deviation: %3f' % train_std) print('\n Mean performance on test set: %3f' % test_mean) print('With standard deviation: %3f' % test_std) return train_mean, train_std, test_mean, test_std