OpenI
/
graphkit-learn

import networkx as nx
import numpy as np
from tqdm import tqdm


def getSPLengths(G1):
    sp = nx.shortest_path(G1)
    distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes()))
    for i in sp.keys():
        for j in sp[i].keys():
            distances[i, j] = len(sp[i][j])-1
    return distances

def getSPGraph(G, edge_weight = 'bond_type'):
    """Transform graph G to its corresponding shortest-paths graph.
    
    Parameters
    ----------
    G : NetworkX graph
        The graph to be tramsformed.
    edge_weight : string
        edge attribute corresponding to the edge weight. The default edge weight is bond_type.
        
    Return
    ------
    S : NetworkX graph
        The shortest-paths graph corresponding to G.
        
    Notes
    ------
    For an input graph G, its corresponding shortest-paths graph S contains the same set of nodes as G, while there exists an edge between all nodes in S which are connected by a walk in G. Every edge in S between two nodes is labeled by the shortest distance between these two nodes.
    
    References
    ----------
    [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
    """
    return floydTransformation(G, edge_weight = edge_weight)
            
def floydTransformation(G, edge_weight = 'bond_type'):
    """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation.
    
    Parameters
    ----------
    G : NetworkX graph
        The graph to be tramsformed.
    edge_weight : string
        edge attribute corresponding to the edge weight. The default edge weight is bond_type.
        
    Return
    ------
    S : NetworkX graph
        The shortest-paths graph corresponding to G.
        
    References
    ----------
    [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.
    """
    spMatrix = nx.floyd_warshall_numpy(G, weight = edge_weight)
    S = nx.Graph()
    S.add_nodes_from(G.nodes(data=True))
    for i in range(0, G.number_of_nodes()):
        for j in range(i, G.number_of_nodes()):
            S.add_edge(i, j, cost = spMatrix[i, j])
    return S


# def kernel_train_test(datafile, kernel_file_path, kernel_func, kernel_para, trials = 100, splits = 10, alpha_grid = None, C_grid = None, hyper_name = '', hyper_range = [1], normalize = False, datafile_y = '', model_type = 'regression'):
#     """Perform training and testing for a kernel method. Print out neccessary data during the process then finally the results.

#     Parameters
#     ----------
#     datafile : string
#         Path of dataset file.
#     kernel_file_path : string
#         Path of the directory to save results.
#     kernel_func : function
#         kernel function to use in the process.
#     kernel_para : dictionary
#         Keyword arguments passed to kernel_func.
#     trials: integer
#         Number of trials for hyperparameter random search, where hyperparameter stands for penalty parameter for now. The default is 100.
#     splits: integer
#         Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10.
#     alpha_grid : ndarray
#         Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression.
#     C_grid : ndarray
#         Penalty parameter C of the error term in kernel SVM.
#     hyper_name : string
#         Name of the hyperparameter.
#     hyper_range : list
#         Range of the hyperparameter.
#     normalize : string
#         Determine whether or not that normalization is performed. Only works when model_type == 'regression'. The default is False.
#     model_type : string
#         Typr of the problem, regression or classification problem

#     References
#     ----------
#     [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1

#     Examples
#     --------
#     >>> import sys
#     >>> sys.path.insert(0, "../")
#     >>> from pygraph.utils.utils import kernel_train_test
#     >>> from pygraph.kernels.treeletKernel import treeletkernel
#     >>> datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'
#     >>> kernel_file_path = 'kernelmatrices_path_acyclic/'
#     >>> kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', labeled = True)
#     >>> kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)
#     """
#     import os
#     import pathlib
#     from collections import OrderedDict
#     from tabulate import tabulate
#     from .graphfiles import loadDataset

#     # setup the parameters
#     model_type = model_type.lower()
#     if model_type != 'regression' and model_type != 'classification':
#         raise Exception('The model type is incorrect! Please choose from regression or classification.')
#     print('\n --- This is a %s problem ---' % model_type)

#     alpha_grid = np.logspace(-10, 10, num = trials, base = 10) if alpha_grid == None else alpha_grid # corresponds to (2*C)^-1 in other linear models such as LogisticRegression
#     C_grid = np.logspace(-10, 10, num = trials, base = 10) if C_grid == None else C_grid

#     if not os.path.exists(kernel_file_path):
#         os.makedirs(kernel_file_path)

#     train_means_list = []
#     train_stds_list = []
#     test_means_list = []
#     test_stds_list = []
#     kernel_time_list = []

#     for hyper_para in hyper_range:
#         print('' if hyper_name == '' else '\n\n #--- calculating kernel matrix when', hyper_name, '=', hyper_para, '---#')

#         print('\n Loading dataset from file...')
#         dataset, y = loadDataset(datafile, filename_y = datafile_y)
#         y = np.array(y)
#         # normalize labels and transform non-numerical labels to numerical labels.
#         if model_type == 'classification':
#             from sklearn.preprocessing import LabelEncoder
#             y = LabelEncoder().fit_transform(y)
#         #   print(y)

#         # save kernel matrices to files / read kernel matrices from files
#         kernel_file = kernel_file_path + 'km.ds'
#         path = pathlib.Path(kernel_file)
#         # get train set kernel matrix
#         if path.is_file():
#             print('\n Loading the kernel matrix from file...')
#             Kmatrix = np.loadtxt(kernel_file)
#             print(Kmatrix)
#         else:
#             print('\n Calculating kernel matrix, this could take a while...')
#             if hyper_name != '':
#                 kernel_para[hyper_name] = hyper_para
#             Kmatrix, run_time = kernel_func(dataset, **kernel_para)
#             kernel_time_list.append(run_time)
#             import matplotlib.pyplot as plt
#             plt.matshow(Kmatrix)
#       #     print('\n Saving kernel matrix to file...')
#         #     np.savetxt(kernel_file, Kmatrix)

#         """
#         -  Here starts the main program
#         -  First we permute the data, then for each split we evaluate corresponding performances
#         -  In the end, the performances are averaged over the test sets
#         """

#         train_mean, train_std, test_mean, test_std = \
#             split_train_test(Kmatrix, y, alpha_grid, C_grid, splits, trials, model_type, normalize = normalize)

#         train_means_list.append(train_mean)
#         train_stds_list.append(train_std)
#         test_means_list.append(test_mean)
#         test_stds_list.append(test_std)

#     print('\n')
#     if model_type == 'regression':
#         table_dict = {'rmse_test': test_means_list, 'std_test': test_stds_list, \
#                       'rmse_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}
#         if hyper_name == '':
#             keyorder = ['rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time']
#         else:
#             table_dict[hyper_name] = hyper_range
#             keyorder = [hyper_name, 'rmse_test', 'std_test', 'rmse_train', 'std_train', 'k_time']
#     elif model_type == 'classification':
#         table_dict = {'accur_test': test_means_list, 'std_test': test_stds_list, \
#                       'accur_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}
#         if hyper_name == '':
#             keyorder = ['accur_test', 'std_test', 'accur_train', 'std_train', 'k_time']
#         else:
#             table_dict[hyper_name] = hyper_range
#             keyorder = [hyper_name, 'accur_test', 'std_test', 'accur_train', 'std_train', 'k_time']
#     print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))


# def split_train_test(Kmatrix, train_target, alpha_grid, C_grid, splits = 10, trials = 100, model_type = 'regression', normalize = False):
#     """Split dataset to training and testing splits, train and test. Print out and return the results.

#     Parameters
#     ----------
#     Kmatrix : Numpy matrix
#         Kernel matrix, each element of which is the kernel between 2 praphs.
#     train_target : ndarray
#         train target.
#     alpha_grid : ndarray
#         Penalty parameter in kernel ridge regression. Corresponds to (2*C)^-1 in other linear models such as LogisticRegression.
#     C_grid : ndarray
#         Penalty parameter C of the error term in kernel SVM.
#     splits : interger
#         Number of splits of dataset. Times of training and testing procedure processed. The final means and stds are the average of the results of all the splits. The default is 10.
#     trials : integer
#         Number of trials for hyperparameters random search. The final means and stds are the ones in the same trial with the best test mean. The default is 100.
#     model_type : string
#         Determine whether it is a regression or classification problem. The default is 'regression'.
#     normalize : string
#         Determine whether or not that normalization is performed. Only works when model_type == 'regression'. The default is False.

#     Return
#     ------
#     train_mean : float
#         mean of train accuracies in the same trial with the best test mean.
#     train_std : float
#         mean of train stds in the same trial with the best test mean.
#     test_mean : float
#         mean of the best tests.
#     test_std : float
#         mean of test stds in the same trial with the best test mean.

#     References
#     ----------
#     [1] Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py, 2018.1
#     """
#     import random
#     from sklearn.kernel_ridge import KernelRidge # 0.17
#     from sklearn.metrics import accuracy_score, mean_squared_error
#     from sklearn import svm

#     datasize = len(train_target)
#     random.seed(20) # Set the seed for uniform parameter distribution

#     # Initialize the performance of the best parameter trial on train with the corresponding performance on test
#     train_split = []
#     test_split = []

#     # For each split of the data
#     print('\n Starting calculate accuracy/rmse...')
#     import sys
#     pbar = tqdm(total = splits * trials, desc = 'calculate performance', file=sys.stdout)
#     for j in range(10, 10 + splits):
#     #         print('\n Starting split %d...' % j)

#         # Set the random set for data permutation
#         random_state = int(j)
#         np.random.seed(random_state)
#         idx_perm = np.random.permutation(datasize)

#         # Permute the data
#         y_perm = train_target[idx_perm] # targets permutation
#         Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation
#         Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation

#         # Set the training, test
#         # Note: the percentage can be set up by the user
#         num_train = int((datasize * 90) / 100)         # 90% (of entire dataset) for training
#         num_test = datasize - num_train             # 10% (of entire dataset) for test

#         # Split the kernel matrix
#         Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]
#         Kmatrix_test = Kmatrix_perm[num_train:datasize, 0:num_train]

#         # Split the targets
#         y_train = y_perm[0:num_train]


#         # Normalization step (for real valued targets only)
#         if normalize == True and model_type == 'regression':
#             y_train_mean = np.mean(y_train)
#             y_train_std = np.std(y_train)
#             y_train_norm = (y_train - y_train_mean) / float(y_train_std)

#         y_test = y_perm[num_train:datasize]

#         # Record the performance for each parameter trial respectively on train and test set
#         perf_all_train = []
#         perf_all_test = []

#         # For each parameter trial
#         for i in range(trials):
#             # For regression use the Kernel Ridge method
#             if model_type == 'regression':
#                 # Fit the kernel ridge model
#                 KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])
#                 KR.fit(Kmatrix_train, y_train if normalize == False else y_train_norm)

#                 # predict on the train and test set
#                 y_pred_train = KR.predict(Kmatrix_train)
#                 y_pred_test = KR.predict(Kmatrix_test)

#                 # adjust prediction: needed because the training targets have been normalized
#                 if normalize == True:
#                     y_pred_train = y_pred_train * float(y_train_std) + y_train_mean
#                     y_pred_test = y_pred_test * float(y_train_std) + y_train_mean

#                 # root mean squared error on train set
#                 accuracy_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
#                 perf_all_train.append(accuracy_train)
#                 # root mean squared error on test set
#                 accuracy_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
#                 perf_all_test.append(accuracy_test)

#             # For clcassification use SVM
#             elif model_type == 'classification':
#                 KR = svm.SVC(kernel = 'precomputed', C = C_grid[i])
#                 KR.fit(Kmatrix_train, y_train)

#                 # predict on the train and test set
#                 y_pred_train = KR.predict(Kmatrix_train)
#                 y_pred_test = KR.predict(Kmatrix_test)

#                 # accuracy on train set
#                 accuracy_train = accuracy_score(y_train, y_pred_train)
#                 perf_all_train.append(accuracy_train)
#                 # accuracy on test set
#                 accuracy_test = accuracy_score(y_test, y_pred_test)
#                 perf_all_test.append(accuracy_test)

#             pbar.update(1)

#         # --- FIND THE OPTIMAL PARAMETERS --- #
#         # For regression: minimise the mean squared error
#         if model_type == 'regression':

#             # get optimal parameter on test (argmin mean squared error)
#             min_idx = np.argmin(perf_all_train)
#             alpha_opt = alpha_grid[min_idx]

#             # corresponding performance on train and test set for the same parameter
#             perf_train_opt = perf_all_train[min_idx]
#             perf_test_opt = perf_all_test[min_idx]

#         # For classification: maximise the accuracy
#         if model_type == 'classification':
#             # get optimal parameter on test (argmax accuracy)
#             max_idx = np.argmax(perf_all_train)
#             C_opt = C_grid[max_idx]

#             # corresponding performance on train and test set for the same parameter
#             perf_train_opt = perf_all_train[max_idx]
#             perf_test_opt = perf_all_test[max_idx]


#         # append the correponding performance on the train and test set
#         train_split.append(perf_train_opt)
#         test_split.append(perf_test_opt)

#     # average the results
#     # mean of the train and test performances over the splits
#     train_mean = np.mean(np.asarray(train_split))
#     test_mean = np.mean(np.asarray(test_split))
#     # std deviation of the train and test over the splits
#     train_std = np.std(np.asarray(train_split))
#     test_std = np.std(np.asarray(test_split))

#     print('\n Mean performance on train set: %3f' % train_mean)
#     print('With standard deviation: %3f' % train_std)
#     print('\n Mean performance on test set: %3f' % test_mean)
#     print('With standard deviation: %3f' % test_std)

#     return train_mean, train_std, test_mean, test_std