{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Author: Elisabetta Ghisu\n",
    "\n",
    "\"\"\"\n",
    "- Script containing functions for computing the shortest path kernel\n",
    "- The Floyd Warshall algorithm is first implemented\n",
    "- Then the SP is calculated\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "#######################\n",
    "# - IMPORT PACKAGES - #\n",
    "#######################\n",
    "\n",
    "\n",
    "\n",
    "import numpy.matlib as matlib\n",
    "import numpy as np\n",
    "\n",
    "\"\"\"\n",
    "### FLOYD WARSHALL ALGORITHM\n",
    "Input:\n",
    "- Adjancency matrix A\n",
    "Output:\n",
    "- Shortest path matrix S\n",
    "\"\"\"\n",
    "\n",
    "def floyd_warshall(A):\n",
    "\n",
    "\t# nuber of nodes\n",
    "\tn = A.shape[0]\n",
    "\n",
    "\t# initialize shortes path matrix\n",
    "\tS = np.zeros(shape = (n,n))\n",
    "\n",
    "\tfor i in range(n):\n",
    "\t\tfor j in range(n):\n",
    "\t\t\tif A[i,j] == 0 and i!=j:\n",
    "\t\t\t\tS[i,j] = float(\"inf\")\n",
    "\t\t\telse:\n",
    "\t\t\t\tS[i,j] = A[i,j]\n",
    "\n",
    "\t# Compute the shortest path matrix\n",
    "\tfor k in range(n):\n",
    "\t\tfor i in range(n):\n",
    "\t\t\tfor j in range(n):\n",
    "\t\t\t\tif S[i,j] > S[i,k] + S[k,j]:\n",
    "\t\t\t\t\tS[i,j] = S[i,k] + S[k,j]\n",
    "\n",
    "\treturn S\t\t\t\t\t\t\t\t\n",
    "\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "SHORTEST PATH KERNEL: This is a fast implementation of the shortest path\n",
    "kernel algorithm\n",
    "Inputs\n",
    "- Adjancency matrix\n",
    "- List of list of node labels for each graph\n",
    "- Total number of node labels \n",
    "Outputs\n",
    "- Kernel matrix\n",
    "- Feature matrix\n",
    "\"\"\"\n",
    "\n",
    "def sp_kernel_fast(adj_mat, labels, L):\n",
    "\n",
    "\t# Number of graphs\n",
    "\tn = len(adj_mat)\n",
    "\tL = int(L)\n",
    "\tS = []\n",
    "\n",
    "\t# shortest path matrices\n",
    "\tfor i in xrange(n):\n",
    "\t\tif i%1000 == 0 and i !=0:\n",
    " \t\t\tprint('haha') #( \"%d\" % i)\n",
    "\t\tS.append(floyd_warshall(adj_mat[i]))\n",
    "\t\n",
    "\t# maximum length of shortest paths in the dataset\n",
    "\tmax_path = 0\n",
    "\n",
    "\t# for each graph in dataset\n",
    "\tfor i in xrange(n):\n",
    "\n",
    "\t\tS_cur = np.copy(S[i])\n",
    "\t\tS_cur[S_cur == np.inf] = 0\n",
    "\t\tnew_max = np.max(S_cur)\n",
    "\t\t\n",
    "\t\tif new_max > max_path:\n",
    "\t\t\tmax_path = new_max # get max short path in all Ss\n",
    "\n",
    "\t# maximum length of shortest paths\n",
    "\tmax_path = int(max_path)\n",
    "\n",
    "\t# initialize feature matrix\n",
    "\tsp = np.zeros(((max_path + 1) * L * (L+1) /2,n))\n",
    "\n",
    "\t# compute feature map for shortest path\n",
    "\tfor i in xrange(n):\n",
    "\n",
    "\t\tif i % 1000 == 0:\n",
    "\t\t\tprint('haha') #\"Processed %d graphs\" %i\n",
    "\n",
    "\t\tS_graph = S[i]\n",
    "\t\tlabels_graph = np.asarray(labels[i].reshape((len(labels[i]),1)))\n",
    "\t\tlabels_graph = labels_graph + 1\n",
    "\t\t\n",
    "\t\tlabels_aux = matlib.repmat(labels_graph, 1, len(labels_graph))\n",
    "\t\t\n",
    "\t\tmin_lab = np.minimum(labels_aux, labels_aux.T)\n",
    "\t\t\n",
    "\t\tmax_lab = np.maximum(labels_aux, labels_aux.T)\n",
    "\t\tsub_path = np.triu(~(np.isinf(S_graph))).T\n",
    "\n",
    "\t\tmin_lab = min_lab[sub_path]\n",
    "\t\tmax_lab = max_lab[sub_path]\n",
    "\n",
    "\n",
    "\t\tind = S_graph[sub_path] * L * (L + 1) / 2 + (min_lab - 1) * (2*L + 2 - min_lab) / 2 + max_lab - min_lab\n",
    "\t\tind = ind.astype(int)\n",
    "\t\taccum = np.zeros((max_path + 1) * L * (L + 1) /2)\n",
    "\t\taccum[:ind.max() + 1] += np.bincount(ind.astype(int))\n",
    "\t\tsp[ind,i] = accum[ind]\n",
    "\t\n",
    "\tsum_cols = np.sum(sp, axis = 1)\n",
    "\tind_true = sum_cols != 0\n",
    "\tsp = sp[ind_true,:]\n",
    "\t\n",
    "\t# compute kernel matrix\n",
    "\tK = np.dot(sp.T,sp)\n",
    "    \n",
    "\treturn K, sp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "No module named 'igraph'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-11-effbaf3a1e10>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[0;31m# iGraph imports to handle graphs and for graph I/O\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0migraph\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mGraph\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mImportError\u001b[0m: No module named 'igraph'"
     ]
    }
   ],
   "source": [
    "#Authors: Elisabetta Ghisu, Felipe Llinares Lopez\n",
    "\n",
    "\"\"\"\n",
    "- This script includes a list of functions for analyzing \n",
    "parsing and formatting graphs\n",
    "- The graphs are given in graphml format\n",
    "- It also cntans functions for loading, processing the graphs\n",
    "and extract graph statistics\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "import numpy as np\n",
    "from numpy import genfromtxt\n",
    "\n",
    "# iGraph imports to handle graphs and for graph I/O\n",
    "from igraph import Graph\n",
    "\n",
    "\n",
    "# ---------------------------------GRAPHML I/O FUNCTIONS------------------------------------ #\n",
    "\n",
    "# INPUT:\n",
    "# filenames_graphs: list of GraphML files, where each file contains one graph in the dataset\n",
    "# filename_labels: text file with labels corresponding to each graph in the dataset, in the same order as they are in\n",
    "#                  filename_graphs\n",
    "# OUTPUT:\n",
    "# G: A list containing one iGraph object for each graph in the dataset\n",
    "# Y: A Numpy array containing the labels corresponding to each graph, in the same order as G\n",
    "def load_graphml(filenames_graphs, filename_labels):\n",
    "    G = []\n",
    "    for fname in filenames_graphs:\n",
    "        G.append(Graph.Read_GraphML(fname))\n",
    "    Y = genfromtxt(filename_labels)\n",
    "    return (G, Y)\n",
    "\n",
    "\n",
    "# Loads a list of paths to GraphML files from filename_list\n",
    "def load_file_list(filename_flist):\n",
    "    f = open(filename_flist, 'r')\n",
    "    f_graphs = []\n",
    "    for line in f:\n",
    "        f_graphs.append(line.strip())\n",
    "    f.close()\n",
    "    return f_graphs\n",
    "\n",
    "\n",
    "# --------------------------------COMPUTE STATISTICS---------------------------------------- #\n",
    "\n",
    "\n",
    "# Retrieve labels of all vertices belonging to any graph in the list of iGraph objects G and\n",
    "# returns the entire list, and a list with the alphabet of the vertex labels\n",
    "def get_all_vertex_labels(G, att_name='label'):\n",
    "    v_l = []\n",
    "    for g in G:\n",
    "        v_l += g.vs[att_name]\n",
    "    return (v_l, np.unique(v_l))\n",
    "\n",
    "\n",
    "# Retrieve labels of all edges belonging to any graph in the list of iGraph objects G and\n",
    "# returns the entire list, and a list with the alphabet of the edge labels\n",
    "def get_all_edge_labels(G, att_name='label'):\n",
    "    e_l = []\n",
    "    for g in G:\n",
    "        e_l += g.es[att_name]\n",
    "    return (e_l, np.unique(e_l))\n",
    "\n",
    "\n",
    "# Returns a list where each element is itself the adjacency list of the corresponding graph\n",
    "# The adjacency lit of a graph has the following format:\n",
    "# it is a list where each element is a list containing the id of adjacent nodes\n",
    "def get_adj_list(G):\n",
    "    ad_l = []\n",
    "    for g in G:\n",
    "        ad_l.append(g.get_adjlist())\n",
    "    return ad_l\n",
    "\n",
    "# Returns a list where each element is the adjacency matrix of the graph \n",
    "# The adjancency matrix is in iGraph format\n",
    "def get_adj_mat(G):\n",
    "    ad_m = []\n",
    "    for g in G:\n",
    "        ad_m.append(g.get_adjacency())\n",
    "    return ad_m\n",
    "\n",
    "# Returns a list where each element contains the nodes label for a graph\n",
    "def get_node_labels(G, att_name = 'label'):\n",
    "    node_l = []\n",
    "    for g in G:\n",
    "        node_l.append(g.vs[att_name])\n",
    "    return node_l\n",
    "\n",
    "\n",
    "\n",
    "# ----------------- LOAD AND PROCESS THE GRAPHS --------------- #\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "Inputs:\n",
    "- list of graphs file\n",
    "- labels file\n",
    "- path to the data folder\n",
    "Outputs:\n",
    "- List of node labels\n",
    "- List of adjancency lists\n",
    "- List of graphs in graphml format\n",
    "- Targets\n",
    "- number of classes\n",
    "- sample size\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "def load_and_process(filenames_graphs, filename_labels, path_to_dataset):\n",
    "\n",
    "    # load a list of names to graphml files\n",
    "    f_graphs = load_file_list(filenames_graphs)\n",
    "    # sample size\n",
    "    n = len(f_graphs)\n",
    "\n",
    "    # create a list of paths to the files\n",
    "    f_graphs_path =[]\n",
    "\n",
    "    # for each graph in dataset\n",
    "    for i in range(n):\n",
    "\n",
    "        # index the graph\n",
    "        graph_name = f_graphs[i]\n",
    "\n",
    "        # path to the data folder\n",
    "        path = \"%s/%s\" % (path_to_dataset, graph_name)\n",
    "        f_graphs_path.append(path)\n",
    "\n",
    "    # If the data is DD have to delete an element (corrupted file)\n",
    "    if graph_name == \"DD\":\n",
    "        del f_graphs_path[148]\n",
    "        n = n-1\n",
    "\n",
    "    # Load the graphs in graphml format\n",
    "    # G is a llist of graphml graph\n",
    "    # Y is an array of targets\n",
    "    G,Y = load_graphml(f_graphs_path, filename_labels)\n",
    "\n",
    "    # Delete corrupted file in DD\n",
    "    if graph_name == \"DD\": \n",
    "        Y = np.delete(Y, 148)\n",
    "\n",
    "    # get adjacency list and matrix for all the graphs in G\n",
    "    ad_list = get_adj_list(G)\n",
    "    ad_mat = get_adj_mat(G)\n",
    "\n",
    "    # get a list containing lists of node labels\n",
    "    node_label = get_node_labels(G)\n",
    "\n",
    "    return node_label, ad_list, G, Y\n",
    "\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "RENAME NODES: function to rename nodes from 0,...,num_nodes\n",
    "Input\n",
    "- list of list of node labels in each graph\n",
    "Output\n",
    "- L: total number of different labels in the dataset\n",
    "- node_label: new renamed labels\n",
    "\"\"\"\n",
    "\n",
    "def rename_nodes(node_label): \n",
    "    \n",
    "    # number of graphs in the dataset\n",
    "    n = len(node_label)\n",
    "\n",
    "    # labels will store the new labels\n",
    "    labels = [0] * n\n",
    "\n",
    "    # disctionary containing the map from the old to the new labels\n",
    "    label_lookup = {}\n",
    "\n",
    "    # counter of unique labels\n",
    "    label_counter = 0\n",
    "\n",
    "    # for each graph in dataset\n",
    "    for i in range(n):\n",
    "\n",
    "\n",
    "        # number of nodes in graph[i]\n",
    "        num_nodes = len(node_label[i]) \n",
    "\n",
    "        # will be used to store the new labels\n",
    "        labels[i] = np.zeros(num_nodes, dtype = np.uint64) # positive integers\n",
    "\n",
    "        # for each node in the graph\n",
    "        for j in range(num_nodes):\n",
    "\n",
    "            # the node label to a string\n",
    "            l_node_str = str(np.copy(node_label[i][j]))\n",
    "            \n",
    "            # if the string has not been observed yet\n",
    "            # the corresponding node is assigned a new label\n",
    "            # otherwise it will be named with the same label\n",
    "            # already assigned to an identical string\n",
    "\n",
    "            if not label_lookup.has_key(l_node_str):\n",
    "                label_lookup[l_node_str] = label_counter\n",
    "                labels[i][j] = label_counter                                                              \n",
    "                label_counter += 1\n",
    "            else:\n",
    "                labels[i][j] = label_lookup[l_node_str]\n",
    "\n",
    "    # total number of labels in the dataset\n",
    "    L = label_counter\n",
    "    print('haha') #'Number of original labels %d' % L \n",
    "\n",
    "    return L, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "usage: ipykernel_launcher.py [-h] --dataset DATASET\n",
      "ipykernel_launcher.py: error: the following arguments are required: --dataset\n"
     ]
    },
    {
     "ename": "SystemExit",
     "evalue": "2",
     "output_type": "error",
     "traceback": [
      "An exception has occurred, use %tb to see the full traceback.\n",
      "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py:2918: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n",
      "  warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n"
     ]
    }
   ],
   "source": [
    "# Author: Elisabetta Ghisu\n",
    "\n",
    "\"\"\"\n",
    "- Script for computing the kernel matrix and features map \n",
    "using shortest path kernel\n",
    "\"\"\"\n",
    "\n",
    "###########################\n",
    "# --- IMPORT PACKAGES --- #\n",
    "###########################\n",
    "\n",
    "import numpy as np\n",
    "import argparse\n",
    "import os\n",
    "import pickle\n",
    "\n",
    "from numpy import genfromtxt\n",
    "\n",
    "# from sp_functions import *\n",
    "# from parse_graphs import *\n",
    "\n",
    "\n",
    "\n",
    "##############################\n",
    "### Command Line Arguments ###\n",
    "##############################\n",
    "\n",
    "parser = argparse.ArgumentParser(description = \"Compute kernel and features matrices via shortest path kernel\")\n",
    "parser.add_argument(\"--dataset\", required = True, help = \"Name of the dataset\")\n",
    "args = parser.parse_args()\n",
    "\n",
    "\n",
    "#####################\n",
    "### LOAD THE DATA ###\n",
    "#####################\n",
    "\n",
    "\"\"\"\n",
    "- Here we load the data input and targets\n",
    "- The data are assumed to be in graph formats\n",
    "- They should be in graphml format \n",
    "\"\"\"\n",
    "\n",
    "# path to the list of graphs and dataset\n",
    "filenames_graphs = \"data/%s.list\" % (args.dataset)\n",
    "path_to_dataset = \"data/%s\" % (args.dataset) \n",
    "\n",
    "# Load the targets\n",
    "filename_labels = \"data/%s_label.txt\" % (args.dataset)\n",
    "\n",
    "# load and process graphs\n",
    "node_label, ad_list, G, Y = load_and_process(filenames_graphs, filename_labels, path_to_dataset)\n",
    "\n",
    "# output directory\n",
    "out_path = \"kernel_matrices/%s/sp\" % args.dataset\n",
    "\n",
    "# If the output directory does not exist, then create it\n",
    "if not os.path.exists(out_path):\n",
    "    os.makedirs(out_path)\n",
    "\n",
    "\n",
    "#########################\n",
    "# --- SHORTEST PATH --- #\n",
    "#########################\n",
    "\n",
    "\n",
    "# assign labels starting from zero to the nodes\n",
    "L, labels = rename_nodes(node_label)\n",
    "\n",
    "\n",
    "# Compute adjancency matrix \n",
    "adj_mat = get_adj_mat(G)\n",
    "\n",
    "# Compute kernel and feature maps using shortest path\n",
    "K, phi = sp_kernel_fast(adj_mat, labels, L)\n",
    "\n",
    "# save kernel matrix\n",
    "file_name = \"%s/%s_ker_mat\" % (out_path, args.dataset)\n",
    "np.save(file_name, K)\n",
    "\n",
    "# save feature map\n",
    "file_name = \"%s/%s_phi_map\" % (out_path, args.dataset)\n",
    "np.save(file_name, phi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.  2.  3.  1.  2.]]\n",
      "{0: {0: [0], 1: [0, 3, 1], 2: [0, 3, 4, 2], 3: [0, 3], 4: [0, 3, 4]}, 1: {0: [1, 3, 0], 1: [1], 2: [1, 3, 4, 2], 3: [1, 3], 4: [1, 3, 4]}, 2: {0: [2, 4, 3, 0], 1: [2, 4, 3, 1], 2: [2], 3: [2, 4, 3], 4: [2, 4]}, 3: {0: [3, 0], 1: [3, 1], 2: [3, 4, 2], 3: [3], 4: [3, 4]}, 4: {0: [4, 3, 0], 1: [4, 3, 1], 2: [4, 2], 3: [4, 3], 4: [4]}}\n",
      "[[ 0.  2.  3.  1.  2.]\n",
      " [ 2.  0.  3.  1.  2.]\n",
      " [ 3.  3.  0.  2.  1.]\n",
      " [ 1.  1.  2.  0.  1.]\n",
      " [ 2.  2.  1.  1.  0.]]\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'plt' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-17-c1e1e7524d30>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetSPGraph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'plt' is not defined"
     ]
    }
   ],
   "source": [
    "dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
    "G1 = dataset[12]\n",
    "\n",
    "nx.draw_networkx(G1)\n",
    "# print(list(dataset[12][4]))\n",
    "\n",
    "l = nx.shortest_path(G1)\n",
    "\n",
    "l2 = nx.floyd_warshall_numpy(G1)\n",
    "print(np.array(l2[0]))\n",
    "print(l)\n",
    "print(l2)\n",
    "plt.show()\n",
    "\n",
    "S = getSPGraph(G1)\n",
    "nx.draw_networkx(S)\n",
    "pos = nx.spring_layout(S)\n",
    "edge_labels = nx.get_edge_attributes(S,'cost')\n",
    "print(edge_labels)\n",
    "nx.draw_networkx_edge_labels(S, pos, edge_labels = edge_labels)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "   \n",
    "def loadCT(filename):\n",
    "    \"\"\"load data from .ct file.\n",
    "    \n",
    "    Notes\n",
    "    ------ \n",
    "    a typical example of data in .ct is like this:\n",
    "    \n",
    "     3 2  <- number of nodes and edges\n",
    "        0.0000    0.0000    0.0000 C <- each line describes a node, the last parameter in which is the label of the node, representing a chemical element @Q what are the first 3 numbers?\n",
    "        0.0000    0.0000    0.0000 C\n",
    "        0.0000    0.0000    0.0000 O\n",
    "      1  3  1  1 <- each line describes an edge, the first two numbers represent two nodes of the edge, the last number represents the label. @Q what are the 3th numbers?\n",
    "      2  3  1  1\n",
    "    \"\"\"\n",
    "    content = open(filename).read().splitlines()\n",
    "    G = nx.Graph(name=str(content[0])) # set name of the graph\n",
    "    tmp = content[1].split(\" \")\n",
    "    if tmp[0] == '':\n",
    "        nb_nodes = int(tmp[1]) # number of the nodes\n",
    "        nb_edges = int(tmp[2]) # number of the edges\n",
    "    else:\n",
    "        nb_nodes = int(tmp[0])\n",
    "        nb_edges = int(tmp[1])\n",
    "\n",
    "    for i in range(0, nb_nodes):\n",
    "        tmp = content[i + 2].split(\" \")\n",
    "        tmp = [x for x in tmp if x != '']\n",
    "        G.add_node(i, label=tmp[3])\n",
    "\n",
    "    for i in range(0, nb_edges):\n",
    "        tmp = content[i + G.number_of_nodes() + 2].split(\" \")\n",
    "        tmp = [x for x in tmp if x != '']\n",
    "        G.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, label=int(tmp[3]))\n",
    "    return G\n",
    "\n",
    "\n",
    "def loadGXL(filename):\n",
    "    import networkx as nx\n",
    "    import xml.etree.ElementTree as ET\n",
    "\n",
    "    tree = ET.parse(filename)\n",
    "    root = tree.getroot()\n",
    "    index = 0\n",
    "    G = nx.Graph()\n",
    "    dic={}\n",
    "    for node in root.iter('node'):\n",
    "        label = node.find('attr')[0].text\n",
    "        dic[node.attrib['id']] = index\n",
    "        G.add_node(index, id=node.attrib['id'], label=label)\n",
    "        index += 1\n",
    "        \n",
    "    for edge in root.iter('edge'):\n",
    "        label = edge.find('attr')[0].text\n",
    "        G.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], label=label)\n",
    "    return G\n",
    " \n",
    "def loadDataset(filename):\n",
    "    \"\"\"load file list of the dataset.\n",
    "    \"\"\"\n",
    "    from os.path import dirname, splitext\n",
    "\n",
    "    dirname_dataset = dirname(filename)\n",
    "    extension = splitext(filename)[1][1:]\n",
    "    data = []\n",
    "    y = []\n",
    "    if(extension == \"ds\"):\n",
    "        content = open(filename).read().splitlines()\n",
    "        for i in range(0, len(content)):\n",
    "            tmp = content[i].split(' ')\n",
    "            data.append(loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) # remove the '#'s in file names\n",
    "            y.append(float(tmp[1]))\n",
    "    elif(extension == \"cxl\"):\n",
    "        import xml.etree.ElementTree as ET\n",
    "\n",
    "        tree = ET.parse(filename)\n",
    "        root = tree.getroot()\n",
    "        data = []\n",
    "        y = []\n",
    "        for graph in root.iter('print'):\n",
    "            mol_filename = graph.attrib['file']\n",
    "            mol_class = graph.attrib['class']\n",
    "            data.append(loadGXL(dirname_dataset + '/' + mol_filename))\n",
    "            y.append(mol_class)\n",
    "\n",
    "    return data, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (<ipython-input-82-ac9ab22d42ef>, line 48)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-82-ac9ab22d42ef>\"\u001b[0;36m, line \u001b[0;32m48\u001b[0m\n\u001b[0;31m    Kmatrix[j][i] += (i == j ? 0 : 1)\u001b[0m\n\u001b[0m                             ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import pathlib\n",
    "sys.path.insert(0, \"../\")\n",
    "\n",
    "\n",
    "import networkx as nx\n",
    "import numpy as np\n",
    "import time\n",
    "\n",
    "from utils.utils import getSPGraph\n",
    "\n",
    "\n",
    "def spkernel(Gn):\n",
    "    \"\"\"Transform graph G to its corresponding shortest-paths graph using Floyd-transformation.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    G : NetworkX graph\n",
    "        The graph to be tramsformed.\n",
    "        \n",
    "    Return\n",
    "    ------\n",
    "    S : NetworkX graph\n",
    "        The shortest-paths graph corresponding to G.\n",
    "        \n",
    "    References\n",
    "    ----------\n",
    "    [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.\n",
    "    \"\"\"\n",
    "    Kmatrix = np.zeros((len(Gn), len(Gn)))\n",
    "    \n",
    "    Sn = [] # get shortest path graphs of Gn\n",
    "    for i in range(0, len(Gn)):\n",
    "        Sn.append(getSPGraph(Gn[i]))\n",
    "    \n",
    "#     print(S1.nodes(data = True))\n",
    "#     print(S2.nodes(data = True))\n",
    "#     print(S1.edges(data = True))\n",
    "#     print(S2.edges(data = True))\n",
    "    \n",
    "    start_time = time.time()\n",
    "    for i in range(0, len(Gn)):\n",
    "        for j in range(i, len(Gn)):\n",
    "            for e1 in Sn[i].edges(data = True):\n",
    "                for e2 in Sn[j].edges(data = True):          \n",
    "                    if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):\n",
    "                        Kmatrix[i][j] += 1\n",
    "                        Kmatrix[j][i] += (i == j ? 0 : 1)\n",
    "    \n",
    "    print(\"--- %s seconds ---\" % (time.time() - start_time))\n",
    "    \n",
    "    return Kmatrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "--- 0.05678129196166992 seconds ---\n",
      "1\n",
      "--- 0.15176129341125488 seconds ---\n",
      "2\n",
      "--- 0.20930719375610352 seconds ---\n",
      "3\n",
      "--- 0.3049781322479248 seconds ---\n",
      "4\n",
      "--- 0.4029049873352051 seconds ---\n",
      "5\n",
      "--- 0.5458371639251709 seconds ---\n",
      "6\n",
      "--- 0.6920650005340576 seconds ---\n",
      "7\n",
      "--- 0.7972092628479004 seconds ---\n",
      "8\n",
      "--- 0.947425365447998 seconds ---\n",
      "9\n",
      "--- 1.1016933917999268 seconds ---\n",
      "10\n",
      "--- 1.2554333209991455 seconds ---\n",
      "11\n",
      "--- 1.4140815734863281 seconds ---\n",
      "12\n",
      "--- 1.562861442565918 seconds ---\n",
      "13\n",
      "--- 1.7876057624816895 seconds ---\n",
      "14\n",
      "--- 1.9889881610870361 seconds ---\n",
      "15\n",
      "--- 2.2633984088897705 seconds ---\n",
      "16\n",
      "--- 2.480710983276367 seconds ---\n",
      "17\n",
      "--- 2.683915138244629 seconds ---\n",
      "18\n",
      "--- 2.8276052474975586 seconds ---\n",
      "19\n",
      "--- 2.972059488296509 seconds ---\n",
      "20\n",
      "--- 3.11892032623291 seconds ---\n",
      "21\n",
      "--- 3.330472469329834 seconds ---\n",
      "22\n",
      "--- 3.5461206436157227 seconds ---\n",
      "23\n",
      "--- 3.7521393299102783 seconds ---\n",
      "24\n",
      "--- 3.956348180770874 seconds ---\n",
      "25\n",
      "--- 4.162136793136597 seconds ---\n",
      "26\n",
      "--- 4.365236759185791 seconds ---\n",
      "27\n",
      "--- 4.572294473648071 seconds ---\n",
      "28\n",
      "--- 4.778241872787476 seconds ---\n",
      "29\n",
      "--- 4.981487035751343 seconds ---\n",
      "30\n",
      "--- 5.189010143280029 seconds ---\n",
      "31\n",
      "--- 5.466430902481079 seconds ---\n",
      "32\n",
      "--- 5.73804497718811 seconds ---\n",
      "33\n",
      "--- 6.0193397998809814 seconds ---\n",
      "34\n",
      "--- 6.293334245681763 seconds ---\n",
      "35\n",
      "--- 6.569210767745972 seconds ---\n",
      "36\n",
      "--- 6.783808708190918 seconds ---\n",
      "37\n",
      "--- 6.999167203903198 seconds ---\n",
      "38\n",
      "--- 7.209052085876465 seconds ---\n",
      "39\n",
      "--- 7.414280652999878 seconds ---\n",
      "40\n",
      "--- 7.620949983596802 seconds ---\n",
      "41\n",
      "--- 7.892791986465454 seconds ---\n",
      "42\n",
      "--- 8.166114330291748 seconds ---\n",
      "43\n",
      "--- 8.46480393409729 seconds ---\n",
      "44\n",
      "--- 8.75532841682434 seconds ---\n",
      "45\n",
      "--- 9.027160882949829 seconds ---\n",
      "46\n",
      "--- 9.303063869476318 seconds ---\n",
      "47\n",
      "--- 9.575549125671387 seconds ---\n",
      "48\n",
      "--- 9.867429733276367 seconds ---\n",
      "49\n",
      "--- 10.160123109817505 seconds ---\n",
      "50\n",
      "--- 10.437638759613037 seconds ---\n",
      "51\n",
      "--- 10.714671611785889 seconds ---\n",
      "52\n",
      "--- 10.987818479537964 seconds ---\n",
      "53\n",
      "--- 11.259410381317139 seconds ---\n",
      "54\n",
      "--- 11.535178184509277 seconds ---\n",
      "55\n",
      "--- 11.807695865631104 seconds ---\n",
      "56\n",
      "--- 12.158225774765015 seconds ---\n",
      "57\n",
      "--- 12.506253004074097 seconds ---\n",
      "58\n",
      "--- 12.856064319610596 seconds ---\n",
      "59\n",
      "--- 13.203948497772217 seconds ---\n",
      "60\n",
      "--- 13.552793741226196 seconds ---\n",
      "61\n",
      "--- 13.906684160232544 seconds ---\n",
      "62\n",
      "--- 14.256698369979858 seconds ---\n",
      "63\n",
      "--- 14.606950283050537 seconds ---\n",
      "64\n",
      "--- 14.876070022583008 seconds ---\n",
      "65\n",
      "--- 15.148754596710205 seconds ---\n",
      "66\n",
      "--- 15.43168306350708 seconds ---\n",
      "67\n",
      "--- 15.710469961166382 seconds ---\n",
      "68\n",
      "--- 15.98047399520874 seconds ---\n",
      "69\n",
      "--- 16.25121569633484 seconds ---\n",
      "70\n",
      "--- 16.52086853981018 seconds ---\n",
      "71\n",
      "--- 16.790047645568848 seconds ---\n",
      "72\n",
      "--- 17.06355619430542 seconds ---\n",
      "73\n",
      "--- 17.335728406906128 seconds ---\n",
      "74\n",
      "--- 17.607405424118042 seconds ---\n",
      "75\n",
      "--- 17.955402135849 seconds ---\n",
      "76\n",
      "--- 18.303555488586426 seconds ---\n",
      "77\n",
      "--- 18.654282808303833 seconds ---\n",
      "78\n",
      "--- 19.004570245742798 seconds ---\n",
      "79\n",
      "--- 19.35291624069214 seconds ---\n",
      "80\n",
      "--- 19.700473070144653 seconds ---\n",
      "81\n",
      "--- 20.04847502708435 seconds ---\n",
      "82\n",
      "--- 20.39787983894348 seconds ---\n",
      "83\n",
      "--- 20.74629044532776 seconds ---\n",
      "84\n",
      "--- 21.094562768936157 seconds ---\n",
      "85\n",
      "--- 21.445199489593506 seconds ---\n",
      "86\n",
      "--- 21.794403791427612 seconds ---\n",
      "87\n",
      "--- 22.143365383148193 seconds ---\n",
      "88\n",
      "--- 22.49206042289734 seconds ---\n",
      "89\n",
      "--- 22.840426445007324 seconds ---\n",
      "90\n",
      "--- 23.189460515975952 seconds ---\n",
      "91\n",
      "--- 23.539386749267578 seconds ---\n",
      "92\n",
      "--- 23.888701677322388 seconds ---\n",
      "93\n",
      "--- 24.23668909072876 seconds ---\n",
      "94\n",
      "--- 24.58505630493164 seconds ---\n",
      "95\n",
      "--- 25.019609451293945 seconds ---\n",
      "96\n",
      "--- 25.456527709960938 seconds ---\n",
      "97\n",
      "--- 25.891918182373047 seconds ---\n",
      "98\n",
      "--- 26.32820987701416 seconds ---\n",
      "99\n",
      "--- 26.76149344444275 seconds ---\n",
      "100\n",
      "--- 27.197012424468994 seconds ---\n",
      "101\n",
      "--- 27.63314127922058 seconds ---\n",
      "102\n",
      "--- 28.068315029144287 seconds ---\n",
      "103\n",
      "--- 28.50419807434082 seconds ---\n",
      "104\n",
      "--- 28.852453231811523 seconds ---\n",
      "105\n",
      "--- 29.205727338790894 seconds ---\n",
      "106\n",
      "--- 29.554840087890625 seconds ---\n",
      "107\n",
      "--- 29.90355086326599 seconds ---\n",
      "108\n",
      "--- 30.251071214675903 seconds ---\n",
      "109\n",
      "--- 30.599868059158325 seconds ---\n",
      "110\n",
      "--- 30.94942593574524 seconds ---\n",
      "111\n",
      "--- 31.298285245895386 seconds ---\n",
      "112\n",
      "--- 31.648550271987915 seconds ---\n",
      "113\n",
      "--- 32.0825355052948 seconds ---\n",
      "114\n",
      "--- 32.516993045806885 seconds ---\n",
      "115\n",
      "--- 32.950743198394775 seconds ---\n",
      "116\n",
      "--- 33.38488531112671 seconds ---\n",
      "117\n",
      "--- 33.81857705116272 seconds ---\n",
      "118\n",
      "--- 34.27995991706848 seconds ---\n",
      "119\n",
      "--- 34.728654623031616 seconds ---\n",
      "120\n",
      "--- 35.16262221336365 seconds ---\n",
      "121\n",
      "--- 35.5960898399353 seconds ---\n",
      "122\n",
      "--- 36.02964925765991 seconds ---\n",
      "123\n",
      "--- 36.46674466133118 seconds ---\n",
      "124\n",
      "--- 36.91917443275452 seconds ---\n",
      "125\n",
      "--- 37.381704330444336 seconds ---\n",
      "126\n",
      "--- 37.81864261627197 seconds ---\n",
      "127\n",
      "--- 38.3528311252594 seconds ---\n",
      "128\n",
      "--- 38.89131188392639 seconds ---\n",
      "129\n",
      "--- 39.42161011695862 seconds ---\n",
      "130\n",
      "--- 39.95006561279297 seconds ---\n",
      "131\n",
      "--- 40.476089000701904 seconds ---\n",
      "132\n",
      "--- 41.00121235847473 seconds ---\n",
      "133\n",
      "--- 41.4318163394928 seconds ---\n",
      "134\n",
      "--- 41.86459708213806 seconds ---\n",
      "135\n",
      "--- 42.29518222808838 seconds ---\n",
      "136\n",
      "--- 42.729474782943726 seconds ---\n",
      "137\n",
      "--- 43.16999864578247 seconds ---\n",
      "138\n",
      "--- 43.606104135513306 seconds ---\n",
      "139\n",
      "--- 44.04209113121033 seconds ---\n",
      "140\n",
      "--- 44.4772834777832 seconds ---\n",
      "141\n",
      "--- 45.01142644882202 seconds ---\n",
      "142\n",
      "--- 45.543590784072876 seconds ---\n",
      "143\n",
      "--- 46.07910680770874 seconds ---\n",
      "144\n",
      "--- 46.612366914749146 seconds ---\n",
      "145\n",
      "--- 47.1452751159668 seconds ---\n",
      "146\n",
      "--- 47.67322564125061 seconds ---\n",
      "147\n",
      "--- 48.20156168937683 seconds ---\n",
      "148\n",
      "--- 48.73471546173096 seconds ---\n",
      "149\n",
      "--- 49.2733519077301 seconds ---\n",
      "150\n",
      "--- 49.806400537490845 seconds ---\n",
      "151\n",
      "--- 50.33490014076233 seconds ---\n",
      "152\n",
      "--- 50.86489534378052 seconds ---\n",
      "153\n",
      "--- 51.39602565765381 seconds ---\n",
      "154\n",
      "--- 51.93729043006897 seconds ---\n",
      "155\n",
      "--- 52.473469972610474 seconds ---\n",
      "156\n",
      "--- 53.01401090621948 seconds ---\n",
      "157\n",
      "--- 53.58053278923035 seconds ---\n",
      "158\n",
      "--- 54.22534370422363 seconds ---\n",
      "159\n",
      "--- 54.870089292526245 seconds ---\n",
      "160\n",
      "--- 55.50953507423401 seconds ---\n",
      "161\n",
      "--- 56.144059896469116 seconds ---\n",
      "162\n",
      "--- 56.779675245285034 seconds ---\n",
      "163\n",
      "--- 57.41550326347351 seconds ---\n",
      "164\n",
      "--- 58.04742622375488 seconds ---\n",
      "165\n",
      "--- 58.57527136802673 seconds ---\n",
      "166\n",
      "--- 59.10521101951599 seconds ---\n",
      "167\n",
      "--- 59.737877368927 seconds ---\n",
      "168\n",
      "--- 60.373518228530884 seconds ---\n",
      "169\n",
      "--- 61.00429916381836 seconds ---\n",
      "170\n",
      "--- 61.64198398590088 seconds ---\n",
      "171\n",
      "--- 62.27683877944946 seconds ---\n",
      "172\n",
      "--- 62.91516971588135 seconds ---\n",
      "173\n",
      "--- 63.26179265975952 seconds ---\n",
      "174\n",
      "--- 63.89728498458862 seconds ---\n",
      "175\n",
      "--- 64.53867197036743 seconds ---\n",
      "176\n",
      "--- 65.18402314186096 seconds ---\n",
      "177\n",
      "--- 65.82770419120789 seconds ---\n",
      "178\n",
      "--- 66.46093964576721 seconds ---\n",
      "179\n",
      "--- 67.09133172035217 seconds ---\n",
      "180\n",
      "--- 67.73470735549927 seconds ---\n",
      "181\n",
      "--- 68.4084141254425 seconds ---\n",
      "182\n",
      "--- 69.05118441581726 seconds ---\n",
      "183\n",
      "--- 69.68487596511841 seconds ---\n",
      "184\n",
      "--- 70.3204357624054 seconds ---\n",
      "[[  3.   1.   3. ...,   1.   1.   1.]\n",
      " [  1.   6.   1. ...,   0.   0.   3.]\n",
      " [  3.   1.   3. ...,   1.   1.   1.]\n",
      " ..., \n",
      " [  1.   0.   1. ...,  55.  21.   7.]\n",
      " [  1.   0.   1. ...,  21.  55.   7.]\n",
      " [  1.   3.   1. ...,   7.   7.  55.]]\n"
     ]
    }
   ],
   "source": [
    "dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
    "G1 = dataset[12]\n",
    "G2 = dataset[20]\n",
    "Kmatrix = spkernel(dataset)\n",
    "\n",
    "print(Kmatrix)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}