|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "\n",
- "\"\"\"\n",
- "- Script containing functions for computing the shortest path kernel\n",
- "- The Floyd Warshall algorithm is first implemented\n",
- "- Then the SP is calculated\n",
- "\"\"\"\n",
- "\n",
- "\n",
- "#######################\n",
- "# - IMPORT PACKAGES - #\n",
- "#######################\n",
- "\n",
- "\n",
- "\n",
- "import numpy.matlib as matlib\n",
- "import numpy as np\n",
- "\n",
- "\"\"\"\n",
- "### FLOYD WARSHALL ALGORITHM\n",
- "Input:\n",
- "- Adjancency matrix A\n",
- "Output:\n",
- "- Shortest path matrix S\n",
- "\"\"\"\n",
- "\n",
- "def floyd_warshall(A):\n",
- "\n",
- "\t# nuber of nodes\n",
- "\tn = A.shape[0]\n",
- "\n",
- "\t# initialize shortes path matrix\n",
- "\tS = np.zeros(shape = (n,n))\n",
- "\n",
- "\tfor i in range(n):\n",
- "\t\tfor j in range(n):\n",
- "\t\t\tif A[i,j] == 0 and i!=j:\n",
- "\t\t\t\tS[i,j] = float(\"inf\")\n",
- "\t\t\telse:\n",
- "\t\t\t\tS[i,j] = A[i,j]\n",
- "\n",
- "\t# Compute the shortest path matrix\n",
- "\tfor k in range(n):\n",
- "\t\tfor i in range(n):\n",
- "\t\t\tfor j in range(n):\n",
- "\t\t\t\tif S[i,j] > S[i,k] + S[k,j]:\n",
- "\t\t\t\t\tS[i,j] = S[i,k] + S[k,j]\n",
- "\n",
- "\treturn S\t\t\t\t\t\t\t\t\n",
- "\n",
- "\n",
- "\n",
- "\"\"\"\n",
- "SHORTEST PATH KERNEL: This is a fast implementation of the shortest path\n",
- "kernel algorithm\n",
- "Inputs\n",
- "- Adjancency matrix\n",
- "- List of list of node labels for each graph\n",
- "- Total number of node labels \n",
- "Outputs\n",
- "- Kernel matrix\n",
- "- Feature matrix\n",
- "\"\"\"\n",
- "\n",
- "def sp_kernel_fast(adj_mat, labels, L):\n",
- "\n",
- "\t# Number of graphs\n",
- "\tn = len(adj_mat)\n",
- "\tL = int(L)\n",
- "\tS = []\n",
- "\n",
- "\t# shortest path matrices\n",
- "\tfor i in xrange(n):\n",
- "\t\tif i%1000 == 0 and i !=0:\n",
- " \t\t\tprint('haha') #( \"%d\" % i)\n",
- "\t\tS.append(floyd_warshall(adj_mat[i]))\n",
- "\t\n",
- "\t# maximum length of shortest paths in the dataset\n",
- "\tmax_path = 0\n",
- "\n",
- "\t# for each graph in dataset\n",
- "\tfor i in xrange(n):\n",
- "\n",
- "\t\tS_cur = np.copy(S[i])\n",
- "\t\tS_cur[S_cur == np.inf] = 0\n",
- "\t\tnew_max = np.max(S_cur)\n",
- "\t\t\n",
- "\t\tif new_max > max_path:\n",
- "\t\t\tmax_path = new_max # get max short path in all Ss\n",
- "\n",
- "\t# maximum length of shortest paths\n",
- "\tmax_path = int(max_path)\n",
- "\n",
- "\t# initialize feature matrix\n",
- "\tsp = np.zeros(((max_path + 1) * L * (L+1) /2,n))\n",
- "\n",
- "\t# compute feature map for shortest path\n",
- "\tfor i in xrange(n):\n",
- "\n",
- "\t\tif i % 1000 == 0:\n",
- "\t\t\tprint('haha') #\"Processed %d graphs\" %i\n",
- "\n",
- "\t\tS_graph = S[i]\n",
- "\t\tlabels_graph = np.asarray(labels[i].reshape((len(labels[i]),1)))\n",
- "\t\tlabels_graph = labels_graph + 1\n",
- "\t\t\n",
- "\t\tlabels_aux = matlib.repmat(labels_graph, 1, len(labels_graph))\n",
- "\t\t\n",
- "\t\tmin_lab = np.minimum(labels_aux, labels_aux.T)\n",
- "\t\t\n",
- "\t\tmax_lab = np.maximum(labels_aux, labels_aux.T)\n",
- "\t\tsub_path = np.triu(~(np.isinf(S_graph))).T\n",
- "\n",
- "\t\tmin_lab = min_lab[sub_path]\n",
- "\t\tmax_lab = max_lab[sub_path]\n",
- "\n",
- "\n",
- "\t\tind = S_graph[sub_path] * L * (L + 1) / 2 + (min_lab - 1) * (2*L + 2 - min_lab) / 2 + max_lab - min_lab\n",
- "\t\tind = ind.astype(int)\n",
- "\t\taccum = np.zeros((max_path + 1) * L * (L + 1) /2)\n",
- "\t\taccum[:ind.max() + 1] += np.bincount(ind.astype(int))\n",
- "\t\tsp[ind,i] = accum[ind]\n",
- "\t\n",
- "\tsum_cols = np.sum(sp, axis = 1)\n",
- "\tind_true = sum_cols != 0\n",
- "\tsp = sp[ind_true,:]\n",
- "\t\n",
- "\t# compute kernel matrix\n",
- "\tK = np.dot(sp.T,sp)\n",
- " \n",
- "\treturn K, sp"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "ename": "ImportError",
- "evalue": "No module named 'igraph'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-11-effbaf3a1e10>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;31m# iGraph imports to handle graphs and for graph I/O\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0migraph\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mGraph\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mImportError\u001b[0m: No module named 'igraph'"
- ]
- }
- ],
- "source": [
- "#Authors: Elisabetta Ghisu, Felipe Llinares Lopez\n",
- "\n",
- "\"\"\"\n",
- "- This script includes a list of functions for analyzing \n",
- "parsing and formatting graphs\n",
- "- The graphs are given in graphml format\n",
- "- It also cntans functions for loading, processing the graphs\n",
- "and extract graph statistics\n",
- "\"\"\"\n",
- "\n",
- "\n",
- "import numpy as np\n",
- "from numpy import genfromtxt\n",
- "\n",
- "# iGraph imports to handle graphs and for graph I/O\n",
- "from igraph import Graph\n",
- "\n",
- "\n",
- "# ---------------------------------GRAPHML I/O FUNCTIONS------------------------------------ #\n",
- "\n",
- "# INPUT:\n",
- "# filenames_graphs: list of GraphML files, where each file contains one graph in the dataset\n",
- "# filename_labels: text file with labels corresponding to each graph in the dataset, in the same order as they are in\n",
- "# filename_graphs\n",
- "# OUTPUT:\n",
- "# G: A list containing one iGraph object for each graph in the dataset\n",
- "# Y: A Numpy array containing the labels corresponding to each graph, in the same order as G\n",
- "def load_graphml(filenames_graphs, filename_labels):\n",
- " G = []\n",
- " for fname in filenames_graphs:\n",
- " G.append(Graph.Read_GraphML(fname))\n",
- " Y = genfromtxt(filename_labels)\n",
- " return (G, Y)\n",
- "\n",
- "\n",
- "# Loads a list of paths to GraphML files from filename_list\n",
- "def load_file_list(filename_flist):\n",
- " f = open(filename_flist, 'r')\n",
- " f_graphs = []\n",
- " for line in f:\n",
- " f_graphs.append(line.strip())\n",
- " f.close()\n",
- " return f_graphs\n",
- "\n",
- "\n",
- "# --------------------------------COMPUTE STATISTICS---------------------------------------- #\n",
- "\n",
- "\n",
- "# Retrieve labels of all vertices belonging to any graph in the list of iGraph objects G and\n",
- "# returns the entire list, and a list with the alphabet of the vertex labels\n",
- "def get_all_vertex_labels(G, att_name='label'):\n",
- " v_l = []\n",
- " for g in G:\n",
- " v_l += g.vs[att_name]\n",
- " return (v_l, np.unique(v_l))\n",
- "\n",
- "\n",
- "# Retrieve labels of all edges belonging to any graph in the list of iGraph objects G and\n",
- "# returns the entire list, and a list with the alphabet of the edge labels\n",
- "def get_all_edge_labels(G, att_name='label'):\n",
- " e_l = []\n",
- " for g in G:\n",
- " e_l += g.es[att_name]\n",
- " return (e_l, np.unique(e_l))\n",
- "\n",
- "\n",
- "# Returns a list where each element is itself the adjacency list of the corresponding graph\n",
- "# The adjacency lit of a graph has the following format:\n",
- "# it is a list where each element is a list containing the id of adjacent nodes\n",
- "def get_adj_list(G):\n",
- " ad_l = []\n",
- " for g in G:\n",
- " ad_l.append(g.get_adjlist())\n",
- " return ad_l\n",
- "\n",
- "# Returns a list where each element is the adjacency matrix of the graph \n",
- "# The adjancency matrix is in iGraph format\n",
- "def get_adj_mat(G):\n",
- " ad_m = []\n",
- " for g in G:\n",
- " ad_m.append(g.get_adjacency())\n",
- " return ad_m\n",
- "\n",
- "# Returns a list where each element contains the nodes label for a graph\n",
- "def get_node_labels(G, att_name = 'label'):\n",
- " node_l = []\n",
- " for g in G:\n",
- " node_l.append(g.vs[att_name])\n",
- " return node_l\n",
- "\n",
- "\n",
- "\n",
- "# ----------------- LOAD AND PROCESS THE GRAPHS --------------- #\n",
- "\n",
- "\n",
- "\"\"\"\n",
- "Inputs:\n",
- "- list of graphs file\n",
- "- labels file\n",
- "- path to the data folder\n",
- "Outputs:\n",
- "- List of node labels\n",
- "- List of adjancency lists\n",
- "- List of graphs in graphml format\n",
- "- Targets\n",
- "- number of classes\n",
- "- sample size\n",
- "\"\"\"\n",
- "\n",
- "\n",
- "def load_and_process(filenames_graphs, filename_labels, path_to_dataset):\n",
- "\n",
- " # load a list of names to graphml files\n",
- " f_graphs = load_file_list(filenames_graphs)\n",
- " # sample size\n",
- " n = len(f_graphs)\n",
- "\n",
- " # create a list of paths to the files\n",
- " f_graphs_path =[]\n",
- "\n",
- " # for each graph in dataset\n",
- " for i in range(n):\n",
- "\n",
- " # index the graph\n",
- " graph_name = f_graphs[i]\n",
- "\n",
- " # path to the data folder\n",
- " path = \"%s/%s\" % (path_to_dataset, graph_name)\n",
- " f_graphs_path.append(path)\n",
- "\n",
- " # If the data is DD have to delete an element (corrupted file)\n",
- " if graph_name == \"DD\":\n",
- " del f_graphs_path[148]\n",
- " n = n-1\n",
- "\n",
- " # Load the graphs in graphml format\n",
- " # G is a llist of graphml graph\n",
- " # Y is an array of targets\n",
- " G,Y = load_graphml(f_graphs_path, filename_labels)\n",
- "\n",
- " # Delete corrupted file in DD\n",
- " if graph_name == \"DD\": \n",
- " Y = np.delete(Y, 148)\n",
- "\n",
- " # get adjacency list and matrix for all the graphs in G\n",
- " ad_list = get_adj_list(G)\n",
- " ad_mat = get_adj_mat(G)\n",
- "\n",
- " # get a list containing lists of node labels\n",
- " node_label = get_node_labels(G)\n",
- "\n",
- " return node_label, ad_list, G, Y\n",
- "\n",
- "\n",
- "\n",
- "\"\"\"\n",
- "RENAME NODES: function to rename nodes from 0,...,num_nodes\n",
- "Input\n",
- "- list of list of node labels in each graph\n",
- "Output\n",
- "- L: total number of different labels in the dataset\n",
- "- node_label: new renamed labels\n",
- "\"\"\"\n",
- "\n",
- "def rename_nodes(node_label): \n",
- " \n",
- " # number of graphs in the dataset\n",
- " n = len(node_label)\n",
- "\n",
- " # labels will store the new labels\n",
- " labels = [0] * n\n",
- "\n",
- " # disctionary containing the map from the old to the new labels\n",
- " label_lookup = {}\n",
- "\n",
- " # counter of unique labels\n",
- " label_counter = 0\n",
- "\n",
- " # for each graph in dataset\n",
- " for i in range(n):\n",
- "\n",
- "\n",
- " # number of nodes in graph[i]\n",
- " num_nodes = len(node_label[i]) \n",
- "\n",
- " # will be used to store the new labels\n",
- " labels[i] = np.zeros(num_nodes, dtype = np.uint64) # positive integers\n",
- "\n",
- " # for each node in the graph\n",
- " for j in range(num_nodes):\n",
- "\n",
- " # the node label to a string\n",
- " l_node_str = str(np.copy(node_label[i][j]))\n",
- " \n",
- " # if the string has not been observed yet\n",
- " # the corresponding node is assigned a new label\n",
- " # otherwise it will be named with the same label\n",
- " # already assigned to an identical string\n",
- "\n",
- " if not label_lookup.has_key(l_node_str):\n",
- " label_lookup[l_node_str] = label_counter\n",
- " labels[i][j] = label_counter \n",
- " label_counter += 1\n",
- " else:\n",
- " labels[i][j] = label_lookup[l_node_str]\n",
- "\n",
- " # total number of labels in the dataset\n",
- " L = label_counter\n",
- " print('haha') #'Number of original labels %d' % L \n",
- "\n",
- " return L, labels"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "usage: ipykernel_launcher.py [-h] --dataset DATASET\n",
- "ipykernel_launcher.py: error: the following arguments are required: --dataset\n"
- ]
- },
- {
- "ename": "SystemExit",
- "evalue": "2",
- "output_type": "error",
- "traceback": [
- "An exception has occurred, use %tb to see the full traceback.\n",
- "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 2\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py:2918: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n",
- " warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n"
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "\n",
- "\"\"\"\n",
- "- Script for computing the kernel matrix and features map \n",
- "using shortest path kernel\n",
- "\"\"\"\n",
- "\n",
- "###########################\n",
- "# --- IMPORT PACKAGES --- #\n",
- "###########################\n",
- "\n",
- "import numpy as np\n",
- "import argparse\n",
- "import os\n",
- "import pickle\n",
- "\n",
- "from numpy import genfromtxt\n",
- "\n",
- "# from sp_functions import *\n",
- "# from parse_graphs import *\n",
- "\n",
- "\n",
- "\n",
- "##############################\n",
- "### Command Line Arguments ###\n",
- "##############################\n",
- "\n",
- "parser = argparse.ArgumentParser(description = \"Compute kernel and features matrices via shortest path kernel\")\n",
- "parser.add_argument(\"--dataset\", required = True, help = \"Name of the dataset\")\n",
- "args = parser.parse_args()\n",
- "\n",
- "\n",
- "#####################\n",
- "### LOAD THE DATA ###\n",
- "#####################\n",
- "\n",
- "\"\"\"\n",
- "- Here we load the data input and targets\n",
- "- The data are assumed to be in graph formats\n",
- "- They should be in graphml format \n",
- "\"\"\"\n",
- "\n",
- "# path to the list of graphs and dataset\n",
- "filenames_graphs = \"data/%s.list\" % (args.dataset)\n",
- "path_to_dataset = \"data/%s\" % (args.dataset) \n",
- "\n",
- "# Load the targets\n",
- "filename_labels = \"data/%s_label.txt\" % (args.dataset)\n",
- "\n",
- "# load and process graphs\n",
- "node_label, ad_list, G, Y = load_and_process(filenames_graphs, filename_labels, path_to_dataset)\n",
- "\n",
- "# output directory\n",
- "out_path = \"kernel_matrices/%s/sp\" % args.dataset\n",
- "\n",
- "# If the output directory does not exist, then create it\n",
- "if not os.path.exists(out_path):\n",
- " os.makedirs(out_path)\n",
- "\n",
- "\n",
- "#########################\n",
- "# --- SHORTEST PATH --- #\n",
- "#########################\n",
- "\n",
- "\n",
- "# assign labels starting from zero to the nodes\n",
- "L, labels = rename_nodes(node_label)\n",
- "\n",
- "\n",
- "# Compute adjancency matrix \n",
- "adj_mat = get_adj_mat(G)\n",
- "\n",
- "# Compute kernel and feature maps using shortest path\n",
- "K, phi = sp_kernel_fast(adj_mat, labels, L)\n",
- "\n",
- "# save kernel matrix\n",
- "file_name = \"%s/%s_ker_mat\" % (out_path, args.dataset)\n",
- "np.save(file_name, K)\n",
- "\n",
- "# save feature map\n",
- "file_name = \"%s/%s_phi_map\" % (out_path, args.dataset)\n",
- "np.save(file_name, phi)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[[ 0. 2. 3. 1. 2.]]\n",
- "{0: {0: [0], 1: [0, 3, 1], 2: [0, 3, 4, 2], 3: [0, 3], 4: [0, 3, 4]}, 1: {0: [1, 3, 0], 1: [1], 2: [1, 3, 4, 2], 3: [1, 3], 4: [1, 3, 4]}, 2: {0: [2, 4, 3, 0], 1: [2, 4, 3, 1], 2: [2], 3: [2, 4, 3], 4: [2, 4]}, 3: {0: [3, 0], 1: [3, 1], 2: [3, 4, 2], 3: [3], 4: [3, 4]}, 4: {0: [4, 3, 0], 1: [4, 3, 1], 2: [4, 2], 3: [4, 3], 4: [4]}}\n",
- "[[ 0. 2. 3. 1. 2.]\n",
- " [ 2. 0. 3. 1. 2.]\n",
- " [ 3. 3. 0. 2. 1.]\n",
- " [ 1. 1. 2. 0. 1.]\n",
- " [ 2. 2. 1. 1. 0.]]\n"
- ]
- },
- {
- "ename": "NameError",
- "evalue": "name 'plt' is not defined",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-17-c1e1e7524d30>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ml2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetSPGraph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mNameError\u001b[0m: name 'plt' is not defined"
- ]
- }
- ],
- "source": [
- "dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "G1 = dataset[12]\n",
- "\n",
- "nx.draw_networkx(G1)\n",
- "# print(list(dataset[12][4]))\n",
- "\n",
- "l = nx.shortest_path(G1)\n",
- "\n",
- "l2 = nx.floyd_warshall_numpy(G1)\n",
- "print(np.array(l2[0]))\n",
- "print(l)\n",
- "print(l2)\n",
- "plt.show()\n",
- "\n",
- "S = getSPGraph(G1)\n",
- "nx.draw_networkx(S)\n",
- "pos = nx.spring_layout(S)\n",
- "edge_labels = nx.get_edge_attributes(S,'cost')\n",
- "print(edge_labels)\n",
- "nx.draw_networkx_edge_labels(S, pos, edge_labels = edge_labels)\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "import networkx as nx\n",
- " \n",
- "def loadCT(filename):\n",
- " \"\"\"load data from .ct file.\n",
- " \n",
- " Notes\n",
- " ------ \n",
- " a typical example of data in .ct is like this:\n",
- " \n",
- " 3 2 <- number of nodes and edges\n",
- " 0.0000 0.0000 0.0000 C <- each line describes a node, the last parameter in which is the label of the node, representing a chemical element @Q what are the first 3 numbers?\n",
- " 0.0000 0.0000 0.0000 C\n",
- " 0.0000 0.0000 0.0000 O\n",
- " 1 3 1 1 <- each line describes an edge, the first two numbers represent two nodes of the edge, the last number represents the label. @Q what are the 3th numbers?\n",
- " 2 3 1 1\n",
- " \"\"\"\n",
- " content = open(filename).read().splitlines()\n",
- " G = nx.Graph(name=str(content[0])) # set name of the graph\n",
- " tmp = content[1].split(\" \")\n",
- " if tmp[0] == '':\n",
- " nb_nodes = int(tmp[1]) # number of the nodes\n",
- " nb_edges = int(tmp[2]) # number of the edges\n",
- " else:\n",
- " nb_nodes = int(tmp[0])\n",
- " nb_edges = int(tmp[1])\n",
- "\n",
- " for i in range(0, nb_nodes):\n",
- " tmp = content[i + 2].split(\" \")\n",
- " tmp = [x for x in tmp if x != '']\n",
- " G.add_node(i, label=tmp[3])\n",
- "\n",
- " for i in range(0, nb_edges):\n",
- " tmp = content[i + G.number_of_nodes() + 2].split(\" \")\n",
- " tmp = [x for x in tmp if x != '']\n",
- " G.add_edge(int(tmp[0]) - 1, int(tmp[1]) - 1, label=int(tmp[3]))\n",
- " return G\n",
- "\n",
- "\n",
- "def loadGXL(filename):\n",
- " import networkx as nx\n",
- " import xml.etree.ElementTree as ET\n",
- "\n",
- " tree = ET.parse(filename)\n",
- " root = tree.getroot()\n",
- " index = 0\n",
- " G = nx.Graph()\n",
- " dic={}\n",
- " for node in root.iter('node'):\n",
- " label = node.find('attr')[0].text\n",
- " dic[node.attrib['id']] = index\n",
- " G.add_node(index, id=node.attrib['id'], label=label)\n",
- " index += 1\n",
- " \n",
- " for edge in root.iter('edge'):\n",
- " label = edge.find('attr')[0].text\n",
- " G.add_edge(dic[edge.attrib['from']], dic[edge.attrib['to']], label=label)\n",
- " return G\n",
- " \n",
- "def loadDataset(filename):\n",
- " \"\"\"load file list of the dataset.\n",
- " \"\"\"\n",
- " from os.path import dirname, splitext\n",
- "\n",
- " dirname_dataset = dirname(filename)\n",
- " extension = splitext(filename)[1][1:]\n",
- " data = []\n",
- " y = []\n",
- " if(extension == \"ds\"):\n",
- " content = open(filename).read().splitlines()\n",
- " for i in range(0, len(content)):\n",
- " tmp = content[i].split(' ')\n",
- " data.append(loadCT(dirname_dataset + '/' + tmp[0].replace('#', '', 1))) # remove the '#'s in file names\n",
- " y.append(float(tmp[1]))\n",
- " elif(extension == \"cxl\"):\n",
- " import xml.etree.ElementTree as ET\n",
- "\n",
- " tree = ET.parse(filename)\n",
- " root = tree.getroot()\n",
- " data = []\n",
- " y = []\n",
- " for graph in root.iter('print'):\n",
- " mol_filename = graph.attrib['file']\n",
- " mol_class = graph.attrib['class']\n",
- " data.append(loadGXL(dirname_dataset + '/' + mol_filename))\n",
- " y.append(mol_class)\n",
- "\n",
- " return data, y"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 82,
- "metadata": {},
- "outputs": [
- {
- "ename": "SyntaxError",
- "evalue": "invalid syntax (<ipython-input-82-ac9ab22d42ef>, line 48)",
- "output_type": "error",
- "traceback": [
- "\u001b[0;36m File \u001b[0;32m\"<ipython-input-82-ac9ab22d42ef>\"\u001b[0;36m, line \u001b[0;32m48\u001b[0m\n\u001b[0;31m Kmatrix[j][i] += (i == j ? 0 : 1)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
- ]
- }
- ],
- "source": [
- "import sys\n",
- "import pathlib\n",
- "sys.path.insert(0, \"../\")\n",
- "\n",
- "\n",
- "import networkx as nx\n",
- "import numpy as np\n",
- "import time\n",
- "\n",
- "from utils.utils import getSPGraph\n",
- "\n",
- "\n",
- "def spkernel(Gn):\n",
- " \"\"\"Transform graph G to its corresponding shortest-paths graph using Floyd-transformation.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " G : NetworkX graph\n",
- " The graph to be tramsformed.\n",
- " \n",
- " Return\n",
- " ------\n",
- " S : NetworkX graph\n",
- " The shortest-paths graph corresponding to G.\n",
- " \n",
- " References\n",
- " ----------\n",
- " [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE.\n",
- " \"\"\"\n",
- " Kmatrix = np.zeros((len(Gn), len(Gn)))\n",
- " \n",
- " Sn = [] # get shortest path graphs of Gn\n",
- " for i in range(0, len(Gn)):\n",
- " Sn.append(getSPGraph(Gn[i]))\n",
- " \n",
- "# print(S1.nodes(data = True))\n",
- "# print(S2.nodes(data = True))\n",
- "# print(S1.edges(data = True))\n",
- "# print(S2.edges(data = True))\n",
- " \n",
- " start_time = time.time()\n",
- " for i in range(0, len(Gn)):\n",
- " for j in range(i, len(Gn)):\n",
- " for e1 in Sn[i].edges(data = True):\n",
- " for e2 in Sn[j].edges(data = True): \n",
- " if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):\n",
- " Kmatrix[i][j] += 1\n",
- " Kmatrix[j][i] += (i == j ? 0 : 1)\n",
- " \n",
- " print(\"--- %s seconds ---\" % (time.time() - start_time))\n",
- " \n",
- " return Kmatrix"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 83,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0\n",
- "--- 0.05678129196166992 seconds ---\n",
- "1\n",
- "--- 0.15176129341125488 seconds ---\n",
- "2\n",
- "--- 0.20930719375610352 seconds ---\n",
- "3\n",
- "--- 0.3049781322479248 seconds ---\n",
- "4\n",
- "--- 0.4029049873352051 seconds ---\n",
- "5\n",
- "--- 0.5458371639251709 seconds ---\n",
- "6\n",
- "--- 0.6920650005340576 seconds ---\n",
- "7\n",
- "--- 0.7972092628479004 seconds ---\n",
- "8\n",
- "--- 0.947425365447998 seconds ---\n",
- "9\n",
- "--- 1.1016933917999268 seconds ---\n",
- "10\n",
- "--- 1.2554333209991455 seconds ---\n",
- "11\n",
- "--- 1.4140815734863281 seconds ---\n",
- "12\n",
- "--- 1.562861442565918 seconds ---\n",
- "13\n",
- "--- 1.7876057624816895 seconds ---\n",
- "14\n",
- "--- 1.9889881610870361 seconds ---\n",
- "15\n",
- "--- 2.2633984088897705 seconds ---\n",
- "16\n",
- "--- 2.480710983276367 seconds ---\n",
- "17\n",
- "--- 2.683915138244629 seconds ---\n",
- "18\n",
- "--- 2.8276052474975586 seconds ---\n",
- "19\n",
- "--- 2.972059488296509 seconds ---\n",
- "20\n",
- "--- 3.11892032623291 seconds ---\n",
- "21\n",
- "--- 3.330472469329834 seconds ---\n",
- "22\n",
- "--- 3.5461206436157227 seconds ---\n",
- "23\n",
- "--- 3.7521393299102783 seconds ---\n",
- "24\n",
- "--- 3.956348180770874 seconds ---\n",
- "25\n",
- "--- 4.162136793136597 seconds ---\n",
- "26\n",
- "--- 4.365236759185791 seconds ---\n",
- "27\n",
- "--- 4.572294473648071 seconds ---\n",
- "28\n",
- "--- 4.778241872787476 seconds ---\n",
- "29\n",
- "--- 4.981487035751343 seconds ---\n",
- "30\n",
- "--- 5.189010143280029 seconds ---\n",
- "31\n",
- "--- 5.466430902481079 seconds ---\n",
- "32\n",
- "--- 5.73804497718811 seconds ---\n",
- "33\n",
- "--- 6.0193397998809814 seconds ---\n",
- "34\n",
- "--- 6.293334245681763 seconds ---\n",
- "35\n",
- "--- 6.569210767745972 seconds ---\n",
- "36\n",
- "--- 6.783808708190918 seconds ---\n",
- "37\n",
- "--- 6.999167203903198 seconds ---\n",
- "38\n",
- "--- 7.209052085876465 seconds ---\n",
- "39\n",
- "--- 7.414280652999878 seconds ---\n",
- "40\n",
- "--- 7.620949983596802 seconds ---\n",
- "41\n",
- "--- 7.892791986465454 seconds ---\n",
- "42\n",
- "--- 8.166114330291748 seconds ---\n",
- "43\n",
- "--- 8.46480393409729 seconds ---\n",
- "44\n",
- "--- 8.75532841682434 seconds ---\n",
- "45\n",
- "--- 9.027160882949829 seconds ---\n",
- "46\n",
- "--- 9.303063869476318 seconds ---\n",
- "47\n",
- "--- 9.575549125671387 seconds ---\n",
- "48\n",
- "--- 9.867429733276367 seconds ---\n",
- "49\n",
- "--- 10.160123109817505 seconds ---\n",
- "50\n",
- "--- 10.437638759613037 seconds ---\n",
- "51\n",
- "--- 10.714671611785889 seconds ---\n",
- "52\n",
- "--- 10.987818479537964 seconds ---\n",
- "53\n",
- "--- 11.259410381317139 seconds ---\n",
- "54\n",
- "--- 11.535178184509277 seconds ---\n",
- "55\n",
- "--- 11.807695865631104 seconds ---\n",
- "56\n",
- "--- 12.158225774765015 seconds ---\n",
- "57\n",
- "--- 12.506253004074097 seconds ---\n",
- "58\n",
- "--- 12.856064319610596 seconds ---\n",
- "59\n",
- "--- 13.203948497772217 seconds ---\n",
- "60\n",
- "--- 13.552793741226196 seconds ---\n",
- "61\n",
- "--- 13.906684160232544 seconds ---\n",
- "62\n",
- "--- 14.256698369979858 seconds ---\n",
- "63\n",
- "--- 14.606950283050537 seconds ---\n",
- "64\n",
- "--- 14.876070022583008 seconds ---\n",
- "65\n",
- "--- 15.148754596710205 seconds ---\n",
- "66\n",
- "--- 15.43168306350708 seconds ---\n",
- "67\n",
- "--- 15.710469961166382 seconds ---\n",
- "68\n",
- "--- 15.98047399520874 seconds ---\n",
- "69\n",
- "--- 16.25121569633484 seconds ---\n",
- "70\n",
- "--- 16.52086853981018 seconds ---\n",
- "71\n",
- "--- 16.790047645568848 seconds ---\n",
- "72\n",
- "--- 17.06355619430542 seconds ---\n",
- "73\n",
- "--- 17.335728406906128 seconds ---\n",
- "74\n",
- "--- 17.607405424118042 seconds ---\n",
- "75\n",
- "--- 17.955402135849 seconds ---\n",
- "76\n",
- "--- 18.303555488586426 seconds ---\n",
- "77\n",
- "--- 18.654282808303833 seconds ---\n",
- "78\n",
- "--- 19.004570245742798 seconds ---\n",
- "79\n",
- "--- 19.35291624069214 seconds ---\n",
- "80\n",
- "--- 19.700473070144653 seconds ---\n",
- "81\n",
- "--- 20.04847502708435 seconds ---\n",
- "82\n",
- "--- 20.39787983894348 seconds ---\n",
- "83\n",
- "--- 20.74629044532776 seconds ---\n",
- "84\n",
- "--- 21.094562768936157 seconds ---\n",
- "85\n",
- "--- 21.445199489593506 seconds ---\n",
- "86\n",
- "--- 21.794403791427612 seconds ---\n",
- "87\n",
- "--- 22.143365383148193 seconds ---\n",
- "88\n",
- "--- 22.49206042289734 seconds ---\n",
- "89\n",
- "--- 22.840426445007324 seconds ---\n",
- "90\n",
- "--- 23.189460515975952 seconds ---\n",
- "91\n",
- "--- 23.539386749267578 seconds ---\n",
- "92\n",
- "--- 23.888701677322388 seconds ---\n",
- "93\n",
- "--- 24.23668909072876 seconds ---\n",
- "94\n",
- "--- 24.58505630493164 seconds ---\n",
- "95\n",
- "--- 25.019609451293945 seconds ---\n",
- "96\n",
- "--- 25.456527709960938 seconds ---\n",
- "97\n",
- "--- 25.891918182373047 seconds ---\n",
- "98\n",
- "--- 26.32820987701416 seconds ---\n",
- "99\n",
- "--- 26.76149344444275 seconds ---\n",
- "100\n",
- "--- 27.197012424468994 seconds ---\n",
- "101\n",
- "--- 27.63314127922058 seconds ---\n",
- "102\n",
- "--- 28.068315029144287 seconds ---\n",
- "103\n",
- "--- 28.50419807434082 seconds ---\n",
- "104\n",
- "--- 28.852453231811523 seconds ---\n",
- "105\n",
- "--- 29.205727338790894 seconds ---\n",
- "106\n",
- "--- 29.554840087890625 seconds ---\n",
- "107\n",
- "--- 29.90355086326599 seconds ---\n",
- "108\n",
- "--- 30.251071214675903 seconds ---\n",
- "109\n",
- "--- 30.599868059158325 seconds ---\n",
- "110\n",
- "--- 30.94942593574524 seconds ---\n",
- "111\n",
- "--- 31.298285245895386 seconds ---\n",
- "112\n",
- "--- 31.648550271987915 seconds ---\n",
- "113\n",
- "--- 32.0825355052948 seconds ---\n",
- "114\n",
- "--- 32.516993045806885 seconds ---\n",
- "115\n",
- "--- 32.950743198394775 seconds ---\n",
- "116\n",
- "--- 33.38488531112671 seconds ---\n",
- "117\n",
- "--- 33.81857705116272 seconds ---\n",
- "118\n",
- "--- 34.27995991706848 seconds ---\n",
- "119\n",
- "--- 34.728654623031616 seconds ---\n",
- "120\n",
- "--- 35.16262221336365 seconds ---\n",
- "121\n",
- "--- 35.5960898399353 seconds ---\n",
- "122\n",
- "--- 36.02964925765991 seconds ---\n",
- "123\n",
- "--- 36.46674466133118 seconds ---\n",
- "124\n",
- "--- 36.91917443275452 seconds ---\n",
- "125\n",
- "--- 37.381704330444336 seconds ---\n",
- "126\n",
- "--- 37.81864261627197 seconds ---\n",
- "127\n",
- "--- 38.3528311252594 seconds ---\n",
- "128\n",
- "--- 38.89131188392639 seconds ---\n",
- "129\n",
- "--- 39.42161011695862 seconds ---\n",
- "130\n",
- "--- 39.95006561279297 seconds ---\n",
- "131\n",
- "--- 40.476089000701904 seconds ---\n",
- "132\n",
- "--- 41.00121235847473 seconds ---\n",
- "133\n",
- "--- 41.4318163394928 seconds ---\n",
- "134\n",
- "--- 41.86459708213806 seconds ---\n",
- "135\n",
- "--- 42.29518222808838 seconds ---\n",
- "136\n",
- "--- 42.729474782943726 seconds ---\n",
- "137\n",
- "--- 43.16999864578247 seconds ---\n",
- "138\n",
- "--- 43.606104135513306 seconds ---\n",
- "139\n",
- "--- 44.04209113121033 seconds ---\n",
- "140\n",
- "--- 44.4772834777832 seconds ---\n",
- "141\n",
- "--- 45.01142644882202 seconds ---\n",
- "142\n",
- "--- 45.543590784072876 seconds ---\n",
- "143\n",
- "--- 46.07910680770874 seconds ---\n",
- "144\n",
- "--- 46.612366914749146 seconds ---\n",
- "145\n",
- "--- 47.1452751159668 seconds ---\n",
- "146\n",
- "--- 47.67322564125061 seconds ---\n",
- "147\n",
- "--- 48.20156168937683 seconds ---\n",
- "148\n",
- "--- 48.73471546173096 seconds ---\n",
- "149\n",
- "--- 49.2733519077301 seconds ---\n",
- "150\n",
- "--- 49.806400537490845 seconds ---\n",
- "151\n",
- "--- 50.33490014076233 seconds ---\n",
- "152\n",
- "--- 50.86489534378052 seconds ---\n",
- "153\n",
- "--- 51.39602565765381 seconds ---\n",
- "154\n",
- "--- 51.93729043006897 seconds ---\n",
- "155\n",
- "--- 52.473469972610474 seconds ---\n",
- "156\n",
- "--- 53.01401090621948 seconds ---\n",
- "157\n",
- "--- 53.58053278923035 seconds ---\n",
- "158\n",
- "--- 54.22534370422363 seconds ---\n",
- "159\n",
- "--- 54.870089292526245 seconds ---\n",
- "160\n",
- "--- 55.50953507423401 seconds ---\n",
- "161\n",
- "--- 56.144059896469116 seconds ---\n",
- "162\n",
- "--- 56.779675245285034 seconds ---\n",
- "163\n",
- "--- 57.41550326347351 seconds ---\n",
- "164\n",
- "--- 58.04742622375488 seconds ---\n",
- "165\n",
- "--- 58.57527136802673 seconds ---\n",
- "166\n",
- "--- 59.10521101951599 seconds ---\n",
- "167\n",
- "--- 59.737877368927 seconds ---\n",
- "168\n",
- "--- 60.373518228530884 seconds ---\n",
- "169\n",
- "--- 61.00429916381836 seconds ---\n",
- "170\n",
- "--- 61.64198398590088 seconds ---\n",
- "171\n",
- "--- 62.27683877944946 seconds ---\n",
- "172\n",
- "--- 62.91516971588135 seconds ---\n",
- "173\n",
- "--- 63.26179265975952 seconds ---\n",
- "174\n",
- "--- 63.89728498458862 seconds ---\n",
- "175\n",
- "--- 64.53867197036743 seconds ---\n",
- "176\n",
- "--- 65.18402314186096 seconds ---\n",
- "177\n",
- "--- 65.82770419120789 seconds ---\n",
- "178\n",
- "--- 66.46093964576721 seconds ---\n",
- "179\n",
- "--- 67.09133172035217 seconds ---\n",
- "180\n",
- "--- 67.73470735549927 seconds ---\n",
- "181\n",
- "--- 68.4084141254425 seconds ---\n",
- "182\n",
- "--- 69.05118441581726 seconds ---\n",
- "183\n",
- "--- 69.68487596511841 seconds ---\n",
- "184\n",
- "--- 70.3204357624054 seconds ---\n",
- "[[ 3. 1. 3. ..., 1. 1. 1.]\n",
- " [ 1. 6. 1. ..., 0. 0. 3.]\n",
- " [ 3. 1. 3. ..., 1. 1. 1.]\n",
- " ..., \n",
- " [ 1. 0. 1. ..., 55. 21. 7.]\n",
- " [ 1. 0. 1. ..., 21. 55. 7.]\n",
- " [ 1. 3. 1. ..., 7. 7. 55.]]\n"
- ]
- }
- ],
- "source": [
- "dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "G1 = dataset[12]\n",
- "G2 = dataset[20]\n",
- "Kmatrix = spkernel(dataset)\n",
- "\n",
- "print(Kmatrix)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|