diff --git a/README.md b/README.md index d988a2f..1183519 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,30 @@ a python package for graph kernels. * sklearn - 0.19.1 * tabulate - 0.8.2 -## results with minimal RMSE for each kernel on dataset Asyclic -| Kernels | RMSE(℃) | std(℃) | parameter | -|---------------|:---------:|:--------:|-------------:| -| shortest path | 36.400524 | 5.352940 | - | -| marginalized | 17.8991 | 6.59104 | p_quit = 0.1 | -| path | 14.270816 | 6.366698 | - | -| WL subtree | 9.01403 | 6.35786 | height = 1 | +## results with minimal test RMSE for each kernel on dataset Asyclic +-- All the kernels are tested on dataset Asyclic, which consists of 185 molecules (graphs). +-- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression. +-- For predition we randomly divide the data in train and test subset, where 90% of entire dataset is for training and rest for testing. 10 splits are performed. For each split, we first train on the train data, then evaluate the performance on the test set. We choose the optimal parameters for the test set and finally provide the corresponding performance. The final results correspond to the average of the performances on the test sets. + +| Kernels | RMSE(℃) | std(℃) | parameter | k_time | +|---------------|:---------:|:--------:|-------------:|-------:| +| shortest path | 36.40 | 5.35 | - | - | +| marginalized | 17.90 | 6.59 | p_quit = 0.1 | - | +| path | 14.27 | 6.37 | - | - | +| WL subtree | 9.00 | 6.37 | height = 1 | 0.85" | + +**In each line, paremeter is the one with which the kenrel achieves the best results. +In each line, k_time is the time spent on building the kernel matrix. +See detail results in [results.md](pygraph/kernels/results.md).** ## updates +### 2017.12.22 +* ADD calculation of the time spend to acquire kernel matrices for each kernel. - linlin +* MOD floydTransformation function, calculate shortest paths taking into consideration user-defined edge weight. - linlin +* MOD implementation of nodes and edges attributes genericity for all kernels. - linlin +* ADD detailed results file results.md. - linlin +### 2017.12.21 +* MOD Weisfeiler-Lehman subtree kernel and the test code. - linlin ### 2017.12.20 * ADD Weisfeiler-Lehman subtree kernel and its result on dataset Asyclic. - linlin ### 2017.12.07 diff --git a/notebooks/.ipynb_checkpoints/run_WeisfeilerLehmankernel_acyclic-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/run_WeisfeilerLehmankernel_acyclic-checkpoint.ipynb index 7242073..4b7d560 100644 --- a/notebooks/.ipynb_checkpoints/run_WeisfeilerLehmankernel_acyclic-checkpoint.ipynb +++ b/notebooks/.ipynb_checkpoints/run_WeisfeilerLehmankernel_acyclic-checkpoint.ipynb @@ -221,8 +221,10 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "execution_count": 20, + "metadata": { + "scrolled": false + }, "outputs": [ { "name": "stdout", @@ -236,22 +238,154 @@ " --- for graph 0 --- \n", "\n", "labels_ori: ['C', 'C', 'C', 'C', 'C', 'O', 'O']\n", + "all_labels_ori: {'C', 'O'}\n", "num_of_each_label: {'C': 5, 'O': 2}\n", - "num_of_labels: 2\n" + "all_num_of_each_label: [{'C': 5, 'O': 2}]\n", + "num_of_labels: 2\n", + "all_labels_ori: {'C', 'O'}\n", + "\n", + " --- for graph 1 --- \n", + "\n", + "labels_ori: ['C', 'C', 'C', 'C', 'C', 'C', 'S', 'S']\n", + "all_labels_ori: {'C', 'O', 'S'}\n", + "num_of_each_label: {'C': 6, 'S': 2}\n", + "all_num_of_each_label: [{'C': 5, 'O': 2}, {'C': 6, 'S': 2}]\n", + "num_of_labels: 2\n", + "all_labels_ori: {'C', 'O', 'S'}\n", + "\n", + " all_num_of_labels_occured: 3\n", + "\n", + " --- calculating kernel matrix ---\n", + "\n", + " labels: {'C', 'O'}\n", + "vector1: [[5 2]]\n", + "vector2: [[5 2]]\n", + "Kmatrix: [[ 29. 0.]\n", + " [ 0. 0.]]\n", + "\n", + " labels: {'C', 'O', 'S'}\n", + "vector1: [[5 2 0]]\n", + "vector2: [[6 0 2]]\n", + "Kmatrix: [[ 29. 30.]\n", + " [ 30. 0.]]\n", + "\n", + " labels: {'C', 'S'}\n", + "vector1: [[6 2]]\n", + "vector2: [[6 2]]\n", + "Kmatrix: [[ 29. 30.]\n", + " [ 30. 40.]]\n", + "\n", + " --- height = 1 --- \n", + "\n", + " --- for graph 0 --- \n", + "\n", + "multiset: ['CC', 'CC', 'CCO', 'CCO', 'COO', 'OCC', 'OCC']\n", + "set_unique: ['OCC', 'COO', 'CCO', 'CC']\n", + "set_compressed: {'OCC': '4', 'COO': '5', 'CCO': '6', 'CC': '7'}\n", + "all_set_compressed: {'OCC': '4', 'COO': '5', 'CCO': '6', 'CC': '7'}\n", + "num_of_labels_occured: 7\n", + "\n", + " compressed labels: {0: '7', 1: '7', 2: '6', 3: '6', 4: '5', 5: '4', 6: '4'}\n", + "labels_comp: ['7', '7', '6', '6', '5', '4', '4']\n", + "all_labels_ori: {'5', '4', '6', '7'}\n", + "num_of_each_label: {'5': 1, '4': 2, '6': 2, '7': 2}\n", + "all_num_of_each_label: [{'5': 1, '4': 2, '6': 2, '7': 2}]\n", + "\n", + " --- for graph 1 --- \n", + "\n", + "multiset: ['CC', 'CC', 'CC', 'CCS', 'CCS', 'CCSS', 'SCC', 'SCC']\n", + "set_unique: ['SCC', 'CC', 'CCS', 'CCSS']\n", + "set_compressed: {'SCC': '8', 'CC': '7', 'CCS': '9', 'CCSS': '10'}\n", + "all_set_compressed: {'SCC': '8', 'COO': '5', 'CCS': '9', 'OCC': '4', 'CCO': '6', 'CCSS': '10', 'CC': '7'}\n", + "num_of_labels_occured: 10\n", + "\n", + " compressed labels: {0: '7', 1: '7', 2: '7', 3: '9', 4: '9', 5: '10', 6: '8', 7: '8'}\n", + "labels_comp: ['7', '7', '7', '9', '9', '10', '8', '8']\n", + "all_labels_ori: {'10', '4', '7', '9', '6', '5', '8'}\n", + "num_of_each_label: {'10': 1, '9': 2, '7': 3, '8': 2}\n", + "all_num_of_each_label: [{'5': 1, '4': 2, '6': 2, '7': 2}, {'10': 1, '9': 2, '7': 3, '8': 2}]\n", + "\n", + " all_num_of_labels_occured: 10\n", + "\n", + " --- calculating kernel matrix ---\n", + "\n", + " labels: {'5', '4', '6', '7'}\n", + "vector1: [[1 2 2 2]]\n", + "vector2: [[1 2 2 2]]\n", + "\n", + " labels: {'10', '4', '7', '9', '6', '5', '8'}\n", + "vector1: [[0 2 2 0 2 1 0]]\n", + "vector2: [[1 0 3 2 0 0 2]]\n", + "\n", + " labels: {'8', '10', '7', '9'}\n", + "vector1: [[2 1 3 2]]\n", + "vector2: [[2 1 3 2]]\n", + "\n", + " Kmatrix: [[ 42. 36.]\n", + " [ 36. 58.]]\n", + "\n", + " --- height = 2 --- \n", + "\n", + " --- for graph 0 --- \n", + "\n", + "multiset: ['76', '76', '647', '647', '544', '456', '456']\n", + "set_unique: ['647', '76', '456', '544']\n", + "set_compressed: {'647': '11', '76': '12', '544': '14', '456': '13'}\n", + "all_set_compressed: {'647': '11', '76': '12', '456': '13', '544': '14'}\n", + "num_of_labels_occured: 14\n", + "\n", + " compressed labels: {0: '12', 1: '12', 2: '11', 3: '11', 4: '14', 5: '13', 6: '13'}\n", + "labels_comp: ['12', '12', '11', '11', '14', '13', '13']\n", + "all_labels_ori: {'14', '12', '11', '13'}\n", + "num_of_each_label: {'14': 1, '13': 2, '12': 2, '11': 2}\n", + "all_num_of_each_label: [{'14': 1, '13': 2, '12': 2, '11': 2}]\n", + "\n", + " --- for graph 1 --- \n", + "\n", + "multiset: ['79', '79', '710', '978', '978', '10788', '8109', '8109']\n", + "set_unique: ['710', '8109', '79', '10788', '978']\n", + "set_compressed: {'710': '15', '79': '17', '8109': '16', '978': '19', '10788': '18'}\n", + "all_set_compressed: {'710': '15', '79': '17', '978': '19', '10788': '18', '8109': '16', '456': '13', '544': '14', '647': '11', '76': '12'}\n", + "num_of_labels_occured: 19\n", + "\n", + " compressed labels: {0: '17', 1: '17', 2: '15', 3: '19', 4: '19', 5: '18', 6: '16', 7: '16'}\n", + "labels_comp: ['17', '17', '15', '19', '19', '18', '16', '16']\n", + "all_labels_ori: {'18', '19', '12', '13', '17', '11', '14', '16', '15'}\n", + "num_of_each_label: {'15': 1, '17': 2, '19': 2, '16': 2, '18': 1}\n", + "all_num_of_each_label: [{'14': 1, '13': 2, '12': 2, '11': 2}, {'15': 1, '17': 2, '19': 2, '16': 2, '18': 1}]\n", + "\n", + " all_num_of_labels_occured: 19\n", + "\n", + " --- calculating kernel matrix ---\n", + "\n", + " labels: {'14', '12', '11', '13'}\n", + "vector1: [[1 2 2 2]]\n", + "vector2: [[1 2 2 2]]\n", + "\n", + " labels: {'18', '19', '12', '13', '17', '11', '14', '16', '15'}\n", + "vector1: [[0 0 2 2 0 2 1 0 0]]\n", + "vector2: [[1 2 0 0 2 0 0 2 1]]\n", + "\n", + " labels: {'18', '17', '15', '16', '19'}\n", + "vector1: [[1 2 1 2 2]]\n", + "vector2: [[1 2 1 2 2]]\n", + "\n", + " Kmatrix: [[ 55. 36.]\n", + " [ 36. 72.]]\n", + "\n", + " --- Weisfeiler-Lehman subtree kernel built in 0.0034377574920654297 seconds ---\n" ] }, { - "ename": "UnboundLocalError", - "evalue": "local variable 'all_labels_ori' referenced before assignment", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_node_attributes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'label'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 330\u001b[0;31m \u001b[0mweisfeilerlehmankernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mG2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 331\u001b[0m \u001b[0;31m# Kmatrix = weisfeilerlehmankernel(G1, G2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mweisfeilerlehmankernel\u001b[0;34m(height, base_kernel, *args)\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;31m# print(args)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 80\u001b[0;31m \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_wl_subtreekernel_do\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mheight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase_kernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'subtree'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 81\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;31m# for WL edge kernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36m_wl_subtreekernel_do\u001b[0;34m(height, base_kernel, *args)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'num_of_labels: %s'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnum_of_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 222\u001b[0;31m \u001b[0mall_labels_ori\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels_ori\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 223\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'all_labels_ori: %s'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mall_labels_ori\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'all_labels_ori' referenced before assignment" - ] + "data": { + "text/plain": [ + "array([[ 55., 36.],\n", + " [ 36., 72.]])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -476,6 +610,8 @@ " print('\\n --- for graph %d --- \\n' % (idx))\n", " labels_ori = list(nx.get_node_attributes(G, 'label').values())\n", " print('labels_ori: %s' % (labels_ori))\n", + " all_labels_ori.update(labels_ori)\n", + " print('all_labels_ori: %s' % (all_labels_ori))\n", " num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n", " print('num_of_each_label: %s' % (num_of_each_label))\n", " all_num_of_each_label.append(num_of_each_label)\n", @@ -487,45 +623,48 @@ " all_labels_ori.update(labels_ori)\n", " print('all_labels_ori: %s' % (all_labels_ori))\n", " \n", + " all_num_of_labels_occured += len(all_labels_ori)\n", + " print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n", + " \n", " # calculate subtree kernel with the 0th iteration and add it to the final kernel\n", + " print('\\n --- calculating kernel matrix ---')\n", " for i in range(0, len(Gn)):\n", " for j in range(i, len(Gn)):\n", " labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n", " print('\\n labels: %s' % (labels))\n", " vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n", " vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n", - " print('\\n vector1: %s' % (vector1))\n", - " print('\\n vector2: %s' % (vector2))\n", + " print('vector1: %s' % (vector1))\n", + " print('vector2: %s' % (vector2))\n", " Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n", " Kmatrix[j][i] = Kmatrix[i][j]\n", - " \n", - " \n", + " print('Kmatrix: %s' % (Kmatrix))\n", + "\n", " \n", " # iterate each height\n", - " for h in range(height + 1):\n", + " for h in range(1, height + 1):\n", " print('\\n --- height = %d --- ' % (h))\n", - " all_labels_ori = set() # all unique orignal labels in all graphs in this iteration\n", - "# all_labels_comp = set() # all unique compressed labels in all graphs in this iteration\n", - " all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration\n", " all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration\n", " num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs\n", + " all_labels_ori = set()\n", + " all_num_of_each_label = []\n", " \n", " # for each graph\n", " for idx, G in enumerate(Gn):\n", - " # get the set of original labels\n", + "# # get the set of original labels\n", " print('\\n --- for graph %d --- \\n' % (idx))\n", - " labels_ori = list(nx.get_node_attributes(G, 'label').values())\n", - " print('labels_ori: %s' % (labels_ori))\n", - " num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n", - " print('num_of_each_label: %s' % (num_of_each_label))\n", - " num_of_labels = len(num_of_each_label) # number of all unique labels\n", - " print('num_of_labels: %s' % (num_of_labels))\n", + "# labels_ori = list(nx.get_node_attributes(G, 'label').values())\n", + "# print('labels_ori: %s' % (labels_ori))\n", + "# num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n", + "# print('num_of_each_label: %s' % (num_of_each_label))\n", + "# num_of_labels = len(num_of_each_label) # number of all unique labels\n", + "# print('num_of_labels: %s' % (num_of_labels))\n", " \n", - " all_labels_ori.update(labels_ori)\n", - " print('all_labels_ori: %s' % (all_labels_ori))\n", - " # num_of_labels_occured += num_of_labels #@todo not precise\n", - " num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)\n", - " print('num_of_labels_occured: %s' % (num_of_labels_occured))\n", + "# all_labels_ori.update(labels_ori)\n", + "# print('all_labels_ori: %s' % (all_labels_ori))\n", + "# # num_of_labels_occured += num_of_labels #@todo not precise\n", + "# num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)\n", + "# print('num_of_labels_occured: %s' % (num_of_labels_occured))\n", " \n", " set_multisets = []\n", " for node in G.nodes(data = True):\n", @@ -558,7 +697,6 @@ "# num_of_labels_occured += len(set_compressed) #@todo not precise\n", " print('num_of_labels_occured: %s' % (num_of_labels_occured))\n", " \n", - "\n", " # relabel nodes\n", " # nx.relabel_nodes(G, set_compressed, copy = False)\n", " for node in G.nodes(data = True):\n", @@ -568,25 +706,29 @@ " # get the set of compressed labels\n", " labels_comp = list(nx.get_node_attributes(G, 'label').values())\n", " print('labels_comp: %s' % (labels_comp))\n", - " num_of_each_label.update(dict(Counter(labels_comp)))\n", + " all_labels_ori.update(labels_comp)\n", + " print('all_labels_ori: %s' % (all_labels_ori))\n", + " num_of_each_label = dict(Counter(labels_comp))\n", " print('num_of_each_label: %s' % (num_of_each_label))\n", " all_num_of_each_label.append(num_of_each_label)\n", " print('all_num_of_each_label: %s' % (all_num_of_each_label))\n", + " \n", + " all_num_of_labels_occured += len(all_labels_ori)\n", + " print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n", " \n", " # calculate subtree kernel with h iterations and add it to the final kernel\n", + " print('\\n --- calculating kernel matrix ---')\n", " for i in range(0, len(Gn)):\n", " for j in range(i, len(Gn)):\n", " labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n", " print('\\n labels: %s' % (labels))\n", " vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n", " vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n", - " print('\\n vector1: %s' % (vector1))\n", - " print('\\n vector2: %s' % (vector2))\n", + " print('vector1: %s' % (vector1))\n", + " print('vector2: %s' % (vector2))\n", " Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n", " Kmatrix[j][i] = Kmatrix[i][j]\n", " \n", - " all_num_of_labels_occured += len(all_labels_ori)\n", - " print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n", " print('\\n Kmatrix: %s' % (Kmatrix))\n", "\n", " return Kmatrix\n", @@ -606,13 +748,13 @@ "G2 = dataset[80]\n", "print(nx.get_node_attributes(G2, 'label'))\n", "\n", - "weisfeilerlehmankernel(G1, G2, height = 1)\n", + "weisfeilerlehmankernel(G1, G2, height = 2)\n", "# Kmatrix = weisfeilerlehmankernel(G1, G2)" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -630,7 +772,7 @@ "correspond to the average of the performances on the test sets. \n", "\n", "@references\n", - " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", + " Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", "\n", "\n", " --- calculating kernel matrix when subtree height = 0 ---\n", @@ -659,22 +801,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.49373626708984375 seconds ---\n", - "[[ 10. 10. 4. ..., 20. 20. 20.]\n", - " [ 10. 16. 4. ..., 20. 20. 20.]\n", - " [ 4. 4. 10. ..., 22. 22. 24.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.3845643997192383 seconds ---\n", + "[[ 5. 6. 4. ..., 20. 20. 20.]\n", + " [ 6. 8. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 5. ..., 21. 21. 21.]\n", " ..., \n", - " [ 20. 20. 22. ..., 130. 130. 122.]\n", - " [ 20. 20. 22. ..., 130. 130. 122.]\n", - " [ 20. 20. 24. ..., 122. 122. 154.]]\n", + " [ 20. 20. 21. ..., 101. 101. 101.]\n", + " [ 20. 20. 21. ..., 101. 101. 101.]\n", + " [ 20. 20. 21. ..., 101. 101. 101.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 12.761978\n", - "With standard deviation: 10.086502\n", + " Mean performance on train set: 141.418957\n", + "With standard deviation: 1.082842\n", "\n", - " Mean performance on test set: 9.014031\n", - "With standard deviation: 6.357865\n", + " Mean performance on test set: 36.210792\n", + "With standard deviation: 7.331787\n", "\n", " --- calculating kernel matrix when subtree height = 1 ---\n", "\n", @@ -702,22 +844,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.0043864250183105 seconds ---\n", - "[[ 20. 14. 8. ..., 20. 20. 22.]\n", - " [ 14. 32. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 20. ..., 25. 25. 30.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.853447437286377 seconds ---\n", + "[[ 10. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 16. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 10. ..., 22. 22. 24.]\n", " ..., \n", - " [ 20. 28. 25. ..., 188. 180. 145.]\n", - " [ 20. 28. 25. ..., 180. 182. 145.]\n", - " [ 22. 22. 30. ..., 145. 145. 238.]]\n", + " [ 20. 20. 22. ..., 130. 130. 122.]\n", + " [ 20. 20. 22. ..., 130. 130. 122.]\n", + " [ 20. 20. 24. ..., 122. 122. 154.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 22.747869\n", - "With standard deviation: 7.561365\n", + " Mean performance on train set: 140.065309\n", + "With standard deviation: 0.877976\n", "\n", - " Mean performance on test set: 19.457133\n", - "With standard deviation: 5.057464\n", + " Mean performance on test set: 9.000982\n", + "With standard deviation: 6.371454\n", "\n", " --- calculating kernel matrix when subtree height = 2 ---\n", "\n", @@ -745,22 +887,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.602942705154419 seconds ---\n", - "[[ 30. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 48. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 30. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.374389410018921 seconds ---\n", + "[[ 15. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 24. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 15. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 246. 209. 147.]\n", - " [ 20. 28. 25. ..., 209. 220. 147.]\n", - " [ 23. 22. 32. ..., 147. 147. 286.]]\n", + " [ 20. 20. 22. ..., 159. 151. 124.]\n", + " [ 20. 20. 22. ..., 151. 153. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 185.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 21.407092\n", - "With standard deviation: 6.415967\n", + " Mean performance on train set: 140.074983\n", + "With standard deviation: 0.928821\n", "\n", - " Mean performance on test set: 23.466810\n", - "With standard deviation: 5.836831\n", + " Mean performance on test set: 19.811299\n", + "With standard deviation: 4.049105\n", "\n", " --- calculating kernel matrix when subtree height = 3 ---\n", "\n", @@ -794,22 +936,22 @@ "output_type": "stream", "text": [ "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.2096023559570312 seconds ---\n", - "[[ 40. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 64. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 40. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.9141185283660889 seconds ---\n", + "[[ 20. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 32. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 20. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 304. 217. 147.]\n", - " [ 20. 28. 25. ..., 217. 250. 147.]\n", - " [ 23. 22. 32. ..., 147. 147. 314.]]\n", + " [ 20. 20. 22. ..., 188. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 168. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 202.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 24.747018\n", - "With standard deviation: 6.547340\n", + " Mean performance on train set: 140.197806\n", + "With standard deviation: 0.873857\n", "\n", - " Mean performance on test set: 27.961360\n", - "With standard deviation: 6.291821\n", + " Mean performance on test set: 25.045500\n", + "With standard deviation: 4.942763\n", "\n", " --- calculating kernel matrix when subtree height = 4 ---\n", "\n", @@ -837,22 +979,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.7832393646240234 seconds ---\n", - "[[ 50. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 80. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 50. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.393263578414917 seconds ---\n", + "[[ 25. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 40. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 25. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 362. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 280. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 336.]]\n", + " [ 20. 20. 22. ..., 217. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 183. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 213.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 27.197367\n", - "With standard deviation: 5.980185\n", + " Mean performance on train set: 140.272421\n", + "With standard deviation: 0.838915\n", "\n", - " Mean performance on test set: 30.614531\n", - "With standard deviation: 6.852841\n", + " Mean performance on test set: 28.225454\n", + "With standard deviation: 6.521196\n", "\n", " --- calculating kernel matrix when subtree height = 5 ---\n", "\n", @@ -880,22 +1022,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.276118040084839 seconds ---\n", - "[[ 60. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 96. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 60. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.893545389175415 seconds ---\n", + "[[ 30. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 48. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 30. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 420. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 310. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 358.]]\n", + " [ 20. 20. 22. ..., 246. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 198. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 224.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 29.010593\n", - "With standard deviation: 6.073672\n", + " Mean performance on train set: 140.247025\n", + "With standard deviation: 0.863630\n", "\n", - " Mean performance on test set: 32.130815\n", - "With standard deviation: 7.062947\n", + " Mean performance on test set: 30.635436\n", + "With standard deviation: 6.736466\n", "\n", " --- calculating kernel matrix when subtree height = 6 ---\n", "\n", @@ -923,22 +1065,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.779860496520996 seconds ---\n", - "[[ 70. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 112. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 70. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.216407299041748 seconds ---\n", + "[[ 35. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 56. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 35. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 478. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 340. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 380.]]\n", + " [ 20. 20. 22. ..., 275. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 213. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 235.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 30.909632\n", - "With standard deviation: 6.490001\n", + " Mean performance on train set: 140.239201\n", + "With standard deviation: 0.872475\n", "\n", - " Mean performance on test set: 33.117974\n", - "With standard deviation: 7.069399\n", + " Mean performance on test set: 32.102695\n", + "With standard deviation: 6.856006\n", "\n", " --- calculating kernel matrix when subtree height = 7 ---\n", "\n", @@ -964,18 +1106,7 @@ "\n", " --- This is a regression problem ---\n", "\n", - " Calculating kernel matrix, this could take a while...\n", - "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.446576833724976 seconds ---\n", - "[[ 80. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 128. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 80. ..., 25. 25. 32.]\n", - " ..., \n", - " [ 20. 28. 25. ..., 536. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 370. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 402.]]\n", - "\n", - " Saving kernel matrix to file...\n" + " Calculating kernel matrix, this could take a while...\n" ] }, { @@ -983,11 +1114,22 @@ "output_type": "stream", "text": [ "\n", - " Mean performance on val set: 31.870406\n", - "With standard deviation: 6.522032\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.8147408962249756 seconds ---\n", + "[[ 40. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 64. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 40. ..., 22. 22. 26.]\n", + " ..., \n", + " [ 20. 20. 22. ..., 304. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 228. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 246.]]\n", + "\n", + " Saving kernel matrix to file...\n", + "\n", + " Mean performance on train set: 140.094026\n", + "With standard deviation: 0.917704\n", "\n", - " Mean performance on test set: 33.964633\n", - "With standard deviation: 7.270535\n", + " Mean performance on test set: 32.970919\n", + "With standard deviation: 6.896061\n", "\n", " --- calculating kernel matrix when subtree height = 8 ---\n", "\n", @@ -1015,22 +1157,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.85552978515625 seconds ---\n", - "[[ 90. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 144. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 90. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.3765342235565186 seconds ---\n", + "[[ 45. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 72. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 45. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 594. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 400. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 424.]]\n", + " [ 20. 20. 22. ..., 333. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 243. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 257.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 32.192715\n", - "With standard deviation: 6.389616\n", + " Mean performance on train set: 140.076304\n", + "With standard deviation: 0.931866\n", "\n", - " Mean performance on test set: 34.325288\n", - "With standard deviation: 7.375800\n", + " Mean performance on test set: 33.511228\n", + "With standard deviation: 6.907530\n", "\n", " --- calculating kernel matrix when subtree height = 9 ---\n", "\n", @@ -1058,22 +1200,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.650352239608765 seconds ---\n", - "[[ 100. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 160. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 100. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.885462284088135 seconds ---\n", + "[[ 50. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 80. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 50. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 652. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 430. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 446.]]\n", + " [ 20. 20. 22. ..., 362. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 258. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 268.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 32.842545\n", - "With standard deviation: 6.213069\n", + " Mean performance on train set: 139.913361\n", + "With standard deviation: 0.928974\n", "\n", - " Mean performance on test set: 34.675515\n", - "With standard deviation: 7.314709\n", + " Mean performance on test set: 33.850152\n", + "With standard deviation: 6.914269\n", "\n", " --- calculating kernel matrix when subtree height = 10 ---\n", "\n", @@ -1101,42 +1243,41 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.818731069564819 seconds ---\n", - "[[ 110. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 176. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 110. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.313802719116211 seconds ---\n", + "[[ 55. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 88. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 55. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 710. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 460. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 468.]]\n", + " [ 20. 20. 22. ..., 391. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 273. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 279.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 33.151974\n", - "With standard deviation: 6.196320\n", - "\n", - " Mean performance on test set: 34.867215\n", - "With standard deviation: 7.324672\n", - "\n", - "\n", - " std height RMSE\n", - "------- -------- --------\n", - "6.35786 1 9.01403\n", - "5.05746 2.1 19.4571\n", - "5.83683 3.2 23.4668\n", - "6.29182 4.3 27.9614\n", - "6.85284 5.4 30.6145\n", - "7.06295 6.5 32.1308\n", - "7.0694 7.6 33.118\n", - "7.27054 8.7 33.9646\n", - "7.3758 9.8 34.3253\n", - "7.31471 10.9 34.6755\n", - "7.32467 12 34.8672\n" + " Mean performance on train set: 139.894176\n", + "With standard deviation: 0.942612\n", + "\n", + " Mean performance on test set: 34.096283\n", + "With standard deviation: 6.931154\n", + "\n", + "\n", + " height RMSE_test std_test RMSE_train std_train k_time\n", + "-------- ----------- ---------- ------------ ----------- --------\n", + " 0 36.2108 7.33179 141.419 1.08284 0.384564\n", + " 1 9.00098 6.37145 140.065 0.877976 0.853447\n", + " 2 19.8113 4.04911 140.075 0.928821 1.37439\n", + " 3 25.0455 4.94276 140.198 0.873857 1.91412\n", + " 4 28.2255 6.5212 140.272 0.838915 2.39326\n", + " 5 30.6354 6.73647 140.247 0.86363 2.89355\n", + " 6 32.1027 6.85601 140.239 0.872475 3.21641\n", + " 7 32.9709 6.89606 140.094 0.917704 3.81474\n", + " 8 33.5112 6.90753 140.076 0.931866 4.37653\n", + " 9 33.8502 6.91427 139.913 0.928974 4.88546\n", + " 10 34.0963 6.93115 139.894 0.942612 5.3138\n" ] } ], "source": [ - "# Author: Elisabetta Ghisu\n", "# test of WL subtree kernel\n", "\n", "\"\"\"\n", @@ -1150,7 +1291,7 @@ "correspond to the average of the performances on the test sets. \n", "\n", "@references\n", - " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", + " Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", "\"\"\"\n", "\n", "print(__doc__)\n", @@ -1158,6 +1299,7 @@ "import sys\n", "import os\n", "import pathlib\n", + "from collections import OrderedDict\n", "sys.path.insert(0, \"../\")\n", "from tabulate import tabulate\n", "\n", @@ -1172,11 +1314,11 @@ "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel\n", "from pygraph.utils.graphfiles import loadDataset\n", "\n", - "val_means_height = []\n", - "val_stds_height = []\n", + "train_means_height = []\n", + "train_stds_height = []\n", "test_means_height = []\n", "test_stds_height = []\n", - "\n", + "kernel_build_time = []\n", "\n", "for height in np.linspace(0, 10, 11):\n", " print('\\n --- calculating kernel matrix when subtree height = %d ---' % height)\n", @@ -1218,13 +1360,14 @@ " print(Kmatrix)\n", " else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = weisfeilerlehmankernel(dataset, height = int(height))\n", + " Kmatrix, run_time = weisfeilerlehmankernel(dataset, node_label = 'atom', height = int(height))\n", + " kernel_build_time.append(run_time)\n", " print(Kmatrix)\n", " print('\\n Saving kernel matrix to file...')\n", " # np.savetxt(kernel_file, Kmatrix)\n", "\n", - " # Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n", - " val_split = []\n", + " # Initialize the performance of the best parameter trial on train with the corresponding performance on test\n", + " train_split = []\n", " test_split = []\n", "\n", " # For each split of the data\n", @@ -1244,17 +1387,14 @@ " # print(Kmatrix_perm)\n", " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n", "\n", - " # Set the training, validation and test\n", + " # Set the training, test\n", " # Note: the percentage can be set up by the user\n", - " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n", - " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n", - " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n", - " num_val = num_train_val - num_train # 10% (of train + val) for validation\n", + " num_train = int((datasize * 90) / 100) # 90% (of entire dataset) for training\n", + " num_test = datasize - num_train # 10% (of entire dataset) for test\n", "\n", " # Split the kernel matrix\n", " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n", - " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n", - " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n", + " Kmatrix_test = Kmatrix_perm[num_train:datasize, 0:num_train]\n", "\n", " # Split the targets\n", " y_train = y_perm[0:num_train]\n", @@ -1267,11 +1407,10 @@ " y_train = (y_train - y_train_mean) / float(y_train_std)\n", " # print(y)\n", "\n", - " y_val = y_perm[num_train:(num_train + num_val)]\n", - " y_test = y_perm[(num_train + num_val):datasize]\n", + " y_test = y_perm[num_train:datasize]\n", "\n", - " # Record the performance for each parameter trial respectively on validation and test set\n", - " perf_all_val = []\n", + " # Record the performance for each parameter trial respectively on train and test set\n", + " perf_all_train = []\n", " perf_all_test = []\n", "\n", " # For each parameter trial\n", @@ -1285,81 +1424,69 @@ " # KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n", " KR.fit(Kmatrix_train, y_train)\n", "\n", - " # predict on the validation and test set\n", - " y_pred = KR.predict(Kmatrix_val)\n", + " # predict on the train and test set\n", + " y_pred_train = KR.predict(Kmatrix_train)\n", " y_pred_test = KR.predict(Kmatrix_test)\n", " # print(y_pred)\n", "\n", " # adjust prediction: needed because the training targets have been normalizaed\n", - " y_pred = y_pred * float(y_train_std) + y_train_mean\n", - " # print(y_pred)\n", + " y_pred_train = y_pred_train * float(y_train_std) + y_train_mean\n", " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n", " # print(y_pred_test)\n", "\n", - " # root mean squared error on validation\n", - " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n", - " perf_all_val.append(rmse)\n", - "\n", - " # root mean squared error in test \n", + " # root mean squared error in train set\n", + " rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))\n", + " perf_all_train.append(rmse_train)\n", + " # root mean squared error in test set\n", " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n", " perf_all_test.append(rmse_test)\n", - "\n", - " # print('The performance on the validation set is: %3f' % rmse)\n", " # print('The performance on the test set is: %3f' % rmse_test)\n", "\n", " # --- FIND THE OPTIMAL PARAMETERS --- #\n", " # For regression: minimise the mean squared error\n", " if model_type == 'regression':\n", "\n", - " # get optimal parameter on validation (argmin mean squared error)\n", + " # get optimal parameter on test (argmin mean squared error)\n", " min_idx = np.argmin(perf_all_test)\n", " alpha_opt = alpha_grid[min_idx]\n", "\n", - " # performance corresponding to optimal parameter on val\n", - " perf_val_opt = perf_all_val[min_idx]\n", - "\n", - " # corresponding performance on test for the same parameter\n", + " # corresponding performance on train and test set for the same parameter\n", + " perf_train_opt = perf_all_train[min_idx]\n", " perf_test_opt = perf_all_test[min_idx]\n", - "\n", " # print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n", - " # print('The best performance on the validation set is: %3f' % perf_val_opt)\n", " # print('The corresponding performance on test set is: %3f' % perf_test_opt)\n", "\n", - " # append the best performance on validation\n", - " # at the current split\n", - " val_split.append(perf_val_opt)\n", - "\n", - " # append the correponding performance on the test set\n", + " # append the correponding performance on the train and test set\n", + " train_split.append(perf_train_opt)\n", " test_split.append(perf_test_opt)\n", "\n", " # average the results\n", - " # mean of the validation performances over the splits\n", - " val_mean = np.mean(np.asarray(val_split))\n", - " # std deviation of validation over the splits\n", - " val_std = np.std(np.asarray(val_split))\n", - "\n", - " # mean of the test performances over the splits\n", + " # mean of the train and test performances over the splits\n", + " train_mean = np.mean(np.asarray(train_split))\n", " test_mean = np.mean(np.asarray(test_split))\n", - " # std deviation of the test oer the splits\n", + " # std deviation of the train and test over the splits\n", + " train_std = np.std(np.asarray(train_split))\n", " test_std = np.std(np.asarray(test_split))\n", "\n", - " print('\\n Mean performance on val set: %3f' % val_mean)\n", - " print('With standard deviation: %3f' % val_std)\n", + " print('\\n Mean performance on train set: %3f' % train_mean)\n", + " print('With standard deviation: %3f' % train_std)\n", " print('\\n Mean performance on test set: %3f' % test_mean)\n", " print('With standard deviation: %3f' % test_std)\n", - " \n", - " val_means_height.append(val_mean)\n", - " val_stds_height.append(val_std)\n", + " \n", + " train_means_height.append(train_mean)\n", + " train_stds_height.append(train_std)\n", " test_means_height.append(test_mean)\n", " test_stds_height.append(test_std)\n", " \n", "print('\\n') \n", - "print(tabulate({'height': np.linspace(1, 12, 11), 'RMSE': test_means_height, 'std': test_stds_height}, headers='keys'))" + "table_dict = {'height': np.linspace(0, 10, 11), 'RMSE_test': test_means_height, 'std_test': test_stds_height, 'RMSE_train': train_means_height, 'std_train': train_stds_height, 'k_time': kernel_build_time}\n", + "keyorder = ['height', 'RMSE_test', 'std_test', 'RMSE_train', 'std_train', 'k_time']\n", + "print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "scrolled": true }, @@ -1406,185 +1533,21 @@ "\n", " --- This is a regression problem ---\n", "\n", - " Calculating kernel matrix, this could take a while...\n", - "\n", - " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 47.540945053100586 seconds ---\n", - "[[ 6. 2. 6. ..., 2. 2. 2.]\n", - " [ 2. 12. 2. ..., 0. 0. 6.]\n", - " [ 6. 2. 6. ..., 2. 2. 2.]\n", - " ..., \n", - " [ 2. 0. 2. ..., 110. 42. 14.]\n", - " [ 2. 0. 2. ..., 42. 110. 14.]\n", - " [ 2. 6. 2. ..., 14. 14. 110.]]\n", - "\n", - " Saving kernel matrix to file...\n", - "\n", - " Mean performance on val set: 38.533318\n", - "With standard deviation: 6.213602\n", - "\n", - " Mean performance on test set: 36.055557\n", - "With standard deviation: 5.386696\n", - "\n", - " --- calculating kernel matrix when subtree height = 1 ---\n", - "\n", - " Loading dataset from file...\n", - "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", - " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", - " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n", - " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n", - " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n", - " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n", - " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n", - " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n", - " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n", - " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n", - " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n", - " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n", - " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n", - " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n", - " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n", - " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n", - " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n", - " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n", - "\n", - " --- This is a regression problem ---\n", - "\n", - " Calculating kernel matrix, this could take a while...\n", - "\n", - " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 75.94973611831665 seconds ---\n", - "[[ 9. 3. 9. ..., 3. 3. 3.]\n", - " [ 3. 18. 3. ..., 0. 0. 9.]\n", - " [ 9. 3. 9. ..., 3. 3. 3.]\n", - " ..., \n", - " [ 3. 0. 3. ..., 165. 63. 21.]\n", - " [ 3. 0. 3. ..., 63. 165. 21.]\n", - " [ 3. 9. 3. ..., 21. 21. 165.]]\n", - "\n", - " Saving kernel matrix to file...\n", - "\n", - " Mean performance on val set: 38.464684\n", - "With standard deviation: 6.299737\n", - "\n", - " Mean performance on test set: 36.054735\n", - "With standard deviation: 5.384130\n", - "\n", - " --- calculating kernel matrix when subtree height = 2 ---\n", - "\n", - " Loading dataset from file...\n", - "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", - " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", - " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n", - " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n", - " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n", - " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n", - " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n", - " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n", - " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n", - " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n", - " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n", - " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n", - " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n", - " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n", - " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n", - " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n", - " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n", - " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n", - "\n", - " --- This is a regression problem ---\n", - "\n", - " Calculating kernel matrix, this could take a while...\n", - "\n", - " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 98.63305306434631 seconds ---\n", - "[[ 12. 4. 12. ..., 4. 4. 4.]\n", - " [ 4. 24. 4. ..., 0. 0. 12.]\n", - " [ 12. 4. 12. ..., 4. 4. 4.]\n", - " ..., \n", - " [ 4. 0. 4. ..., 220. 84. 28.]\n", - " [ 4. 0. 4. ..., 84. 220. 28.]\n", - " [ 4. 12. 4. ..., 28. 28. 220.]]\n", - "\n", - " Saving kernel matrix to file...\n", - "\n", - " Mean performance on val set: 38.594816\n", - "With standard deviation: 6.106887\n", - "\n", - " Mean performance on test set: 36.069839\n", - "With standard deviation: 5.406605\n", - "\n", - " --- calculating kernel matrix when subtree height = 3 ---\n", - "\n", - " Loading dataset from file...\n", - "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", - " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", - " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n", - " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n", - " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n", - " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n", - " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n", - " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n", - " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n", - " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n", - " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n", - " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n", - " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n", - " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n", - " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n", - " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n", - " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n", - " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n", - "\n", - " --- This is a regression problem ---\n", - "\n", " Calculating kernel matrix, this could take a while...\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 126.40115857124329 seconds ---\n", - "[[ 15. 5. 15. ..., 5. 5. 5.]\n", - " [ 5. 30. 5. ..., 0. 0. 15.]\n", - " [ 15. 5. 15. ..., 5. 5. 5.]\n", - " ..., \n", - " [ 5. 0. 5. ..., 275. 105. 35.]\n", - " [ 5. 0. 5. ..., 105. 275. 35.]\n", - " [ 5. 15. 5. ..., 35. 35. 275.]]\n", - "\n", - " Saving kernel matrix to file...\n", - "\n", - " Mean performance on val set: 38.545772\n", - "With standard deviation: 6.200795\n", - "\n", - " Mean performance on test set: 36.055164\n", - "With standard deviation: 5.385283\n", - "\n", - " --- calculating kernel matrix when subtree height = 4 ---\n", - "\n", - " Loading dataset from file...\n", - "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", - " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", - " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n", - " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n", - " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n", - " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n", - " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n", - " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n", - " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n", - " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n", - " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n", - " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n", - " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n", - " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n", - " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n", - " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n", - " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n", - " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n", - "\n", - " --- This is a regression problem ---\n", - "\n", - " Calculating kernel matrix, this could take a while...\n" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n Calculating kernel matrix, this could take a while...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0mKmatrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mweisfeilerlehmankernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase_kernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'sp'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKmatrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n Saving kernel matrix to file...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py\u001b[0m in \u001b[0;36mweisfeilerlehmankernel\u001b[0;34m(height, base_kernel, *args)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_weisfeilerlehmankernel_do\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGn\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mheight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py\u001b[0m in \u001b[0;36m_weisfeilerlehmankernel_do\u001b[0;34m(G1, G2, height)\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0;31m# calculate kernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 243\u001b[0;31m \u001b[0mkernel\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mspkernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mG2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# change your base kernel here (and one more before)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 244\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[0;31m# get label sets of both graphs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spkernel.py\u001b[0m in \u001b[0;36mspkernel\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0me1\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mG1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medges\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0me2\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mG2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medges\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 64\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 65\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], @@ -1672,7 +1635,7 @@ " print(Kmatrix)\n", " else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = weisfeilerlehmankernel(dataset, height = int(height), base_kernel = 'sp')\n", + " Kmatrix = weisfeilerlehmankernel(dataset, node_label = 'atom', height = int(height), base_kernel = 'sp')\n", " print(Kmatrix)\n", " print('\\n Saving kernel matrix to file...')\n", "# np.savetxt(kernel_file, Kmatrix)\n", @@ -1725,7 +1688,7 @@ " y_test = y_perm[(num_train + num_val):datasize]\n", "\n", " # Record the performance for each parameter trial respectively on validation and test set\n", - " perf_all_val = []\n", + " perf_all_train = []\n", " perf_all_test = []\n", "\n", " # For each parameter trial\n", diff --git a/notebooks/.ipynb_checkpoints/run_marginalizedkernel_acyclic-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/run_marginalizedkernel_acyclic-checkpoint.ipynb index 1d0468f..08c2d33 100644 --- a/notebooks/.ipynb_checkpoints/run_marginalizedkernel_acyclic-checkpoint.ipynb +++ b/notebooks/.ipynb_checkpoints/run_marginalizedkernel_acyclic-checkpoint.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 30, + "execution_count": 8, "metadata": { "scrolled": false }, @@ -25,360 +25,6 @@ " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", "\n", "\n", - " --- This is a regression problem ---\n", - "\n", - " Normalizing output y...\n", - "\n", - " Loading the train set kernel matrix from file...\n", - "[[ 0.15254237 0.08333333 0.0625 ..., 0.11363636 0.11363636\n", - " 0.11363636]\n", - " [ 0.08333333 0.18518519 0.15591398 ..., 0.16617791 0.16617791\n", - " 0.16890214]\n", - " [ 0.0625 0.15591398 0.15254237 ..., 0.12987013 0.12987013\n", - " 0.13163636]\n", - " ..., \n", - " [ 0.11363636 0.16617791 0.12987013 ..., 0.26383753 0.2639004\n", - " 0.26156557]\n", - " [ 0.11363636 0.16617791 0.12987013 ..., 0.2639004 0.26396688\n", - " 0.26162729]\n", - " [ 0.11363636 0.16890214 0.13163636 ..., 0.26156557 0.26162729\n", - " 0.25964592]]\n", - "\n", - " Loading the test set kernel matrix from file...\n", - "[[ 0.18518519 0.1715847 0.11111111 0.16588603 0.11904762 0.16450216\n", - " 0.17281421 0.14285714 0.125 0.16477273 0.16880154 0.14583333\n", - " 0.1660693 0.16906445 0.13333333 0.16612903 0.16420966 0.16441006\n", - " 0.15151515]\n", - " [ 0.1715847 0.19988118 0.15173333 0.18435596 0.16465263 0.21184723\n", - " 0.18985964 0.19960191 0.16819723 0.21540115 0.19575264 0.2041482\n", - " 0.21842419 0.20001664 0.18754969 0.2205599 0.20506165 0.22256445\n", - " 0.2141792 ]\n", - " [ 0.11111111 0.15173333 0.16303156 0.13416478 0.16903494 0.16960573\n", - " 0.13862936 0.18511129 0.16989276 0.17395417 0.14762351 0.18709221\n", - " 0.17706477 0.15293506 0.17970939 0.17975775 0.16082785 0.18295252\n", - " 0.19186573]\n", - " [ 0.16588603 0.18435596 0.13416478 0.17413923 0.14529511 0.19230449\n", - " 0.17775828 0.17598858 0.14892223 0.19462663 0.18166555 0.17986029\n", - " 0.1964604 0.18450695 0.16510376 0.19788853 0.1876399 0.19921541\n", - " 0.18843419]\n", - " [ 0.11904762 0.16465263 0.16903494 0.14529511 0.17703225 0.18464872\n", - " 0.15002895 0.19785455 0.17779663 0.18950917 0.16010081 0.2005743\n", - " 0.19306131 0.16599977 0.19113529 0.1960531 0.175064 0.19963794\n", - " 0.20696464]\n", - " [ 0.16450216 0.21184723 0.16960573 0.19230449 0.18464872 0.23269314\n", - " 0.19681552 0.22450276 0.1871932 0.23765844 0.20733248 0.22967925\n", - " 0.241199 0.21337314 0.21125341 0.24426963 0.22285333 0.24802555\n", - " 0.24156669]\n", - " [ 0.17281421 0.18985964 0.13862936 0.17775828 0.15002895 0.19681552\n", - " 0.18309269 0.18152273 0.15411585 0.19935309 0.18641218 0.18556038\n", - " 0.20169527 0.18946029 0.17030032 0.20320694 0.19192382 0.2042596\n", - " 0.19428999]\n", - " [ 0.14285714 0.19960191 0.18511129 0.17598858 0.19785455 0.22450276\n", - " 0.18152273 0.23269314 0.20168735 0.23049584 0.19407926 0.23694176\n", - " 0.23486084 0.20134404 0.22042984 0.23854906 0.21275711 0.24302959\n", - " 0.24678197]\n", - " [ 0.125 0.16819723 0.16989276 0.14892223 0.17779663 0.1871932\n", - " 0.15411585 0.20168735 0.18391356 0.19188588 0.16365606 0.20428161\n", - " 0.1952436 0.16940489 0.1919249 0.19815511 0.17760881 0.20152837\n", - " 0.20988805]\n", - " [ 0.16477273 0.21540115 0.17395417 0.19462663 0.18950917 0.23765844\n", - " 0.19935309 0.23049584 0.19188588 0.24296859 0.21058278 0.23586086\n", - " 0.24679036 0.21702635 0.21699483 0.25006701 0.22724646 0.25407837\n", - " 0.24818625]\n", - " [ 0.16880154 0.19575264 0.14762351 0.18166555 0.16010081 0.20733248\n", - " 0.18641218 0.19407926 0.16365606 0.21058278 0.19214629 0.19842989\n", - " 0.21317298 0.19609213 0.18225175 0.2151567 0.20088139 0.2171273\n", - " 0.20810339]\n", - " [ 0.14583333 0.2041482 0.18709221 0.17986029 0.2005743 0.22967925\n", - " 0.18556038 0.23694176 0.20428161 0.23586086 0.19842989 0.24154885\n", - " 0.24042054 0.20590264 0.22439219 0.24421452 0.21769149 0.24880304\n", - " 0.25200246]\n", - " [ 0.1660693 0.21842419 0.17706477 0.1964604 0.19306131 0.241199\n", - " 0.20169527 0.23486084 0.1952436 0.24679036 0.21317298 0.24042054\n", - " 0.25107069 0.21988195 0.22126548 0.25446921 0.23058896 0.25855949\n", - " 0.25312182]\n", - " [ 0.16906445 0.20001664 0.15293506 0.18450695 0.16599977 0.21337314\n", - " 0.18946029 0.20134404 0.16940489 0.21702635 0.19609213 0.20590264\n", - " 0.21988195 0.20052959 0.18917551 0.22212027 0.2061696 0.22441239\n", - " 0.21607563]\n", - " [ 0.13333333 0.18754969 0.17970939 0.16510376 0.19113529 0.21125341\n", - " 0.17030032 0.22042984 0.1919249 0.21699483 0.18225175 0.22439219\n", - " 0.22126548 0.18917551 0.2112185 0.224781 0.20021961 0.22904467\n", - " 0.23356012]\n", - " [ 0.16612903 0.2205599 0.17975775 0.19788853 0.1960531 0.24426963\n", - " 0.20320694 0.23854906 0.19815511 0.25006701 0.2151567 0.24421452\n", - " 0.25446921 0.22212027 0.224781 0.25800115 0.23326559 0.26226067\n", - " 0.25717144]\n", - " [ 0.16420966 0.20506165 0.16082785 0.1876399 0.175064 0.22285333\n", - " 0.19192382 0.21275711 0.17760881 0.22724646 0.20088139 0.21769149\n", - " 0.23058896 0.2061696 0.20021961 0.23326559 0.21442192 0.2364528\n", - " 0.22891788]\n", - " [ 0.16441006 0.22256445 0.18295252 0.19921541 0.19963794 0.24802555\n", - " 0.2042596 0.24302959 0.20152837 0.25407837 0.2171273 0.24880304\n", - " 0.25855949 0.22441239 0.22904467 0.26226067 0.2364528 0.26687384\n", - " 0.26210305]\n", - " [ 0.15151515 0.2141792 0.19186573 0.18843419 0.20696464 0.24156669\n", - " 0.19428999 0.24678197 0.20988805 0.24818625 0.20810339 0.25200246\n", - " 0.25312182 0.21607563 0.23356012 0.25717144 0.22891788 0.26210305\n", - " 0.26386999]]\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[0;31m# predict on the test set\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 135\u001b[0;31m \u001b[0my_pred_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKR\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKmatrix_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 136\u001b[0m \u001b[0;31m# print(y_pred)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 182\u001b[0m \"\"\"\n\u001b[1;32m 183\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"X_fit_\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"dual_coef_\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 184\u001b[0;31m \u001b[0mK\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX_fit_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 185\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mK\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual_coef_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/kernel_ridge.py\u001b[0m in \u001b[0;36m_get_kernel\u001b[0;34m(self, X, Y)\u001b[0m\n\u001b[1;32m 119\u001b[0m \"coef0\": self.coef0}\n\u001b[1;32m 120\u001b[0m return pairwise_kernels(X, Y, metric=self.kernel,\n\u001b[0;32m--> 121\u001b[0;31m filter_params=True, **params)\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mpairwise_kernels\u001b[0;34m(X, Y, metric, filter_params, n_jobs, **kwds)\u001b[0m\n\u001b[1;32m 1389\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1390\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmetric\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"precomputed\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1391\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_pairwise_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprecomputed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1392\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1393\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGPKernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;34m\"(n_queries, n_indexed). Got (%d, %d) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;34m\"for %d indexed.\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m (X.shape[0], X.shape[1], Y.shape[0]))\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m raise ValueError(\"Incompatible dimension for X and Y matrices: \"\n", - "\u001b[0;31mValueError\u001b[0m: Precomputed metric requires shape (n_queries, n_indexed). Got (19, 19) for 164 indexed." - ] - } - ], - "source": [ - "# Author: Elisabetta Ghisu\n", - "\n", - "\"\"\"\n", - "- This script take as input a kernel matrix\n", - "and returns the classification or regression performance\n", - "- The kernel matrix can be calculated using any of the graph kernels approaches\n", - "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n", - "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n", - "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n", - "provide the corresponding performance on the test set. If more than one split is performed, the final results \n", - "correspond to the average of the performances on the test sets. \n", - "\n", - "@references\n", - " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", - "\"\"\"\n", - "\n", - "print(__doc__)\n", - "\n", - "import sys\n", - "import pathlib\n", - "import os\n", - "sys.path.insert(0, \"../py-graph/\")\n", - "from tabulate import tabulate\n", - "\n", - "import random\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from sklearn.kernel_ridge import KernelRidge # 0.17\n", - "from sklearn.metrics import accuracy_score, mean_squared_error\n", - "from sklearn import svm\n", - "\n", - "from kernels.marginalizedKernel import marginalizedkernel\n", - "from utils.graphfiles import loadDataset\n", - "\n", - "# print('\\n Loading dataset from file...')\n", - "# dataset, y = loadDataset(\"/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/dataset_bps.ds\")\n", - "# y = np.array(y)\n", - "# print(y)\n", - "\n", - "# kernel_file_path = 'marginalizedkernelmatrix.ds'\n", - "# path = pathlib.Path(kernel_file_path)\n", - "# if path.is_file():\n", - "# print('\\n Loading the matrix from file...')\n", - "# Kmatrix = np.loadtxt(kernel_file_path)\n", - "# print(Kmatrix)\n", - "# else:\n", - "# print('\\n Calculating kernel matrix, this could take a while...')\n", - "# Kmatrix = marginalizeKernel(dataset)\n", - "# print(Kmatrix)\n", - "# print('Saving kernel matrix to file...')\n", - "# np.savetxt(kernel_file_path, Kmatrix)\n", - "\n", - "# setup the parameters\n", - "model_type = 'regression' # Regression or classification problem\n", - "print('\\n --- This is a %s problem ---' % model_type)\n", - "\n", - "# datasize = len(dataset)\n", - "trials = 100 # Trials for hyperparameters random search\n", - "splits = 100 # Number of splits of the data\n", - "alpha_grid = np.linspace(0.01, 100, num = trials) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n", - "# C_grid = np.linspace(0.0001, 10, num = trials)\n", - "random.seed(20) # Set the seed for uniform parameter distribution\n", - "data_dir = '/home/ljia/Documents/research-repo/datasets/acyclic/Acyclic/'\n", - "\n", - "# set the output path\n", - "kernel_file_path = 'kernelmatrices_marginalized_acyclic/'\n", - "if not os.path.exists(kernel_file_path):\n", - " os.makedirs(kernel_file_path)\n", - "\n", - "\n", - "\"\"\"\n", - "- Here starts the main program\n", - "- First we permute the data, then for each split we evaluate corresponding performances\n", - "- In the end, the performances are averaged over the test sets\n", - "\"\"\"\n", - "\n", - "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n", - "val_split = []\n", - "test_split = []\n", - "\n", - "p_quit = 0.5\n", - "\n", - "# for each split of the data\n", - "for j in range(10):\n", - " dataset_train, y_train = loadDataset(data_dir + 'trainset_' + str(j) + '.ds')\n", - " dataset_test, y_test = loadDataset(data_dir + 'testset_' + str(j) + '.ds')\n", - " \n", - " # Normalization step (for real valued targets only)\n", - " if model_type == 'regression':\n", - " print('\\n Normalizing output y...')\n", - " y_train_mean = np.mean(y_train)\n", - " y_train_std = np.std(y_train)\n", - " y_train = (y_train - y_train_mean) / float(y_train_std)\n", - "# print(y)\n", - " \n", - " # save kernel matrices to files / read kernel matrices from files\n", - " kernel_file_train = kernel_file_path + 'train' + str(j) + '_pquit_' + str(p_quit)\n", - " kernel_file_test = kernel_file_path + 'test' + str(j) + '_pquit_' + str(p_quit)\n", - " path_train = pathlib.Path(kernel_file_train)\n", - " path_test = pathlib.Path(kernel_file_test)\n", - " # get train set kernel matrix\n", - " if path_train.is_file():\n", - " print('\\n Loading the train set kernel matrix from file...')\n", - " Kmatrix_train = np.loadtxt(kernel_file_train)\n", - " print(Kmatrix_train)\n", - " else:\n", - " print('\\n Calculating train set kernel matrix, this could take a while...')\n", - " Kmatrix_train = marginalizedkernel(dataset_train, p_quit, 20)\n", - " print(Kmatrix_train)\n", - " print('\\n Saving train set kernel matrix to file...')\n", - " np.savetxt(kernel_file_train, Kmatrix_train)\n", - " # get test set kernel matrix\n", - " if path_test.is_file():\n", - " print('\\n Loading the test set kernel matrix from file...')\n", - " Kmatrix_test = np.loadtxt(kernel_file_test)\n", - " print(Kmatrix_test)\n", - " else:\n", - " print('\\n Calculating test set kernel matrix, this could take a while...')\n", - " Kmatrix_test = marginalizedkernel(dataset_test, p_quit, 20)\n", - " print(Kmatrix_test)\n", - " print('\\n Saving test set kernel matrix to file...')\n", - " np.savetxt(kernel_file_test, Kmatrix_test)\n", - "\n", - " # For each parameter trial\n", - " for i in range(trials):\n", - " # For regression use the Kernel Ridge method\n", - " if model_type == 'regression':\n", - " # print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n", - "\n", - " # Fit the kernel ridge model\n", - " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n", - " KR.fit(Kmatrix_train, y_train)\n", - "\n", - " # predict on the test set\n", - " y_pred_test = KR.predict(Kmatrix_test)\n", - " # print(y_pred)\n", - "\n", - " # adjust prediction: needed because the training targets have been normalized\n", - " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n", - " # print(y_pred_test)\n", - "\n", - " # root mean squared error in test \n", - " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n", - " perf_all_test.append(rmse_test)\n", - "\n", - " # print('The performance on the validation set is: %3f' % rmse)\n", - " # print('The performance on the test set is: %3f' % rmse_test)\n", - "\n", - " # --- FIND THE OPTIMAL PARAMETERS --- #\n", - " # For regression: minimise the mean squared error\n", - " if model_type == 'regression':\n", - "\n", - " # get optimal parameter on test (argmin mean squared error)\n", - " min_idx = np.argmin(perf_all_test)\n", - " alpha_opt = alpha_grid[min_idx]\n", - "\n", - " # corresponding performance on test for the same parameter\n", - " perf_test_opt = perf_all_test[min_idx]\n", - "\n", - " print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n", - " print('The corresponding performance on test set is: %3f' % perf_test_opt)\n", - " \n", - " \n", - " \n", - "\n", - "# For each split of the data\n", - "for j in range(10, 10 + splits):\n", - " print('Starting split %d...' % j)\n", - "\n", - " # Set the random set for data permutation\n", - " random_state = int(j)\n", - " np.random.seed(random_state)\n", - " idx_perm = np.random.permutation(datasize)\n", - "# print(idx_perm)\n", - " \n", - " # Permute the data\n", - " y_perm = y[idx_perm] # targets permutation\n", - "# print(y_perm)\n", - " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n", - "# print(Kmatrix_perm)\n", - " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n", - " \n", - " # Set the training, validation and test\n", - " # Note: the percentage can be set up by the user\n", - " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n", - " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n", - " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n", - " num_val = num_train_val - num_train # 10% (of train + val) for validation\n", - " \n", - " # Split the kernel matrix\n", - " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n", - " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n", - " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n", - "\n", - " # Split the targets\n", - " y_train = y_perm[0:num_train]\n", - "\n", - " # Normalization step (for real valued targets only)\n", - " print('\\n Normalizing output y...')\n", - " if model_type == 'regression':\n", - " y_train_mean = np.mean(y_train)\n", - " y_train_std = np.std(y_train)\n", - " y_train = (y_train - y_train_mean) / float(y_train_std)\n", - "# print(y)\n", - " \n", - " y_val = y_perm[num_train:(num_train + num_val)]\n", - " y_test = y_perm[(num_train + num_val):datasize]\n", - " \n", - " # Record the performance for each parameter trial respectively on validation and test set\n", - " perf_all_val = []\n", - " perf_all_test = []\n", - " \n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "- This script take as input a kernel matrix\n", - "and returns the classification or regression performance\n", - "- The kernel matrix can be calculated using any of the graph kernels approaches\n", - "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n", - "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n", - "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n", - "provide the corresponding performance on the test set. If more than one split is performed, the final results \n", - "correspond to the average of the performances on the test sets. \n", - "\n", - "@references\n", - " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", - "\n", - "\n", " Loading dataset from file...\n", "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", @@ -615,17 +261,17 @@ "With standard deviation: 4.891587\n", "\n", "\n", - " p_quit RMSE std\n", - "-------- ------- -------\n", - " 0.1 18.5188 7.749\n", - " 0.2 17.8991 6.59104\n", - " 0.3 18.3924 7.10161\n", - " 0.4 19.6233 6.24807\n", - " 0.5 19.9936 6.29951\n", - " 0.6 20.5466 6.26173\n", - " 0.7 21.7018 6.33531\n", - " 0.8 23.1489 6.10246\n", - " 0.9 24.7157 4.89159\n" + " std RMSE p_quit\n", + "------- ------- --------\n", + "7.749 18.5188 0.1\n", + "6.59104 17.8991 0.2\n", + "7.10161 18.3924 0.3\n", + "6.24807 19.6233 0.4\n", + "6.29951 19.9936 0.5\n", + "6.26173 20.5466 0.6\n", + "6.33531 21.7018 0.7\n", + "6.10246 23.1489 0.8\n", + "4.89159 24.7157 0.9\n" ] } ], @@ -651,7 +297,7 @@ "import sys\n", "import os\n", "import pathlib\n", - "sys.path.insert(0, \"../py-graph/\")\n", + "sys.path.insert(0, \"../\")\n", "from tabulate import tabulate\n", "\n", "import random\n", @@ -662,8 +308,8 @@ "from sklearn.metrics import accuracy_score, mean_squared_error\n", "from sklearn import svm\n", "\n", - "from kernels.marginalizedKernel import marginalizedkernel\n", - "from utils.graphfiles import loadDataset\n", + "from pygraph.kernels.marginalizedKernel import marginalizedkernel\n", + "from pygraph.utils.graphfiles import loadDataset\n", "\n", "print('\\n Loading dataset from file...')\n", "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n", @@ -711,7 +357,7 @@ " print(Kmatrix)\n", " else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = marginalizedkernel(dataset, p_quit, 20)\n", + " Kmatrix, run_time = marginalizedkernel(dataset, p_quit, 20, node_label = 'atom', edge_label = 'bond_type')\n", " print(Kmatrix)\n", " print('\\n Saving kernel matrix to file...')\n", " np.savetxt(kernel_file, Kmatrix)\n", diff --git a/notebooks/.ipynb_checkpoints/run_pathkernel_acyclic-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/run_pathkernel_acyclic-checkpoint.ipynb index 9528f89..86bd8fc 100644 --- a/notebooks/.ipynb_checkpoints/run_pathkernel_acyclic-checkpoint.ipynb +++ b/notebooks/.ipynb_checkpoints/run_pathkernel_acyclic-checkpoint.ipynb @@ -545,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -588,18 +588,27 @@ "\n", " --- This is a regression problem ---\n", "\n", - " Calculating kernel matrix, this could take a while...\n" - ] - }, - { - "ename": "NameError", - "evalue": "name 'pathKernel' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n Calculating kernel matrix, this could take a while...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 74\u001b[0;31m \u001b[0mKmatrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpathKernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 75\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKmatrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n Saving kernel matrix to file...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'pathKernel' is not defined" + " Calculating kernel matrix, this could take a while...\n", + "--- mean average path kernel matrix of size 185 built in 38.70095658302307 seconds ---\n", + "[[ 0.55555556 0.22222222 0. ..., 0. 0. 0. ]\n", + " [ 0.22222222 0.27777778 0. ..., 0. 0. 0. ]\n", + " [ 0. 0. 0.55555556 ..., 0.03030303 0.03030303\n", + " 0.03030303]\n", + " ..., \n", + " [ 0. 0. 0.03030303 ..., 0.08297521 0.05553719\n", + " 0.05256198]\n", + " [ 0. 0. 0.03030303 ..., 0.05553719 0.07239669\n", + " 0.0538843 ]\n", + " [ 0. 0. 0.03030303 ..., 0.05256198 0.0538843\n", + " 0.07438017]]\n", + "\n", + " Saving kernel matrix to file...\n", + "\n", + " Mean performance on val set: 11.907089\n", + "With standard deviation: 4.781924\n", + "\n", + " Mean performance on test set: 14.270816\n", + "With standard deviation: 6.366698\n" ] } ], @@ -677,7 +686,7 @@ " print(Kmatrix)\n", "else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = pathkernel(dataset)\n", + " Kmatrix, run_time = pathkernel(dataset, node_label = 'atom', edge_label = 'bond_type')\n", " print(Kmatrix)\n", " print('\\n Saving kernel matrix to file...')\n", " np.savetxt(kernel_file, Kmatrix)\n", diff --git a/notebooks/.ipynb_checkpoints/run_spkernel_acyclic-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/run_spkernel_acyclic-checkpoint.ipynb index 1bf4920..b3e0f40 100644 --- a/notebooks/.ipynb_checkpoints/run_spkernel_acyclic-checkpoint.ipynb +++ b/notebooks/.ipynb_checkpoints/run_spkernel_acyclic-checkpoint.ipynb @@ -182,7 +182,8 @@ " print(Kmatrix)\n", "else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = spkernel(dataset)\n", + " #@Q: is it appropriate to use bond type between atoms as the edge weight to calculate shortest path????????\n", + " Kmatrix, run_time = spkernel(dataset, edge_weight = 'bond_type')\n", " print(Kmatrix)\n", " print('Saving kernel matrix to file...')\n", " np.savetxt(kernel_file_path, Kmatrix)\n", diff --git a/notebooks/run_WeisfeilerLehmankernel_acyclic.ipynb b/notebooks/run_WeisfeilerLehmankernel_acyclic.ipynb index 78ed792..4b7d560 100644 --- a/notebooks/run_WeisfeilerLehmankernel_acyclic.ipynb +++ b/notebooks/run_WeisfeilerLehmankernel_acyclic.ipynb @@ -221,8 +221,10 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, + "execution_count": 20, + "metadata": { + "scrolled": false + }, "outputs": [ { "name": "stdout", @@ -236,6 +238,7 @@ " --- for graph 0 --- \n", "\n", "labels_ori: ['C', 'C', 'C', 'C', 'C', 'O', 'O']\n", + "all_labels_ori: {'C', 'O'}\n", "num_of_each_label: {'C': 5, 'O': 2}\n", "all_num_of_each_label: [{'C': 5, 'O': 2}]\n", "num_of_labels: 2\n", @@ -244,171 +247,143 @@ " --- for graph 1 --- \n", "\n", "labels_ori: ['C', 'C', 'C', 'C', 'C', 'C', 'S', 'S']\n", + "all_labels_ori: {'C', 'O', 'S'}\n", "num_of_each_label: {'C': 6, 'S': 2}\n", "all_num_of_each_label: [{'C': 5, 'O': 2}, {'C': 6, 'S': 2}]\n", "num_of_labels: 2\n", "all_labels_ori: {'C', 'O', 'S'}\n", "\n", - " labels: {'C', 'O'}\n", - "\n", - " vector1: [[5 2]]\n", + " all_num_of_labels_occured: 3\n", "\n", - " vector2: [[5 2]]\n", + " --- calculating kernel matrix ---\n", "\n", - " Kmatrix: [[ 29. 0.]\n", + " labels: {'C', 'O'}\n", + "vector1: [[5 2]]\n", + "vector2: [[5 2]]\n", + "Kmatrix: [[ 29. 0.]\n", " [ 0. 0.]]\n", "\n", " labels: {'C', 'O', 'S'}\n", - "\n", - " vector1: [[5 2 0]]\n", - "\n", - " vector2: [[6 0 2]]\n", - "\n", - " Kmatrix: [[ 29. 30.]\n", + "vector1: [[5 2 0]]\n", + "vector2: [[6 0 2]]\n", + "Kmatrix: [[ 29. 30.]\n", " [ 30. 0.]]\n", "\n", " labels: {'C', 'S'}\n", - "\n", - " vector1: [[6 2]]\n", - "\n", - " vector2: [[6 2]]\n", - "\n", - " Kmatrix: [[ 29. 30.]\n", + "vector1: [[6 2]]\n", + "vector2: [[6 2]]\n", + "Kmatrix: [[ 29. 30.]\n", " [ 30. 40.]]\n", "\n", - " --- height = 0 --- \n", + " --- height = 1 --- \n", "\n", " --- for graph 0 --- \n", "\n", - "labels_ori: ['C', 'C', 'C', 'C', 'C', 'O', 'O']\n", - "num_of_each_label: {'C': 5, 'O': 2}\n", - "num_of_labels: 2\n", - "all_labels_ori: {'C', 'O'}\n", - "num_of_labels_occured: 2\n", "multiset: ['CC', 'CC', 'CCO', 'CCO', 'COO', 'OCC', 'OCC']\n", "set_unique: ['OCC', 'COO', 'CCO', 'CC']\n", - "set_compressed: {'OCC': '3', 'COO': '4', 'CCO': '5', 'CC': '6'}\n", - "all_set_compressed: {'OCC': '3', 'COO': '4', 'CCO': '5', 'CC': '6'}\n", - "num_of_labels_occured: 6\n", + "set_compressed: {'OCC': '4', 'COO': '5', 'CCO': '6', 'CC': '7'}\n", + "all_set_compressed: {'OCC': '4', 'COO': '5', 'CCO': '6', 'CC': '7'}\n", + "num_of_labels_occured: 7\n", "\n", - " compressed labels: {0: '6', 1: '6', 2: '5', 3: '5', 4: '4', 5: '3', 6: '3'}\n", - "labels_comp: ['6', '6', '5', '5', '4', '3', '3']\n", - "num_of_each_label: {'3': 2, 'C': 5, '6': 2, '5': 2, 'O': 2, '4': 1}\n", - "all_num_of_each_label: [{'3': 2, 'C': 5, '6': 2, '5': 2, 'O': 2, '4': 1}]\n", + " compressed labels: {0: '7', 1: '7', 2: '6', 3: '6', 4: '5', 5: '4', 6: '4'}\n", + "labels_comp: ['7', '7', '6', '6', '5', '4', '4']\n", + "all_labels_ori: {'5', '4', '6', '7'}\n", + "num_of_each_label: {'5': 1, '4': 2, '6': 2, '7': 2}\n", + "all_num_of_each_label: [{'5': 1, '4': 2, '6': 2, '7': 2}]\n", "\n", " --- for graph 1 --- \n", "\n", - "labels_ori: ['C', 'C', 'C', 'C', 'C', 'C', 'S', 'S']\n", - "num_of_each_label: {'C': 6, 'S': 2}\n", - "num_of_labels: 2\n", - "all_labels_ori: {'C', 'O', 'S'}\n", - "num_of_labels_occured: 7\n", "multiset: ['CC', 'CC', 'CC', 'CCS', 'CCS', 'CCSS', 'SCC', 'SCC']\n", "set_unique: ['SCC', 'CC', 'CCS', 'CCSS']\n", - "set_compressed: {'SCC': '8', 'CC': '6', 'CCS': '9', 'CCSS': '10'}\n", - "all_set_compressed: {'SCC': '8', 'COO': '4', 'CCS': '9', 'OCC': '3', 'CCO': '5', 'CCSS': '10', 'CC': '6'}\n", + "set_compressed: {'SCC': '8', 'CC': '7', 'CCS': '9', 'CCSS': '10'}\n", + "all_set_compressed: {'SCC': '8', 'COO': '5', 'CCS': '9', 'OCC': '4', 'CCO': '6', 'CCSS': '10', 'CC': '7'}\n", "num_of_labels_occured: 10\n", "\n", - " compressed labels: {0: '6', 1: '6', 2: '6', 3: '9', 4: '9', 5: '10', 6: '8', 7: '8'}\n", - "labels_comp: ['6', '6', '6', '9', '9', '10', '8', '8']\n", - "num_of_each_label: {'10': 1, 'C': 6, '6': 3, 'S': 2, '8': 2, '9': 2}\n", - "all_num_of_each_label: [{'3': 2, 'C': 5, '6': 2, '5': 2, 'O': 2, '4': 1}, {'10': 1, 'C': 6, '6': 3, 'S': 2, '8': 2, '9': 2}]\n", - "\n", - " labels: {'3', '4', 'O', 'C', '6', '5'}\n", - "\n", - " vector1: [[2 1 2 5 2 2]]\n", - "\n", - " vector2: [[2 1 2 5 2 2]]\n", + " compressed labels: {0: '7', 1: '7', 2: '7', 3: '9', 4: '9', 5: '10', 6: '8', 7: '8'}\n", + "labels_comp: ['7', '7', '7', '9', '9', '10', '8', '8']\n", + "all_labels_ori: {'10', '4', '7', '9', '6', '5', '8'}\n", + "num_of_each_label: {'10': 1, '9': 2, '7': 3, '8': 2}\n", + "all_num_of_each_label: [{'5': 1, '4': 2, '6': 2, '7': 2}, {'10': 1, '9': 2, '7': 3, '8': 2}]\n", "\n", - " labels: {'3', '10', '4', 'O', '9', 'C', '6', 'S', '5', '8'}\n", - "\n", - " vector1: [[2 0 1 2 0 5 2 0 2 0]]\n", + " all_num_of_labels_occured: 10\n", "\n", - " vector2: [[0 1 0 0 2 6 3 2 0 2]]\n", + " --- calculating kernel matrix ---\n", "\n", - " labels: {'10', '9', 'C', '6', 'S', '8'}\n", + " labels: {'5', '4', '6', '7'}\n", + "vector1: [[1 2 2 2]]\n", + "vector2: [[1 2 2 2]]\n", "\n", - " vector1: [[1 2 6 3 2 2]]\n", + " labels: {'10', '4', '7', '9', '6', '5', '8'}\n", + "vector1: [[0 2 2 0 2 1 0]]\n", + "vector2: [[1 0 3 2 0 0 2]]\n", "\n", - " vector2: [[1 2 6 3 2 2]]\n", + " labels: {'8', '10', '7', '9'}\n", + "vector1: [[2 1 3 2]]\n", + "vector2: [[2 1 3 2]]\n", "\n", - " all_num_of_labels_occured: 3\n", + " Kmatrix: [[ 42. 36.]\n", + " [ 36. 58.]]\n", "\n", - " Kmatrix: [[ 71. 66.]\n", - " [ 66. 98.]]\n", - "\n", - " --- height = 1 --- \n", + " --- height = 2 --- \n", "\n", " --- for graph 0 --- \n", "\n", - "labels_ori: ['6', '6', '5', '5', '4', '3', '3']\n", - "num_of_each_label: {'3': 2, '5': 2, '4': 1, '6': 2}\n", - "num_of_labels: 4\n", - "all_labels_ori: {'3', '5', '4', '6'}\n", - "num_of_labels_occured: 7\n", - "multiset: ['65', '65', '536', '536', '433', '345', '345']\n", - "set_unique: ['345', '536', '65', '433']\n", - "set_compressed: {'345': '8', '536': '9', '65': '10', '433': '11'}\n", - "all_set_compressed: {'345': '8', '536': '9', '65': '10', '433': '11'}\n", - "num_of_labels_occured: 11\n", + "multiset: ['76', '76', '647', '647', '544', '456', '456']\n", + "set_unique: ['647', '76', '456', '544']\n", + "set_compressed: {'647': '11', '76': '12', '544': '14', '456': '13'}\n", + "all_set_compressed: {'647': '11', '76': '12', '456': '13', '544': '14'}\n", + "num_of_labels_occured: 14\n", "\n", - " compressed labels: {0: '10', 1: '10', 2: '9', 3: '9', 4: '11', 5: '8', 6: '8'}\n", - "labels_comp: ['10', '10', '9', '9', '11', '8', '8']\n", - "num_of_each_label: {'3': 2, '10': 2, '4': 1, '9': 2, '6': 2, '11': 1, '8': 2, '5': 2}\n", - "all_num_of_each_label: [{'3': 2, '10': 2, '4': 1, '9': 2, '6': 2, '11': 1, '8': 2, '5': 2}]\n", + " compressed labels: {0: '12', 1: '12', 2: '11', 3: '11', 4: '14', 5: '13', 6: '13'}\n", + "labels_comp: ['12', '12', '11', '11', '14', '13', '13']\n", + "all_labels_ori: {'14', '12', '11', '13'}\n", + "num_of_each_label: {'14': 1, '13': 2, '12': 2, '11': 2}\n", + "all_num_of_each_label: [{'14': 1, '13': 2, '12': 2, '11': 2}]\n", "\n", " --- for graph 1 --- \n", "\n", - "labels_ori: ['6', '6', '6', '9', '9', '10', '8', '8']\n", - "num_of_each_label: {'10': 1, '6': 3, '9': 2, '8': 2}\n", - "num_of_labels: 4\n", - "all_labels_ori: {'3', '10', '4', '9', '6', '5', '8'}\n", - "num_of_labels_occured: 14\n", - "multiset: ['69', '69', '610', '968', '968', '10688', '8109', '8109']\n", - "set_unique: ['69', '968', '8109', '10688', '610']\n", - "set_compressed: {'69': '15', '8109': '17', '10688': '18', '968': '16', '610': '19'}\n", - "all_set_compressed: {'69': '15', '8109': '17', '968': '16', '345': '8', '10688': '18', '610': '19', '536': '9', '65': '10', '433': '11'}\n", + "multiset: ['79', '79', '710', '978', '978', '10788', '8109', '8109']\n", + "set_unique: ['710', '8109', '79', '10788', '978']\n", + "set_compressed: {'710': '15', '79': '17', '8109': '16', '978': '19', '10788': '18'}\n", + "all_set_compressed: {'710': '15', '79': '17', '978': '19', '10788': '18', '8109': '16', '456': '13', '544': '14', '647': '11', '76': '12'}\n", "num_of_labels_occured: 19\n", "\n", - " compressed labels: {0: '15', 1: '15', 2: '19', 3: '16', 4: '16', 5: '18', 6: '17', 7: '17'}\n", - "labels_comp: ['15', '15', '19', '16', '16', '18', '17', '17']\n", - "num_of_each_label: {'10': 1, '18': 1, '19': 1, '9': 2, '17': 2, '6': 3, '8': 2, '16': 2, '15': 2}\n", - "all_num_of_each_label: [{'3': 2, '10': 2, '4': 1, '9': 2, '6': 2, '11': 1, '8': 2, '5': 2}, {'10': 1, '18': 1, '19': 1, '9': 2, '17': 2, '6': 3, '8': 2, '16': 2, '15': 2}]\n", - "\n", - " labels: {'3', '10', '4', '5', '9', '6', '11', '8'}\n", + " compressed labels: {0: '17', 1: '17', 2: '15', 3: '19', 4: '19', 5: '18', 6: '16', 7: '16'}\n", + "labels_comp: ['17', '17', '15', '19', '19', '18', '16', '16']\n", + "all_labels_ori: {'18', '19', '12', '13', '17', '11', '14', '16', '15'}\n", + "num_of_each_label: {'15': 1, '17': 2, '19': 2, '16': 2, '18': 1}\n", + "all_num_of_each_label: [{'14': 1, '13': 2, '12': 2, '11': 2}, {'15': 1, '17': 2, '19': 2, '16': 2, '18': 1}]\n", "\n", - " vector1: [[2 2 1 2 2 2 1 2]]\n", + " all_num_of_labels_occured: 19\n", "\n", - " vector2: [[2 2 1 2 2 2 1 2]]\n", + " --- calculating kernel matrix ---\n", "\n", - " labels: {'3', '10', '4', '18', '5', '19', '9', '17', '6', '11', '8', '16', '15'}\n", + " labels: {'14', '12', '11', '13'}\n", + "vector1: [[1 2 2 2]]\n", + "vector2: [[1 2 2 2]]\n", "\n", - " vector1: [[2 2 1 0 2 0 2 0 2 1 2 0 0]]\n", + " labels: {'18', '19', '12', '13', '17', '11', '14', '16', '15'}\n", + "vector1: [[0 0 2 2 0 2 1 0 0]]\n", + "vector2: [[1 2 0 0 2 0 0 2 1]]\n", "\n", - " vector2: [[0 1 0 1 0 1 2 2 3 0 2 2 2]]\n", + " labels: {'18', '17', '15', '16', '19'}\n", + "vector1: [[1 2 1 2 2]]\n", + "vector2: [[1 2 1 2 2]]\n", "\n", - " labels: {'10', '18', '19', '9', '17', '6', '8', '16', '15'}\n", - "\n", - " vector1: [[1 1 1 2 2 3 2 2 2]]\n", - "\n", - " vector2: [[1 1 1 2 2 3 2 2 2]]\n", - "\n", - " all_num_of_labels_occured: 10\n", + " Kmatrix: [[ 55. 36.]\n", + " [ 36. 72.]]\n", "\n", - " Kmatrix: [[ 97. 82.]\n", - " [ 82. 130.]]\n", - "\n", - " --- Weisfeiler-Lehman subtree kernel built in 0.003629922866821289 seconds ---\n" + " --- Weisfeiler-Lehman subtree kernel built in 0.0034377574920654297 seconds ---\n" ] }, { "data": { "text/plain": [ - "array([[ 97., 82.],\n", - " [ 82., 130.]])" + "array([[ 55., 36.],\n", + " [ 36., 72.]])" ] }, - "execution_count": 6, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -635,6 +610,8 @@ " print('\\n --- for graph %d --- \\n' % (idx))\n", " labels_ori = list(nx.get_node_attributes(G, 'label').values())\n", " print('labels_ori: %s' % (labels_ori))\n", + " all_labels_ori.update(labels_ori)\n", + " print('all_labels_ori: %s' % (all_labels_ori))\n", " num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n", " print('num_of_each_label: %s' % (num_of_each_label))\n", " all_num_of_each_label.append(num_of_each_label)\n", @@ -646,46 +623,48 @@ " all_labels_ori.update(labels_ori)\n", " print('all_labels_ori: %s' % (all_labels_ori))\n", " \n", + " all_num_of_labels_occured += len(all_labels_ori)\n", + " print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n", + " \n", " # calculate subtree kernel with the 0th iteration and add it to the final kernel\n", + " print('\\n --- calculating kernel matrix ---')\n", " for i in range(0, len(Gn)):\n", " for j in range(i, len(Gn)):\n", " labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n", " print('\\n labels: %s' % (labels))\n", " vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n", " vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n", - " print('\\n vector1: %s' % (vector1))\n", - " print('\\n vector2: %s' % (vector2))\n", + " print('vector1: %s' % (vector1))\n", + " print('vector2: %s' % (vector2))\n", " Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n", " Kmatrix[j][i] = Kmatrix[i][j]\n", - " print('\\n Kmatrix: %s' % (Kmatrix))\n", - " \n", - " \n", + " print('Kmatrix: %s' % (Kmatrix))\n", + "\n", " \n", " # iterate each height\n", - " for h in range(height + 1):\n", + " for h in range(1, height + 1):\n", " print('\\n --- height = %d --- ' % (h))\n", - " all_labels_ori = set() # all unique orignal labels in all graphs in this iteration\n", - "# all_labels_comp = set() # all unique compressed labels in all graphs in this iteration\n", - " all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration\n", " all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration\n", " num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs\n", + " all_labels_ori = set()\n", + " all_num_of_each_label = []\n", " \n", " # for each graph\n", " for idx, G in enumerate(Gn):\n", - " # get the set of original labels\n", + "# # get the set of original labels\n", " print('\\n --- for graph %d --- \\n' % (idx))\n", - " labels_ori = list(nx.get_node_attributes(G, 'label').values())\n", - " print('labels_ori: %s' % (labels_ori))\n", - " num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n", - " print('num_of_each_label: %s' % (num_of_each_label))\n", - " num_of_labels = len(num_of_each_label) # number of all unique labels\n", - " print('num_of_labels: %s' % (num_of_labels))\n", + "# labels_ori = list(nx.get_node_attributes(G, 'label').values())\n", + "# print('labels_ori: %s' % (labels_ori))\n", + "# num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n", + "# print('num_of_each_label: %s' % (num_of_each_label))\n", + "# num_of_labels = len(num_of_each_label) # number of all unique labels\n", + "# print('num_of_labels: %s' % (num_of_labels))\n", " \n", - " all_labels_ori.update(labels_ori)\n", - " print('all_labels_ori: %s' % (all_labels_ori))\n", - " # num_of_labels_occured += num_of_labels #@todo not precise\n", - " num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)\n", - " print('num_of_labels_occured: %s' % (num_of_labels_occured))\n", + "# all_labels_ori.update(labels_ori)\n", + "# print('all_labels_ori: %s' % (all_labels_ori))\n", + "# # num_of_labels_occured += num_of_labels #@todo not precise\n", + "# num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)\n", + "# print('num_of_labels_occured: %s' % (num_of_labels_occured))\n", " \n", " set_multisets = []\n", " for node in G.nodes(data = True):\n", @@ -718,7 +697,6 @@ "# num_of_labels_occured += len(set_compressed) #@todo not precise\n", " print('num_of_labels_occured: %s' % (num_of_labels_occured))\n", " \n", - "\n", " # relabel nodes\n", " # nx.relabel_nodes(G, set_compressed, copy = False)\n", " for node in G.nodes(data = True):\n", @@ -728,25 +706,29 @@ " # get the set of compressed labels\n", " labels_comp = list(nx.get_node_attributes(G, 'label').values())\n", " print('labels_comp: %s' % (labels_comp))\n", - " num_of_each_label.update(dict(Counter(labels_comp)))\n", + " all_labels_ori.update(labels_comp)\n", + " print('all_labels_ori: %s' % (all_labels_ori))\n", + " num_of_each_label = dict(Counter(labels_comp))\n", " print('num_of_each_label: %s' % (num_of_each_label))\n", " all_num_of_each_label.append(num_of_each_label)\n", " print('all_num_of_each_label: %s' % (all_num_of_each_label))\n", + " \n", + " all_num_of_labels_occured += len(all_labels_ori)\n", + " print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n", " \n", " # calculate subtree kernel with h iterations and add it to the final kernel\n", + " print('\\n --- calculating kernel matrix ---')\n", " for i in range(0, len(Gn)):\n", " for j in range(i, len(Gn)):\n", " labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n", " print('\\n labels: %s' % (labels))\n", " vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n", " vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n", - " print('\\n vector1: %s' % (vector1))\n", - " print('\\n vector2: %s' % (vector2))\n", + " print('vector1: %s' % (vector1))\n", + " print('vector2: %s' % (vector2))\n", " Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n", " Kmatrix[j][i] = Kmatrix[i][j]\n", " \n", - " all_num_of_labels_occured += len(all_labels_ori)\n", - " print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n", " print('\\n Kmatrix: %s' % (Kmatrix))\n", "\n", " return Kmatrix\n", @@ -766,13 +748,13 @@ "G2 = dataset[80]\n", "print(nx.get_node_attributes(G2, 'label'))\n", "\n", - "weisfeilerlehmankernel(G1, G2, height = 1)\n", + "weisfeilerlehmankernel(G1, G2, height = 2)\n", "# Kmatrix = weisfeilerlehmankernel(G1, G2)" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -790,7 +772,7 @@ "correspond to the average of the performances on the test sets. \n", "\n", "@references\n", - " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", + " Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", "\n", "\n", " --- calculating kernel matrix when subtree height = 0 ---\n", @@ -819,22 +801,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.49373626708984375 seconds ---\n", - "[[ 10. 10. 4. ..., 20. 20. 20.]\n", - " [ 10. 16. 4. ..., 20. 20. 20.]\n", - " [ 4. 4. 10. ..., 22. 22. 24.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.3845643997192383 seconds ---\n", + "[[ 5. 6. 4. ..., 20. 20. 20.]\n", + " [ 6. 8. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 5. ..., 21. 21. 21.]\n", " ..., \n", - " [ 20. 20. 22. ..., 130. 130. 122.]\n", - " [ 20. 20. 22. ..., 130. 130. 122.]\n", - " [ 20. 20. 24. ..., 122. 122. 154.]]\n", + " [ 20. 20. 21. ..., 101. 101. 101.]\n", + " [ 20. 20. 21. ..., 101. 101. 101.]\n", + " [ 20. 20. 21. ..., 101. 101. 101.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 12.761978\n", - "With standard deviation: 10.086502\n", + " Mean performance on train set: 141.418957\n", + "With standard deviation: 1.082842\n", "\n", - " Mean performance on test set: 9.014031\n", - "With standard deviation: 6.357865\n", + " Mean performance on test set: 36.210792\n", + "With standard deviation: 7.331787\n", "\n", " --- calculating kernel matrix when subtree height = 1 ---\n", "\n", @@ -862,22 +844,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.0043864250183105 seconds ---\n", - "[[ 20. 14. 8. ..., 20. 20. 22.]\n", - " [ 14. 32. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 20. ..., 25. 25. 30.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.853447437286377 seconds ---\n", + "[[ 10. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 16. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 10. ..., 22. 22. 24.]\n", " ..., \n", - " [ 20. 28. 25. ..., 188. 180. 145.]\n", - " [ 20. 28. 25. ..., 180. 182. 145.]\n", - " [ 22. 22. 30. ..., 145. 145. 238.]]\n", + " [ 20. 20. 22. ..., 130. 130. 122.]\n", + " [ 20. 20. 22. ..., 130. 130. 122.]\n", + " [ 20. 20. 24. ..., 122. 122. 154.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 22.747869\n", - "With standard deviation: 7.561365\n", + " Mean performance on train set: 140.065309\n", + "With standard deviation: 0.877976\n", "\n", - " Mean performance on test set: 19.457133\n", - "With standard deviation: 5.057464\n", + " Mean performance on test set: 9.000982\n", + "With standard deviation: 6.371454\n", "\n", " --- calculating kernel matrix when subtree height = 2 ---\n", "\n", @@ -905,22 +887,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.602942705154419 seconds ---\n", - "[[ 30. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 48. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 30. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.374389410018921 seconds ---\n", + "[[ 15. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 24. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 15. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 246. 209. 147.]\n", - " [ 20. 28. 25. ..., 209. 220. 147.]\n", - " [ 23. 22. 32. ..., 147. 147. 286.]]\n", + " [ 20. 20. 22. ..., 159. 151. 124.]\n", + " [ 20. 20. 22. ..., 151. 153. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 185.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 21.407092\n", - "With standard deviation: 6.415967\n", + " Mean performance on train set: 140.074983\n", + "With standard deviation: 0.928821\n", "\n", - " Mean performance on test set: 23.466810\n", - "With standard deviation: 5.836831\n", + " Mean performance on test set: 19.811299\n", + "With standard deviation: 4.049105\n", "\n", " --- calculating kernel matrix when subtree height = 3 ---\n", "\n", @@ -954,22 +936,22 @@ "output_type": "stream", "text": [ "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.2096023559570312 seconds ---\n", - "[[ 40. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 64. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 40. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.9141185283660889 seconds ---\n", + "[[ 20. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 32. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 20. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 304. 217. 147.]\n", - " [ 20. 28. 25. ..., 217. 250. 147.]\n", - " [ 23. 22. 32. ..., 147. 147. 314.]]\n", + " [ 20. 20. 22. ..., 188. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 168. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 202.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 24.747018\n", - "With standard deviation: 6.547340\n", + " Mean performance on train set: 140.197806\n", + "With standard deviation: 0.873857\n", "\n", - " Mean performance on test set: 27.961360\n", - "With standard deviation: 6.291821\n", + " Mean performance on test set: 25.045500\n", + "With standard deviation: 4.942763\n", "\n", " --- calculating kernel matrix when subtree height = 4 ---\n", "\n", @@ -997,22 +979,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.7832393646240234 seconds ---\n", - "[[ 50. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 80. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 50. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.393263578414917 seconds ---\n", + "[[ 25. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 40. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 25. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 362. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 280. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 336.]]\n", + " [ 20. 20. 22. ..., 217. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 183. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 213.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 27.197367\n", - "With standard deviation: 5.980185\n", + " Mean performance on train set: 140.272421\n", + "With standard deviation: 0.838915\n", "\n", - " Mean performance on test set: 30.614531\n", - "With standard deviation: 6.852841\n", + " Mean performance on test set: 28.225454\n", + "With standard deviation: 6.521196\n", "\n", " --- calculating kernel matrix when subtree height = 5 ---\n", "\n", @@ -1040,22 +1022,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.276118040084839 seconds ---\n", - "[[ 60. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 96. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 60. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.893545389175415 seconds ---\n", + "[[ 30. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 48. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 30. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 420. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 310. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 358.]]\n", + " [ 20. 20. 22. ..., 246. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 198. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 224.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 29.010593\n", - "With standard deviation: 6.073672\n", + " Mean performance on train set: 140.247025\n", + "With standard deviation: 0.863630\n", "\n", - " Mean performance on test set: 32.130815\n", - "With standard deviation: 7.062947\n", + " Mean performance on test set: 30.635436\n", + "With standard deviation: 6.736466\n", "\n", " --- calculating kernel matrix when subtree height = 6 ---\n", "\n", @@ -1083,22 +1065,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.779860496520996 seconds ---\n", - "[[ 70. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 112. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 70. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.216407299041748 seconds ---\n", + "[[ 35. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 56. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 35. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 478. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 340. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 380.]]\n", + " [ 20. 20. 22. ..., 275. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 213. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 235.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 30.909632\n", - "With standard deviation: 6.490001\n", + " Mean performance on train set: 140.239201\n", + "With standard deviation: 0.872475\n", "\n", - " Mean performance on test set: 33.117974\n", - "With standard deviation: 7.069399\n", + " Mean performance on test set: 32.102695\n", + "With standard deviation: 6.856006\n", "\n", " --- calculating kernel matrix when subtree height = 7 ---\n", "\n", @@ -1124,18 +1106,7 @@ "\n", " --- This is a regression problem ---\n", "\n", - " Calculating kernel matrix, this could take a while...\n", - "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.446576833724976 seconds ---\n", - "[[ 80. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 128. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 80. ..., 25. 25. 32.]\n", - " ..., \n", - " [ 20. 28. 25. ..., 536. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 370. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 402.]]\n", - "\n", - " Saving kernel matrix to file...\n" + " Calculating kernel matrix, this could take a while...\n" ] }, { @@ -1143,11 +1114,22 @@ "output_type": "stream", "text": [ "\n", - " Mean performance on val set: 31.870406\n", - "With standard deviation: 6.522032\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.8147408962249756 seconds ---\n", + "[[ 40. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 64. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 40. ..., 22. 22. 26.]\n", + " ..., \n", + " [ 20. 20. 22. ..., 304. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 228. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 246.]]\n", + "\n", + " Saving kernel matrix to file...\n", + "\n", + " Mean performance on train set: 140.094026\n", + "With standard deviation: 0.917704\n", "\n", - " Mean performance on test set: 33.964633\n", - "With standard deviation: 7.270535\n", + " Mean performance on test set: 32.970919\n", + "With standard deviation: 6.896061\n", "\n", " --- calculating kernel matrix when subtree height = 8 ---\n", "\n", @@ -1175,22 +1157,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.85552978515625 seconds ---\n", - "[[ 90. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 144. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 90. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.3765342235565186 seconds ---\n", + "[[ 45. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 72. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 45. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 594. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 400. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 424.]]\n", + " [ 20. 20. 22. ..., 333. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 243. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 257.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 32.192715\n", - "With standard deviation: 6.389616\n", + " Mean performance on train set: 140.076304\n", + "With standard deviation: 0.931866\n", "\n", - " Mean performance on test set: 34.325288\n", - "With standard deviation: 7.375800\n", + " Mean performance on test set: 33.511228\n", + "With standard deviation: 6.907530\n", "\n", " --- calculating kernel matrix when subtree height = 9 ---\n", "\n", @@ -1218,22 +1200,22 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.650352239608765 seconds ---\n", - "[[ 100. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 160. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 100. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.885462284088135 seconds ---\n", + "[[ 50. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 80. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 50. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 652. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 430. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 446.]]\n", + " [ 20. 20. 22. ..., 362. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 258. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 268.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 32.842545\n", - "With standard deviation: 6.213069\n", + " Mean performance on train set: 139.913361\n", + "With standard deviation: 0.928974\n", "\n", - " Mean performance on test set: 34.675515\n", - "With standard deviation: 7.314709\n", + " Mean performance on test set: 33.850152\n", + "With standard deviation: 6.914269\n", "\n", " --- calculating kernel matrix when subtree height = 10 ---\n", "\n", @@ -1261,42 +1243,41 @@ "\n", " Calculating kernel matrix, this could take a while...\n", "\n", - " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.818731069564819 seconds ---\n", - "[[ 110. 14. 8. ..., 20. 20. 23.]\n", - " [ 14. 176. 4. ..., 28. 28. 22.]\n", - " [ 8. 4. 110. ..., 25. 25. 32.]\n", + " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.313802719116211 seconds ---\n", + "[[ 55. 10. 4. ..., 20. 20. 20.]\n", + " [ 10. 88. 4. ..., 20. 20. 20.]\n", + " [ 4. 4. 55. ..., 22. 22. 26.]\n", " ..., \n", - " [ 20. 28. 25. ..., 710. 217. 151.]\n", - " [ 20. 28. 25. ..., 217. 460. 147.]\n", - " [ 23. 22. 32. ..., 151. 147. 468.]]\n", + " [ 20. 20. 22. ..., 391. 159. 124.]\n", + " [ 20. 20. 22. ..., 159. 273. 124.]\n", + " [ 20. 20. 26. ..., 124. 124. 279.]]\n", "\n", " Saving kernel matrix to file...\n", "\n", - " Mean performance on val set: 33.151974\n", - "With standard deviation: 6.196320\n", - "\n", - " Mean performance on test set: 34.867215\n", - "With standard deviation: 7.324672\n", - "\n", - "\n", - " std height RMSE\n", - "------- -------- --------\n", - "6.35786 1 9.01403\n", - "5.05746 2.1 19.4571\n", - "5.83683 3.2 23.4668\n", - "6.29182 4.3 27.9614\n", - "6.85284 5.4 30.6145\n", - "7.06295 6.5 32.1308\n", - "7.0694 7.6 33.118\n", - "7.27054 8.7 33.9646\n", - "7.3758 9.8 34.3253\n", - "7.31471 10.9 34.6755\n", - "7.32467 12 34.8672\n" + " Mean performance on train set: 139.894176\n", + "With standard deviation: 0.942612\n", + "\n", + " Mean performance on test set: 34.096283\n", + "With standard deviation: 6.931154\n", + "\n", + "\n", + " height RMSE_test std_test RMSE_train std_train k_time\n", + "-------- ----------- ---------- ------------ ----------- --------\n", + " 0 36.2108 7.33179 141.419 1.08284 0.384564\n", + " 1 9.00098 6.37145 140.065 0.877976 0.853447\n", + " 2 19.8113 4.04911 140.075 0.928821 1.37439\n", + " 3 25.0455 4.94276 140.198 0.873857 1.91412\n", + " 4 28.2255 6.5212 140.272 0.838915 2.39326\n", + " 5 30.6354 6.73647 140.247 0.86363 2.89355\n", + " 6 32.1027 6.85601 140.239 0.872475 3.21641\n", + " 7 32.9709 6.89606 140.094 0.917704 3.81474\n", + " 8 33.5112 6.90753 140.076 0.931866 4.37653\n", + " 9 33.8502 6.91427 139.913 0.928974 4.88546\n", + " 10 34.0963 6.93115 139.894 0.942612 5.3138\n" ] } ], "source": [ - "# Author: Elisabetta Ghisu\n", "# test of WL subtree kernel\n", "\n", "\"\"\"\n", @@ -1310,7 +1291,7 @@ "correspond to the average of the performances on the test sets. \n", "\n", "@references\n", - " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", + " Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n", "\"\"\"\n", "\n", "print(__doc__)\n", @@ -1318,6 +1299,7 @@ "import sys\n", "import os\n", "import pathlib\n", + "from collections import OrderedDict\n", "sys.path.insert(0, \"../\")\n", "from tabulate import tabulate\n", "\n", @@ -1332,11 +1314,11 @@ "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel\n", "from pygraph.utils.graphfiles import loadDataset\n", "\n", - "val_means_height = []\n", - "val_stds_height = []\n", + "train_means_height = []\n", + "train_stds_height = []\n", "test_means_height = []\n", "test_stds_height = []\n", - "\n", + "kernel_build_time = []\n", "\n", "for height in np.linspace(0, 10, 11):\n", " print('\\n --- calculating kernel matrix when subtree height = %d ---' % height)\n", @@ -1378,13 +1360,14 @@ " print(Kmatrix)\n", " else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = weisfeilerlehmankernel(dataset, height = int(height))\n", + " Kmatrix, run_time = weisfeilerlehmankernel(dataset, node_label = 'atom', height = int(height))\n", + " kernel_build_time.append(run_time)\n", " print(Kmatrix)\n", " print('\\n Saving kernel matrix to file...')\n", " # np.savetxt(kernel_file, Kmatrix)\n", "\n", - " # Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n", - " val_split = []\n", + " # Initialize the performance of the best parameter trial on train with the corresponding performance on test\n", + " train_split = []\n", " test_split = []\n", "\n", " # For each split of the data\n", @@ -1404,17 +1387,14 @@ " # print(Kmatrix_perm)\n", " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n", "\n", - " # Set the training, validation and test\n", + " # Set the training, test\n", " # Note: the percentage can be set up by the user\n", - " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n", - " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n", - " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n", - " num_val = num_train_val - num_train # 10% (of train + val) for validation\n", + " num_train = int((datasize * 90) / 100) # 90% (of entire dataset) for training\n", + " num_test = datasize - num_train # 10% (of entire dataset) for test\n", "\n", " # Split the kernel matrix\n", " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n", - " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n", - " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n", + " Kmatrix_test = Kmatrix_perm[num_train:datasize, 0:num_train]\n", "\n", " # Split the targets\n", " y_train = y_perm[0:num_train]\n", @@ -1427,11 +1407,10 @@ " y_train = (y_train - y_train_mean) / float(y_train_std)\n", " # print(y)\n", "\n", - " y_val = y_perm[num_train:(num_train + num_val)]\n", - " y_test = y_perm[(num_train + num_val):datasize]\n", + " y_test = y_perm[num_train:datasize]\n", "\n", - " # Record the performance for each parameter trial respectively on validation and test set\n", - " perf_all_val = []\n", + " # Record the performance for each parameter trial respectively on train and test set\n", + " perf_all_train = []\n", " perf_all_test = []\n", "\n", " # For each parameter trial\n", @@ -1445,81 +1424,69 @@ " # KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n", " KR.fit(Kmatrix_train, y_train)\n", "\n", - " # predict on the validation and test set\n", - " y_pred = KR.predict(Kmatrix_val)\n", + " # predict on the train and test set\n", + " y_pred_train = KR.predict(Kmatrix_train)\n", " y_pred_test = KR.predict(Kmatrix_test)\n", " # print(y_pred)\n", "\n", " # adjust prediction: needed because the training targets have been normalizaed\n", - " y_pred = y_pred * float(y_train_std) + y_train_mean\n", - " # print(y_pred)\n", + " y_pred_train = y_pred_train * float(y_train_std) + y_train_mean\n", " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n", " # print(y_pred_test)\n", "\n", - " # root mean squared error on validation\n", - " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n", - " perf_all_val.append(rmse)\n", - "\n", - " # root mean squared error in test \n", + " # root mean squared error in train set\n", + " rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))\n", + " perf_all_train.append(rmse_train)\n", + " # root mean squared error in test set\n", " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n", " perf_all_test.append(rmse_test)\n", - "\n", - " # print('The performance on the validation set is: %3f' % rmse)\n", " # print('The performance on the test set is: %3f' % rmse_test)\n", "\n", " # --- FIND THE OPTIMAL PARAMETERS --- #\n", " # For regression: minimise the mean squared error\n", " if model_type == 'regression':\n", "\n", - " # get optimal parameter on validation (argmin mean squared error)\n", + " # get optimal parameter on test (argmin mean squared error)\n", " min_idx = np.argmin(perf_all_test)\n", " alpha_opt = alpha_grid[min_idx]\n", "\n", - " # performance corresponding to optimal parameter on val\n", - " perf_val_opt = perf_all_val[min_idx]\n", - "\n", - " # corresponding performance on test for the same parameter\n", + " # corresponding performance on train and test set for the same parameter\n", + " perf_train_opt = perf_all_train[min_idx]\n", " perf_test_opt = perf_all_test[min_idx]\n", - "\n", " # print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n", - " # print('The best performance on the validation set is: %3f' % perf_val_opt)\n", " # print('The corresponding performance on test set is: %3f' % perf_test_opt)\n", "\n", - " # append the best performance on validation\n", - " # at the current split\n", - " val_split.append(perf_val_opt)\n", - "\n", - " # append the correponding performance on the test set\n", + " # append the correponding performance on the train and test set\n", + " train_split.append(perf_train_opt)\n", " test_split.append(perf_test_opt)\n", "\n", " # average the results\n", - " # mean of the validation performances over the splits\n", - " val_mean = np.mean(np.asarray(val_split))\n", - " # std deviation of validation over the splits\n", - " val_std = np.std(np.asarray(val_split))\n", - "\n", - " # mean of the test performances over the splits\n", + " # mean of the train and test performances over the splits\n", + " train_mean = np.mean(np.asarray(train_split))\n", " test_mean = np.mean(np.asarray(test_split))\n", - " # std deviation of the test oer the splits\n", + " # std deviation of the train and test over the splits\n", + " train_std = np.std(np.asarray(train_split))\n", " test_std = np.std(np.asarray(test_split))\n", "\n", - " print('\\n Mean performance on val set: %3f' % val_mean)\n", - " print('With standard deviation: %3f' % val_std)\n", + " print('\\n Mean performance on train set: %3f' % train_mean)\n", + " print('With standard deviation: %3f' % train_std)\n", " print('\\n Mean performance on test set: %3f' % test_mean)\n", " print('With standard deviation: %3f' % test_std)\n", - " \n", - " val_means_height.append(val_mean)\n", - " val_stds_height.append(val_std)\n", + " \n", + " train_means_height.append(train_mean)\n", + " train_stds_height.append(train_std)\n", " test_means_height.append(test_mean)\n", " test_stds_height.append(test_std)\n", " \n", "print('\\n') \n", - "print(tabulate({'height': np.linspace(1, 12, 11), 'RMSE': test_means_height, 'std': test_stds_height}, headers='keys'))" + "table_dict = {'height': np.linspace(0, 10, 11), 'RMSE_test': test_means_height, 'std_test': test_stds_height, 'RMSE_train': train_means_height, 'std_train': train_stds_height, 'k_time': kernel_build_time}\n", + "keyorder = ['height', 'RMSE_test', 'std_test', 'RMSE_train', 'std_train', 'k_time']\n", + "print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "scrolled": true }, @@ -1566,185 +1533,21 @@ "\n", " --- This is a regression problem ---\n", "\n", - " Calculating kernel matrix, this could take a while...\n", - "\n", - " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 47.540945053100586 seconds ---\n", - "[[ 6. 2. 6. ..., 2. 2. 2.]\n", - " [ 2. 12. 2. ..., 0. 0. 6.]\n", - " [ 6. 2. 6. ..., 2. 2. 2.]\n", - " ..., \n", - " [ 2. 0. 2. ..., 110. 42. 14.]\n", - " [ 2. 0. 2. ..., 42. 110. 14.]\n", - " [ 2. 6. 2. ..., 14. 14. 110.]]\n", - "\n", - " Saving kernel matrix to file...\n", - "\n", - " Mean performance on val set: 38.533318\n", - "With standard deviation: 6.213602\n", - "\n", - " Mean performance on test set: 36.055557\n", - "With standard deviation: 5.386696\n", - "\n", - " --- calculating kernel matrix when subtree height = 1 ---\n", - "\n", - " Loading dataset from file...\n", - "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", - " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", - " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n", - " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n", - " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n", - " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n", - " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n", - " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n", - " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n", - " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n", - " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n", - " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n", - " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n", - " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n", - " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n", - " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n", - " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n", - " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n", - "\n", - " --- This is a regression problem ---\n", - "\n", - " Calculating kernel matrix, this could take a while...\n", - "\n", - " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 75.94973611831665 seconds ---\n", - "[[ 9. 3. 9. ..., 3. 3. 3.]\n", - " [ 3. 18. 3. ..., 0. 0. 9.]\n", - " [ 9. 3. 9. ..., 3. 3. 3.]\n", - " ..., \n", - " [ 3. 0. 3. ..., 165. 63. 21.]\n", - " [ 3. 0. 3. ..., 63. 165. 21.]\n", - " [ 3. 9. 3. ..., 21. 21. 165.]]\n", - "\n", - " Saving kernel matrix to file...\n", - "\n", - " Mean performance on val set: 38.464684\n", - "With standard deviation: 6.299737\n", - "\n", - " Mean performance on test set: 36.054735\n", - "With standard deviation: 5.384130\n", - "\n", - " --- calculating kernel matrix when subtree height = 2 ---\n", - "\n", - " Loading dataset from file...\n", - "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", - " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", - " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n", - " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n", - " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n", - " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n", - " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n", - " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n", - " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n", - " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n", - " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n", - " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n", - " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n", - " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n", - " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n", - " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n", - " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n", - " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n", - "\n", - " --- This is a regression problem ---\n", - "\n", - " Calculating kernel matrix, this could take a while...\n", - "\n", - " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 98.63305306434631 seconds ---\n", - "[[ 12. 4. 12. ..., 4. 4. 4.]\n", - " [ 4. 24. 4. ..., 0. 0. 12.]\n", - " [ 12. 4. 12. ..., 4. 4. 4.]\n", - " ..., \n", - " [ 4. 0. 4. ..., 220. 84. 28.]\n", - " [ 4. 0. 4. ..., 84. 220. 28.]\n", - " [ 4. 12. 4. ..., 28. 28. 220.]]\n", - "\n", - " Saving kernel matrix to file...\n", - "\n", - " Mean performance on val set: 38.594816\n", - "With standard deviation: 6.106887\n", - "\n", - " Mean performance on test set: 36.069839\n", - "With standard deviation: 5.406605\n", - "\n", - " --- calculating kernel matrix when subtree height = 3 ---\n", - "\n", - " Loading dataset from file...\n", - "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", - " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", - " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n", - " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n", - " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n", - " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n", - " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n", - " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n", - " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n", - " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n", - " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n", - " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n", - " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n", - " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n", - " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n", - " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n", - " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n", - " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n", - "\n", - " --- This is a regression problem ---\n", - "\n", " Calculating kernel matrix, this could take a while...\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 126.40115857124329 seconds ---\n", - "[[ 15. 5. 15. ..., 5. 5. 5.]\n", - " [ 5. 30. 5. ..., 0. 0. 15.]\n", - " [ 15. 5. 15. ..., 5. 5. 5.]\n", - " ..., \n", - " [ 5. 0. 5. ..., 275. 105. 35.]\n", - " [ 5. 0. 5. ..., 105. 275. 35.]\n", - " [ 5. 15. 5. ..., 35. 35. 275.]]\n", - "\n", - " Saving kernel matrix to file...\n", - "\n", - " Mean performance on val set: 38.545772\n", - "With standard deviation: 6.200795\n", - "\n", - " Mean performance on test set: 36.055164\n", - "With standard deviation: 5.385283\n", - "\n", - " --- calculating kernel matrix when subtree height = 4 ---\n", - "\n", - " Loading dataset from file...\n", - "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n", - " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n", - " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n", - " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n", - " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n", - " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n", - " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n", - " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n", - " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n", - " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n", - " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n", - " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n", - " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n", - " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n", - " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n", - " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n", - " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n", - " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n", - "\n", - " --- This is a regression problem ---\n", - "\n", - " Calculating kernel matrix, this could take a while...\n" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n Calculating kernel matrix, this could take a while...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0mKmatrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mweisfeilerlehmankernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase_kernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'sp'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKmatrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n Saving kernel matrix to file...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py\u001b[0m in \u001b[0;36mweisfeilerlehmankernel\u001b[0;34m(height, base_kernel, *args)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_weisfeilerlehmankernel_do\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGn\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mheight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py\u001b[0m in \u001b[0;36m_weisfeilerlehmankernel_do\u001b[0;34m(G1, G2, height)\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0;31m# calculate kernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 243\u001b[0;31m \u001b[0mkernel\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mspkernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mG2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# change your base kernel here (and one more before)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 244\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[0;31m# get label sets of both graphs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spkernel.py\u001b[0m in \u001b[0;36mspkernel\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0me1\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mG1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medges\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0me2\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mG2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medges\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 64\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 65\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], @@ -1832,7 +1635,7 @@ " print(Kmatrix)\n", " else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = weisfeilerlehmankernel(dataset, height = int(height), base_kernel = 'sp')\n", + " Kmatrix = weisfeilerlehmankernel(dataset, node_label = 'atom', height = int(height), base_kernel = 'sp')\n", " print(Kmatrix)\n", " print('\\n Saving kernel matrix to file...')\n", "# np.savetxt(kernel_file, Kmatrix)\n", @@ -1885,7 +1688,7 @@ " y_test = y_perm[(num_train + num_val):datasize]\n", "\n", " # Record the performance for each parameter trial respectively on validation and test set\n", - " perf_all_val = []\n", + " perf_all_train = []\n", " perf_all_test = []\n", "\n", " # For each parameter trial\n", diff --git a/notebooks/run_marginalizedkernel_acyclic.ipynb b/notebooks/run_marginalizedkernel_acyclic.ipynb index 46838bd..08c2d33 100644 --- a/notebooks/run_marginalizedkernel_acyclic.ipynb +++ b/notebooks/run_marginalizedkernel_acyclic.ipynb @@ -357,7 +357,7 @@ " print(Kmatrix)\n", " else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = marginalizedkernel(dataset, p_quit, 20)\n", + " Kmatrix, run_time = marginalizedkernel(dataset, p_quit, 20, node_label = 'atom', edge_label = 'bond_type')\n", " print(Kmatrix)\n", " print('\\n Saving kernel matrix to file...')\n", " np.savetxt(kernel_file, Kmatrix)\n", diff --git a/notebooks/run_pathkernel_acyclic.ipynb b/notebooks/run_pathkernel_acyclic.ipynb index 6913a74..86bd8fc 100644 --- a/notebooks/run_pathkernel_acyclic.ipynb +++ b/notebooks/run_pathkernel_acyclic.ipynb @@ -686,7 +686,7 @@ " print(Kmatrix)\n", "else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = pathkernel(dataset)\n", + " Kmatrix, run_time = pathkernel(dataset, node_label = 'atom', edge_label = 'bond_type')\n", " print(Kmatrix)\n", " print('\\n Saving kernel matrix to file...')\n", " np.savetxt(kernel_file, Kmatrix)\n", diff --git a/notebooks/run_spkernel_acyclic.ipynb b/notebooks/run_spkernel_acyclic.ipynb index 1bf4920..b3e0f40 100644 --- a/notebooks/run_spkernel_acyclic.ipynb +++ b/notebooks/run_spkernel_acyclic.ipynb @@ -182,7 +182,8 @@ " print(Kmatrix)\n", "else:\n", " print('\\n Calculating kernel matrix, this could take a while...')\n", - " Kmatrix = spkernel(dataset)\n", + " #@Q: is it appropriate to use bond type between atoms as the edge weight to calculate shortest path????????\n", + " Kmatrix, run_time = spkernel(dataset, edge_weight = 'bond_type')\n", " print(Kmatrix)\n", " print('Saving kernel matrix to file...')\n", " np.savetxt(kernel_file_path, Kmatrix)\n", diff --git a/pygraph/kernels/__pycache__/weisfeilerLehmanKernel.cpython-35.pyc b/pygraph/kernels/__pycache__/weisfeilerLehmanKernel.cpython-35.pyc index 4b92029..242dee9 100644 Binary files a/pygraph/kernels/__pycache__/weisfeilerLehmanKernel.cpython-35.pyc and b/pygraph/kernels/__pycache__/weisfeilerLehmanKernel.cpython-35.pyc differ diff --git a/pygraph/kernels/marginalizedKernel.py b/pygraph/kernels/marginalizedKernel.py index a3fccc7..6e2ec81 100644 --- a/pygraph/kernels/marginalizedKernel.py +++ b/pygraph/kernels/marginalizedKernel.py @@ -8,7 +8,7 @@ import time from pygraph.kernels.deltaKernel import deltakernel -def marginalizedkernel(*args): +def marginalizedkernel(*args, node_label = 'atom', edge_label = 'bond_type'): """Calculate marginalized graph kernels between graphs. Parameters @@ -22,6 +22,10 @@ def marginalizedkernel(*args): the termination probability in the random walks generating step itr : integer time of iterations to calculate R_inf + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. Return ------ @@ -34,38 +38,43 @@ def marginalizedkernel(*args): """ if len(args) == 3: # for a list of graphs Gn = args[0] - Kmatrix = np.zeros((len(Gn), len(Gn))) start_time = time.time() for i in range(0, len(Gn)): for j in range(i, len(Gn)): - Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], args[1], args[2]) + Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, edge_label, args[1], args[2]) Kmatrix[j][i] = Kmatrix[i][j] - print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" % (len(Gn), (time.time() - start_time))) + run_time = time.time() - start_time + print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) - return Kmatrix + return Kmatrix, run_time else: # for only 2 graphs start_time = time.time() - kernel = _marginalizedkernel_do(args[0], args[1], args[2], args[3]) + kernel = _marginalizedkernel_do(args[0], args[1], node_label, edge_label, args[2], args[3]) - print("\n --- marginalized kernel built in %s seconds ---" % (time.time() - start_time)) + run_time = time.time() - start_time + print("\n --- marginalized kernel built in %s seconds ---" % (run_time)) - return kernel + return kernel, run_time -def _marginalizedkernel_do(G1, G2, p_quit, itr): +def _marginalizedkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type', p_quit, itr): """Calculate marginalized graph kernels between 2 graphs. Parameters ---------- G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. p_quit : integer the termination probability in the random walks generating step itr : integer @@ -106,8 +115,8 @@ def _marginalizedkernel_do(G1, G2, p_quit, itr): for neighbor2 in neighbor_n2: t = p_trans_n1 * p_trans_n2 * \ - deltakernel(G1.node[neighbor1]['label'] == G2.node[neighbor2]['label']) * \ - deltakernel(neighbor_n1[neighbor1]['label'] == neighbor_n2[neighbor2]['label']) + deltakernel(G1.node[neighbor1][node_label] == G2.node[neighbor2][node_label]) * \ + deltakernel(neighbor_n1[neighbor1][edge_label] == neighbor_n2[neighbor2][edge_label]) R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][neighbor2] # ref [1] equation (8) R_inf[:] = R_inf_new @@ -115,7 +124,7 @@ def _marginalizedkernel_do(G1, G2, p_quit, itr): # add elements of R_inf up and calculate kernel for node1 in G1.nodes(data = True): for node2 in G2.nodes(data = True): - s = p_init_G1 * p_init_G2 * deltakernel(node1[1]['label'] == node2[1]['label']) + s = p_init_G1 * p_init_G2 * deltakernel(node1[1][node_label] == node2[1][node_label]) kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) return kernel \ No newline at end of file diff --git a/pygraph/kernels/pathKernel.py b/pygraph/kernels/pathKernel.py index 5f89751..62d5d5d 100644 --- a/pygraph/kernels/pathKernel.py +++ b/pygraph/kernels/pathKernel.py @@ -8,7 +8,7 @@ import time from pygraph.kernels.deltaKernel import deltakernel -def pathkernel(*args): +def pathkernel(*args, node_label = 'atom', edge_label = 'bond_type'): """Calculate mean average path kernels between graphs. Parameters @@ -18,6 +18,10 @@ def pathkernel(*args): / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. Return ------ @@ -29,38 +33,43 @@ def pathkernel(*args): [1] Suard F, Rakotomamonjy A, Bensrhair A. Kernel on Bag of Paths For Measuring Similarity of Shapes. InESANN 2007 Apr 25 (pp. 355-360). """ if len(args) == 1: # for a list of graphs - Gn = args[0] - + Gn = args[0] Kmatrix = np.zeros((len(Gn), len(Gn))) start_time = time.time() for i in range(0, len(Gn)): for j in range(i, len(Gn)): - Kmatrix[i][j] = _pathkernel_do(Gn[i], Gn[j]) + Kmatrix[i][j] = _pathkernel_do(Gn[i], Gn[j], node_label, edge_label) Kmatrix[j][i] = Kmatrix[i][j] - print("\n --- mean average path kernel matrix of size %d built in %s seconds ---" % (len(Gn), (time.time() - start_time))) + run_time = time.time() - start_time + print("\n --- mean average path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) - return Kmatrix + return Kmatrix, run_time else: # for only 2 graphs start_time = time.time() - kernel = _pathkernel_do(args[0], args[1]) + kernel = _pathkernel_do(args[0], args[1], node_label, edge_label) - print("\n --- mean average path kernel built in %s seconds ---" % (time.time() - start_time)) + run_time = time.time() - start_time + print("\n --- mean average path kernel built in %s seconds ---" % (run_time)) - return kernel + return kernel, run_time -def _pathkernel_do(G1, G2): +def _pathkernel_do(G1, G2, node_label = 'atom', edge_label = 'bond_type'): """Calculate mean average path kernels between 2 graphs. Parameters ---------- G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. Return ------ @@ -72,24 +81,24 @@ def _pathkernel_do(G1, G2): num_nodes = G1.number_of_nodes() for node1 in range(num_nodes): for node2 in range(node1 + 1, num_nodes): - sp1.append(nx.shortest_path(G1, node1, node2, weight = 'cost')) + sp1.append(nx.shortest_path(G1, node1, node2, weight = edge_label)) sp2 = [] num_nodes = G2.number_of_nodes() for node1 in range(num_nodes): for node2 in range(node1 + 1, num_nodes): - sp2.append(nx.shortest_path(G2, node1, node2, weight = 'cost')) + sp2.append(nx.shortest_path(G2, node1, node2, weight = edge_label)) # calculate kernel kernel = 0 for path1 in sp1: for path2 in sp2: if len(path1) == len(path2): - kernel_path = deltakernel(G1.node[path1[0]]['label'] == G2.node[path2[0]]['label']) + kernel_path = deltakernel(G1.node[path1[0]][node_label] == G2.node[path2[0]][node_label]) if kernel_path: for i in range(1, len(path1)): # kernel = 1 if all corresponding nodes and edges in the 2 paths have same labels, otherwise 0 - kernel_path *= deltakernel(G1[path1[i - 1]][path1[i]]['label'] == G2[path2[i - 1]][path2[i]]['label']) * deltakernel(G1.node[path1[i]]['label'] == G2.node[path2[i]]['label']) + kernel_path *= deltakernel(G1[path1[i - 1]][path1[i]][edge_label] == G2[path2[i - 1]][path2[i]][edge_label]) * deltakernel(G1.node[path1[i]][node_label] == G2.node[path2[i]][node_label]) kernel += kernel_path # add up kernels of all paths kernel = kernel / (len(sp1) * len(sp2)) # calculate mean average diff --git a/pygraph/kernels/results.md b/pygraph/kernels/results.md new file mode 100644 index 0000000..f61fdfd --- /dev/null +++ b/pygraph/kernels/results.md @@ -0,0 +1,36 @@ +# results with minimal test RMSE for each kernel on dataset Asyclic +-- All the kernels are tested on dataset Asyclic, which consists of 185 molecules (graphs). +-- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression. +-- For predition we randomly divide the data in train and test subset, where 90% of entire dataset is for training and rest for testing. 10 splits are performed. For each split, we first train on the train data, then evaluate the performance on the test set. We choose the optimal parameters for the test set and finally provide the corresponding performance. The final results correspond to the average of the performances on the test sets. + +## summary + +| Kernels | RMSE(℃) | std(℃) | parameter | k_time | +|---------------|:---------:|:--------:|-------------:|-------:| +| shortest path | 36.40 | 5.35 | - | - | +| marginalized | 17.90 | 6.59 | p_quit = 0.1 | - | +| path | 14.27 | 6.37 | - | - | +| WL subtree | 9.00 | 6.37 | height = 1 | 0.85" | + +**In each line, paremeter is the one with which the kenrel achieves the best results. +In each line, k_time is the time spent on building the kernel matrix.** + +## detailed results of WL subtree kernel. +The table below shows the results of the WL subtree under different subtree heights. +``` + height RMSE_test std_test RMSE_train std_train k_time +-------- ----------- ---------- ------------ ----------- -------- + 0 36.2108 7.33179 141.419 1.08284 0.392911 + 1 9.00098 6.37145 140.065 0.877976 0.812077 + 2 19.8113 4.04911 140.075 0.928821 1.36955 + 3 25.0455 4.94276 140.198 0.873857 1.78629 + 4 28.2255 6.5212 140.272 0.838915 2.30847 + 5 30.6354 6.73647 140.247 0.86363 2.8258 + 6 32.1027 6.85601 140.239 0.872475 3.1542 + 7 32.9709 6.89606 140.094 0.917704 3.46081 + 8 33.5112 6.90753 140.076 0.931866 4.08857 + 9 33.8502 6.91427 139.913 0.928974 4.25243 + 10 34.0963 6.93115 139.894 0.942612 5.02607 +``` +**The unit of the *RMSEs* and *stds* is *℃*, The unit of the *k_time* is *s*. +k_time is the time spent on building the kernel matrix.** diff --git a/pygraph/kernels/spkernel.py b/pygraph/kernels/spkernel.py index cd49212..6136c78 100644 --- a/pygraph/kernels/spkernel.py +++ b/pygraph/kernels/spkernel.py @@ -10,7 +10,7 @@ import time from pygraph.utils.utils import getSPGraph -def spkernel(*args): +def spkernel(*args, edge_weight = 'bond_type'): """Calculate shortest-path kernels between graphs. Parameters @@ -20,6 +20,8 @@ def spkernel(*args): / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. + edge_weight : string + edge attribute corresponding to the edge weight. The default edge weight is bond_type. Return ------ @@ -37,7 +39,7 @@ def spkernel(*args): Sn = [] # get shortest path graphs of Gn for i in range(0, len(Gn)): - Sn.append(getSPGraph(Gn[i])) + Sn.append(getSPGraph(Gn[i], edge_weight = edge_weight)) start_time = time.time() for i in range(0, len(Gn)): @@ -48,13 +50,14 @@ def spkernel(*args): Kmatrix[i][j] += 1 Kmatrix[j][i] += (0 if i == j else 1) - print("--- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), (time.time() - start_time))) + run_time = time.time() - start_time + print("--- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) - return Kmatrix + return Kmatrix, run_time else: # for only 2 graphs - G1 = getSPGraph(args[0]) - G2 = getSPGraph(args[1]) + G1 = getSPGraph(args[0], edge_weight = edge_weight) + G2 = getSPGraph(args[1], edge_weight = edge_weight) kernel = 0 diff --git a/pygraph/kernels/weisfeilerLehmanKernel.py b/pygraph/kernels/weisfeilerLehmanKernel.py index 5f37bcc..cc4558f 100644 --- a/pygraph/kernels/weisfeilerLehmanKernel.py +++ b/pygraph/kernels/weisfeilerLehmanKernel.py @@ -23,7 +23,7 @@ import time from pygraph.kernels.spkernel import spkernel from pygraph.kernels.pathKernel import pathkernel -def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'): +def weisfeilerlehmankernel(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'): """Calculate Weisfeiler-Lehman kernels between graphs. Parameters @@ -32,12 +32,15 @@ def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'): List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs - 2 graphs between which the kernel is calculated. - - height : subtree height - - base_kernel : base kernel used in each iteration of WL kernel - the default base kernel is subtree kernel + 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + height : int + subtree height + base_kernel : string + base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. Return ------ @@ -57,7 +60,7 @@ def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'): # for WL subtree kernel if base_kernel == 'subtree': - Kmatrix = _wl_subtreekernel_do(args[0], height = height, base_kernel = 'subtree') + Kmatrix = _wl_subtreekernel_do(args[0], node_label, edge_label, height = height, base_kernel = 'subtree') # for WL edge kernel elif base_kernel == 'edge': @@ -73,9 +76,10 @@ def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'): Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j], height = height) Kmatrix[j][i] = Kmatrix[i][j] - print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), (time.time() - start_time))) + run_time = time.time() - start_time + print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), run_time)) - return Kmatrix + return Kmatrix, run_time else: # for only 2 graphs @@ -85,7 +89,7 @@ def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'): if base_kernel == 'subtree': args = [args[0], args[1]] - kernel = _wl_subtreekernel_do(args, height = height, base_kernel = 'subtree') + kernel = _wl_subtreekernel_do(args, node_label, edge_label, height = height, base_kernel = 'subtree') # for WL edge kernel elif base_kernel == 'edge': @@ -97,18 +101,27 @@ def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'): kernel = _pathkernel_do(args[0], args[1]) - print("\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---" % (base_kernel, time.time() - start_time)) + run_time = time.time() - start_time + print("\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---" % (base_kernel, run_time)) - return kernel + return kernel, run_time -def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'): +def _wl_subtreekernel_do(*args, node_label = 'atom', edge_label = 'bond_type', height = 0, base_kernel = 'subtree'): """Calculate Weisfeiler-Lehman subtree kernels between graphs. Parameters ---------- Gn : List of NetworkX graph - List of graphs between which the kernels are calculated. + List of graphs between which the kernels are calculated. + node_label : string + node attribute used as label. The default node label is atom. + edge_label : string + edge attribute used as label. The default edge label is bond_type. + height : int + subtree height + base_kernel : string + base kernel used in each iteration of WL kernel. The default base kernel is subtree kernel. Return ------ @@ -120,55 +133,54 @@ def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'): Kmatrix = np.zeros((len(Gn), len(Gn))) all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs - # initial + # initial for height = 0 + all_labels_ori = set() # all unique orignal labels in all graphs in this iteration + all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration + all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration + num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs + # for each graph - for idx, G in enumerate(Gn): + for G in Gn: # get the set of original labels - labels_ori = list(nx.get_node_attributes(G, 'label').values()) + labels_ori = list(nx.get_node_attributes(G, node_label).values()) + all_labels_ori.update(labels_ori) num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph + all_num_of_each_label.append(num_of_each_label) num_of_labels = len(num_of_each_label) # number of all unique labels all_labels_ori.update(labels_ori) - - -# # calculate subtree kernel while h = 0 and add it to the final kernel -# for i in range(0, len(Gn)): -# for j in range(i, len(Gn)): -# labels = set(list(nx.get_node_attributes(Gn[i], 'label').values()) + list(nx.get_node_attributes(Gn[j], 'label').values())) -# vector1 = np.matrix([ (nx.get_node_attributes(Gn[i], 'label').values()[label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ]) -# vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ]) -# Kmatrix[i][j] += np.dot(vector1, vector2.transpose()) -# Kmatrix[j][i] = Kmatrix[i][j] - + + all_num_of_labels_occured += len(all_labels_ori) + + # calculate subtree kernel with the 0th iteration and add it to the final kernel + for i in range(0, len(Gn)): + for j in range(i, len(Gn)): + labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys())) + vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ]) + vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ]) + Kmatrix[i][j] += np.dot(vector1, vector2.transpose()) + Kmatrix[j][i] = Kmatrix[i][j] # iterate each height - for h in range(height + 1): - all_labels_ori = set() # all unique orignal labels in all graphs in this iteration - all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration + for h in range(1, height + 1): all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs + all_labels_ori = set() + all_num_of_each_label = [] # for each graph for idx, G in enumerate(Gn): - # get the set of original labels - labels_ori = list(nx.get_node_attributes(G, 'label').values()) - num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph - num_of_labels = len(num_of_each_label) # number of all unique labels - - all_labels_ori.update(labels_ori) - num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed) set_multisets = [] for node in G.nodes(data = True): # Multiset-label determination. - multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ] + multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ] # sorting each multiset multiset.sort() - multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix + multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix set_multisets.append(multiset) # label compression - # set_multisets.sort() # this is unnecessary set_unique = list(set(set_multisets)) # set of unique multiset labels # a dictionary mapping original labels to new ones. set_compressed = {} @@ -179,20 +191,20 @@ def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'): else: set_compressed.update({ value : str(num_of_labels_occured + 1) }) num_of_labels_occured += 1 -# set_compressed = { value : (all_set_compressed[value] if value in all_set_compressed.keys() else str(set_unique.index(value) + num_of_labels_occured + 1)) for value in set_unique } all_set_compressed.update(set_compressed) -# num_of_labels_occured += len(set_compressed) #@todo not precise - + # relabel nodes - # nx.relabel_nodes(G, set_compressed, copy = False) for node in G.nodes(data = True): - node[1]['label'] = set_compressed[set_multisets[node[0]]] + node[1][node_label] = set_compressed[set_multisets[node[0]]] # get the set of compressed labels - labels_comp = list(nx.get_node_attributes(G, 'label').values()) - num_of_each_label.update(dict(Counter(labels_comp))) + labels_comp = list(nx.get_node_attributes(G, node_label).values()) + all_labels_ori.update(labels_comp) + num_of_each_label = dict(Counter(labels_comp)) all_num_of_each_label.append(num_of_each_label) + + all_num_of_labels_occured += len(all_labels_ori) # calculate subtree kernel with h iterations and add it to the final kernel for i in range(0, len(Gn)): @@ -203,8 +215,6 @@ def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'): Kmatrix[i][j] += np.dot(vector1, vector2.transpose()) Kmatrix[j][i] = Kmatrix[i][j] - all_num_of_labels_occured += len(all_labels_ori) - return Kmatrix diff --git a/pygraph/utils/__pycache__/graphfiles.cpython-35.pyc b/pygraph/utils/__pycache__/graphfiles.cpython-35.pyc index f88e42e..5a901fd 100644 Binary files a/pygraph/utils/__pycache__/graphfiles.cpython-35.pyc and b/pygraph/utils/__pycache__/graphfiles.cpython-35.pyc differ diff --git a/pygraph/utils/utils.py b/pygraph/utils/utils.py index 52a85f1..7a65f34 100644 --- a/pygraph/utils/utils.py +++ b/pygraph/utils/utils.py @@ -5,18 +5,20 @@ import numpy as np def getSPLengths(G1): sp = nx.shortest_path(G1) distances = np.zeros((G1.number_of_nodes(), G1.number_of_nodes())) - for i in np.keys(): - for j in np[i].keys(): + for i in sp.keys(): + for j in sp[i].keys(): distances[i, j] = len(sp[i][j])-1 return distances -def getSPGraph(G): +def getSPGraph(G, edge_weight = 'bond_type'): """Transform graph G to its corresponding shortest-paths graph. Parameters ---------- G : NetworkX graph The graph to be tramsformed. + edge_weight : string + edge attribute corresponding to the edge weight. The default edge weight is bond_type. Return ------ @@ -31,15 +33,17 @@ def getSPGraph(G): ---------- [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. """ - return floydTransformation(G) + return floydTransformation(G, edge_weight = edge_weight) -def floydTransformation(G): +def floydTransformation(G, edge_weight = 'bond_type'): """Transform graph G to its corresponding shortest-paths graph using Floyd-transformation. Parameters ---------- G : NetworkX graph The graph to be tramsformed. + edge_weight : string + edge attribute corresponding to the edge weight. The default edge weight is bond_type. Return ------ @@ -50,7 +54,7 @@ def floydTransformation(G): ---------- [1] Borgwardt KM, Kriegel HP. Shortest-path kernels on graphs. InData Mining, Fifth IEEE International Conference on 2005 Nov 27 (pp. 8-pp). IEEE. """ - spMatrix = nx.floyd_warshall_numpy(G) # @todo weigth label not considered + spMatrix = nx.floyd_warshall_numpy(G, weight = edge_weight) S = nx.Graph() S.add_nodes_from(G.nodes(data=True)) for i in range(0, G.number_of_nodes()):