|
|
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 0.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.38979601860046387 seconds ---\n",
- "[[ 5. 6. 4. ..., 20. 20. 20.]\n",
- " [ 6. 8. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 5. ..., 21. 21. 21.]\n",
- " ..., \n",
- " [ 20. 20. 21. ..., 101. 101. 101.]\n",
- " [ 20. 20. 21. ..., 101. 101. 101.]\n",
- " [ 20. 20. 21. ..., 101. 101. 101.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 17.681582\n",
- "With standard deviation: 0.713183\n",
- "\n",
- " Mean performance on test set: 15.685879\n",
- "With standard deviation: 4.139197\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 1.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.8205692768096924 seconds ---\n",
- "[[ 10. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 16. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 10. ..., 22. 22. 24.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 130. 130. 122.]\n",
- " [ 20. 20. 22. ..., 130. 130. 122.]\n",
- " [ 20. 20. 24. ..., 122. 122. 154.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 6.270014\n",
- "With standard deviation: 0.654734\n",
- "\n",
- " Mean performance on test set: 7.550458\n",
- "With standard deviation: 2.331786\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 2.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.375309705734253 seconds ---\n",
- "[[ 15. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 24. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 15. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 159. 151. 124.]\n",
- " [ 20. 20. 22. ..., 151. 153. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 185.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 4.450682\n",
- "With standard deviation: 0.882129\n",
- "\n",
- " Mean performance on test set: 9.728466\n",
- "With standard deviation: 2.057669\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 3.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.8636789321899414 seconds ---\n",
- "[[ 20. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 32. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 20. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 188. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 168. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 202.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 2.270586\n",
- "With standard deviation: 0.481516\n",
- "\n",
- " Mean performance on test set: 11.296110\n",
- "With standard deviation: 2.799944\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 4.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.5077457427978516 seconds ---\n",
- "[[ 25. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 40. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 25. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 217. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 183. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 213.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 1.074035\n",
- "With standard deviation: 0.637823\n",
- "\n",
- " Mean performance on test set: 12.808303\n",
- "With standard deviation: 3.446939\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 5.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.8235607147216797 seconds ---\n",
- "[[ 30. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 48. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 30. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 246. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 198. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 224.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.700602\n",
- "With standard deviation: 0.572640\n",
- "\n",
- " Mean performance on test set: 14.017923\n",
- "With standard deviation: 3.675042\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 6.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.458494186401367 seconds ---\n",
- "[[ 35. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 56. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 35. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 275. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 213. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 235.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691515\n",
- "With standard deviation: 0.564620\n",
- "\n",
- " Mean performance on test set: 14.918434\n",
- "With standard deviation: 3.805352\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 7.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.861224889755249 seconds ---\n",
- "[[ 40. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 64. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 40. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 304. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 228. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 246.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691516\n",
- "With standard deviation: 0.564620\n",
- "\n",
- " Mean performance on test set: 15.629476\n",
- "With standard deviation: 3.865387\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 8.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.295838117599487 seconds ---\n",
- "[[ 45. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 72. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 45. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 333. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 243. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 257.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691515\n",
- "With standard deviation: 0.564620\n",
- "\n",
- " Mean performance on test set: 16.214369\n",
- "With standard deviation: 3.928756\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 9.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.008287668228149 seconds ---\n",
- "[[ 50. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 80. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 50. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 362. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 258. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 268.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691515\n",
- "With standard deviation: 0.564620\n",
- "\n",
- " Mean performance on test set: 16.725744\n",
- "With standard deviation: 3.993095\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when height = 10.0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 5.347799301147461 seconds ---\n",
- "[[ 55. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 88. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 55. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 391. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 273. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 279.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691516\n",
- "With standard deviation: 0.564621\n",
- "\n",
- " Mean performance on test set: 17.186401\n",
- "With standard deviation: 4.056724\n",
- "\n",
- "\n",
- " height RMSE_test std_test RMSE_train std_train k_time\n",
- "-------- ----------- ---------- ------------ ----------- --------\n",
- " 0 15.6859 4.1392 17.6816 0.713183 0.389796\n",
- " 1 7.55046 2.33179 6.27001 0.654734 0.820569\n",
- " 2 9.72847 2.05767 4.45068 0.882129 1.37531\n",
- " 3 11.2961 2.79994 2.27059 0.481516 1.86368\n",
- " 4 12.8083 3.44694 1.07403 0.637823 2.50775\n",
- " 5 14.0179 3.67504 0.700602 0.57264 2.82356\n",
- " 6 14.9184 3.80535 0.691515 0.56462 3.45849\n",
- " 7 15.6295 3.86539 0.691516 0.56462 3.86122\n",
- " 8 16.2144 3.92876 0.691515 0.56462 4.29584\n",
- " 9 16.7257 3.9931 0.691515 0.56462 5.00829\n",
- " 10 17.1864 4.05672 0.691516 0.564621 5.3478\n"
- ]
- }
- ],
- "source": [
- "# wl subtree kernel\n",
- "%load_ext line_profiler\n",
- "\n",
- "import numpy as np\n",
- "import sys\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.utils import kernel_train_test\n",
- "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel, _wl_subtreekernel_do\n",
- "\n",
- "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
- "kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n",
- "\n",
- "kernel_para = dict(node_label = 'atom', edge_label = 'bond_type')\n",
- "\n",
- "kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
- " hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)\n",
- "\n",
- "# %lprun -f _wl_subtreekernel_do \\\n",
- "# kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
- "# hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "ename": "ImportError",
- "evalue": "cannot import name 'NUMPY_MKL'",
- "output_type": "error",
- "traceback": [
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)",
- "\u001b[1;32m<ipython-input-1-e7b9d5ef03e3>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minsert\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"../\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mpygraph\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mutils\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mkernel_train_test\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpygraph\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkernels\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mweisfeilerLehmanKernel\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mweisfeilerlehmankernel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_wl_subtreekernel_do\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32mE:\\课程及课件\\Doctorant\\py-graph\\pygraph\\utils\\utils.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 183\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 184\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mrandom\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 185\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkernel_ridge\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mKernelRidge\u001b[0m \u001b[1;31m# 0.17\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 186\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmetrics\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmean_squared_error\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0msvm\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32md:\\python\\python36\\lib\\site-packages\\sklearn\\__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 132\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 133\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m__check_build\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 134\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mbase\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mclone\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 135\u001b[0m \u001b[0m__check_build\u001b[0m \u001b[1;31m# avoid flakes unused variable error\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 136\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32md:\\python\\python36\\lib\\site-packages\\sklearn\\base.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 11\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0msparse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 12\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mexternals\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0msix\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfixes\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0msignature\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;32md:\\python\\python36\\lib\\site-packages\\scipy\\__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 59\u001b[0m \u001b[0m__all__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'test'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 61\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_distributor_init\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mNUMPY_MKL\u001b[0m \u001b[1;31m# requires numpy+mkl\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 62\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 63\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mshow_config\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mshow_numpy_config\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
- "\u001b[1;31mImportError\u001b[0m: cannot import name 'NUMPY_MKL'"
- ]
- }
- ],
- "source": [
- "# WL sp kernel\n",
- "%load_ext line_profiler\n",
- "\n",
- "import numpy as np\n",
- "import sys\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.utils import kernel_train_test\n",
- "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel, _wl_subtreekernel_do\n",
- "\n",
- "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
- "kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n",
- "\n",
- "kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', base_kernel = 'sp')\n",
- "\n",
- "kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
- " hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)\n",
- "\n",
- "# %lprun -f _wl_subtreekernel_do \\\n",
- "# kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
- "# hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# results\n",
- "\n",
- "# with y normalization\n",
- " height RMSE_test std_test RMSE_train std_train k_time\n",
- "-------- ----------- ---------- ------------ ----------- --------\n",
- " 0 36.2108 7.33179 38.6059 1.57064 0.379475\n",
- " 1 9.00098 6.37145 6.76379 1.96568 0.844898\n",
- " 2 19.8113 4.04911 5.28757 1.81899 1.35308\n",
- " 3 25.0455 4.94276 2.3274 0.805733 1.81136\n",
- " 4 28.2255 6.5212 0.85156 0.423465 2.23098\n",
- " 5 30.6354 6.73647 3.35947 8.17561 2.71575\n",
- " 6 32.1027 6.85601 3.54105 8.71922 3.11459\n",
- " 7 32.9709 6.89606 6.94372 9.94045 3.55571\n",
- " 8 33.5112 6.90753 6.97339 9.76975 3.79657\n",
- " 9 33.8502 6.91427 11.8345 11.6213 4.41555\n",
- " 10 34.0963 6.93115 11.4257 11.2624 4.94888\n",
- "\n",
- "# without y normalization\n",
- " height RMSE_test std_test RMSE_train std_train k_time\n",
- "-------- ----------- ---------- ------------ ----------- --------\n",
- " 0 15.6859 4.1392 17.6816 0.713183 0.360443\n",
- " 1 7.55046 2.33179 6.27001 0.654734 0.837389\n",
- " 2 9.72847 2.05767 4.45068 0.882129 1.25317\n",
- " 3 11.2961 2.79994 2.27059 0.481516 1.79971\n",
- " 4 12.8083 3.44694 1.07403 0.637823 2.35346\n",
- " 5 14.0179 3.67504 0.700602 0.57264 2.78285\n",
- " 6 14.9184 3.80535 0.691515 0.56462 3.20764\n",
- " 7 15.6295 3.86539 0.691516 0.56462 3.71648\n",
- " 8 16.2144 3.92876 0.691515 0.56462 3.99213\n",
- " 9 16.7257 3.9931 0.691515 0.56462 4.26315\n",
- " 10 17.1864 4.05672 0.691516 0.564621 5.00918"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 0 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.3920705318450928 seconds ---\n",
- "[[ 5. 6. 4. ..., 20. 20. 20.]\n",
- " [ 6. 8. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 5. ..., 21. 21. 21.]\n",
- " ..., \n",
- " [ 20. 20. 21. ..., 101. 101. 101.]\n",
- " [ 20. 20. 21. ..., 101. 101. 101.]\n",
- " [ 20. 20. 21. ..., 101. 101. 101.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 17.681582\n",
- "With standard deviation: 0.713183\n",
- "\n",
- " Mean performance on test set: 15.685879\n",
- "With standard deviation: 4.139197\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 1 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.8578901290893555 seconds ---\n",
- "[[ 10. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 16. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 10. ..., 22. 22. 24.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 130. 130. 122.]\n",
- " [ 20. 20. 22. ..., 130. 130. 122.]\n",
- " [ 20. 20. 24. ..., 122. 122. 154.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 6.270014\n",
- "With standard deviation: 0.654734\n",
- "\n",
- " Mean performance on test set: 7.550458\n",
- "With standard deviation: 2.331786\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 2 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.264050006866455 seconds ---\n",
- "[[ 15. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 24. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 15. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 159. 151. 124.]\n",
- " [ 20. 20. 22. ..., 151. 153. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 185.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 4.450682\n",
- "With standard deviation: 0.882129\n",
- "\n",
- " Mean performance on test set: 9.728466\n",
- "With standard deviation: 2.057669\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 3 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.731236219406128 seconds ---\n",
- "[[ 20. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 32. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 20. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 188. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 168. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 202.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 2.270586\n",
- "With standard deviation: 0.481516\n",
- "\n",
- " Mean performance on test set: 11.296110\n",
- "With standard deviation: 2.799944\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 4 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.1112847328186035 seconds ---\n",
- "[[ 25. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 40. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 25. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 217. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 183. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 213.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 1.074035\n",
- "With standard deviation: 0.637823\n",
- "\n",
- " Mean performance on test set: 12.808303\n",
- "With standard deviation: 3.446939\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 5 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.4751319885253906 seconds ---\n",
- "[[ 30. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 48. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 30. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 246. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 198. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 224.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.700602\n",
- "With standard deviation: 0.572640\n",
- "\n",
- " Mean performance on test set: 14.017923\n",
- "With standard deviation: 3.675042\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 6 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.8712213039398193 seconds ---\n",
- "[[ 35. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 56. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 35. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 275. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 213. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 235.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691515\n",
- "With standard deviation: 0.564620\n",
- "\n",
- " Mean performance on test set: 14.918434\n",
- "With standard deviation: 3.805352\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 7 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.554422378540039 seconds ---\n",
- "[[ 40. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 64. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 40. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 304. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 228. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 246.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691516\n",
- "With standard deviation: 0.564620\n",
- "\n",
- " Mean performance on test set: 15.629476\n",
- "With standard deviation: 3.865387\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 8 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.8757314682006836 seconds ---\n",
- "[[ 45. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 72. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 45. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 333. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 243. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 257.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691515\n",
- "With standard deviation: 0.564620\n",
- "\n",
- " Mean performance on test set: 16.214369\n",
- "With standard deviation: 3.928756\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 9 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.205373764038086 seconds ---\n",
- "[[ 50. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 80. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 50. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 362. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 258. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 268.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691515\n",
- "With standard deviation: 0.564620\n",
- "\n",
- " Mean performance on test set: 16.725744\n",
- "With standard deviation: 3.993095\n",
- "\n",
- "\n",
- " #--- calculating kernel matrix when subtree height = 10 ---#\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.737298250198364 seconds ---\n",
- "[[ 55. 10. 4. ..., 20. 20. 20.]\n",
- " [ 10. 88. 4. ..., 20. 20. 20.]\n",
- " [ 4. 4. 55. ..., 22. 22. 26.]\n",
- " ..., \n",
- " [ 20. 20. 22. ..., 391. 159. 124.]\n",
- " [ 20. 20. 22. ..., 159. 273. 124.]\n",
- " [ 20. 20. 26. ..., 124. 124. 279.]]\n",
- "\n",
- " Saving kernel matrix to file...\n",
- "\n",
- " Mean performance on train set: 0.691516\n",
- "With standard deviation: 0.564621\n",
- "\n",
- " Mean performance on test set: 17.186401\n",
- "With standard deviation: 4.056724\n",
- "\n",
- "\n",
- " height RMSE_test std_test RMSE_train std_train k_time\n",
- "-------- ----------- ---------- ------------ ----------- --------\n",
- " 0 15.6859 4.1392 17.6816 0.713183 0.392071\n",
- " 1 7.55046 2.33179 6.27001 0.654734 0.85789\n",
- " 2 9.72847 2.05767 4.45068 0.882129 1.26405\n",
- " 3 11.2961 2.79994 2.27059 0.481516 1.73124\n",
- " 4 12.8083 3.44694 1.07403 0.637823 2.11128\n",
- " 5 14.0179 3.67504 0.700602 0.57264 2.47513\n",
- " 6 14.9184 3.80535 0.691515 0.56462 2.87122\n",
- " 7 15.6295 3.86539 0.691516 0.56462 3.55442\n",
- " 8 16.2144 3.92876 0.691515 0.56462 3.87573\n",
- " 9 16.7257 3.9931 0.691515 0.56462 4.20537\n",
- " 10 17.1864 4.05672 0.691516 0.564621 4.7373\n"
- ]
- }
- ],
- "source": [
- "# test of WL subtree kernel\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " Elisabetta Ghisu, https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import os\n",
- "import pathlib\n",
- "from collections import OrderedDict\n",
- "sys.path.insert(0, \"../\")\n",
- "from tabulate import tabulate\n",
- "\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "from pygraph.utils.utils import split_train_test\n",
- "\n",
- "train_means_list = []\n",
- "train_stds_list = []\n",
- "test_means_list = []\n",
- "test_stds_list = []\n",
- "kernel_time_list = []\n",
- "\n",
- "for height in np.linspace(0, 10, 11):\n",
- " print('\\n\\n #--- calculating kernel matrix when subtree height = %d ---#' % height)\n",
- "\n",
- " print('\\n Loading dataset from file...')\n",
- " dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- " y = np.array(y)\n",
- "# print(y)\n",
- "\n",
- " # setup the parameters\n",
- " model_type = 'regression' # Regression or classification problem\n",
- " print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- "# datasize = len(dataset)\n",
- " trials = 100 # Trials for hyperparameters random search\n",
- " splits = 10 # Number of splits of the data\n",
- " alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- " C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
- "\n",
- "\n",
- " # set the output path\n",
- " kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n",
- " if not os.path.exists(kernel_file_path):\n",
- " os.makedirs(kernel_file_path)\n",
- "\n",
- " \"\"\"\n",
- " - Here starts the main program\n",
- " - First we permute the data, then for each split we evaluate corresponding performances\n",
- " - In the end, the performances are averaged over the test sets\n",
- " \"\"\"\n",
- "\n",
- " # save kernel matrices to files / read kernel matrices from files\n",
- " kernel_file = kernel_file_path + 'km.ds'\n",
- " path = pathlib.Path(kernel_file)\n",
- " # get train set kernel matrix\n",
- " if path.is_file():\n",
- " print('\\n Loading the kernel matrix from file...')\n",
- " Kmatrix = np.loadtxt(kernel_file)# results\n",
- " print(Kmatrix)\n",
- " else:\n",
- " print('\\n Calculating kernel matrix, this could take a while...')\n",
- " Kmatrix, run_time = weisfeilerlehmankernel(dataset, node_label = 'atom', height = int(height))\n",
- " kernel_time_list.append(run_time)\n",
- " print(Kmatrix)\n",
- " print('\\n Saving kernel matrix to file...')\n",
- " # np.savetxt(kernel_file, Kmatrix)\n",
- "\n",
- " train_mean, train_std, test_mean, test_std = \\\n",
- " split_train_test(Kmatrix, y, alpha_grid, C_grid, splits, trials, model_type, normalize = False)\n",
- " \n",
- " train_means_list.append(train_mean)\n",
- " train_stds_list.append(train_std)\n",
- " test_means_list.append(test_mean)\n",
- " test_stds_list.append(test_std)\n",
- " \n",
- "print('\\n') \n",
- "table_dict = {'height': np.linspace(0, 10, 11), 'RMSE_test': test_means_list, 'std_test': test_stds_list, \\\n",
- " 'RMSE_train': train_means_list, 'std_train': train_stds_list, 'k_time': kernel_time_list}\n",
- "keyorder = ['height', 'RMSE_test', 'std_test', 'RMSE_train', 'std_train', 'k_time']\n",
- "print(tabulate(OrderedDict(sorted(table_dict.items(), key = lambda i:keyorder.index(i[0]))), headers='keys'))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'O', 'C'}\n",
- "{'O', 'C'}\n",
- "--- shortest path kernel built in 0.0002582073211669922 seconds ---\n",
- "3\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773eab40b8>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': 'C'}), (1, {'label': 'C'}), (2, {'label': 'C'}), (3, {'label': 'C'}), (4, {'label': 'O'})]\n",
- " -> \n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773ca1cc88>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': 'CC'}), (1, {'label': 'CC'}), (2, {'label': 'CO'}), (3, {'label': 'CCCO'}), (4, {'label': 'OCC'})]\n",
- " -> \n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773c9a44e0>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': '0'}), (1, {'label': '0'}), (2, {'label': '3'}), (3, {'label': '1'}), (4, {'label': '2'})]\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773c9957b8>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': 'C'}), (1, {'label': 'C'}), (2, {'label': 'C'}), (3, {'label': 'C'}), (4, {'label': 'C'}), (5, {'label': 'C'}), (6, {'label': 'O'})]\n",
- " -> \n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f7788e0e390>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': 'CC'}), (1, {'label': 'CC'}), (2, {'label': 'CC'}), (3, {'label': 'CO'}), (4, {'label': 'CCCC'}), (5, {'label': 'CCCO'}), (6, {'label': 'OCC'})]\n",
- " -> \n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "<matplotlib.figure.Figure at 0x7f773c95a5f8>"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[(0, {'label': '0'}), (1, {'label': '0'}), (2, {'label': '0'}), (3, {'label': '3'}), (4, {'label': '4'}), (5, {'label': '1'}), (6, {'label': '2'})]\n",
- "--- shortest path kernel built in 0.00026607513427734375 seconds ---\n",
- "6\n"
- ]
- }
- ],
- "source": [
- "import sys\n",
- "import networkx as nx\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "from pygraph.kernels.spkernel import spkernel\n",
- "\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "\n",
- "def weisfeilerlehman_test(G):\n",
- " '''\n",
- " Weisfeiler-Lehman test of graph isomorphism.\n",
- " '''\n",
- "\n",
- " nx.draw_networkx(G)\n",
- " plt.show()\n",
- " nx.draw_networkx_labels(G, nx.spring_layout(G), labels = nx.get_node_attributes(G,'label'))\n",
- " print(G.nodes(data = True))\n",
- " \n",
- " set_multisets = []\n",
- " for node in G.nodes(data = True):\n",
- " # Multiset-label determination.\n",
- " multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]\n",
- " # sorting each multiset\n",
- " multiset.sort()\n",
- " multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix \n",
- " set_multisets.append(multiset)\n",
- " \n",
- " # label compression\n",
- "# set_multisets.sort() # this is unnecessary\n",
- " set_unique = list(set(set_multisets)) # set of unique multiset labels\n",
- " set_compressed = { value : str(set_unique.index(value)) for value in set_unique } # assign indices as the new labels\n",
- "# print(set_compressed)\n",
- "# print(set_multisets)\n",
- " \n",
- " # relabel nodes with multisets\n",
- " for node in G.nodes(data = True):\n",
- " node[1]['label'] = set_multisets[node[0]]\n",
- " print(' -> ')\n",
- " nx.draw_networkx(G)\n",
- " plt.show()\n",
- " print(G.nodes(data = True))\n",
- "\n",
- " \n",
- " # relabel nodes\n",
- " for node in G.nodes(data = True):\n",
- " node[1]['label'] = set_compressed[set_multisets[node[0]]]\n",
- " \n",
- " print(' -> ')\n",
- " nx.draw_networkx(G)\n",
- " plt.show()\n",
- " print(G.nodes(data = True))\n",
- "\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "G1 = dataset[12]\n",
- "G2 = dataset[55]\n",
- "\n",
- "# init.\n",
- "kernel = 0 # init kernel\n",
- "num_nodes1 = G1.number_of_nodes()\n",
- "num_nodes2 = G2.number_of_nodes()\n",
- "\n",
- "# the first iteration.\n",
- "labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
- "labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
- "print(labelset1)\n",
- "print(labelset2)\n",
- "kernel += spkernel(G1, G2)\n",
- "print(kernel)\n",
- "\n",
- "\n",
- "\n",
- "for height in range(0, min(num_nodes1, num_nodes2)): #Q how to determine the upper bound of the height?\n",
- " if labelset1 != labelset2:\n",
- " break\n",
- " \n",
- " # Weisfeiler-Lehman test of graph isomorphism.\n",
- " weisfeilerlehman_test(G1)\n",
- " weisfeilerlehman_test(G2)\n",
- " \n",
- " # calculate kernel\n",
- " kernel += spkernel(G1, G2)\n",
- " \n",
- " # get label sets of both graphs\n",
- " labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
- " labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
- "# print(labelset1)\n",
- "# print(labelset2)\n",
- "\n",
- "print(kernel)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{0: 'C', 1: 'C', 2: 'C', 3: 'C', 4: 'C', 5: 'O', 6: 'O'}\n",
- "{0: 'C', 1: 'C', 2: 'C', 3: 'C', 4: 'C', 5: 'C', 6: 'S', 7: 'S'}\n",
- "\n",
- " --- height = 0 --- \n",
- "\n",
- " --- for graph 0 --- \n",
- "\n",
- "labels_ori: ['C', 'C', 'C', 'C', 'C', 'O', 'O']\n",
- "all_labels_ori: {'C', 'O'}\n",
- "num_of_each_label: {'C': 5, 'O': 2}\n",
- "all_num_of_each_label: [{'C': 5, 'O': 2}]\n",
- "num_of_labels: 2\n",
- "all_labels_ori: {'C', 'O'}\n",
- "\n",
- " --- for graph 1 --- \n",
- "\n",
- "labels_ori: ['C', 'C', 'C', 'C', 'C', 'C', 'S', 'S']\n",
- "all_labels_ori: {'C', 'O', 'S'}\n",
- "num_of_each_label: {'C': 6, 'S': 2}\n",
- "all_num_of_each_label: [{'C': 5, 'O': 2}, {'C': 6, 'S': 2}]\n",
- "num_of_labels: 2\n",
- "all_labels_ori: {'C', 'O', 'S'}\n",
- "\n",
- " all_num_of_labels_occured: 3\n",
- "\n",
- " --- calculating kernel matrix ---\n",
- "\n",
- " labels: {'C', 'O'}\n",
- "vector1: [[5 2]]\n",
- "vector2: [[5 2]]\n",
- "Kmatrix: [[ 29. 0.]\n",
- " [ 0. 0.]]\n",
- "\n",
- " labels: {'C', 'O', 'S'}\n",
- "vector1: [[5 2 0]]\n",
- "vector2: [[6 0 2]]\n",
- "Kmatrix: [[ 29. 30.]\n",
- " [ 30. 0.]]\n",
- "\n",
- " labels: {'C', 'S'}\n",
- "vector1: [[6 2]]\n",
- "vector2: [[6 2]]\n",
- "Kmatrix: [[ 29. 30.]\n",
- " [ 30. 40.]]\n",
- "\n",
- " --- height = 1 --- \n",
- "\n",
- " --- for graph 0 --- \n",
- "\n",
- "multiset: ['CC', 'CC', 'CCO', 'CCO', 'COO', 'OCC', 'OCC']\n",
- "set_unique: ['OCC', 'COO', 'CCO', 'CC']\n",
- "set_compressed: {'OCC': '4', 'COO': '5', 'CCO': '6', 'CC': '7'}\n",
- "all_set_compressed: {'OCC': '4', 'COO': '5', 'CCO': '6', 'CC': '7'}\n",
- "num_of_labels_occured: 7\n",
- "\n",
- " compressed labels: {0: '7', 1: '7', 2: '6', 3: '6', 4: '5', 5: '4', 6: '4'}\n",
- "labels_comp: ['7', '7', '6', '6', '5', '4', '4']\n",
- "all_labels_ori: {'5', '4', '6', '7'}\n",
- "num_of_each_label: {'5': 1, '4': 2, '6': 2, '7': 2}\n",
- "all_num_of_each_label: [{'5': 1, '4': 2, '6': 2, '7': 2}]\n",
- "\n",
- " --- for graph 1 --- \n",
- "\n",
- "multiset: ['CC', 'CC', 'CC', 'CCS', 'CCS', 'CCSS', 'SCC', 'SCC']\n",
- "set_unique: ['SCC', 'CC', 'CCS', 'CCSS']\n",
- "set_compressed: {'SCC': '8', 'CC': '7', 'CCS': '9', 'CCSS': '10'}\n",
- "all_set_compressed: {'SCC': '8', 'COO': '5', 'CCS': '9', 'OCC': '4', 'CCO': '6', 'CCSS': '10', 'CC': '7'}\n",
- "num_of_labels_occured: 10\n",
- "\n",
- " compressed labels: {0: '7', 1: '7', 2: '7', 3: '9', 4: '9', 5: '10', 6: '8', 7: '8'}\n",
- "labels_comp: ['7', '7', '7', '9', '9', '10', '8', '8']\n",
- "all_labels_ori: {'10', '4', '7', '9', '6', '5', '8'}\n",
- "num_of_each_label: {'10': 1, '9': 2, '7': 3, '8': 2}\n",
- "all_num_of_each_label: [{'5': 1, '4': 2, '6': 2, '7': 2}, {'10': 1, '9': 2, '7': 3, '8': 2}]\n",
- "\n",
- " all_num_of_labels_occured: 10\n",
- "\n",
- " --- calculating kernel matrix ---\n",
- "\n",
- " labels: {'5', '4', '6', '7'}\n",
- "vector1: [[1 2 2 2]]\n",
- "vector2: [[1 2 2 2]]\n",
- "\n",
- " labels: {'10', '4', '7', '9', '6', '5', '8'}\n",
- "vector1: [[0 2 2 0 2 1 0]]\n",
- "vector2: [[1 0 3 2 0 0 2]]\n",
- "\n",
- " labels: {'8', '10', '7', '9'}\n",
- "vector1: [[2 1 3 2]]\n",
- "vector2: [[2 1 3 2]]\n",
- "\n",
- " Kmatrix: [[ 42. 36.]\n",
- " [ 36. 58.]]\n",
- "\n",
- " --- height = 2 --- \n",
- "\n",
- " --- for graph 0 --- \n",
- "\n",
- "multiset: ['76', '76', '647', '647', '544', '456', '456']\n",
- "set_unique: ['647', '76', '456', '544']\n",
- "set_compressed: {'647': '11', '76': '12', '544': '14', '456': '13'}\n",
- "all_set_compressed: {'647': '11', '76': '12', '456': '13', '544': '14'}\n",
- "num_of_labels_occured: 14\n",
- "\n",
- " compressed labels: {0: '12', 1: '12', 2: '11', 3: '11', 4: '14', 5: '13', 6: '13'}\n",
- "labels_comp: ['12', '12', '11', '11', '14', '13', '13']\n",
- "all_labels_ori: {'14', '12', '11', '13'}\n",
- "num_of_each_label: {'14': 1, '13': 2, '12': 2, '11': 2}\n",
- "all_num_of_each_label: [{'14': 1, '13': 2, '12': 2, '11': 2}]\n",
- "\n",
- " --- for graph 1 --- \n",
- "\n",
- "multiset: ['79', '79', '710', '978', '978', '10788', '8109', '8109']\n",
- "set_unique: ['710', '8109', '79', '10788', '978']\n",
- "set_compressed: {'710': '15', '79': '17', '8109': '16', '978': '19', '10788': '18'}\n",
- "all_set_compressed: {'710': '15', '79': '17', '978': '19', '10788': '18', '8109': '16', '456': '13', '544': '14', '647': '11', '76': '12'}\n",
- "num_of_labels_occured: 19\n",
- "\n",
- " compressed labels: {0: '17', 1: '17', 2: '15', 3: '19', 4: '19', 5: '18', 6: '16', 7: '16'}\n",
- "labels_comp: ['17', '17', '15', '19', '19', '18', '16', '16']\n",
- "all_labels_ori: {'18', '19', '12', '13', '17', '11', '14', '16', '15'}\n",
- "num_of_each_label: {'15': 1, '17': 2, '19': 2, '16': 2, '18': 1}\n",
- "all_num_of_each_label: [{'14': 1, '13': 2, '12': 2, '11': 2}, {'15': 1, '17': 2, '19': 2, '16': 2, '18': 1}]\n",
- "\n",
- " all_num_of_labels_occured: 19\n",
- "\n",
- " --- calculating kernel matrix ---\n",
- "\n",
- " labels: {'14', '12', '11', '13'}\n",
- "vector1: [[1 2 2 2]]\n",
- "vector2: [[1 2 2 2]]\n",
- "\n",
- " labels: {'18', '19', '12', '13', '17', '11', '14', '16', '15'}\n",
- "vector1: [[0 0 2 2 0 2 1 0 0]]\n",
- "vector2: [[1 2 0 0 2 0 0 2 1]]\n",
- "\n",
- " labels: {'18', '17', '15', '16', '19'}\n",
- "vector1: [[1 2 1 2 2]]\n",
- "vector2: [[1 2 1 2 2]]\n",
- "\n",
- " Kmatrix: [[ 55. 36.]\n",
- " [ 36. 72.]]\n",
- "\n",
- " --- Weisfeiler-Lehman subtree kernel built in 0.0034377574920654297 seconds ---\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "array([[ 55., 36.],\n",
- " [ 36., 72.]])"
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# test of WL subtree kernel on many graphs\n",
- "\n",
- "import sys\n",
- "import pathlib\n",
- "from collections import Counter\n",
- "sys.path.insert(0, \"../\")\n",
- "\n",
- "import networkx as nx\n",
- "import numpy as np\n",
- "import time\n",
- "\n",
- "from pygraph.kernels.spkernel import spkernel\n",
- "from pygraph.kernels.pathKernel import pathkernel\n",
- "\n",
- "def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'):\n",
- " \"\"\"Calculate Weisfeiler-Lehman kernels between graphs.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " Gn : List of NetworkX graph\n",
- " List of graphs between which the kernels are calculated.\n",
- " /\n",
- " G1, G2 : NetworkX graphs\n",
- " 2 graphs between which the kernel is calculated.\n",
- " \n",
- " height : subtree height\n",
- " \n",
- " base_kernel : base kernel used in each iteration of WL kernel\n",
- " the default base kernel is subtree kernel\n",
- " \n",
- " Return\n",
- " ------\n",
- " Kmatrix/Kernel : Numpy matrix/int\n",
- " Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman Kernel between 2 graphs.\n",
- " \n",
- " Notes\n",
- " -----\n",
- " This function now supports WL subtree kernel and WL shortest path kernel.\n",
- " \n",
- " References\n",
- " ----------\n",
- " [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.\n",
- " \"\"\"\n",
- " if len(args) == 1: # for a list of graphs\n",
- "\n",
- "# print(args)\n",
- " start_time = time.time()\n",
- " \n",
- " # for WL subtree kernel\n",
- " if base_kernel == 'subtree': \n",
- " Kmatrix = _wl_subtreekernel_do(args[0], height = height, base_kernel = 'subtree')\n",
- " \n",
- " # for WL edge kernel\n",
- " elif base_kernel == 'edge':\n",
- " print('edge')\n",
- " \n",
- " # for WL shortest path kernel\n",
- " elif base_kernel == 'sp':\n",
- " Gn = args[0]\n",
- " Kmatrix = np.zeros((len(Gn), len(Gn)))\n",
- " \n",
- " for i in range(0, len(Gn)):\n",
- " for j in range(i, len(Gn)):\n",
- " Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j])\n",
- " Kmatrix[j][i] = Kmatrix[i][j]\n",
- "\n",
- " print(\"\\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---\" % (base_kernel, len(args[0]), (time.time() - start_time)))\n",
- " \n",
- " return Kmatrix\n",
- " \n",
- " else: # for only 2 graphs\n",
- " \n",
- " start_time = time.time()\n",
- " \n",
- " # for WL subtree kernel\n",
- " if base_kernel == 'subtree':\n",
- " \n",
- " args = [args[0], args[1]]\n",
- "# print(args)\n",
- " kernel = _wl_subtreekernel_do(args, height = height, base_kernel = 'subtree')\n",
- " \n",
- " # for WL edge kernel\n",
- " elif base_kernel == 'edge':\n",
- " print('edge')\n",
- " \n",
- " # for WL shortest path kernel\n",
- " elif base_kernel == 'sp':\n",
- " \n",
- "\n",
- " kernel = _pathkernel_do(args[0], args[1])\n",
- "\n",
- " print(\"\\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---\" % (base_kernel, time.time() - start_time))\n",
- " \n",
- " return kernel\n",
- " \n",
- " \n",
- "def _weisfeilerlehmankernel_do(G1, G2):\n",
- " \"\"\"Calculate Weisfeiler-Lehman kernels between 2 graphs. This kernel use shortest path kernel to calculate kernel between two graphs in each iteration.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " G1, G2 : NetworkX graphs\n",
- " 2 graphs between which the kernel is calculated.\n",
- " \n",
- " Return\n",
- " ------\n",
- " Kernel : int\n",
- " Weisfeiler-Lehman Kernel between 2 graphs.\n",
- " \"\"\"\n",
- " \n",
- " # init.\n",
- " kernel = 0 # init kernel\n",
- " num_nodes1 = G1.number_of_nodes()\n",
- " num_nodes2 = G2.number_of_nodes()\n",
- " height = 12 #min(num_nodes1, num_nodes2)) #Q how to determine the upper bound of the height?\n",
- " \n",
- " # the first iteration.\n",
- " labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
- " labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
- " kernel += pathkernel(G1, G2) # change your base kernel here (and one more below)\n",
- " \n",
- " for h in range(0, height):\n",
- "# if labelset1 != labelset2:\n",
- "# break\n",
- "\n",
- " # Weisfeiler-Lehman test of graph isomorphism.\n",
- " relabel(G1)\n",
- " relabel(G2)\n",
- "\n",
- " # calculate kernel\n",
- " kernel += pathkernel(G1, G2) # change your base kernel here (and one more before)\n",
- "\n",
- " # get label sets of both graphs\n",
- " labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
- " labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
- " \n",
- " return kernel\n",
- "\n",
- "\n",
- "def relabel(G):\n",
- " '''\n",
- " Relabel nodes in graph G in one iteration of the 1-dim. WL test of graph isomorphism.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " G : NetworkX graph\n",
- " The graphs whose nodes are relabeled.\n",
- " '''\n",
- " \n",
- " # get the set of original labels\n",
- " labels_ori = list(nx.get_node_attributes(G, 'label').values())\n",
- " print(labels_ori)\n",
- " num_of_each_label = dict(Counter(labels_ori))\n",
- " print(num_of_each_label)\n",
- " num_of_labels = len(num_of_each_label)\n",
- " print(num_of_labels)\n",
- " \n",
- " set_multisets = []\n",
- " for node in G.nodes(data = True):\n",
- " # Multiset-label determination.\n",
- " multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]\n",
- " # sorting each multiset\n",
- " multiset.sort()\n",
- " multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix \n",
- " set_multisets.append(multiset)\n",
- " print(set_multisets)\n",
- " \n",
- " # label compression\n",
- "# set_multisets.sort() # this is unnecessary\n",
- " set_unique = list(set(set_multisets)) # set of unique multiset labels\n",
- " print(set_unique)\n",
- " set_compressed = { value : str(set_unique.index(value) + num_of_labels + 1) for value in set_unique } # assign new labels\n",
- " print(set_compressed)\n",
- " \n",
- " # relabel nodes\n",
- "# nx.relabel_nodes(G, set_compressed, copy = False)\n",
- " for node in G.nodes(data = True):\n",
- " node[1]['label'] = set_compressed[set_multisets[node[0]]]\n",
- " print(nx.get_node_attributes(G, 'label'))\n",
- "\n",
- " # get the set of compressed labels\n",
- " labels_comp = list(nx.get_node_attributes(G, 'label').values())\n",
- " print(labels_comp)\n",
- " num_of_each_label.update(dict(Counter(labels_comp)))\n",
- " print(num_of_each_label)\n",
- " \n",
- " \n",
- "def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'):\n",
- " \"\"\"Calculate Weisfeiler-Lehman subtree kernels between graphs.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " Gn : List of NetworkX graph\n",
- " List of graphs between which the kernels are calculated.\n",
- " \n",
- " Return\n",
- " ------\n",
- " Kmatrix/Kernel : Numpy matrix/int\n",
- " Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.\n",
- " \"\"\"\n",
- " \n",
- "# print(args)\n",
- " Gn = args[0]\n",
- "# print(Gn)\n",
- "\n",
- " Kmatrix = np.zeros((len(Gn), len(Gn)))\n",
- " all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs\n",
- " \n",
- " # initial for height = 0\n",
- " print('\\n --- height = 0 --- ')\n",
- " all_labels_ori = set() # all unique orignal labels in all graphs in this iteration\n",
- " all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration\n",
- " all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration\n",
- " num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs\n",
- "\n",
- " # for each graph\n",
- " for idx, G in enumerate(Gn):\n",
- " # get the set of original labels\n",
- " print('\\n --- for graph %d --- \\n' % (idx))\n",
- " labels_ori = list(nx.get_node_attributes(G, 'label').values())\n",
- " print('labels_ori: %s' % (labels_ori))\n",
- " all_labels_ori.update(labels_ori)\n",
- " print('all_labels_ori: %s' % (all_labels_ori))\n",
- " num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n",
- " print('num_of_each_label: %s' % (num_of_each_label))\n",
- " all_num_of_each_label.append(num_of_each_label)\n",
- " print('all_num_of_each_label: %s' % (all_num_of_each_label))\n",
- " num_of_labels = len(num_of_each_label) # number of all unique labels\n",
- " print('num_of_labels: %s' % (num_of_labels))\n",
- " \n",
- "\n",
- " all_labels_ori.update(labels_ori)\n",
- " print('all_labels_ori: %s' % (all_labels_ori))\n",
- " \n",
- " all_num_of_labels_occured += len(all_labels_ori)\n",
- " print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n",
- " \n",
- " # calculate subtree kernel with the 0th iteration and add it to the final kernel\n",
- " print('\\n --- calculating kernel matrix ---')\n",
- " for i in range(0, len(Gn)):\n",
- " for j in range(i, len(Gn)):\n",
- " labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n",
- " print('\\n labels: %s' % (labels))\n",
- " vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n",
- " vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n",
- " print('vector1: %s' % (vector1))\n",
- " print('vector2: %s' % (vector2))\n",
- " Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n",
- " Kmatrix[j][i] = Kmatrix[i][j]\n",
- " print('Kmatrix: %s' % (Kmatrix))\n",
- "\n",
- " \n",
- " # iterate each height\n",
- " for h in range(1, height + 1):\n",
- " print('\\n --- height = %d --- ' % (h))\n",
- " all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration\n",
- " num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs\n",
- " all_labels_ori = set()\n",
- " all_num_of_each_label = []\n",
- " \n",
- " # for each graph\n",
- " for idx, G in enumerate(Gn):\n",
- "# # get the set of original labels\n",
- " print('\\n --- for graph %d --- \\n' % (idx))\n",
- "# labels_ori = list(nx.get_node_attributes(G, 'label').values())\n",
- "# print('labels_ori: %s' % (labels_ori))\n",
- "# num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n",
- "# print('num_of_each_label: %s' % (num_of_each_label))\n",
- "# num_of_labels = len(num_of_each_label) # number of all unique labels\n",
- "# print('num_of_labels: %s' % (num_of_labels))\n",
- " \n",
- "# all_labels_ori.update(labels_ori)\n",
- "# print('all_labels_ori: %s' % (all_labels_ori))\n",
- "# # num_of_labels_occured += num_of_labels #@todo not precise\n",
- "# num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)\n",
- "# print('num_of_labels_occured: %s' % (num_of_labels_occured))\n",
- " \n",
- " set_multisets = []\n",
- " for node in G.nodes(data = True):\n",
- " # Multiset-label determination.\n",
- " multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]\n",
- " # sorting each multiset\n",
- " multiset.sort()\n",
- " multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix \n",
- " set_multisets.append(multiset)\n",
- " print('multiset: %s' % (set_multisets))\n",
- "\n",
- " # label compression\n",
- " # set_multisets.sort() # this is unnecessary\n",
- " set_unique = list(set(set_multisets)) # set of unique multiset labels\n",
- " print('set_unique: %s' % (set_unique))\n",
- " # a dictionary mapping original labels to new ones. \n",
- " set_compressed = {}\n",
- " # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label \n",
- " for value in set_unique:\n",
- " if value in all_set_compressed.keys():\n",
- " set_compressed.update({ value : all_set_compressed[value] })\n",
- " else:\n",
- " set_compressed.update({ value : str(num_of_labels_occured + 1) })\n",
- " num_of_labels_occured += 1\n",
- "# set_compressed = { value : (all_set_compressed[value] if value in all_set_compressed.keys() else str(set_unique.index(value) + num_of_labels_occured + 1)) for value in set_unique }\n",
- " print('set_compressed: %s' % (set_compressed))\n",
- " \n",
- " all_set_compressed.update(set_compressed)\n",
- " print('all_set_compressed: %s' % (all_set_compressed))\n",
- "# num_of_labels_occured += len(set_compressed) #@todo not precise\n",
- " print('num_of_labels_occured: %s' % (num_of_labels_occured))\n",
- " \n",
- " # relabel nodes\n",
- " # nx.relabel_nodes(G, set_compressed, copy = False)\n",
- " for node in G.nodes(data = True):\n",
- " node[1]['label'] = set_compressed[set_multisets[node[0]]]\n",
- " print('\\n compressed labels: %s' % (nx.get_node_attributes(G, 'label')))\n",
- "\n",
- " # get the set of compressed labels\n",
- " labels_comp = list(nx.get_node_attributes(G, 'label').values())\n",
- " print('labels_comp: %s' % (labels_comp))\n",
- " all_labels_ori.update(labels_comp)\n",
- " print('all_labels_ori: %s' % (all_labels_ori))\n",
- " num_of_each_label = dict(Counter(labels_comp))\n",
- " print('num_of_each_label: %s' % (num_of_each_label))\n",
- " all_num_of_each_label.append(num_of_each_label)\n",
- " print('all_num_of_each_label: %s' % (all_num_of_each_label))\n",
- " \n",
- " all_num_of_labels_occured += len(all_labels_ori)\n",
- " print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n",
- " \n",
- " # calculate subtree kernel with h iterations and add it to the final kernel\n",
- " print('\\n --- calculating kernel matrix ---')\n",
- " for i in range(0, len(Gn)):\n",
- " for j in range(i, len(Gn)):\n",
- " labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n",
- " print('\\n labels: %s' % (labels))\n",
- " vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n",
- " vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n",
- " print('vector1: %s' % (vector1))\n",
- " print('vector2: %s' % (vector2))\n",
- " Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n",
- " Kmatrix[j][i] = Kmatrix[i][j]\n",
- " \n",
- " print('\\n Kmatrix: %s' % (Kmatrix))\n",
- "\n",
- " return Kmatrix\n",
- "\n",
- " \n",
- "# main\n",
- "import sys\n",
- "from collections import Counter\n",
- "import networkx as nx\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "from pygraph.kernels.spkernel import spkernel\n",
- "\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "G1 = dataset[15]\n",
- "print(nx.get_node_attributes(G1, 'label'))\n",
- "G2 = dataset[80]\n",
- "print(nx.get_node_attributes(G2, 'label'))\n",
- "\n",
- "weisfeilerlehmankernel(G1, G2, height = 2)\n",
- "# Kmatrix = weisfeilerlehmankernel(G1, G2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "185"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "\n",
- "len(dataset)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- " --- calculating kernel matrix when subtree height = 0 ---\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n"
- ]
- },
- {
- "ename": "KeyboardInterrupt",
- "evalue": "",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-1-2ce8cff340bc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n Calculating kernel matrix, this could take a while...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0mKmatrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mweisfeilerlehmankernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase_kernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'sp'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mKmatrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n Saving kernel matrix to file...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py\u001b[0m in \u001b[0;36mweisfeilerlehmankernel\u001b[0;34m(height, base_kernel, *args)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_weisfeilerlehmankernel_do\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGn\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mheight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/weisfeilerLehmanKernel.py\u001b[0m in \u001b[0;36m_weisfeilerlehmankernel_do\u001b[0;34m(G1, G2, height)\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0;31m# calculate kernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 243\u001b[0;31m \u001b[0mkernel\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mspkernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mG2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# change your base kernel here (and one more before)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 244\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[0;31m# get label sets of both graphs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spkernel.py\u001b[0m in \u001b[0;36mspkernel\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0me1\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mG1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medges\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0me2\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mG2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medges\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 64\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'cost'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0me2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 65\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "# test of WL subtree kernel\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import os\n",
- "import pathlib\n",
- "sys.path.insert(0, \"../\")\n",
- "from tabulate import tabulate\n",
- "\n",
- "import random\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
- "from sklearn.metrics import accuracy_score, mean_squared_error\n",
- "from sklearn import svm\n",
- "\n",
- "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "\n",
- "val_means_height = []\n",
- "val_stds_height = []\n",
- "test_means_height = []\n",
- "test_stds_height = []\n",
- "\n",
- "\n",
- "for height in np.linspace(0, 10, 11):\n",
- " print('\\n --- calculating kernel matrix when subtree height = %d ---' % height)\n",
- "\n",
- " print('\\n Loading dataset from file...')\n",
- " dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- " y = np.array(y)\n",
- " print(y)\n",
- "\n",
- " # setup the parameters\n",
- " model_type = 'regression' # Regression or classification problem\n",
- " print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- " datasize = len(dataset)\n",
- " trials = 100 # Trials for hyperparameters random search\n",
- " splits = 10 # Number of splits of the data\n",
- " alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- " C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
- " random.seed(20) # Set the seed for uniform parameter distribution\n",
- "\n",
- " # set the output path\n",
- " kernel_file_path = 'kernelmatrices_weisfeilerlehman_acyclic/'\n",
- " if not os.path.exists(kernel_file_path):\n",
- " os.makedirs(kernel_file_path)\n",
- "\n",
- "\n",
- " \"\"\"\n",
- " - Here starts the main program\n",
- " - First we permute the data, then for each split we evaluate corresponding performances\n",
- " - In the end, the performances are averaged over the test sets\n",
- " \"\"\"\n",
- "\n",
- " # save kernel matrices to files / read kernel matrices from files\n",
- " kernel_file = kernel_file_path + 'km.ds'\n",
- " path = pathlib.Path(kernel_file)\n",
- " # get train set kernel matrix\n",
- " if path.is_file():\n",
- " print('\\n Loading the kernel matrix from file...')\n",
- " Kmatrix = np.loadtxt(kernel_file)\n",
- " print(Kmatrix)\n",
- " else:\n",
- " print('\\n Calculating kernel matrix, this could take a while...')\n",
- " Kmatrix = weisfeilerlehmankernel(dataset, node_label = 'atom', height = int(height), base_kernel = 'sp')\n",
- " print(Kmatrix)\n",
- " print('\\n Saving kernel matrix to file...')\n",
- "# np.savetxt(kernel_file, Kmatrix)\n",
- "\n",
- " # Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
- " val_split = []\n",
- " test_split = []\n",
- "\n",
- " # For each split of the data\n",
- " for j in range(10, 10 + splits):\n",
- " # print('\\n Starting split %d...' % j)\n",
- "\n",
- " # Set the random set for data permutation\n",
- " random_state = int(j)\n",
- " np.random.seed(random_state)\n",
- " idx_perm = np.random.permutation(datasize)\n",
- " # print(idx_perm)\n",
- "\n",
- " # Permute the data\n",
- " y_perm = y[idx_perm] # targets permutation\n",
- " # print(y_perm)\n",
- " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
- " # print(Kmatrix_perm)\n",
- " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
- "\n",
- " # Set the training, validation and test\n",
- " # Note: the percentage can be set up by the user\n",
- " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
- " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
- " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
- " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
- "\n",
- " # Split the kernel matrix\n",
- " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
- " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
- " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
- "\n",
- " # Split the targets\n",
- " y_train = y_perm[0:num_train]\n",
- "\n",
- " # Normalization step (for real valued targets only)\n",
- " if model_type == 'regression':\n",
- " # print('\\n Normalizing output y...')\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- " # print(y)\n",
- "\n",
- " y_val = y_perm[num_train:(num_train + num_val)]\n",
- " y_test = y_perm[(num_train + num_val):datasize]\n",
- "\n",
- " # Record the performance for each parameter trial respectively on validation and test set\n",
- " perf_all_train = []\n",
- " perf_all_test = []\n",
- "\n",
- " # For each parameter trial\n",
- " for i in range(trials):\n",
- " # For regression use the Kernel Ridge method\n",
- " if model_type == 'regression':\n",
- " # print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
- "\n",
- " # Fit the kernel ridge model\n",
- " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
- " # KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n",
- " KR.fit(Kmatrix_train, y_train)\n",
- "\n",
- " # predict on the validation and test set\n",
- " y_pred = KR.predict(Kmatrix_val)\n",
- " y_pred_test = KR.predict(Kmatrix_test)\n",
- " # print(y_pred)\n",
- "\n",
- " # adjust prediction: needed because the training targets have been normalizaed\n",
- " y_pred = y_pred * float(y_train_std) + y_train_mean\n",
- " # print(y_pred)\n",
- " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
- " # print(y_pred_test)\n",
- "\n",
- " # root mean squared error on validation\n",
- " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
- " perf_all_val.append(rmse)\n",
- "\n",
- " # root mean squared error in test \n",
- " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
- " perf_all_test.append(rmse_test)\n",
- "\n",
- " # print('The performance on the validation set is: %3f' % rmse)\n",
- " # print('The performance on the test set is: %3f' % rmse_test)\n",
- "\n",
- " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
- " # For regression: minimise the mean squared error\n",
- " if model_type == 'regression':\n",
- "\n",
- " # get optimal parameter on validation (argmin mean squared error)\n",
- " min_idx = np.argmin(perf_all_test)\n",
- " alpha_opt = alpha_grid[min_idx]\n",
- "\n",
- " # performance corresponding to optimal parameter on val\n",
- " perf_val_opt = perf_all_val[min_idx]\n",
- "\n",
- " # corresponding performance on test for the same parameter\n",
- " perf_test_opt = perf_all_test[min_idx]\n",
- "\n",
- " # print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
- " # print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
- " # print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
- "\n",
- " # append the best performance on validation\n",
- " # at the current split\n",
- " val_split.append(perf_val_opt)\n",
- "\n",
- " # append the correponding performance on the test set\n",
- " test_split.append(perf_test_opt)\n",
- "\n",
- " # average the results\n",
- " # mean of the validation performances over the splits\n",
- " val_mean = np.mean(np.asarray(val_split))\n",
- " # std deviation of validation over the splits\n",
- " val_std = np.std(np.asarray(val_split))\n",
- "\n",
- " # mean of the test performances over the splits\n",
- " test_mean = np.mean(np.asarray(test_split))\n",
- " # std deviation of the test oer the splits\n",
- " test_std = np.std(np.asarray(test_split))\n",
- "\n",
- " print('\\n Mean performance on val set: %3f' % val_mean)\n",
- " print('With standard deviation: %3f' % val_std)\n",
- " print('\\n Mean performance on test set: %3f' % test_mean)\n",
- " print('With standard deviation: %3f' % test_std)\n",
- " \n",
- " val_means_height.append(val_mean)\n",
- " val_stds_height.append(val_std)\n",
- " test_means_height.append(test_mean)\n",
- " test_stds_height.append(test_std)\n",
- " \n",
- "print('\\n') \n",
- "print(tabulate({'height': np.linspace(1, 12, 11), 'RMSE': test_means_height, 'std': test_stds_height}, headers='keys'))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{0: 'C', 1: 'C', 2: 'C', 3: 'C', 4: 'C', 5: 'O', 6: 'O'}"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# a = [0, 1, 3, 2]\n",
- "# b = [3, 2, 1, 0]\n",
- "# print(1 if a == b else 0)\n",
- "\n",
- "# max(1 ,2)\n",
- "\n",
- "# x = [ 'r', 'a', 's' ]\n",
- "# x.sort()\n",
- "# print(x)\n",
- "\n",
- "# def test1(*args, base = 'subtree'):\n",
- "# if base == 'subtree':\n",
- "# print('subtree')\n",
- "# elif base == 'edge':\n",
- "# print('edge')\n",
- "# else:\n",
- "# print('sp')\n",
- "\n",
- "# # function parameter usage test\n",
- "# test1('hello', 'hi', base = 'edge')\n",
- "\n",
- "# # python matrix calculation speed test\n",
- "# import numpy as np\n",
- "# import time\n",
- "\n",
- "# size = 100\n",
- "# m1 = np.random.random((size, size))\n",
- "# m2 = np.random.random((size, size))\n",
- "# itr = 1\n",
- "\n",
- "# start_time = time.time()\n",
- "# for i in range(itr):\n",
- "# np.dot(m1, m2)\n",
- "# print(time.time() - start_time)\n",
- "\n",
- "# start_time = time.time()\n",
- "# for j in range(itr):\n",
- "# result = np.zeros((size, size))\n",
- "# for i1 in range(size):\n",
- "# for i2 in range(size):\n",
- "# for i3 in range(size):\n",
- "# result[i1][i2] += m1[i1][i3] * m2[i3][i2]\n",
- "# print(time.time() - start_time)\n",
- "\n",
- "# start_time = time.time()\n",
- "# for i in range(itr):\n",
- "# print(np.dot(m1, m2))\n",
- "# print(time.time() - start_time)\n",
- "\n",
- "# start_time = time.time()\n",
- "# for j in range(itr):\n",
- "# result = np.zeros((size, size))\n",
- "# for i1 in range(size):\n",
- "# for i2 in range(size):\n",
- "# for i3 in range(size):\n",
- "# result[i1][i2] += m1[i1][i3] * m2[i3][i2]\n",
- "# print(result)\n",
- "# print(time.time() - start_time)\n",
- "\n",
- "# help(np.sum)\n",
- "\n",
- "# test dict\n",
- "import sys\n",
- "from collections import Counter\n",
- "import networkx as nx\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "from pygraph.kernels.spkernel import spkernel\n",
- "\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "G1 = dataset[15]\n",
- "nx.get_node_attributes(G1, 'label')\n",
- "listhqhq = list(nx.get_node_attributes(G1, 'label').values())\n",
- "dicthaha = dict(Counter(listhqhq))\n",
- "len(dicthaha)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|