{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " --- This is a regression problem ---\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 0.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.3646550178527832 seconds ---\n",
      "[[  5.   6.   4. ...  20.  20.  20.]\n",
      " [  6.   8.   4. ...  20.  20.  20.]\n",
      " [  4.   4.   5. ...  21.  21.  21.]\n",
      " ...\n",
      " [ 20.  20.  21. ... 101. 101. 101.]\n",
      " [ 20.  20.  21. ... 101. 101. 101.]\n",
      " [ 20.  20.  21. ... 101. 101. 101.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  98%|█████████▊| 985/1000 [00:01<00:00, 664.77it/s]\n",
      " Mean performance on train set: 17.681582\n",
      "With standard deviation: 0.713183\n",
      "\n",
      " Mean performance on test set: 15.685879\n",
      "With standard deviation: 4.139197\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 681.36it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 1.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 0.7535510063171387 seconds ---\n",
      "[[ 10.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  16.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  10. ...  22.  22.  24.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 130. 130. 122.]\n",
      " [ 20.  20.  22. ... 130. 130. 122.]\n",
      " [ 20.  20.  24. ... 122. 122. 154.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  94%|█████████▍| 945/1000 [00:01<00:00, 713.00it/s]\n",
      " Mean performance on train set: 6.270014\n",
      "With standard deviation: 0.654734\n",
      "\n",
      " Mean performance on test set: 7.550458\n",
      "With standard deviation: 2.331786\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 719.46it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 2.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.3278343677520752 seconds ---\n",
      "[[ 15.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  24.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  15. ...  22.  22.  26.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 159. 151. 124.]\n",
      " [ 20.  20.  22. ... 151. 153. 124.]\n",
      " [ 20.  20.  26. ... 124. 124. 185.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  95%|█████████▍| 949/1000 [00:01<00:00, 736.38it/s]\n",
      " Mean performance on train set: 4.450682\n",
      "With standard deviation: 0.882129\n",
      "\n",
      " Mean performance on test set: 9.728466\n",
      "With standard deviation: 2.057669\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 709.22it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 3.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 1.7653727531433105 seconds ---\n",
      "[[ 20.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  32.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  20. ...  22.  22.  26.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 188. 159. 124.]\n",
      " [ 20.  20.  22. ... 159. 168. 124.]\n",
      " [ 20.  20.  26. ... 124. 124. 202.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  96%|█████████▌| 959/1000 [00:01<00:00, 724.60it/s]\n",
      " Mean performance on train set: 2.270586\n",
      "With standard deviation: 0.481516\n",
      "\n",
      " Mean performance on test set: 11.296110\n",
      "With standard deviation: 2.799944\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 670.29it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 4.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.2821996212005615 seconds ---\n",
      "[[ 25.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  40.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  25. ...  22.  22.  26.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 217. 159. 124.]\n",
      " [ 20.  20.  22. ... 159. 183. 124.]\n",
      " [ 20.  20.  26. ... 124. 124. 213.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  98%|█████████▊| 983/1000 [00:01<00:00, 709.28it/s]\n",
      " Mean performance on train set: 1.074035\n",
      "With standard deviation: 0.637823\n",
      "\n",
      " Mean performance on test set: 12.808303\n",
      "With standard deviation: 3.446939\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 646.12it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 5.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 2.706934928894043 seconds ---\n",
      "[[ 30.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  48.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  30. ...  22.  22.  26.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 246. 159. 124.]\n",
      " [ 20.  20.  22. ... 159. 198. 124.]\n",
      " [ 20.  20.  26. ... 124. 124. 224.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  95%|█████████▌| 953/1000 [00:01<00:00, 553.49it/s]\n",
      " Mean performance on train set: 0.700602\n",
      "With standard deviation: 0.572640\n",
      "\n",
      " Mean performance on test set: 14.017923\n",
      "With standard deviation: 3.675042\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 621.01it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 6.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.1140964031219482 seconds ---\n",
      "[[ 35.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  56.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  35. ...  22.  22.  26.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 275. 159. 124.]\n",
      " [ 20.  20.  22. ... 159. 213. 124.]\n",
      " [ 20.  20.  26. ... 124. 124. 235.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance: 100%|█████████▉| 997/1000 [00:01<00:00, 595.50it/s]\n",
      " Mean performance on train set: 0.691515\n",
      "With standard deviation: 0.564620\n",
      "\n",
      " Mean performance on test set: 14.918434\n",
      "With standard deviation: 3.805352\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 586.05it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 7.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 3.5894455909729004 seconds ---\n",
      "[[ 40.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  64.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  40. ...  22.  22.  26.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 304. 159. 124.]\n",
      " [ 20.  20.  22. ... 159. 228. 124.]\n",
      " [ 20.  20.  26. ... 124. 124. 246.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  99%|█████████▉| 991/1000 [00:01<00:00, 663.55it/s]\n",
      " Mean performance on train set: 0.691516\n",
      "With standard deviation: 0.564620\n",
      "\n",
      " Mean performance on test set: 15.629476\n",
      "With standard deviation: 3.865387\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 627.59it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 8.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.081295967102051 seconds ---\n",
      "[[ 45.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  72.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  45. ...  22.  22.  26.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 333. 159. 124.]\n",
      " [ 20.  20.  22. ... 159. 243. 124.]\n",
      " [ 20.  20.  26. ... 124. 124. 257.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  96%|█████████▌| 961/1000 [00:01<00:00, 601.33it/s]\n",
      " Mean performance on train set: 0.691515\n",
      "With standard deviation: 0.564620\n",
      "\n",
      " Mean performance on test set: 16.214369\n",
      "With standard deviation: 3.928756\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 603.90it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 9.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.497286796569824 seconds ---\n",
      "[[ 50.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  80.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  50. ...  22.  22.  26.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 362. 159. 124.]\n",
      " [ 20.  20.  22. ... 159. 258. 124.]\n",
      " [ 20.  20.  26. ... 124. 124. 268.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  93%|█████████▎| 931/1000 [00:01<00:00, 511.55it/s]\n",
      " Mean performance on train set: 0.691515\n",
      "With standard deviation: 0.564620\n",
      "\n",
      " Mean performance on test set: 16.725744\n",
      "With standard deviation: 3.993095\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 550.66it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 10.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " --- Weisfeiler-Lehman subtree kernel matrix of size 185 built in 4.984841585159302 seconds ---\n",
      "[[ 55.  10.   4. ...  20.  20.  20.]\n",
      " [ 10.  88.   4. ...  20.  20.  20.]\n",
      " [  4.   4.  55. ...  22.  22.  26.]\n",
      " ...\n",
      " [ 20.  20.  22. ... 391. 159. 124.]\n",
      " [ 20.  20.  22. ... 159. 273. 124.]\n",
      " [ 20.  20.  26. ... 124. 124. 279.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  94%|█████████▍| 942/1000 [00:01<00:00, 708.78it/s]\n",
      " Mean performance on train set: 0.691516\n",
      "With standard deviation: 0.564621\n",
      "\n",
      " Mean performance on test set: 17.186401\n",
      "With standard deviation: 4.056724\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 711.43it/s]\n",
      "\n",
      "\n",
      "  height    rmse_test    std_test    rmse_train    std_train    k_time\n",
      "--------  -----------  ----------  ------------  -----------  --------\n",
      "       0     15.6859      4.1392      17.6816       0.713183  0.364655\n",
      "       1      7.55046     2.33179      6.27001      0.654734  0.753551\n",
      "       2      9.72847     2.05767      4.45068      0.882129  1.32783\n",
      "       3     11.2961      2.79994      2.27059      0.481516  1.76537\n",
      "       4     12.8083      3.44694      1.07403      0.637823  2.2822\n",
      "       5     14.0179      3.67504      0.700602     0.57264   2.70693\n",
      "       6     14.9184      3.80535      0.691515     0.56462   3.1141\n",
      "       7     15.6295      3.86539      0.691516     0.56462   3.58945\n",
      "       8     16.2144      3.92876      0.691515     0.56462   4.0813\n",
      "       9     16.7257      3.9931       0.691515     0.56462   4.49729\n",
      "      10     17.1864      4.05672      0.691516     0.564621  4.98484\n"
     ]
    }
   ],
   "source": [
    "# wl subtree kernel\n",
    "%load_ext line_profiler\n",
    "\n",
    "import numpy as np\n",
    "import sys\n",
    "sys.path.insert(0, \"../\")\n",
    "from pygraph.utils.utils import kernel_train_test\n",
    "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel, _wl_subtreekernel_do\n",
    "\n",
    "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
    "kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n",
    "\n",
    "kernel_para = dict(node_label = 'atom', edge_label = 'bond_type')\n",
    "\n",
    "kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
    "    hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)\n",
    "\n",
    "# %lprun -f _wl_subtreekernel_do \\\n",
    "#     kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
    "#     hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " --- This is a regression problem ---\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 0.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 13.504083633422852 seconds ---\n",
      "[[ 3.  1.  3. ...  1.  1.  1.]\n",
      " [ 1.  6.  1. ...  0.  0.  3.]\n",
      " [ 3.  1.  3. ...  1.  1.  1.]\n",
      " ...\n",
      " [ 1.  0.  1. ... 55. 21.  7.]\n",
      " [ 1.  0.  1. ... 21. 55.  7.]\n",
      " [ 1.  3.  1. ...  7.  7. 55.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  98%|█████████▊| 980/1000 [00:01<00:00, 773.79it/s]\n",
      " Mean performance on train set: 28.360361\n",
      "With standard deviation: 1.357183\n",
      "\n",
      " Mean performance on test set: 35.191954\n",
      "With standard deviation: 4.495767\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 743.82it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 1.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 26.82917618751526 seconds ---\n",
      "[[  6.   2.   6. ...   2.   2.   2.]\n",
      " [  2.  12.   2. ...   0.   0.   6.]\n",
      " [  6.   2.   6. ...   2.   2.   2.]\n",
      " ...\n",
      " [  2.   0.   2. ... 110.  42.  14.]\n",
      " [  2.   0.   2. ...  42. 110.  14.]\n",
      " [  2.   6.   2. ...  14.  14. 110.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  98%|█████████▊| 983/1000 [00:01<00:00, 751.78it/s]\n",
      " Mean performance on train set: 27.933534\n",
      "With standard deviation: 1.448359\n",
      "\n",
      " Mean performance on test set: 35.180815\n",
      "With standard deviation: 4.500453\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 744.44it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 2.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 40.235626220703125 seconds ---\n",
      "[[  9.   3.   9. ...   3.   3.   3.]\n",
      " [  3.  18.   3. ...   0.   0.   9.]\n",
      " [  9.   3.   9. ...   3.   3.   3.]\n",
      " ...\n",
      " [  3.   0.   3. ... 165.  63.  21.]\n",
      " [  3.   0.   3. ...  63. 165.  21.]\n",
      " [  3.   9.   3. ...  21.  21. 165.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  94%|█████████▎| 936/1000 [00:01<00:00, 694.10it/s]\n",
      " Mean performance on train set: 28.111311\n",
      "With standard deviation: 1.508915\n",
      "\n",
      " Mean performance on test set: 35.163150\n",
      "With standard deviation: 4.502054\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 695.02it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 3.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 54.67040753364563 seconds ---\n",
      "[[ 12.   4.  12. ...   4.   4.   4.]\n",
      " [  4.  24.   4. ...   0.   0.  12.]\n",
      " [ 12.   4.  12. ...   4.   4.   4.]\n",
      " ...\n",
      " [  4.   0.   4. ... 220.  84.  28.]\n",
      " [  4.   0.   4. ...  84. 220.  28.]\n",
      " [  4.  12.   4. ...  28.  28. 220.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  95%|█████████▌| 954/1000 [00:01<00:00, 748.03it/s]\n",
      " Mean performance on train set: 28.390274\n",
      "With standard deviation: 1.365711\n",
      "\n",
      " Mean performance on test set: 35.194634\n",
      "With standard deviation: 4.498007\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 726.68it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 4.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 67.15217232704163 seconds ---\n",
      "[[ 15.   5.  15. ...   5.   5.   5.]\n",
      " [  5.  30.   5. ...   0.   0.  15.]\n",
      " [ 15.   5.  15. ...   5.   5.   5.]\n",
      " ...\n",
      " [  5.   0.   5. ... 275. 105.  35.]\n",
      " [  5.   0.   5. ... 105. 275.  35.]\n",
      " [  5.  15.   5. ...  35.  35. 275.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  95%|█████████▌| 950/1000 [00:01<00:00, 737.07it/s]\n",
      " Mean performance on train set: 27.974611\n",
      "With standard deviation: 1.462223\n",
      "\n",
      " Mean performance on test set: 35.175314\n",
      "With standard deviation: 4.501113\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 719.71it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 5.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 80.08806300163269 seconds ---\n",
      "[[ 18.   6.  18. ...   6.   6.   6.]\n",
      " [  6.  36.   6. ...   0.   0.  18.]\n",
      " [ 18.   6.  18. ...   6.   6.   6.]\n",
      " ...\n",
      " [  6.   0.   6. ... 330. 126.  42.]\n",
      " [  6.   0.   6. ... 126. 330.  42.]\n",
      " [  6.  18.   6. ...  42.  42. 330.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  98%|█████████▊| 985/1000 [00:01<00:00, 735.71it/s]\n",
      " Mean performance on train set: 28.018415\n",
      "With standard deviation: 1.455644\n",
      "\n",
      " Mean performance on test set: 35.199713\n",
      "With standard deviation: 4.507104\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 738.55it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 6.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 92.19254112243652 seconds ---\n",
      "[[ 21.   7.  21. ...   7.   7.   7.]\n",
      " [  7.  42.   7. ...   0.   0.  21.]\n",
      " [ 21.   7.  21. ...   7.   7.   7.]\n",
      " ...\n",
      " [  7.   0.   7. ... 385. 147.  49.]\n",
      " [  7.   0.   7. ... 147. 385.  49.]\n",
      " [  7.  21.   7. ...  49.  49. 385.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  98%|█████████▊| 975/1000 [00:01<00:00, 721.42it/s]\n",
      " Mean performance on train set: 28.373079\n",
      "With standard deviation: 1.600565\n",
      "\n",
      " Mean performance on test set: 35.164471\n",
      "With standard deviation: 4.498487\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 727.58it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 7.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 105.81170415878296 seconds ---\n",
      "[[ 24.   8.  24. ...   8.   8.   8.]\n",
      " [  8.  48.   8. ...   0.   0.  24.]\n",
      " [ 24.   8.  24. ...   8.   8.   8.]\n",
      " ...\n",
      " [  8.   0.   8. ... 440. 168.  56.]\n",
      " [  8.   0.   8. ... 168. 440.  56.]\n",
      " [  8.  24.   8. ...  56.  56. 440.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  97%|█████████▋| 968/1000 [00:01<00:00, 739.67it/s]\n",
      " Mean performance on train set: 27.960421\n",
      "With standard deviation: 1.457425\n",
      "\n",
      " Mean performance on test set: 35.177115\n",
      "With standard deviation: 4.500904\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 733.61it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 8.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 119.0216612815857 seconds ---\n",
      "[[ 27.   9.  27. ...   9.   9.   9.]\n",
      " [  9.  54.   9. ...   0.   0.  27.]\n",
      " [ 27.   9.  27. ...   9.   9.   9.]\n",
      " ...\n",
      " [  9.   0.   9. ... 495. 189.  63.]\n",
      " [  9.   0.   9. ... 189. 495.  63.]\n",
      " [  9.  27.   9. ...  63.  63. 495.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  93%|█████████▎| 931/1000 [00:01<00:00, 752.10it/s]\n",
      " Mean performance on train set: 28.199059\n",
      "With standard deviation: 1.514897\n",
      "\n",
      " Mean performance on test set: 35.196848\n",
      "With standard deviation: 4.505256\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 768.54it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 9.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 131.22810459136963 seconds ---\n",
      "[[ 30.  10.  30. ...  10.  10.  10.]\n",
      " [ 10.  60.  10. ...   0.   0.  30.]\n",
      " [ 30.  10.  30. ...  10.  10.  10.]\n",
      " ...\n",
      " [ 10.   0.  10. ... 550. 210.  70.]\n",
      " [ 10.   0.  10. ... 210. 550.  70.]\n",
      " [ 10.  30.  10. ...  70.  70. 550.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  93%|█████████▎| 932/1000 [00:01<00:00, 763.55it/s]\n",
      " Mean performance on train set: 28.266520\n",
      "With standard deviation: 1.307686\n",
      "\n",
      " Mean performance on test set: 35.195635\n",
      "With standard deviation: 4.501972\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 764.12it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 10.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " --- Weisfeiler-Lehman sp kernel matrix of size 185 built in 144.96362161636353 seconds ---\n",
      "[[ 33.  11.  33. ...  11.  11.  11.]\n",
      " [ 11.  66.  11. ...   0.   0.  33.]\n",
      " [ 33.  11.  33. ...  11.  11.  11.]\n",
      " ...\n",
      " [ 11.   0.  11. ... 605. 231.  77.]\n",
      " [ 11.   0.  11. ... 231. 605.  77.]\n",
      " [ 11.  33.  11. ...  77.  77. 605.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance: 100%|█████████▉| 996/1000 [00:01<00:00, 820.73it/s]\n",
      " Mean performance on train set: 28.416280\n",
      "With standard deviation: 1.615957\n",
      "\n",
      " Mean performance on test set: 35.167588\n",
      "With standard deviation: 4.497227\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 822.53it/s]\n",
      "\n",
      "\n",
      "  height    rmse_test    std_test    rmse_train    std_train    k_time\n",
      "--------  -----------  ----------  ------------  -----------  --------\n",
      "       0      35.192      4.49577       28.3604      1.35718   13.5041\n",
      "       1      35.1808     4.50045       27.9335      1.44836   26.8292\n",
      "       2      35.1632     4.50205       28.1113      1.50891   40.2356\n",
      "       3      35.1946     4.49801       28.3903      1.36571   54.6704\n",
      "       4      35.1753     4.50111       27.9746      1.46222   67.1522\n",
      "       5      35.1997     4.5071        28.0184      1.45564   80.0881\n",
      "       6      35.1645     4.49849       28.3731      1.60057   92.1925\n",
      "       7      35.1771     4.5009        27.9604      1.45742  105.812\n",
      "       8      35.1968     4.50526       28.1991      1.5149   119.022\n",
      "       9      35.1956     4.50197       28.2665      1.30769  131.228\n",
      "      10      35.1676     4.49723       28.4163      1.61596  144.964\n"
     ]
    }
   ],
   "source": [
    "# WL sp kernel\n",
    "%load_ext line_profiler\n",
    "\n",
    "import numpy as np\n",
    "import sys\n",
    "sys.path.insert(0, \"../\")\n",
    "from pygraph.utils.utils import kernel_train_test\n",
    "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel, _wl_subtreekernel_do\n",
    "\n",
    "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
    "kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n",
    "\n",
    "kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', base_kernel = 'sp')\n",
    "\n",
    "kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
    "    hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)\n",
    "\n",
    "# %lprun -f _wl_subtreekernel_do \\\n",
    "#     kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
    "#     hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The line_profiler extension is already loaded. To reload it, use:\n",
      "  %reload_ext line_profiler\n",
      "\n",
      " --- This is a regression problem ---\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 0.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 0.8530018329620361 seconds ---\n",
      "[[ 2.  1.  2. ...  0.  0.  1.]\n",
      " [ 1.  3.  1. ...  0.  0.  2.]\n",
      " [ 2.  1.  2. ...  0.  0.  1.]\n",
      " ...\n",
      " [ 0.  0.  0. ... 10.  7.  0.]\n",
      " [ 0.  0.  0. ...  7. 10.  1.]\n",
      " [ 1.  2.  1. ...  0.  1. 10.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  95%|█████████▍| 947/1000 [00:01<00:00, 719.29it/s]\n",
      " Mean performance on train set: 29.997498\n",
      "With standard deviation: 0.902340\n",
      "\n",
      " Mean performance on test set: 33.407740\n",
      "With standard deviation: 4.732717\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 653.54it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 1.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 1.717505931854248 seconds ---\n",
      "[[ 4.  2.  4. ...  0.  0.  2.]\n",
      " [ 2.  6.  2. ...  0.  0.  4.]\n",
      " [ 4.  2.  4. ...  0.  0.  2.]\n",
      " ...\n",
      " [ 0.  0.  0. ... 20. 14.  0.]\n",
      " [ 0.  0.  0. ... 14. 20.  2.]\n",
      " [ 2.  4.  2. ...  0.  2. 20.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  96%|█████████▌| 956/1000 [00:01<00:00, 721.27it/s]\n",
      " Mean performance on train set: 30.160338\n",
      "With standard deviation: 1.094235\n",
      "\n",
      " Mean performance on test set: 33.423458\n",
      "With standard deviation: 4.721311\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 723.53it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 2.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 2.6603214740753174 seconds ---\n",
      "[[ 6.  3.  6. ...  0.  0.  3.]\n",
      " [ 3.  9.  3. ...  0.  0.  6.]\n",
      " [ 6.  3.  6. ...  0.  0.  3.]\n",
      " ...\n",
      " [ 0.  0.  0. ... 30. 21.  0.]\n",
      " [ 0.  0.  0. ... 21. 30.  3.]\n",
      " [ 3.  6.  3. ...  0.  3. 30.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  94%|█████████▍| 944/1000 [00:01<00:00, 650.98it/s]\n",
      " Mean performance on train set: 29.928570\n",
      "With standard deviation: 0.787941\n",
      "\n",
      " Mean performance on test set: 33.433014\n",
      "With standard deviation: 4.724408\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 688.71it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 3.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 3.477631092071533 seconds ---\n",
      "[[ 8.  4.  8. ...  0.  0.  4.]\n",
      " [ 4. 12.  4. ...  0.  0.  8.]\n",
      " [ 8.  4.  8. ...  0.  0.  4.]\n",
      " ...\n",
      " [ 0.  0.  0. ... 40. 28.  0.]\n",
      " [ 0.  0.  0. ... 28. 40.  4.]\n",
      " [ 4.  8.  4. ...  0.  4. 40.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  95%|█████████▌| 954/1000 [00:01<00:00, 725.15it/s]\n",
      " Mean performance on train set: 30.011409\n",
      "With standard deviation: 0.909674\n",
      "\n",
      " Mean performance on test set: 33.407319\n",
      "With standard deviation: 4.732434\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 720.71it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 4.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 4.5436692237854 seconds ---\n",
      "[[10.  5. 10. ...  0.  0.  5.]\n",
      " [ 5. 15.  5. ...  0.  0. 10.]\n",
      " [10.  5. 10. ...  0.  0.  5.]\n",
      " ...\n",
      " [ 0.  0.  0. ... 50. 35.  0.]\n",
      " [ 0.  0.  0. ... 35. 50.  5.]\n",
      " [ 5. 10.  5. ...  0.  5. 50.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  94%|█████████▎| 936/1000 [00:01<00:00, 568.04it/s]\n",
      " Mean performance on train set: 30.184162\n",
      "With standard deviation: 1.108902\n",
      "\n",
      " Mean performance on test set: 33.425625\n",
      "With standard deviation: 4.721660\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 564.24it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 5.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 5.6617820262908936 seconds ---\n",
      "[[12.  6. 12. ...  0.  0.  6.]\n",
      " [ 6. 18.  6. ...  0.  0. 12.]\n",
      " [12.  6. 12. ...  0.  0.  6.]\n",
      " ...\n",
      " [ 0.  0.  0. ... 60. 42.  0.]\n",
      " [ 0.  0.  0. ... 42. 60.  6.]\n",
      " [ 6. 12.  6. ...  0.  6. 60.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  99%|█████████▉| 993/1000 [00:01<00:00, 519.25it/s]\n",
      " Mean performance on train set: 30.041068\n",
      "With standard deviation: 1.018451\n",
      "\n",
      " Mean performance on test set: 33.406717\n",
      "With standard deviation: 4.726409\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 548.91it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 6.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 6.148027420043945 seconds ---\n",
      "[[14.  7. 14. ...  0.  0.  7.]\n",
      " [ 7. 21.  7. ...  0.  0. 14.]\n",
      " [14.  7. 14. ...  0.  0.  7.]\n",
      " ...\n",
      " [ 0.  0.  0. ... 70. 49.  0.]\n",
      " [ 0.  0.  0. ... 49. 70.  7.]\n",
      " [ 7. 14.  7. ...  0.  7. 70.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  98%|█████████▊| 985/1000 [00:01<00:00, 498.31it/s]\n",
      " Mean performance on train set: 29.905596\n",
      "With standard deviation: 0.782179\n",
      "\n",
      " Mean performance on test set: 33.418992\n",
      "With standard deviation: 4.730753\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 534.86it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 7.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 7.603543519973755 seconds ---\n",
      "[[16.  8. 16. ...  0.  0.  8.]\n",
      " [ 8. 24.  8. ...  0.  0. 16.]\n",
      " [16.  8. 16. ...  0.  0.  8.]\n",
      " ...\n",
      " [ 0.  0.  0. ... 80. 56.  0.]\n",
      " [ 0.  0.  0. ... 56. 80.  8.]\n",
      " [ 8. 16.  8. ...  0.  8. 80.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  95%|█████████▌| 953/1000 [00:01<00:00, 586.15it/s]\n",
      " Mean performance on train set: 30.175921\n",
      "With standard deviation: 1.103820\n",
      "\n",
      " Mean performance on test set: 33.424820\n",
      "With standard deviation: 4.721550\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 546.00it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 8.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 7.972221612930298 seconds ---\n",
      "[[18.  9. 18. ...  0.  0.  9.]\n",
      " [ 9. 27.  9. ...  0.  0. 18.]\n",
      " [18.  9. 18. ...  0.  0.  9.]\n",
      " ...\n",
      " [ 0.  0.  0. ... 90. 63.  0.]\n",
      " [ 0.  0.  0. ... 63. 90.  9.]\n",
      " [ 9. 18.  9. ...  0.  9. 90.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  98%|█████████▊| 980/1000 [00:01<00:00, 490.30it/s]\n",
      " Mean performance on train set: 30.136537\n",
      "With standard deviation: 1.074854\n",
      "\n",
      " Mean performance on test set: 33.412196\n",
      "With standard deviation: 4.715539\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 536.66it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 9.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 9.070842504501343 seconds ---\n",
      "[[ 20.  10.  20. ...   0.   0.  10.]\n",
      " [ 10.  30.  10. ...   0.   0.  20.]\n",
      " [ 20.  10.  20. ...   0.   0.  10.]\n",
      " ...\n",
      " [  0.   0.   0. ... 100.  70.   0.]\n",
      " [  0.   0.   0. ...  70. 100.  10.]\n",
      " [ 10.  20.  10. ...   0.  10. 100.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n",
      "calculate performance:  98%|█████████▊| 975/1000 [00:01<00:00, 527.13it/s]\n",
      " Mean performance on train set: 30.032887\n",
      "With standard deviation: 0.921065\n",
      "\n",
      " Mean performance on test set: 33.407050\n",
      "With standard deviation: 4.731928\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 600.62it/s]\n",
      "\n",
      "\n",
      " #--- calculating kernel matrix when height = 10.0 ---#\n",
      "\n",
      " Loading dataset from file...\n",
      "\n",
      " Calculating kernel matrix, this could take a while...\n",
      "\n",
      " --- Weisfeiler-Lehman edge kernel matrix of size 185 built in 10.02536916732788 seconds ---\n",
      "[[ 22.  11.  22. ...   0.   0.  11.]\n",
      " [ 11.  33.  11. ...   0.   0.  22.]\n",
      " [ 22.  11.  22. ...   0.   0.  11.]\n",
      " ...\n",
      " [  0.   0.   0. ... 110.  77.   0.]\n",
      " [  0.   0.   0. ...  77. 110.  11.]\n",
      " [ 11.  22.  11. ...   0.  11. 110.]]\n",
      "\n",
      " Starting calculate accuracy/rmse...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "calculate performance:  97%|█████████▋| 970/1000 [00:01<00:00, 694.38it/s]\n",
      " Mean performance on train set: 29.924232\n",
      "With standard deviation: 0.790843\n",
      "\n",
      " Mean performance on test set: 33.416469\n",
      "With standard deviation: 4.731694\n",
      "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 678.72it/s]\n",
      "\n",
      "\n",
      "  height    rmse_test    std_test    rmse_train    std_train     k_time\n",
      "--------  -----------  ----------  ------------  -----------  ---------\n",
      "       0      33.4077     4.73272       29.9975     0.90234    0.853002\n",
      "       1      33.4235     4.72131       30.1603     1.09423    1.71751\n",
      "       2      33.433      4.72441       29.9286     0.787941   2.66032\n",
      "       3      33.4073     4.73243       30.0114     0.909674   3.47763\n",
      "       4      33.4256     4.72166       30.1842     1.1089     4.54367\n",
      "       5      33.4067     4.72641       30.0411     1.01845    5.66178\n",
      "       6      33.419      4.73075       29.9056     0.782179   6.14803\n",
      "       7      33.4248     4.72155       30.1759     1.10382    7.60354\n",
      "       8      33.4122     4.71554       30.1365     1.07485    7.97222\n",
      "       9      33.4071     4.73193       30.0329     0.921065   9.07084\n",
      "      10      33.4165     4.73169       29.9242     0.790843  10.0254\n"
     ]
    }
   ],
   "source": [
    "# WL edge kernel\n",
    "%load_ext line_profiler\n",
    "\n",
    "import numpy as np\n",
    "import sys\n",
    "sys.path.insert(0, \"../\")\n",
    "from pygraph.utils.utils import kernel_train_test\n",
    "from pygraph.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel, _wl_subtreekernel_do\n",
    "\n",
    "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
    "kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n",
    "\n",
    "kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', base_kernel = 'edge')\n",
    "\n",
    "kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
    "    hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)\n",
    "\n",
    "# %lprun -f _wl_subtreekernel_do \\\n",
    "#     kernel_train_test(datafile, kernel_file_path, weisfeilerlehmankernel, kernel_para, \\\n",
    "#     hyper_name = 'height', hyper_range = np.linspace(0, 10, 11), normalize = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# results\n",
    "\n",
    "# subtree with y normalization\n",
    "  height    RMSE_test    std_test    RMSE_train    std_train    k_time\n",
    "--------  -----------  ----------  ------------  -----------  --------\n",
    "       0     36.2108      7.33179      38.6059      1.57064   0.379475\n",
    "       1      9.00098     6.37145       6.76379     1.96568   0.844898\n",
    "       2     19.8113      4.04911       5.28757     1.81899   1.35308\n",
    "       3     25.0455      4.94276       2.3274      0.805733  1.81136\n",
    "       4     28.2255      6.5212        0.85156     0.423465  2.23098\n",
    "       5     30.6354      6.73647       3.35947     8.17561   2.71575\n",
    "       6     32.1027      6.85601       3.54105     8.71922   3.11459\n",
    "       7     32.9709      6.89606       6.94372     9.94045   3.55571\n",
    "       8     33.5112      6.90753       6.97339     9.76975   3.79657\n",
    "       9     33.8502      6.91427      11.8345     11.6213    4.41555\n",
    "      10     34.0963      6.93115      11.4257     11.2624    4.94888\n",
    "\n",
    "# subtree without y normalization\n",
    "  height    RMSE_test    std_test    RMSE_train    std_train    k_time\n",
    "--------  -----------  ----------  ------------  -----------  --------\n",
    "       0     15.6859      4.1392      17.6816       0.713183  0.360443\n",
    "       1      7.55046     2.33179      6.27001      0.654734  0.837389\n",
    "       2      9.72847     2.05767      4.45068      0.882129  1.25317\n",
    "       3     11.2961      2.79994      2.27059      0.481516  1.79971\n",
    "       4     12.8083      3.44694      1.07403      0.637823  2.35346\n",
    "       5     14.0179      3.67504      0.700602     0.57264   2.78285\n",
    "       6     14.9184      3.80535      0.691515     0.56462   3.20764\n",
    "       7     15.6295      3.86539      0.691516     0.56462   3.71648\n",
    "       8     16.2144      3.92876      0.691515     0.56462   3.99213\n",
    "       9     16.7257      3.9931       0.691515     0.56462   4.26315\n",
    "      10     17.1864      4.05672      0.691516     0.564621  5.00918\n",
    "    \n",
    "# sp\n",
    "  height    rmse_test    std_test    rmse_train    std_train    k_time\n",
    "--------  -----------  ----------  ------------  -----------  --------\n",
    "       0      35.192      4.49577       28.3604      1.35718   13.5041\n",
    "       1      35.1808     4.50045       27.9335      1.44836   26.8292\n",
    "       2      35.1632     4.50205       28.1113      1.50891   40.2356\n",
    "       3      35.1946     4.49801       28.3903      1.36571   54.6704\n",
    "       4      35.1753     4.50111       27.9746      1.46222   67.1522\n",
    "       5      35.1997     4.5071        28.0184      1.45564   80.0881\n",
    "       6      35.1645     4.49849       28.3731      1.60057   92.1925\n",
    "       7      35.1771     4.5009        27.9604      1.45742  105.812\n",
    "       8      35.1968     4.50526       28.1991      1.5149   119.022\n",
    "       9      35.1956     4.50197       28.2665      1.30769  131.228\n",
    "      10      35.1676     4.49723       28.4163      1.61596  144.964\n",
    "    \n",
    "# path\n",
    "  height    rmse_test    std_test    rmse_train    std_train     k_time\n",
    "--------  -----------  ----------  ------------  -----------  ---------\n",
    "       0      33.4077     4.73272       29.9975     0.90234    0.853002\n",
    "       1      33.4235     4.72131       30.1603     1.09423    1.71751\n",
    "       2      33.433      4.72441       29.9286     0.787941   2.66032\n",
    "       3      33.4073     4.73243       30.0114     0.909674   3.47763\n",
    "       4      33.4256     4.72166       30.1842     1.1089     4.54367\n",
    "       5      33.4067     4.72641       30.0411     1.01845    5.66178\n",
    "       6      33.419      4.73075       29.9056     0.782179   6.14803\n",
    "       7      33.4248     4.72155       30.1759     1.10382    7.60354\n",
    "       8      33.4122     4.71554       30.1365     1.07485    7.97222\n",
    "       9      33.4071     4.73193       30.0329     0.921065   9.07084\n",
    "      10      33.4165     4.73169       29.9242     0.790843  10.0254"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'O', 'C'}\n",
      "{'O', 'C'}\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "'int' object is not iterable",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-4-e54963002171>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     66\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabelset1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     67\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabelset2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mkernel\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mspkernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mG2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     69\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py\u001b[0m in \u001b[0;36mspkernel\u001b[0;34m(edge_weight, *args)\u001b[0m\n\u001b[1;32m     39\u001b[0m     \u001b[0mstart_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m     \u001b[0mGn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m \u001b[0mgetSPGraph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0medge_weight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0medge_weight\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mG\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m]\u001b[0m \u001b[0;31m# get shortest path graphs of Gn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     42\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     43\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m     39\u001b[0m     \u001b[0mstart_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m     \u001b[0mGn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m \u001b[0mgetSPGraph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0medge_weight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0medge_weight\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mG\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m]\u001b[0m \u001b[0;31m# get shortest path graphs of Gn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     42\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     43\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/utils/utils.py\u001b[0m in \u001b[0;36mgetSPGraph\u001b[0;34m(G, edge_weight)\u001b[0m\n\u001b[1;32m     35\u001b[0m     \u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0mBorgwardt\u001b[0m \u001b[0mKM\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mKriegel\u001b[0m \u001b[0mHP\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mShortest\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mpath\u001b[0m \u001b[0mkernels\u001b[0m \u001b[0mon\u001b[0m \u001b[0mgraphs\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mInData\u001b[0m \u001b[0mMining\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mFifth\u001b[0m \u001b[0mIEEE\u001b[0m \u001b[0mInternational\u001b[0m \u001b[0mConference\u001b[0m \u001b[0mon\u001b[0m \u001b[0;36m2005\u001b[0m \u001b[0mNov\u001b[0m \u001b[0;36m27\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0;36m8\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mIEEE\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     36\u001b[0m     \"\"\"\n\u001b[0;32m---> 37\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mfloydTransformation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0medge_weight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0medge_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     39\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfloydTransformation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0medge_weight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'bond_type'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/utils/utils.py\u001b[0m in \u001b[0;36mfloydTransformation\u001b[0;34m(G, edge_weight)\u001b[0m\n\u001b[1;32m     56\u001b[0m     \u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0mBorgwardt\u001b[0m \u001b[0mKM\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mKriegel\u001b[0m \u001b[0mHP\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mShortest\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mpath\u001b[0m \u001b[0mkernels\u001b[0m \u001b[0mon\u001b[0m \u001b[0mgraphs\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mInData\u001b[0m \u001b[0mMining\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mFifth\u001b[0m \u001b[0mIEEE\u001b[0m \u001b[0mInternational\u001b[0m \u001b[0mConference\u001b[0m \u001b[0mon\u001b[0m \u001b[0;36m2005\u001b[0m \u001b[0mNov\u001b[0m \u001b[0;36m27\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0;36m8\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mIEEE\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     57\u001b[0m     \"\"\"\n\u001b[0;32m---> 58\u001b[0;31m     \u001b[0mspMatrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloyd_warshall_numpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0medge_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m     \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGraph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m     \u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_nodes_from\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnodes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/networkx/algorithms/shortest_paths/dense.py\u001b[0m in \u001b[0;36mfloyd_warshall_numpy\u001b[0;34m(G, nodelist, weight)\u001b[0m\n\u001b[1;32m     52\u001b[0m     \u001b[0;31m# nonedges are not given the value 0 as well.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     53\u001b[0m     A = nx.to_numpy_matrix(G, nodelist=nodelist, multigraph_weight=min,\n\u001b[0;32m---> 54\u001b[0;31m                            weight=weight, nonedge=np.inf)\n\u001b[0m\u001b[1;32m     55\u001b[0m     \u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     56\u001b[0m     \u001b[0mI\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0midentity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/networkx/convert_matrix.py\u001b[0m in \u001b[0;36mto_numpy_matrix\u001b[0;34m(G, nodelist, dtype, order, multigraph_weight, weight, nonedge)\u001b[0m\n\u001b[1;32m    446\u001b[0m     A = to_numpy_array(G, nodelist=nodelist, dtype=dtype, order=order,\n\u001b[1;32m    447\u001b[0m                        \u001b[0mmultigraph_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmultigraph_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 448\u001b[0;31m                        nonedge=nonedge)\n\u001b[0m\u001b[1;32m    449\u001b[0m     \u001b[0mM\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    450\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mM\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/networkx/convert_matrix.py\u001b[0m in \u001b[0;36mto_numpy_array\u001b[0;34m(G, nodelist, dtype, order, multigraph_weight, weight, nonedge)\u001b[0m\n\u001b[1;32m   1061\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1062\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mnodelist\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1063\u001b[0;31m         \u001b[0mnodelist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mG\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1064\u001b[0m     \u001b[0mnodeset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnodelist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1065\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnodelist\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnodeset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: 'int' object is not iterable"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import networkx as nx\n",
    "sys.path.insert(0, \"../\")\n",
    "from pygraph.utils.graphfiles import loadDataset\n",
    "from pygraph.kernels.spkernel import spkernel\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "def weisfeilerlehman_test(G):\n",
    "    '''\n",
    "    Weisfeiler-Lehman test of graph isomorphism.\n",
    "    '''\n",
    "\n",
    "    nx.draw_networkx(G)\n",
    "    plt.show()\n",
    "    nx.draw_networkx_labels(G, nx.spring_layout(G), labels = nx.get_node_attributes(G,'label'))\n",
    "    print(G.nodes(data = True))\n",
    "    \n",
    "    set_multisets = []\n",
    "    for node in G.nodes(data = True):\n",
    "        # Multiset-label determination.\n",
    "        multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]\n",
    "        # sorting each multiset\n",
    "        multiset.sort()\n",
    "        multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix \n",
    "        set_multisets.append(multiset)\n",
    "        \n",
    "    # label compression\n",
    "#     set_multisets.sort() # this is unnecessary\n",
    "    set_unique = list(set(set_multisets)) # set of unique multiset labels\n",
    "    set_compressed = { value : str(set_unique.index(value)) for value in set_unique } # assign indices as the new labels\n",
    "#     print(set_compressed)\n",
    "#     print(set_multisets)\n",
    "    \n",
    "    # relabel nodes with multisets\n",
    "    for node in G.nodes(data = True):\n",
    "        node[1]['label'] = set_multisets[node[0]]\n",
    "    print(' -> ')\n",
    "    nx.draw_networkx(G)\n",
    "    plt.show()\n",
    "    print(G.nodes(data = True))\n",
    "\n",
    "    \n",
    "    # relabel nodes\n",
    "    for node in G.nodes(data = True):\n",
    "        node[1]['label'] = set_compressed[set_multisets[node[0]]]\n",
    "        \n",
    "    print(' -> ')\n",
    "    nx.draw_networkx(G)\n",
    "    plt.show()\n",
    "    print(G.nodes(data = True))\n",
    "\n",
    "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
    "G1 = dataset[12]\n",
    "G2 = dataset[55]\n",
    "\n",
    "# init.\n",
    "kernel = 0 # init kernel\n",
    "num_nodes1 = G1.number_of_nodes()\n",
    "num_nodes2 = G2.number_of_nodes()\n",
    "\n",
    "# the first iteration.\n",
    "labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
    "labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
    "print(labelset1)\n",
    "print(labelset2)\n",
    "kernel += spkernel(G1, G2)\n",
    "print(kernel)\n",
    "\n",
    "\n",
    "\n",
    "for height in range(0, min(num_nodes1, num_nodes2)): #Q how to determine the upper bound of the height?\n",
    "    if labelset1 != labelset2:\n",
    "        break\n",
    "        \n",
    "    # Weisfeiler-Lehman test of graph isomorphism.\n",
    "    weisfeilerlehman_test(G1)\n",
    "    weisfeilerlehman_test(G2)\n",
    "    \n",
    "    # calculate kernel\n",
    "    kernel += spkernel(G1, G2)\n",
    "    \n",
    "    # get label sets of both graphs\n",
    "    labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
    "    labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
    "#     print(labelset1)\n",
    "#     print(labelset2)\n",
    "\n",
    "print(kernel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 'C', 1: 'C', 2: 'C', 3: 'C', 4: 'C', 5: 'O', 6: 'O'}\n",
      "{0: 'C', 1: 'C', 2: 'C', 3: 'C', 4: 'C', 5: 'C', 6: 'S', 7: 'S'}\n",
      "\n",
      " --- height = 0 --- \n",
      "\n",
      " --- for graph 0 --- \n",
      "\n",
      "labels_ori: ['C', 'C', 'C', 'C', 'C', 'O', 'O']\n",
      "all_labels_ori: {'C', 'O'}\n",
      "num_of_each_label: {'C': 5, 'O': 2}\n",
      "all_num_of_each_label: [{'C': 5, 'O': 2}]\n",
      "num_of_labels: 2\n",
      "all_labels_ori: {'C', 'O'}\n",
      "\n",
      " --- for graph 1 --- \n",
      "\n",
      "labels_ori: ['C', 'C', 'C', 'C', 'C', 'C', 'S', 'S']\n",
      "all_labels_ori: {'C', 'O', 'S'}\n",
      "num_of_each_label: {'C': 6, 'S': 2}\n",
      "all_num_of_each_label: [{'C': 5, 'O': 2}, {'C': 6, 'S': 2}]\n",
      "num_of_labels: 2\n",
      "all_labels_ori: {'C', 'O', 'S'}\n",
      "\n",
      " all_num_of_labels_occured: 3\n",
      "\n",
      " --- calculating kernel matrix ---\n",
      "\n",
      " labels: {'C', 'O'}\n",
      "vector1: [[5 2]]\n",
      "vector2: [[5 2]]\n",
      "Kmatrix: [[ 29.   0.]\n",
      " [  0.   0.]]\n",
      "\n",
      " labels: {'C', 'O', 'S'}\n",
      "vector1: [[5 2 0]]\n",
      "vector2: [[6 0 2]]\n",
      "Kmatrix: [[ 29.  30.]\n",
      " [ 30.   0.]]\n",
      "\n",
      " labels: {'C', 'S'}\n",
      "vector1: [[6 2]]\n",
      "vector2: [[6 2]]\n",
      "Kmatrix: [[ 29.  30.]\n",
      " [ 30.  40.]]\n",
      "\n",
      " --- height = 1 --- \n",
      "\n",
      " --- for graph 0 --- \n",
      "\n",
      "multiset: ['CC', 'CC', 'CCO', 'CCO', 'COO', 'OCC', 'OCC']\n",
      "set_unique: ['OCC', 'COO', 'CCO', 'CC']\n",
      "set_compressed: {'OCC': '4', 'COO': '5', 'CCO': '6', 'CC': '7'}\n",
      "all_set_compressed: {'OCC': '4', 'COO': '5', 'CCO': '6', 'CC': '7'}\n",
      "num_of_labels_occured: 7\n",
      "\n",
      " compressed labels: {0: '7', 1: '7', 2: '6', 3: '6', 4: '5', 5: '4', 6: '4'}\n",
      "labels_comp: ['7', '7', '6', '6', '5', '4', '4']\n",
      "all_labels_ori: {'5', '4', '6', '7'}\n",
      "num_of_each_label: {'5': 1, '4': 2, '6': 2, '7': 2}\n",
      "all_num_of_each_label: [{'5': 1, '4': 2, '6': 2, '7': 2}]\n",
      "\n",
      " --- for graph 1 --- \n",
      "\n",
      "multiset: ['CC', 'CC', 'CC', 'CCS', 'CCS', 'CCSS', 'SCC', 'SCC']\n",
      "set_unique: ['SCC', 'CC', 'CCS', 'CCSS']\n",
      "set_compressed: {'SCC': '8', 'CC': '7', 'CCS': '9', 'CCSS': '10'}\n",
      "all_set_compressed: {'SCC': '8', 'COO': '5', 'CCS': '9', 'OCC': '4', 'CCO': '6', 'CCSS': '10', 'CC': '7'}\n",
      "num_of_labels_occured: 10\n",
      "\n",
      " compressed labels: {0: '7', 1: '7', 2: '7', 3: '9', 4: '9', 5: '10', 6: '8', 7: '8'}\n",
      "labels_comp: ['7', '7', '7', '9', '9', '10', '8', '8']\n",
      "all_labels_ori: {'10', '4', '7', '9', '6', '5', '8'}\n",
      "num_of_each_label: {'10': 1, '9': 2, '7': 3, '8': 2}\n",
      "all_num_of_each_label: [{'5': 1, '4': 2, '6': 2, '7': 2}, {'10': 1, '9': 2, '7': 3, '8': 2}]\n",
      "\n",
      " all_num_of_labels_occured: 10\n",
      "\n",
      " --- calculating kernel matrix ---\n",
      "\n",
      " labels: {'5', '4', '6', '7'}\n",
      "vector1: [[1 2 2 2]]\n",
      "vector2: [[1 2 2 2]]\n",
      "\n",
      " labels: {'10', '4', '7', '9', '6', '5', '8'}\n",
      "vector1: [[0 2 2 0 2 1 0]]\n",
      "vector2: [[1 0 3 2 0 0 2]]\n",
      "\n",
      " labels: {'8', '10', '7', '9'}\n",
      "vector1: [[2 1 3 2]]\n",
      "vector2: [[2 1 3 2]]\n",
      "\n",
      " Kmatrix: [[ 42.  36.]\n",
      " [ 36.  58.]]\n",
      "\n",
      " --- height = 2 --- \n",
      "\n",
      " --- for graph 0 --- \n",
      "\n",
      "multiset: ['76', '76', '647', '647', '544', '456', '456']\n",
      "set_unique: ['647', '76', '456', '544']\n",
      "set_compressed: {'647': '11', '76': '12', '544': '14', '456': '13'}\n",
      "all_set_compressed: {'647': '11', '76': '12', '456': '13', '544': '14'}\n",
      "num_of_labels_occured: 14\n",
      "\n",
      " compressed labels: {0: '12', 1: '12', 2: '11', 3: '11', 4: '14', 5: '13', 6: '13'}\n",
      "labels_comp: ['12', '12', '11', '11', '14', '13', '13']\n",
      "all_labels_ori: {'14', '12', '11', '13'}\n",
      "num_of_each_label: {'14': 1, '13': 2, '12': 2, '11': 2}\n",
      "all_num_of_each_label: [{'14': 1, '13': 2, '12': 2, '11': 2}]\n",
      "\n",
      " --- for graph 1 --- \n",
      "\n",
      "multiset: ['79', '79', '710', '978', '978', '10788', '8109', '8109']\n",
      "set_unique: ['710', '8109', '79', '10788', '978']\n",
      "set_compressed: {'710': '15', '79': '17', '8109': '16', '978': '19', '10788': '18'}\n",
      "all_set_compressed: {'710': '15', '79': '17', '978': '19', '10788': '18', '8109': '16', '456': '13', '544': '14', '647': '11', '76': '12'}\n",
      "num_of_labels_occured: 19\n",
      "\n",
      " compressed labels: {0: '17', 1: '17', 2: '15', 3: '19', 4: '19', 5: '18', 6: '16', 7: '16'}\n",
      "labels_comp: ['17', '17', '15', '19', '19', '18', '16', '16']\n",
      "all_labels_ori: {'18', '19', '12', '13', '17', '11', '14', '16', '15'}\n",
      "num_of_each_label: {'15': 1, '17': 2, '19': 2, '16': 2, '18': 1}\n",
      "all_num_of_each_label: [{'14': 1, '13': 2, '12': 2, '11': 2}, {'15': 1, '17': 2, '19': 2, '16': 2, '18': 1}]\n",
      "\n",
      " all_num_of_labels_occured: 19\n",
      "\n",
      " --- calculating kernel matrix ---\n",
      "\n",
      " labels: {'14', '12', '11', '13'}\n",
      "vector1: [[1 2 2 2]]\n",
      "vector2: [[1 2 2 2]]\n",
      "\n",
      " labels: {'18', '19', '12', '13', '17', '11', '14', '16', '15'}\n",
      "vector1: [[0 0 2 2 0 2 1 0 0]]\n",
      "vector2: [[1 2 0 0 2 0 0 2 1]]\n",
      "\n",
      " labels: {'18', '17', '15', '16', '19'}\n",
      "vector1: [[1 2 1 2 2]]\n",
      "vector2: [[1 2 1 2 2]]\n",
      "\n",
      " Kmatrix: [[ 55.  36.]\n",
      " [ 36.  72.]]\n",
      "\n",
      " --- Weisfeiler-Lehman subtree kernel built in 0.0034377574920654297 seconds ---\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[ 55.,  36.],\n",
       "       [ 36.,  72.]])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# test of WL subtree kernel on many graphs\n",
    "\n",
    "import sys\n",
    "import pathlib\n",
    "from collections import Counter\n",
    "sys.path.insert(0, \"../\")\n",
    "\n",
    "import networkx as nx\n",
    "import numpy as np\n",
    "import time\n",
    "\n",
    "from pygraph.kernels.spkernel import spkernel\n",
    "from pygraph.kernels.pathKernel import pathkernel\n",
    "\n",
    "def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'):\n",
    "    \"\"\"Calculate Weisfeiler-Lehman kernels between graphs.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    Gn : List of NetworkX graph\n",
    "        List of graphs between which the kernels are calculated.\n",
    "    /\n",
    "    G1, G2 : NetworkX graphs\n",
    "        2 graphs between which the kernel is calculated.\n",
    "        \n",
    "    height : subtree height\n",
    "    \n",
    "    base_kernel : base kernel used in each iteration of WL kernel\n",
    "        the default base kernel is subtree kernel\n",
    "        \n",
    "    Return\n",
    "    ------\n",
    "    Kmatrix/Kernel : Numpy matrix/int\n",
    "        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman Kernel between 2 graphs.\n",
    "        \n",
    "    Notes\n",
    "    -----\n",
    "    This function now supports WL subtree kernel and WL shortest path kernel.\n",
    "    \n",
    "    References\n",
    "    ----------\n",
    "    [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.\n",
    "    \"\"\"\n",
    "    if len(args) == 1: # for a list of graphs\n",
    "\n",
    "#         print(args)\n",
    "        start_time = time.time()\n",
    "        \n",
    "        # for WL subtree kernel\n",
    "        if base_kernel == 'subtree':           \n",
    "            Kmatrix = _wl_subtreekernel_do(args[0], height = height, base_kernel = 'subtree')\n",
    "            \n",
    "        # for WL edge kernel\n",
    "        elif base_kernel == 'edge':\n",
    "            print('edge')\n",
    "            \n",
    "        # for WL shortest path kernel\n",
    "        elif base_kernel == 'sp':\n",
    "            Gn = args[0]\n",
    "            Kmatrix = np.zeros((len(Gn), len(Gn)))\n",
    "            \n",
    "            for i in range(0, len(Gn)):\n",
    "                for j in range(i, len(Gn)):\n",
    "                    Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j])\n",
    "                    Kmatrix[j][i] = Kmatrix[i][j]\n",
    "\n",
    "        print(\"\\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---\" % (base_kernel, len(args[0]), (time.time() - start_time)))\n",
    "        \n",
    "        return Kmatrix\n",
    "        \n",
    "    else: # for only 2 graphs\n",
    "        \n",
    "        start_time = time.time()\n",
    "        \n",
    "        # for WL subtree kernel\n",
    "        if base_kernel == 'subtree':\n",
    "            \n",
    "            args = [args[0], args[1]]\n",
    "#             print(args)\n",
    "            kernel = _wl_subtreekernel_do(args, height = height, base_kernel = 'subtree')\n",
    "            \n",
    "        # for WL edge kernel\n",
    "        elif base_kernel == 'edge':\n",
    "            print('edge')\n",
    "            \n",
    "        # for WL shortest path kernel\n",
    "        elif base_kernel == 'sp':\n",
    "            \n",
    "\n",
    "            kernel = _pathkernel_do(args[0], args[1])\n",
    "\n",
    "        print(\"\\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---\" % (base_kernel, time.time() - start_time))\n",
    "        \n",
    "        return kernel\n",
    "    \n",
    "    \n",
    "def _weisfeilerlehmankernel_do(G1, G2):\n",
    "    \"\"\"Calculate Weisfeiler-Lehman kernels between 2 graphs. This kernel use shortest path kernel to calculate kernel between two graphs in each iteration.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    G1, G2 : NetworkX graphs\n",
    "        2 graphs between which the kernel is calculated.\n",
    "        \n",
    "    Return\n",
    "    ------\n",
    "    Kernel : int\n",
    "        Weisfeiler-Lehman Kernel between 2 graphs.\n",
    "    \"\"\"\n",
    "    \n",
    "    # init.\n",
    "    kernel = 0 # init kernel\n",
    "    num_nodes1 = G1.number_of_nodes()\n",
    "    num_nodes2 = G2.number_of_nodes()\n",
    "    height = 12 #min(num_nodes1, num_nodes2)) #Q how to determine the upper bound of the height?\n",
    "    \n",
    "    # the first iteration.\n",
    "    labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
    "    labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
    "    kernel += pathkernel(G1, G2) # change your base kernel here (and one more below)\n",
    "    \n",
    "    for h in range(0, height):\n",
    "#         if labelset1 != labelset2:\n",
    "#             break\n",
    "\n",
    "        # Weisfeiler-Lehman test of graph isomorphism.\n",
    "        relabel(G1)\n",
    "        relabel(G2)\n",
    "\n",
    "        # calculate kernel\n",
    "        kernel += pathkernel(G1, G2) # change your base kernel here (and one more before)\n",
    "\n",
    "        # get label sets of both graphs\n",
    "        labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }\n",
    "        labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }\n",
    "    \n",
    "    return kernel\n",
    "\n",
    "\n",
    "def relabel(G):\n",
    "    '''\n",
    "    Relabel nodes in graph G in one iteration of the 1-dim. WL test of graph isomorphism.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    G : NetworkX graph\n",
    "        The graphs whose nodes are relabeled.\n",
    "    '''\n",
    "    \n",
    "    # get the set of original labels\n",
    "    labels_ori = list(nx.get_node_attributes(G, 'label').values())\n",
    "    print(labels_ori)\n",
    "    num_of_each_label = dict(Counter(labels_ori))\n",
    "    print(num_of_each_label)\n",
    "    num_of_labels = len(num_of_each_label)\n",
    "    print(num_of_labels)\n",
    "    \n",
    "    set_multisets = []\n",
    "    for node in G.nodes(data = True):\n",
    "        # Multiset-label determination.\n",
    "        multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]\n",
    "        # sorting each multiset\n",
    "        multiset.sort()\n",
    "        multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix \n",
    "        set_multisets.append(multiset)\n",
    "    print(set_multisets)\n",
    "        \n",
    "    # label compression\n",
    "#     set_multisets.sort() # this is unnecessary\n",
    "    set_unique = list(set(set_multisets)) # set of unique multiset labels\n",
    "    print(set_unique)\n",
    "    set_compressed = { value : str(set_unique.index(value) + num_of_labels + 1) for value in set_unique } # assign new labels\n",
    "    print(set_compressed)\n",
    "    \n",
    "    # relabel nodes\n",
    "#     nx.relabel_nodes(G, set_compressed, copy = False)\n",
    "    for node in G.nodes(data = True):\n",
    "        node[1]['label'] = set_compressed[set_multisets[node[0]]]\n",
    "    print(nx.get_node_attributes(G, 'label'))\n",
    "\n",
    "    # get the set of compressed labels\n",
    "    labels_comp = list(nx.get_node_attributes(G, 'label').values())\n",
    "    print(labels_comp)\n",
    "    num_of_each_label.update(dict(Counter(labels_comp)))\n",
    "    print(num_of_each_label)\n",
    "    \n",
    "    \n",
    "def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'):\n",
    "    \"\"\"Calculate Weisfeiler-Lehman subtree kernels between graphs.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    Gn : List of NetworkX graph\n",
    "        List of graphs between which the kernels are calculated.\n",
    "        \n",
    "    Return\n",
    "    ------\n",
    "    Kmatrix/Kernel : Numpy matrix/int\n",
    "        Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.\n",
    "    \"\"\"\n",
    "    \n",
    "#     print(args)\n",
    "    Gn = args[0]\n",
    "#     print(Gn)\n",
    "\n",
    "    Kmatrix = np.zeros((len(Gn), len(Gn)))\n",
    "    all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs\n",
    "    \n",
    "    # initial for height = 0\n",
    "    print('\\n --- height = 0 --- ')\n",
    "    all_labels_ori = set() # all unique orignal labels in all graphs in this iteration\n",
    "    all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration\n",
    "    all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration\n",
    "    num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs\n",
    "\n",
    "    # for each graph\n",
    "    for idx, G in enumerate(Gn):\n",
    "        # get the set of original labels\n",
    "        print('\\n --- for graph %d --- \\n' % (idx))\n",
    "        labels_ori = list(nx.get_node_attributes(G, 'label').values())\n",
    "        print('labels_ori: %s' % (labels_ori))\n",
    "        all_labels_ori.update(labels_ori)\n",
    "        print('all_labels_ori: %s' % (all_labels_ori))\n",
    "        num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n",
    "        print('num_of_each_label: %s' % (num_of_each_label))\n",
    "        all_num_of_each_label.append(num_of_each_label)\n",
    "        print('all_num_of_each_label: %s' % (all_num_of_each_label))\n",
    "        num_of_labels = len(num_of_each_label) # number of all unique labels\n",
    "        print('num_of_labels: %s' % (num_of_labels))\n",
    "        \n",
    "\n",
    "        all_labels_ori.update(labels_ori)\n",
    "        print('all_labels_ori: %s' % (all_labels_ori))\n",
    "        \n",
    "    all_num_of_labels_occured += len(all_labels_ori)\n",
    "    print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n",
    "        \n",
    "    # calculate subtree kernel with the 0th iteration and add it to the final kernel\n",
    "    print('\\n --- calculating kernel matrix ---')\n",
    "    for i in range(0, len(Gn)):\n",
    "        for j in range(i, len(Gn)):\n",
    "            labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n",
    "            print('\\n labels: %s' % (labels))\n",
    "            vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n",
    "            vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n",
    "            print('vector1: %s' % (vector1))\n",
    "            print('vector2: %s' % (vector2))\n",
    "            Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n",
    "            Kmatrix[j][i] = Kmatrix[i][j]\n",
    "            print('Kmatrix: %s' % (Kmatrix))\n",
    "\n",
    "    \n",
    "    # iterate each height\n",
    "    for h in range(1, height + 1):\n",
    "        print('\\n --- height = %d --- ' % (h))\n",
    "        all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration\n",
    "        num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs\n",
    "        all_labels_ori = set()\n",
    "        all_num_of_each_label = []\n",
    "        \n",
    "        # for each graph\n",
    "        for idx, G in enumerate(Gn):\n",
    "#             # get the set of original labels\n",
    "            print('\\n --- for graph %d --- \\n' % (idx))\n",
    "#             labels_ori = list(nx.get_node_attributes(G, 'label').values())\n",
    "#             print('labels_ori: %s' % (labels_ori))\n",
    "#             num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph\n",
    "#             print('num_of_each_label: %s' % (num_of_each_label))\n",
    "#             num_of_labels = len(num_of_each_label) # number of all unique labels\n",
    "#             print('num_of_labels: %s' % (num_of_labels))\n",
    "            \n",
    "#             all_labels_ori.update(labels_ori)\n",
    "#             print('all_labels_ori: %s' % (all_labels_ori))\n",
    "#             #             num_of_labels_occured += num_of_labels #@todo not precise\n",
    "#             num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)\n",
    "#             print('num_of_labels_occured: %s' % (num_of_labels_occured))\n",
    "            \n",
    "            set_multisets = []\n",
    "            for node in G.nodes(data = True):\n",
    "                # Multiset-label determination.\n",
    "                multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]\n",
    "                # sorting each multiset\n",
    "                multiset.sort()\n",
    "                multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix \n",
    "                set_multisets.append(multiset)\n",
    "            print('multiset: %s' % (set_multisets))\n",
    "\n",
    "            # label compression\n",
    "        #     set_multisets.sort() # this is unnecessary\n",
    "            set_unique = list(set(set_multisets)) # set of unique multiset labels\n",
    "            print('set_unique: %s' % (set_unique))\n",
    "            # a dictionary mapping original labels to new ones. \n",
    "            set_compressed = {}\n",
    "            # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label \n",
    "            for value in set_unique:\n",
    "                if value in all_set_compressed.keys():\n",
    "                    set_compressed.update({ value : all_set_compressed[value] })\n",
    "                else:\n",
    "                    set_compressed.update({ value : str(num_of_labels_occured + 1) })\n",
    "                    num_of_labels_occured += 1\n",
    "#             set_compressed = { value : (all_set_compressed[value] if value in all_set_compressed.keys() else str(set_unique.index(value) + num_of_labels_occured + 1)) for value in set_unique }\n",
    "            print('set_compressed: %s' % (set_compressed))\n",
    "            \n",
    "            all_set_compressed.update(set_compressed)\n",
    "            print('all_set_compressed: %s' % (all_set_compressed))\n",
    "#             num_of_labels_occured += len(set_compressed) #@todo not precise\n",
    "            print('num_of_labels_occured: %s' % (num_of_labels_occured))\n",
    "            \n",
    "            # relabel nodes\n",
    "        #     nx.relabel_nodes(G, set_compressed, copy = False)\n",
    "            for node in G.nodes(data = True):\n",
    "                node[1]['label'] = set_compressed[set_multisets[node[0]]]\n",
    "            print('\\n compressed labels: %s' % (nx.get_node_attributes(G, 'label')))\n",
    "\n",
    "            # get the set of compressed labels\n",
    "            labels_comp = list(nx.get_node_attributes(G, 'label').values())\n",
    "            print('labels_comp: %s' % (labels_comp))\n",
    "            all_labels_ori.update(labels_comp)\n",
    "            print('all_labels_ori: %s' % (all_labels_ori))\n",
    "            num_of_each_label = dict(Counter(labels_comp))\n",
    "            print('num_of_each_label: %s' % (num_of_each_label))\n",
    "            all_num_of_each_label.append(num_of_each_label)\n",
    "            print('all_num_of_each_label: %s' % (all_num_of_each_label))\n",
    "                    \n",
    "        all_num_of_labels_occured += len(all_labels_ori)\n",
    "        print('\\n all_num_of_labels_occured: %s' % (all_num_of_labels_occured))\n",
    "        \n",
    "        # calculate subtree kernel with h iterations and add it to the final kernel\n",
    "        print('\\n --- calculating kernel matrix ---')\n",
    "        for i in range(0, len(Gn)):\n",
    "            for j in range(i, len(Gn)):\n",
    "                labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))\n",
    "                print('\\n labels: %s' % (labels))\n",
    "                vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])\n",
    "                vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])\n",
    "                print('vector1: %s' % (vector1))\n",
    "                print('vector2: %s' % (vector2))\n",
    "                Kmatrix[i][j] += np.dot(vector1, vector2.transpose())\n",
    "                Kmatrix[j][i] = Kmatrix[i][j]\n",
    "                    \n",
    "        print('\\n Kmatrix: %s' % (Kmatrix))\n",
    "\n",
    "    return Kmatrix\n",
    "\n",
    "    \n",
    "# main\n",
    "import sys\n",
    "from collections import Counter\n",
    "import networkx as nx\n",
    "sys.path.insert(0, \"../\")\n",
    "from pygraph.utils.graphfiles import loadDataset\n",
    "from pygraph.kernels.spkernel import spkernel\n",
    "\n",
    "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
    "G1 = dataset[15]\n",
    "print(nx.get_node_attributes(G1, 'label'))\n",
    "G2 = dataset[80]\n",
    "print(nx.get_node_attributes(G2, 'label'))\n",
    "\n",
    "weisfeilerlehmankernel(G1, G2, height = 2)\n",
    "# Kmatrix = weisfeilerlehmankernel(G1, G2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}