You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run_spkernel.ipynb 50 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": null,
  6. "metadata": {
  7. "scrolled": false
  8. },
  9. "outputs": [
  10. {
  11. "name": "stdout",
  12. "output_type": "stream",
  13. "text": [
  14. "\n",
  15. "MAO\n",
  16. "\n",
  17. "--- This is a classification problem ---\n",
  18. "\n",
  19. "\n",
  20. "1. Loading dataset from file...\n",
  21. "\n",
  22. "2. Calculating gram matrices. This could take a while...\n",
  23. "\n",
  24. " None edge weight specified. Set all weight to 1.\n",
  25. "\n",
  26. "getting sp graphs: 68it [00:00, 692.11it/s]\n",
  27. "calculating kernels: 2346it [00:05, 399.28it/s]\n",
  28. "\n",
  29. " --- shortest path kernel matrix of size 68 built in 6.345669507980347 seconds ---\n",
  30. "\n",
  31. "the gram matrix with parameters {'node_kernels': {'symb': <function deltakernel at 0x7fe240afd620>, 'nsymb': <function gaussiankernel at 0x7fe240afd9d8>, 'mix': functools.partial(<function kernelproduct at 0x7fe240aaf0d0>, <function deltakernel at 0x7fe240afd620>, <function gaussiankernel at 0x7fe240afd9d8>)}, 'n_jobs': 8} is: \n",
  32. "\n",
  33. "1 gram matrices are calculated, 0 of which are ignored.\n",
  34. "\n",
  35. "3. Fitting and predicting using nested cross validation. This could really take a while...\n",
  36. "cross validation: 7it [00:09, 4.67s/it]"
  37. ]
  38. }
  39. ],
  40. "source": [
  41. "import functools\n",
  42. "from libs import *\n",
  43. "import multiprocessing\n",
  44. "\n",
  45. "from pygraph.kernels.spKernel import spkernel\n",
  46. "from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct\n",
  47. "#from pygraph.utils.model_selection_precomputed import trial_do\n",
  48. "\n",
  49. "dslist = [\n",
  50. "# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n",
  51. "# 'task': 'regression'}, # node symb\n",
  52. "# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n",
  53. "# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n",
  54. "# # contains single node graph, node symb\n",
  55. " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n",
  56. "# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n",
  57. "# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
  58. "# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n",
  59. "# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
  60. "# # node nsymb\n",
  61. "# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
  62. "# # node symb/nsymb\n",
  63. "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
  64. " # node/edge symb\n",
  65. "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
  66. "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n",
  67. "\n",
  68. " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
  69. " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
  70. " # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
  71. " # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
  72. " #\n",
  73. " # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
  74. " # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
  75. " # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
  76. " # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
  77. " # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
  78. "\n",
  79. " # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
  80. " # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
  81. " # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
  82. " # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
  83. " # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
  84. " # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
  85. " # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
  86. " # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
  87. " # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
  88. "\n",
  89. " # # not working below\n",
  90. " # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
  91. " # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
  92. " # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
  93. " # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
  94. "]\n",
  95. "estimator = spkernel\n",
  96. "mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)\n",
  97. "param_grid_precomputed = {'node_kernels': [\n",
  98. " {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}\n",
  99. "param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},\n",
  100. " {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n",
  101. "\n",
  102. "for ds in dslist:\n",
  103. " print()\n",
  104. " print(ds['name'])\n",
  105. " model_selection_for_precomputed_kernel(\n",
  106. " ds['dataset'],\n",
  107. " estimator,\n",
  108. " param_grid_precomputed,\n",
  109. " (param_grid[1] if ('task' in ds and ds['task']\n",
  110. " == 'regression') else param_grid[0]),\n",
  111. " (ds['task'] if 'task' in ds else 'classification'),\n",
  112. " NUM_TRIALS=30,\n",
  113. " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n",
  114. " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n",
  115. " ds_name=ds['name'],\n",
  116. " n_jobs=multiprocessing.cpu_count(),\n",
  117. " read_gm_from_file=False)\n",
  118. " print()\n"
  119. ]
  120. },
  121. {
  122. "cell_type": "code",
  123. "execution_count": 1,
  124. "metadata": {},
  125. "outputs": [
  126. {
  127. "name": "stderr",
  128. "output_type": "stream",
  129. "text": [
  130. "[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.\n",
  131. "[Parallel(n_jobs=8)]: Done 2 out of 9 | elapsed: 15.7s remaining: 54.8s\n",
  132. "[Parallel(n_jobs=8)]: Done 3 out of 9 | elapsed: 15.7s remaining: 31.3s\n",
  133. "[Parallel(n_jobs=8)]: Done 4 out of 9 | elapsed: 15.7s remaining: 19.6s\n",
  134. "[Parallel(n_jobs=8)]: Done 5 out of 9 | elapsed: 15.7s remaining: 12.5s\n",
  135. "[Parallel(n_jobs=8)]: Done 6 out of 9 | elapsed: 15.7s remaining: 7.8s\n",
  136. "[Parallel(n_jobs=8)]: Done 7 out of 9 | elapsed: 15.7s remaining: 4.5s\n",
  137. "[Parallel(n_jobs=8)]: Done 9 out of 9 | elapsed: 15.7s remaining: 0.0s\n"
  138. ]
  139. },
  140. {
  141. "ename": "KeyboardInterrupt",
  142. "evalue": "",
  143. "output_type": "error",
  144. "traceback": [
  145. "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
  146. "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
  147. "\u001b[0;32m<ipython-input-1-ba0f5fe728f1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 83\u001b[0;31m \u001b[0mParallel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_cores\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelayed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompute_ds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mds\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdslist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
  148. "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 960\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 961\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieval_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 962\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 963\u001b[0m \u001b[0;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0melapsed_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  149. "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 864\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'supports_timeout'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 865\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 866\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 867\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  150. "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mwrap_future_result\u001b[0;34m(future, timeout)\u001b[0m\n\u001b[1;32m 513\u001b[0m AsyncResults.get from multiprocessing.\"\"\"\n\u001b[1;32m 514\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 515\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 516\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mLokyTimeoutError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  151. "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/joblib/externals/loky/_base.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 424\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__get_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 426\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_condition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 427\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mCANCELLED\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCANCELLED_AND_NOTIFIED\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  152. "\u001b[0;32m/usr/lib/python3.5/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 293\u001b[0;31m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 294\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  153. "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
  154. ]
  155. }
  156. ],
  157. "source": [
  158. "# # test parallel computing\n",
  159. "# import psutil\n",
  160. "# # logical=True counts threads, but we are interested in cores\n",
  161. "# psutil.()# .cpu_count(logical=False)\n",
  162. "%load_ext line_profiler\n",
  163. "%matplotlib inline\n",
  164. "import functools\n",
  165. "from libs import *\n",
  166. "from sklearn.metrics.pairwise import rbf_kernel\n",
  167. "from joblib import Parallel, delayed\n",
  168. "import multiprocessing\n",
  169. "\n",
  170. "from pygraph.kernels.spKernel import spkernel\n",
  171. "from pygraph.utils.kernels import deltakernel, kernelsum\n",
  172. "\n",
  173. "num_cores = multiprocessing.cpu_count()\n",
  174. "\n",
  175. "dslist = [ \n",
  176. " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb\n",
  177. "# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
  178. " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled\n",
  179. " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb\n",
  180. " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
  181. " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n",
  182. " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', \n",
  183. " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb\n",
  184. "# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
  185. "# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
  186. " {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb\n",
  187. " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb\n",
  188. "# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
  189. " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
  190. "# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
  191. "# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
  192. "# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
  193. "# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
  194. "# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
  195. "\n",
  196. "# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
  197. "# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
  198. " {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
  199. " 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n",
  200. "# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
  201. "# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
  202. "# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
  203. "# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
  204. "# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
  205. "# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
  206. "# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
  207. " \n",
  208. "# # not working below\n",
  209. "# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
  210. "# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
  211. "# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
  212. "# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
  213. "]\n",
  214. "estimator = spkernel\n",
  215. "mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)\n",
  216. "param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}\n",
  217. "param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, \n",
  218. " {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]\n",
  219. " \n",
  220. "def compute_ds(ds):\n",
  221. " print()\n",
  222. " print(ds['name'])\n",
  223. " model_selection_for_precomputed_kernel(\n",
  224. " ds['dataset'], estimator, param_grid_precomputed, \n",
  225. " (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \n",
  226. " (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30,\n",
  227. " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n",
  228. " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n",
  229. " ds_name=ds['name'])\n",
  230. " \n",
  231. "# %lprun -f spkernel \\\n",
  232. "# model_selection_for_precomputed_kernel( \\\n",
  233. "# ds['dataset'], estimator, param_grid_precomputed, \\\n",
  234. "# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \\\n",
  235. "# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \\\n",
  236. "# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \\\n",
  237. "# extra_params=(ds['extra_params'] if 'extra_params' in ds else None))\n",
  238. " print()\n",
  239. " \n",
  240. "Parallel(n_jobs=num_cores, verbose=10)(delayed(compute_ds)(ds) for ds in dslist)"
  241. ]
  242. },
  243. {
  244. "cell_type": "code",
  245. "execution_count": 1,
  246. "metadata": {},
  247. "outputs": [
  248. {
  249. "name": "stdout",
  250. "output_type": "stream",
  251. "text": [
  252. "\n",
  253. "--- This is a regression problem ---\n",
  254. "\n",
  255. "\n",
  256. "I. Loading dataset from file...\n",
  257. "\n",
  258. "2. Calculating gram matrices. This could take a while...\n",
  259. "\n",
  260. " None edge weight specified. Set all weight to 1.\n",
  261. "\n"
  262. ]
  263. },
  264. {
  265. "ename": "TypeError",
  266. "evalue": "'NoneType' object is not subscriptable",
  267. "output_type": "error",
  268. "traceback": [
  269. "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
  270. "\u001b[0;31mRemoteTraceback\u001b[0m Traceback (most recent call last)",
  271. "\u001b[0;31mRemoteTraceback\u001b[0m: \n\"\"\"\nTraceback (most recent call last):\n File \"/usr/lib/python3.5/multiprocessing/pool.py\", line 119, in worker\n result = (True, func(*args, **kwds))\n File \"/usr/lib/python3.5/multiprocessing/pool.py\", line 44, in mapstar\n return list(map(*args))\n File \"../pygraph/kernels/spKernel.py\", line 359, in spkernel_do\n kn = node_kernels['symb']\nTypeError: 'NoneType' object is not subscriptable\n\"\"\"",
  272. "\nThe above exception was the direct cause of the following exception:\n",
  273. "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
  274. "\u001b[0;32m<ipython-input-1-b5a6e5aa5a44>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, \n\u001b[0;32m---> 15\u001b[0;31m 'regression', NUM_TRIALS=30)\n\u001b[0m",
  275. "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/utils/model_selection_precomputed.py\u001b[0m in \u001b[0;36mmodel_selection_for_precomputed_kernel\u001b[0;34m(datafile, estimator, param_grid_precomputed, param_grid, model_type, NUM_TRIALS, datafile_y, extra_params, ds_name, n_jobs)\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams_out\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam_list_precomputed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mparams_out\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'n_jobs'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 122\u001b[0;31m \u001b[0mrtn_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mparams_out\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 123\u001b[0m \u001b[0mKmatrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrtn_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0mcurrent_run_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrtn_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  276. "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py\u001b[0m in \u001b[0;36mspkernel\u001b[0;34m(node_label, edge_weight, node_kernels, n_jobs, *args)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0mdo_partial\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpartial\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspkernel_do\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mds_attrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnode_label\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnode_kernels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mitr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcombinations_with_replacement\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 99\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpool\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdo_partial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mitr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtotal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 100\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  277. "\u001b[0;32m/usr/lib/python3.5/multiprocessing/pool.py\u001b[0m in \u001b[0;36mmap\u001b[0;34m(self, func, iterable, chunksize)\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mthat\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mreturned\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 259\u001b[0m '''\n\u001b[0;32m--> 260\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_map_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmapstar\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 261\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mstarmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  278. "\u001b[0;32m/usr/lib/python3.5/multiprocessing/pool.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 606\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 608\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 610\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_set\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  279. "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable"
  280. ]
  281. },
  282. {
  283. "name": "stderr",
  284. "output_type": "stream",
  285. "text": [
  286. "Process ForkPoolWorker-1:\n",
  287. "Traceback (most recent call last):\n",
  288. " File \"/usr/lib/python3.5/multiprocessing/process.py\", line 249, in _bootstrap\n",
  289. " self.run()\n",
  290. " File \"/usr/lib/python3.5/multiprocessing/process.py\", line 93, in run\n",
  291. " self._target(*self._args, **self._kwargs)\n",
  292. " File \"/usr/lib/python3.5/multiprocessing/pool.py\", line 108, in worker\n",
  293. " task = get()\n",
  294. " File \"/usr/lib/python3.5/multiprocessing/queues.py\", line 343, in get\n",
  295. " res = self._reader.recv_bytes()\n",
  296. " File \"/usr/lib/python3.5/multiprocessing/connection.py\", line 216, in recv_bytes\n",
  297. " buf = self._recv_bytes(maxlength)\n",
  298. " File \"/usr/lib/python3.5/multiprocessing/connection.py\", line 407, in _recv_bytes\n",
  299. " buf = self._recv(4)\n",
  300. " File \"/usr/lib/python3.5/multiprocessing/connection.py\", line 379, in _recv\n",
  301. " chunk = read(handle, remaining)\n",
  302. "KeyboardInterrupt\n"
  303. ]
  304. }
  305. ],
  306. "source": [
  307. "%load_ext line_profiler\n",
  308. "%matplotlib inline\n",
  309. "import numpy as np\n",
  310. "import sys\n",
  311. "sys.path.insert(0, \"../\")\n",
  312. "from pygraph.utils.model_selection_precomputed import model_selection_for_precomputed_kernel\n",
  313. "from pygraph.kernels.spKernel import spkernel\n",
  314. "\n",
  315. "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
  316. "estimator = spkernel\n",
  317. "param_grid_precomputed = {}\n",
  318. "param_grid = {'alpha': np.logspace(-1, 1, num = 41, base = 10)}\n",
  319. "\n",
  320. "model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, \n",
  321. " 'regression', NUM_TRIALS=30)"
  322. ]
  323. },
  324. {
  325. "cell_type": "code",
  326. "execution_count": 1,
  327. "metadata": {},
  328. "outputs": [
  329. {
  330. "name": "stdout",
  331. "output_type": "stream",
  332. "text": [
  333. "\n",
  334. " --- This is a regression problem ---\n",
  335. "\n",
  336. "\n",
  337. " Loading dataset from file...\n",
  338. "\n",
  339. " Calculating kernel matrix, this could take a while...\n",
  340. "--- shortest path kernel matrix of size 185 built in 13.3865065574646 seconds ---\n",
  341. "[[ 3. 1. 3. ... 1. 1. 1.]\n",
  342. " [ 1. 6. 1. ... 0. 0. 3.]\n",
  343. " [ 3. 1. 3. ... 1. 1. 1.]\n",
  344. " ...\n",
  345. " [ 1. 0. 1. ... 55. 21. 7.]\n",
  346. " [ 1. 0. 1. ... 21. 55. 7.]\n",
  347. " [ 1. 3. 1. ... 7. 7. 55.]]\n",
  348. "\n",
  349. " Starting calculate accuracy/rmse...\n",
  350. "calculate performance: 94%|█████████▎| 936/1000 [00:01<00:00, 757.54it/s]\n",
  351. " Mean performance on train set: 28.360361\n",
  352. "With standard deviation: 1.357183\n",
  353. "\n",
  354. " Mean performance on test set: 35.191954\n",
  355. "With standard deviation: 4.495767\n",
  356. "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 771.22it/s]\n",
  357. "\n",
  358. "\n",
  359. " rmse_test std_test rmse_train std_train k_time\n",
  360. "----------- ---------- ------------ ----------- --------\n",
  361. " 35.192 4.49577 28.3604 1.35718 13.3865\n"
  362. ]
  363. }
  364. ],
  365. "source": [
  366. "%load_ext line_profiler\n",
  367. "\n",
  368. "import sys\n",
  369. "sys.path.insert(0, \"../\")\n",
  370. "from pygraph.utils.utils import kernel_train_test\n",
  371. "from pygraph.kernels.spKernel import spkernel\n",
  372. "\n",
  373. "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
  374. "kernel_file_path = 'kernelmatrices_path_acyclic/'\n",
  375. "\n",
  376. "kernel_para = dict(edge_weight = 'atom')\n",
  377. "\n",
  378. "kernel_train_test(datafile, kernel_file_path, spkernel, kernel_para, normalize = False)\n",
  379. "\n",
  380. "# %lprun -f spkernel \\\n",
  381. "# kernel_train_test(datafile, kernel_file_path, spkernel, kernel_para, normalize = False)"
  382. ]
  383. },
  384. {
  385. "cell_type": "code",
  386. "execution_count": null,
  387. "metadata": {},
  388. "outputs": [],
  389. "source": [
  390. "# results\n",
  391. "\n",
  392. "# with y normalization\n",
  393. " RMSE_test std_test RMSE_train std_train k_time\n",
  394. "----------- ---------- ------------ ----------- --------\n",
  395. " 35.6337 5.23183 32.3805 3.92531 14.9301\n",
  396. "\n",
  397. "# without y normalization\n",
  398. " RMSE_test std_test RMSE_train std_train k_time\n",
  399. "----------- ---------- ------------ ----------- --------\n",
  400. " 35.192 4.49577 28.3604 1.35718 14.5768"
  401. ]
  402. },
  403. {
  404. "cell_type": "code",
  405. "execution_count": 5,
  406. "metadata": {
  407. "scrolled": false
  408. },
  409. "outputs": [
  410. {
  411. "name": "stdout",
  412. "output_type": "stream",
  413. "text": [
  414. "\n",
  415. "- This script take as input a kernel matrix\n",
  416. "and returns the classification or regression performance\n",
  417. "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
  418. "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
  419. "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
  420. "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
  421. "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
  422. "correspond to the average of the performances on the test sets. \n",
  423. "\n",
  424. "@references\n",
  425. " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
  426. "\n",
  427. "\n",
  428. " Loading dataset from file...\n",
  429. "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
  430. " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
  431. " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
  432. " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
  433. " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
  434. " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
  435. " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
  436. " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
  437. " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
  438. " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
  439. " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
  440. " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
  441. " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
  442. " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
  443. " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
  444. " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
  445. " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
  446. " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
  447. "\n",
  448. " Loading the matrix from file...\n",
  449. "[[ 3. 1. 3. ..., 1. 1. 1.]\n",
  450. " [ 1. 6. 1. ..., 0. 0. 3.]\n",
  451. " [ 3. 1. 3. ..., 1. 1. 1.]\n",
  452. " ..., \n",
  453. " [ 1. 0. 1. ..., 55. 21. 7.]\n",
  454. " [ 1. 0. 1. ..., 21. 55. 7.]\n",
  455. " [ 1. 3. 1. ..., 7. 7. 55.]]\n",
  456. "\n",
  457. " --- This is a regression problem ---\n",
  458. "\n",
  459. " Starting split 10...\n",
  460. "\n",
  461. " Normalizing output y...\n",
  462. "The best performance is for trial 12 with parameter alpha = 100.000000\n",
  463. "The best performance on the validation set is: 40.422382\n",
  464. "The corresponding performance on test set is: 47.424532\n",
  465. "\n",
  466. " Starting split 11...\n",
  467. "\n",
  468. " Normalizing output y...\n",
  469. "The best performance is for trial 12 with parameter alpha = 100.000000\n",
  470. "The best performance on the validation set is: 33.084913\n",
  471. "The corresponding performance on test set is: 35.493699\n",
  472. "\n",
  473. " Starting split 12...\n",
  474. "\n",
  475. " Normalizing output y...\n",
  476. "The best performance is for trial 12 with parameter alpha = 100.000000\n",
  477. "The best performance on the validation set is: 31.306710\n",
  478. "The corresponding performance on test set is: 33.173366\n",
  479. "\n",
  480. " Starting split 13...\n",
  481. "\n",
  482. " Normalizing output y...\n",
  483. "The best performance is for trial 12 with parameter alpha = 100.000000\n",
  484. "The best performance on the validation set is: 43.500424\n",
  485. "The corresponding performance on test set is: 32.633129\n",
  486. "\n",
  487. " Starting split 14...\n",
  488. "\n",
  489. " Normalizing output y...\n",
  490. "The best performance is for trial 10 with parameter alpha = 1.000000\n",
  491. "The best performance on the validation set is: 53.561752\n",
  492. "The corresponding performance on test set is: 42.883548\n",
  493. "\n",
  494. " Starting split 15...\n",
  495. "\n",
  496. " Normalizing output y...\n",
  497. "The best performance is for trial 12 with parameter alpha = 100.000000\n",
  498. "The best performance on the validation set is: 40.444773\n",
  499. "The corresponding performance on test set is: 32.713040\n",
  500. "\n",
  501. " Starting split 16...\n",
  502. "\n",
  503. " Normalizing output y...\n",
  504. "The best performance is for trial 11 with parameter alpha = 10.000000\n",
  505. "The best performance on the validation set is: 37.046818\n",
  506. "The corresponding performance on test set is: 37.337851\n",
  507. "\n",
  508. " Starting split 17...\n",
  509. "\n",
  510. " Normalizing output y...\n",
  511. "The best performance is for trial 12 with parameter alpha = 100.000000\n",
  512. "The best performance on the validation set is: 39.907628\n",
  513. "The corresponding performance on test set is: 38.889064\n",
  514. "\n",
  515. " Starting split 18...\n",
  516. "\n",
  517. " Normalizing output y...\n",
  518. "The best performance is for trial 12 with parameter alpha = 100.000000\n",
  519. "The best performance on the validation set is: 29.879950\n",
  520. "The corresponding performance on test set is: 27.652558\n",
  521. "\n",
  522. " Starting split 19...\n",
  523. "\n",
  524. " Normalizing output y...\n",
  525. "The best performance is for trial 11 with parameter alpha = 10.000000\n",
  526. "The best performance on the validation set is: 44.911892\n",
  527. "The corresponding performance on test set is: 35.804454\n",
  528. "\n",
  529. " Mean performance on val set: 39.406724\n",
  530. "With standard deviation: 6.720820\n",
  531. "\n",
  532. " Mean performance on test set: 36.400524\n",
  533. "With standard deviation: 5.352940\n"
  534. ]
  535. }
  536. ],
  537. "source": [
  538. "# Author: Elisabetta Ghisu\n",
  539. "\n",
  540. "\"\"\"\n",
  541. "- This script take as input a kernel matrix\n",
  542. "and returns the classification or regression performance\n",
  543. "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
  544. "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
  545. "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
  546. "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
  547. "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
  548. "correspond to the average of the performances on the test sets. \n",
  549. "\n",
  550. "@references\n",
  551. " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
  552. "\"\"\"\n",
  553. "\n",
  554. "print(__doc__)\n",
  555. "\n",
  556. "import sys\n",
  557. "import pathlib\n",
  558. "sys.path.insert(0, \"../\")\n",
  559. "from tabulate import tabulate\n",
  560. "\n",
  561. "import random\n",
  562. "import numpy as np\n",
  563. "import matplotlib.pyplot as plt\n",
  564. "\n",
  565. "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
  566. "from sklearn.metrics import accuracy_score, mean_squared_error\n",
  567. "from sklearn import svm\n",
  568. "\n",
  569. "from pygraph.kernels.spkernel import spkernel\n",
  570. "from pygraph.utils.graphfiles import loadDataset\n",
  571. "\n",
  572. "print('\\n Loading dataset from file...')\n",
  573. "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
  574. "y = np.array(y)\n",
  575. "print(y)\n",
  576. "\n",
  577. "kernel_file_path = 'kernelmatrix.ds'\n",
  578. "path = pathlib.Path(kernel_file_path)\n",
  579. "if path.is_file():\n",
  580. " print('\\n Loading the matrix from file...')\n",
  581. " Kmatrix = np.loadtxt(kernel_file_path)\n",
  582. " print(Kmatrix)\n",
  583. "else:\n",
  584. " print('\\n Calculating kernel matrix, this could take a while...')\n",
  585. " #@Q: is it appropriate to use bond type between atoms as the edge weight to calculate shortest path????????\n",
  586. " Kmatrix, run_time = spkernel(dataset, edge_weight = 'bond_type')\n",
  587. " print(Kmatrix)\n",
  588. " print('Saving kernel matrix to file...')\n",
  589. " np.savetxt(kernel_file_path, Kmatrix)\n",
  590. "\n",
  591. "# setup the parameters\n",
  592. "model_type = 'regression' # Regression or classification problem\n",
  593. "print('\\n --- This is a %s problem ---' % model_type)\n",
  594. "\n",
  595. "datasize = len(dataset)\n",
  596. "trials = 21 # Trials for hyperparameters random search\n",
  597. "splits = 10 # Number of splits of the data\n",
  598. "alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
  599. "C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
  600. "random.seed(20) # Set the seed for uniform parameter distribution\n",
  601. "\n",
  602. "\n",
  603. "\"\"\"\n",
  604. "- Here starts the main program\n",
  605. "- First we permute the data, then for each split we evaluate corresponding performances\n",
  606. "- In the end, the performances are averaged over the test sets\n",
  607. "\"\"\"\n",
  608. "\n",
  609. "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
  610. "val_split = []\n",
  611. "test_split = []\n",
  612. "\n",
  613. "# For each split of the data\n",
  614. "for j in range(10, 10 + splits):\n",
  615. " print('\\n Starting split %d...' % j)\n",
  616. "\n",
  617. " # Set the random set for data permutation\n",
  618. " random_state = int(j)\n",
  619. " np.random.seed(random_state)\n",
  620. " idx_perm = np.random.permutation(datasize)\n",
  621. "# print(idx_perm)\n",
  622. " \n",
  623. " # Permute the data\n",
  624. " y_perm = y[idx_perm] # targets permutation\n",
  625. "# print(y_perm)\n",
  626. " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
  627. "# print(Kmatrix_perm)\n",
  628. " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
  629. " \n",
  630. " # Set the training, validation and test\n",
  631. " # Note: the percentage can be set up by the user\n",
  632. " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
  633. " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
  634. " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
  635. " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
  636. " \n",
  637. " # Split the kernel matrix\n",
  638. " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
  639. " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
  640. " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
  641. "\n",
  642. " # Split the targets\n",
  643. " y_train = y_perm[0:num_train]\n",
  644. "\n",
  645. " # Normalization step (for real valued targets only)\n",
  646. " print('\\n Normalizing output y...')\n",
  647. " if model_type == 'regression':\n",
  648. " y_train_mean = np.mean(y_train)\n",
  649. " y_train_std = np.std(y_train)\n",
  650. " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
  651. "# print(y)\n",
  652. " \n",
  653. " y_val = y_perm[num_train:(num_train + num_val)]\n",
  654. " y_test = y_perm[(num_train + num_val):datasize]\n",
  655. " \n",
  656. " # Record the performance for each parameter trial respectively on validation and test set\n",
  657. " perf_all_val = []\n",
  658. " perf_all_test = []\n",
  659. " \n",
  660. " # For each parameter trial\n",
  661. " for i in range(trials):\n",
  662. " # For regression use the Kernel Ridge method\n",
  663. " if model_type == 'regression':\n",
  664. "# print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
  665. "\n",
  666. " # Fit the kernel ridge model\n",
  667. " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
  668. "# KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n",
  669. " KR.fit(Kmatrix_train, y_train)\n",
  670. "\n",
  671. " # predict on the validation and test set\n",
  672. " y_pred = KR.predict(Kmatrix_val)\n",
  673. " y_pred_test = KR.predict(Kmatrix_test)\n",
  674. "# print(y_pred)\n",
  675. "\n",
  676. " # adjust prediction: needed because the training targets have been normalizaed\n",
  677. " y_pred = y_pred * float(y_train_std) + y_train_mean\n",
  678. "# print(y_pred)\n",
  679. " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
  680. "# print(y_pred_test)\n",
  681. "\n",
  682. " # root mean squared error on validation\n",
  683. " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
  684. " perf_all_val.append(rmse)\n",
  685. "\n",
  686. " # root mean squared error in test \n",
  687. " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
  688. " perf_all_test.append(rmse_test)\n",
  689. "\n",
  690. "# print('The performance on the validation set is: %3f' % rmse)\n",
  691. "# print('The performance on the test set is: %3f' % rmse_test)\n",
  692. " \n",
  693. " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
  694. " # For regression: minimise the mean squared error\n",
  695. " if model_type == 'regression':\n",
  696. "\n",
  697. " # get optimal parameter on validation (argmin mean squared error)\n",
  698. " min_idx = np.argmin(perf_all_test)\n",
  699. " alpha_opt = alpha_grid[min_idx]\n",
  700. "\n",
  701. " # performance corresponding to optimal parameter on val\n",
  702. " perf_val_opt = perf_all_val[min_idx]\n",
  703. "\n",
  704. " # corresponding performance on test for the same parameter\n",
  705. " perf_test_opt = perf_all_test[min_idx]\n",
  706. "\n",
  707. " print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
  708. " print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
  709. " print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
  710. "\n",
  711. " # append the best performance on validation\n",
  712. " # at the current split\n",
  713. " val_split.append(perf_val_opt)\n",
  714. "\n",
  715. " # append the correponding performance on the test set\n",
  716. " test_split.append(perf_test_opt)\n",
  717. "\n",
  718. "# average the results\n",
  719. "# mean of the validation performances over the splits\n",
  720. "val_mean = np.mean(np.asarray(val_split))\n",
  721. "# std deviation of validation over the splits\n",
  722. "val_std = np.std(np.asarray(val_split))\n",
  723. "\n",
  724. "# mean of the test performances over the splits\n",
  725. "test_mean = np.mean(np.asarray(test_split))\n",
  726. "# std deviation of the test oer the splits\n",
  727. "test_std = np.std(np.asarray(test_split))\n",
  728. "\n",
  729. "print('\\n Mean performance on val set: %3f' % val_mean)\n",
  730. "print('With standard deviation: %3f' % val_std)\n",
  731. "print('\\n Mean performance on test set: %3f' % test_mean)\n",
  732. "print('With standard deviation: %3f' % test_std)"
  733. ]
  734. }
  735. ],
  736. "metadata": {
  737. "kernelspec": {
  738. "display_name": "Python 3",
  739. "language": "python",
  740. "name": "python3"
  741. },
  742. "language_info": {
  743. "codemirror_mode": {
  744. "name": "ipython",
  745. "version": 3
  746. },
  747. "file_extension": ".py",
  748. "mimetype": "text/x-python",
  749. "name": "python",
  750. "nbconvert_exporter": "python",
  751. "pygments_lexer": "ipython3",
  752. "version": "3.6.6"
  753. }
  754. },
  755. "nbformat": 4,
  756. "nbformat_minor": 2
  757. }

A Python package for graph kernels, graph edit distances and graph pre-image problem.