|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "MAO\n",
- "\n",
- "--- This is a classification problem ---\n",
- "\n",
- "\n",
- "1. Loading dataset from file...\n",
- "\n",
- "2. Calculating gram matrices. This could take a while...\n",
- "\n",
- " None edge weight specified. Set all weight to 1.\n",
- "\n",
- "getting sp graphs: 68it [00:00, 692.11it/s]\n",
- "calculating kernels: 2346it [00:05, 399.28it/s]\n",
- "\n",
- " --- shortest path kernel matrix of size 68 built in 6.345669507980347 seconds ---\n",
- "\n",
- "the gram matrix with parameters {'node_kernels': {'symb': <function deltakernel at 0x7fe240afd620>, 'nsymb': <function gaussiankernel at 0x7fe240afd9d8>, 'mix': functools.partial(<function kernelproduct at 0x7fe240aaf0d0>, <function deltakernel at 0x7fe240afd620>, <function gaussiankernel at 0x7fe240afd9d8>)}, 'n_jobs': 8} is: \n",
- "\n",
- "1 gram matrices are calculated, 0 of which are ignored.\n",
- "\n",
- "3. Fitting and predicting using nested cross validation. This could really take a while...\n",
- "cross validation: 7it [00:09, 4.67s/it]"
- ]
- }
- ],
- "source": [
- "import functools\n",
- "from libs import *\n",
- "import multiprocessing\n",
- "\n",
- "from pygraph.kernels.spKernel import spkernel\n",
- "from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct\n",
- "#from pygraph.utils.model_selection_precomputed import trial_do\n",
- "\n",
- "dslist = [\n",
- "# {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',\n",
- "# 'task': 'regression'}, # node symb\n",
- "# {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',\n",
- "# 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, \n",
- "# # contains single node graph, node symb\n",
- " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb\n",
- "# {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled\n",
- "# {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
- "# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n",
- "# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
- "# # node nsymb\n",
- "# {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
- "# # node symb/nsymb\n",
- "# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
- " # node/edge symb\n",
- "# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
- "# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n",
- "\n",
- " # {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
- " # # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
- " # # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
- " # {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
- " #\n",
- " # # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
- " # # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
- " # # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
- " # # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
- " # # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
- "\n",
- " # # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
- " # # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
- " # # {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
- " # {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
- " # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
- " # {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
- " # 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
- " # {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
- " # 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
- "\n",
- " # # not working below\n",
- " # {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
- " # {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
- " # {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
- " # {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
- "]\n",
- "estimator = spkernel\n",
- "mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)\n",
- "param_grid_precomputed = {'node_kernels': [\n",
- " {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}\n",
- "param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},\n",
- " {'alpha': np.logspace(-10, 10, num=41, base=10)}]\n",
- "\n",
- "for ds in dslist:\n",
- " print()\n",
- " print(ds['name'])\n",
- " model_selection_for_precomputed_kernel(\n",
- " ds['dataset'],\n",
- " estimator,\n",
- " param_grid_precomputed,\n",
- " (param_grid[1] if ('task' in ds and ds['task']\n",
- " == 'regression') else param_grid[0]),\n",
- " (ds['task'] if 'task' in ds else 'classification'),\n",
- " NUM_TRIALS=30,\n",
- " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n",
- " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n",
- " ds_name=ds['name'],\n",
- " n_jobs=multiprocessing.cpu_count(),\n",
- " read_gm_from_file=False)\n",
- " print()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.\n",
- "[Parallel(n_jobs=8)]: Done 2 out of 9 | elapsed: 15.7s remaining: 54.8s\n",
- "[Parallel(n_jobs=8)]: Done 3 out of 9 | elapsed: 15.7s remaining: 31.3s\n",
- "[Parallel(n_jobs=8)]: Done 4 out of 9 | elapsed: 15.7s remaining: 19.6s\n",
- "[Parallel(n_jobs=8)]: Done 5 out of 9 | elapsed: 15.7s remaining: 12.5s\n",
- "[Parallel(n_jobs=8)]: Done 6 out of 9 | elapsed: 15.7s remaining: 7.8s\n",
- "[Parallel(n_jobs=8)]: Done 7 out of 9 | elapsed: 15.7s remaining: 4.5s\n",
- "[Parallel(n_jobs=8)]: Done 9 out of 9 | elapsed: 15.7s remaining: 0.0s\n"
- ]
- },
- {
- "ename": "KeyboardInterrupt",
- "evalue": "",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-1-ba0f5fe728f1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 83\u001b[0;31m \u001b[0mParallel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_cores\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelayed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompute_ds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mds\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdslist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 960\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 961\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieval_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 962\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 963\u001b[0m \u001b[0;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0melapsed_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 864\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'supports_timeout'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 865\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 866\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 867\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mwrap_future_result\u001b[0;34m(future, timeout)\u001b[0m\n\u001b[1;32m 513\u001b[0m AsyncResults.get from multiprocessing.\"\"\"\n\u001b[1;32m 514\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 515\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 516\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mLokyTimeoutError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/joblib/externals/loky/_base.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 424\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__get_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 426\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_condition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 427\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mCANCELLED\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCANCELLED_AND_NOTIFIED\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/lib/python3.5/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 293\u001b[0;31m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 294\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
- ]
- }
- ],
- "source": [
- "# # test parallel computing\n",
- "# import psutil\n",
- "# # logical=True counts threads, but we are interested in cores\n",
- "# psutil.()# .cpu_count(logical=False)\n",
- "%load_ext line_profiler\n",
- "%matplotlib inline\n",
- "import functools\n",
- "from libs import *\n",
- "from sklearn.metrics.pairwise import rbf_kernel\n",
- "from joblib import Parallel, delayed\n",
- "import multiprocessing\n",
- "\n",
- "from pygraph.kernels.spKernel import spkernel\n",
- "from pygraph.utils.kernels import deltakernel, kernelsum\n",
- "\n",
- "num_cores = multiprocessing.cpu_count()\n",
- "\n",
- "dslist = [ \n",
- " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', 'task': 'regression'}, # node symb\n",
- "# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb\n",
- " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',}, # unlabeled\n",
- " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',}, # node/edge symb\n",
- " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
- " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb\n",
- " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', \n",
- " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',}, # contains single node graph, node symb\n",
- "# {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb\n",
- "# {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb\n",
- " {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, # node/edge symb\n",
- " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, # node symb/nsymb\n",
- "# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},\n",
- " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
- "# {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb\n",
- "# {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb\n",
- "# {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb\n",
- "# {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb\n",
- "# {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb\n",
- "\n",
- "# {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb\n",
- "# {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb\n",
- " {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
- " 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb\n",
- "# {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb\n",
- "# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
- "# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
- "# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
- "# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb\n",
- "# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
- "# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb\n",
- " \n",
- "# # not working below\n",
- "# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
- "# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
- "# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
- "# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
- "]\n",
- "estimator = spkernel\n",
- "mixkernel = functools.partial(kernelsum, deltakernel, rbf_kernel)\n",
- "param_grid_precomputed = {'node_kernels': [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}\n",
- "param_grid = [{'C': np.logspace(-10, 10, num = 41, base = 10)}, \n",
- " {'alpha': np.logspace(-10, 10, num = 41, base = 10)}]\n",
- " \n",
- "def compute_ds(ds):\n",
- " print()\n",
- " print(ds['name'])\n",
- " model_selection_for_precomputed_kernel(\n",
- " ds['dataset'], estimator, param_grid_precomputed, \n",
- " (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \n",
- " (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30,\n",
- " datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n",
- " extra_params=(ds['extra_params'] if 'extra_params' in ds else None),\n",
- " ds_name=ds['name'])\n",
- " \n",
- "# %lprun -f spkernel \\\n",
- "# model_selection_for_precomputed_kernel( \\\n",
- "# ds['dataset'], estimator, param_grid_precomputed, \\\n",
- "# (param_grid[1] if ('task' in ds and ds['task'] == 'regression') else param_grid[0]), \\\n",
- "# (ds['task'] if 'task' in ds else 'classification'), NUM_TRIALS=30, \\\n",
- "# datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), \\\n",
- "# extra_params=(ds['extra_params'] if 'extra_params' in ds else None))\n",
- " print()\n",
- " \n",
- "Parallel(n_jobs=num_cores, verbose=10)(delayed(compute_ds)(ds) for ds in dslist)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "--- This is a regression problem ---\n",
- "\n",
- "\n",
- "I. Loading dataset from file...\n",
- "\n",
- "2. Calculating gram matrices. This could take a while...\n",
- "\n",
- " None edge weight specified. Set all weight to 1.\n",
- "\n"
- ]
- },
- {
- "ename": "TypeError",
- "evalue": "'NoneType' object is not subscriptable",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mRemoteTraceback\u001b[0m Traceback (most recent call last)",
- "\u001b[0;31mRemoteTraceback\u001b[0m: \n\"\"\"\nTraceback (most recent call last):\n File \"/usr/lib/python3.5/multiprocessing/pool.py\", line 119, in worker\n result = (True, func(*args, **kwds))\n File \"/usr/lib/python3.5/multiprocessing/pool.py\", line 44, in mapstar\n return list(map(*args))\n File \"../pygraph/kernels/spKernel.py\", line 359, in spkernel_do\n kn = node_kernels['symb']\nTypeError: 'NoneType' object is not subscriptable\n\"\"\"",
- "\nThe above exception was the direct cause of the following exception:\n",
- "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-1-b5a6e5aa5a44>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, \n\u001b[0;32m---> 15\u001b[0;31m 'regression', NUM_TRIALS=30)\n\u001b[0m",
- "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/utils/model_selection_precomputed.py\u001b[0m in \u001b[0;36mmodel_selection_for_precomputed_kernel\u001b[0;34m(datafile, estimator, param_grid_precomputed, param_grid, model_type, NUM_TRIALS, datafile_y, extra_params, ds_name, n_jobs)\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams_out\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam_list_precomputed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mparams_out\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'n_jobs'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 122\u001b[0;31m \u001b[0mrtn_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mparams_out\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 123\u001b[0m \u001b[0mKmatrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrtn_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0mcurrent_run_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrtn_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/media/ljia/DATA/research-repo/codes/Linlin/py-graph/pygraph/kernels/spKernel.py\u001b[0m in \u001b[0;36mspkernel\u001b[0;34m(node_label, edge_weight, node_kernels, n_jobs, *args)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0mdo_partial\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpartial\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspkernel_do\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mds_attrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnode_label\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnode_kernels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mitr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcombinations_with_replacement\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 99\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkernel\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpool\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdo_partial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mitr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtotal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 100\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0mKmatrix\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/lib/python3.5/multiprocessing/pool.py\u001b[0m in \u001b[0;36mmap\u001b[0;34m(self, func, iterable, chunksize)\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mthat\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mreturned\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 259\u001b[0m '''\n\u001b[0;32m--> 260\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_map_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmapstar\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 261\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mstarmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/lib/python3.5/multiprocessing/pool.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 606\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 608\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 610\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_set\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Process ForkPoolWorker-1:\n",
- "Traceback (most recent call last):\n",
- " File \"/usr/lib/python3.5/multiprocessing/process.py\", line 249, in _bootstrap\n",
- " self.run()\n",
- " File \"/usr/lib/python3.5/multiprocessing/process.py\", line 93, in run\n",
- " self._target(*self._args, **self._kwargs)\n",
- " File \"/usr/lib/python3.5/multiprocessing/pool.py\", line 108, in worker\n",
- " task = get()\n",
- " File \"/usr/lib/python3.5/multiprocessing/queues.py\", line 343, in get\n",
- " res = self._reader.recv_bytes()\n",
- " File \"/usr/lib/python3.5/multiprocessing/connection.py\", line 216, in recv_bytes\n",
- " buf = self._recv_bytes(maxlength)\n",
- " File \"/usr/lib/python3.5/multiprocessing/connection.py\", line 407, in _recv_bytes\n",
- " buf = self._recv(4)\n",
- " File \"/usr/lib/python3.5/multiprocessing/connection.py\", line 379, in _recv\n",
- " chunk = read(handle, remaining)\n",
- "KeyboardInterrupt\n"
- ]
- }
- ],
- "source": [
- "%load_ext line_profiler\n",
- "%matplotlib inline\n",
- "import numpy as np\n",
- "import sys\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.model_selection_precomputed import model_selection_for_precomputed_kernel\n",
- "from pygraph.kernels.spKernel import spkernel\n",
- "\n",
- "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
- "estimator = spkernel\n",
- "param_grid_precomputed = {}\n",
- "param_grid = {'alpha': np.logspace(-1, 1, num = 41, base = 10)}\n",
- "\n",
- "model_selection_for_precomputed_kernel(datafile, estimator, param_grid_precomputed, param_grid, \n",
- " 'regression', NUM_TRIALS=30)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- "\n",
- " Loading dataset from file...\n",
- "\n",
- " Calculating kernel matrix, this could take a while...\n",
- "--- shortest path kernel matrix of size 185 built in 13.3865065574646 seconds ---\n",
- "[[ 3. 1. 3. ... 1. 1. 1.]\n",
- " [ 1. 6. 1. ... 0. 0. 3.]\n",
- " [ 3. 1. 3. ... 1. 1. 1.]\n",
- " ...\n",
- " [ 1. 0. 1. ... 55. 21. 7.]\n",
- " [ 1. 0. 1. ... 21. 55. 7.]\n",
- " [ 1. 3. 1. ... 7. 7. 55.]]\n",
- "\n",
- " Starting calculate accuracy/rmse...\n",
- "calculate performance: 94%|█████████▎| 936/1000 [00:01<00:00, 757.54it/s]\n",
- " Mean performance on train set: 28.360361\n",
- "With standard deviation: 1.357183\n",
- "\n",
- " Mean performance on test set: 35.191954\n",
- "With standard deviation: 4.495767\n",
- "calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 771.22it/s]\n",
- "\n",
- "\n",
- " rmse_test std_test rmse_train std_train k_time\n",
- "----------- ---------- ------------ ----------- --------\n",
- " 35.192 4.49577 28.3604 1.35718 13.3865\n"
- ]
- }
- ],
- "source": [
- "%load_ext line_profiler\n",
- "\n",
- "import sys\n",
- "sys.path.insert(0, \"../\")\n",
- "from pygraph.utils.utils import kernel_train_test\n",
- "from pygraph.kernels.spKernel import spkernel\n",
- "\n",
- "datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
- "kernel_file_path = 'kernelmatrices_path_acyclic/'\n",
- "\n",
- "kernel_para = dict(edge_weight = 'atom')\n",
- "\n",
- "kernel_train_test(datafile, kernel_file_path, spkernel, kernel_para, normalize = False)\n",
- "\n",
- "# %lprun -f spkernel \\\n",
- "# kernel_train_test(datafile, kernel_file_path, spkernel, kernel_para, normalize = False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# results\n",
- "\n",
- "# with y normalization\n",
- " RMSE_test std_test RMSE_train std_train k_time\n",
- "----------- ---------- ------------ ----------- --------\n",
- " 35.6337 5.23183 32.3805 3.92531 14.9301\n",
- "\n",
- "# without y normalization\n",
- " RMSE_test std_test RMSE_train std_train k_time\n",
- "----------- ---------- ------------ ----------- --------\n",
- " 35.192 4.49577 28.3604 1.35718 14.5768"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\n",
- "\n",
- " Loading dataset from file...\n",
- "[ -23.7 14. 37.3 109.7 10.8 39. 42. 66.6 135. 148.5\n",
- " 40. 34.6 32. 63. 53.5 67. 64.4 84.7 95.5 92.\n",
- " 84.4 154. 156. 166. 183. 70.3 63.6 52.5 59. 59.5\n",
- " 55.2 88. 83. 104.5 102. 92. 107.4 123.2 112.5 118.5\n",
- " 101.5 173.7 165.5 181. 99.5 92.3 90.1 80.2 82. 91.2\n",
- " 91.5 81.2 93. 69. 86.3 82. 103. 103.5 96. 112. 104.\n",
- " 132.5 123.5 120.3 145. 144.2 142.8 132. 134.2 137. 139.\n",
- " 133.6 120.4 120. 137. 195.8 177.2 181. 185.9 175.7 186. 211.\n",
- " 125. 118. 117.1 107. 102.5 112. 97.4 91.5 87.6 106.5\n",
- " 101. 99.3 90. 137. 114. 126. 124. 140.5 157.5 146. 145.\n",
- " 141. 171. 166. 155. 145. 159. 138. 142. 159. 163.5\n",
- " 229.5 142. 125. 132. 130.5 125. 122. 121. 122.2 112. 106.\n",
- " 114.5 151. 128.5 109.5 126. 147. 158. 147. 165. 188.9\n",
- " 170. 178. 148.5 165. 177. 167. 195. 226. 215. 201. 205.\n",
- " 151.5 165.5 157. 139. 163. 153.5 139. 162. 173. 159.5\n",
- " 159.5 155.5 141. 126. 164. 163. 166.5 146. 165. 159. 195.\n",
- " 218. 250. 235. 186.5 156.5 162. 162. 170.2 173.2 186.8\n",
- " 173. 187. 174. 188.5 199. 228. 215. 216. 240. ]\n",
- "\n",
- " Loading the matrix from file...\n",
- "[[ 3. 1. 3. ..., 1. 1. 1.]\n",
- " [ 1. 6. 1. ..., 0. 0. 3.]\n",
- " [ 3. 1. 3. ..., 1. 1. 1.]\n",
- " ..., \n",
- " [ 1. 0. 1. ..., 55. 21. 7.]\n",
- " [ 1. 0. 1. ..., 21. 55. 7.]\n",
- " [ 1. 3. 1. ..., 7. 7. 55.]]\n",
- "\n",
- " --- This is a regression problem ---\n",
- "\n",
- " Starting split 10...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 40.422382\n",
- "The corresponding performance on test set is: 47.424532\n",
- "\n",
- " Starting split 11...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 33.084913\n",
- "The corresponding performance on test set is: 35.493699\n",
- "\n",
- " Starting split 12...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 31.306710\n",
- "The corresponding performance on test set is: 33.173366\n",
- "\n",
- " Starting split 13...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 43.500424\n",
- "The corresponding performance on test set is: 32.633129\n",
- "\n",
- " Starting split 14...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 10 with parameter alpha = 1.000000\n",
- "The best performance on the validation set is: 53.561752\n",
- "The corresponding performance on test set is: 42.883548\n",
- "\n",
- " Starting split 15...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 40.444773\n",
- "The corresponding performance on test set is: 32.713040\n",
- "\n",
- " Starting split 16...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 11 with parameter alpha = 10.000000\n",
- "The best performance on the validation set is: 37.046818\n",
- "The corresponding performance on test set is: 37.337851\n",
- "\n",
- " Starting split 17...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 39.907628\n",
- "The corresponding performance on test set is: 38.889064\n",
- "\n",
- " Starting split 18...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 12 with parameter alpha = 100.000000\n",
- "The best performance on the validation set is: 29.879950\n",
- "The corresponding performance on test set is: 27.652558\n",
- "\n",
- " Starting split 19...\n",
- "\n",
- " Normalizing output y...\n",
- "The best performance is for trial 11 with parameter alpha = 10.000000\n",
- "The best performance on the validation set is: 44.911892\n",
- "The corresponding performance on test set is: 35.804454\n",
- "\n",
- " Mean performance on val set: 39.406724\n",
- "With standard deviation: 6.720820\n",
- "\n",
- " Mean performance on test set: 36.400524\n",
- "With standard deviation: 5.352940\n"
- ]
- }
- ],
- "source": [
- "# Author: Elisabetta Ghisu\n",
- "\n",
- "\"\"\"\n",
- "- This script take as input a kernel matrix\n",
- "and returns the classification or regression performance\n",
- "- The kernel matrix can be calculated using any of the graph kernels approaches\n",
- "- The criteria used for prediction are SVM for classification and kernel Ridge regression for regression\n",
- "- For predition we divide the data in training, validation and test. For each split, we first train on the train data, \n",
- "then evaluate the performance on the validation. We choose the optimal parameters for the validation set and finally\n",
- "provide the corresponding performance on the test set. If more than one split is performed, the final results \n",
- "correspond to the average of the performances on the test sets. \n",
- "\n",
- "@references\n",
- " https://github.com/eghisu/GraphKernels/blob/master/GraphKernelsCollection/python_scripts/compute_perf_gk.py\n",
- "\"\"\"\n",
- "\n",
- "print(__doc__)\n",
- "\n",
- "import sys\n",
- "import pathlib\n",
- "sys.path.insert(0, \"../\")\n",
- "from tabulate import tabulate\n",
- "\n",
- "import random\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from sklearn.kernel_ridge import KernelRidge # 0.17\n",
- "from sklearn.metrics import accuracy_score, mean_squared_error\n",
- "from sklearn import svm\n",
- "\n",
- "from pygraph.kernels.spkernel import spkernel\n",
- "from pygraph.utils.graphfiles import loadDataset\n",
- "\n",
- "print('\\n Loading dataset from file...')\n",
- "dataset, y = loadDataset(\"../../../../datasets/acyclic/Acyclic/dataset_bps.ds\")\n",
- "y = np.array(y)\n",
- "print(y)\n",
- "\n",
- "kernel_file_path = 'kernelmatrix.ds'\n",
- "path = pathlib.Path(kernel_file_path)\n",
- "if path.is_file():\n",
- " print('\\n Loading the matrix from file...')\n",
- " Kmatrix = np.loadtxt(kernel_file_path)\n",
- " print(Kmatrix)\n",
- "else:\n",
- " print('\\n Calculating kernel matrix, this could take a while...')\n",
- " #@Q: is it appropriate to use bond type between atoms as the edge weight to calculate shortest path????????\n",
- " Kmatrix, run_time = spkernel(dataset, edge_weight = 'bond_type')\n",
- " print(Kmatrix)\n",
- " print('Saving kernel matrix to file...')\n",
- " np.savetxt(kernel_file_path, Kmatrix)\n",
- "\n",
- "# setup the parameters\n",
- "model_type = 'regression' # Regression or classification problem\n",
- "print('\\n --- This is a %s problem ---' % model_type)\n",
- "\n",
- "datasize = len(dataset)\n",
- "trials = 21 # Trials for hyperparameters random search\n",
- "splits = 10 # Number of splits of the data\n",
- "alpha_grid = np.logspace(-10, 10, num = trials, base = 10) # corresponds to (2*C)^-1 in other linear models such as LogisticRegression\n",
- "C_grid = np.logspace(-10, 10, num = trials, base = 10)\n",
- "random.seed(20) # Set the seed for uniform parameter distribution\n",
- "\n",
- "\n",
- "\"\"\"\n",
- "- Here starts the main program\n",
- "- First we permute the data, then for each split we evaluate corresponding performances\n",
- "- In the end, the performances are averaged over the test sets\n",
- "\"\"\"\n",
- "\n",
- "# Initialize the performance of the best parameter trial on validation with the corresponding performance on test\n",
- "val_split = []\n",
- "test_split = []\n",
- "\n",
- "# For each split of the data\n",
- "for j in range(10, 10 + splits):\n",
- " print('\\n Starting split %d...' % j)\n",
- "\n",
- " # Set the random set for data permutation\n",
- " random_state = int(j)\n",
- " np.random.seed(random_state)\n",
- " idx_perm = np.random.permutation(datasize)\n",
- "# print(idx_perm)\n",
- " \n",
- " # Permute the data\n",
- " y_perm = y[idx_perm] # targets permutation\n",
- "# print(y_perm)\n",
- " Kmatrix_perm = Kmatrix[:, idx_perm] # inputs permutation\n",
- "# print(Kmatrix_perm)\n",
- " Kmatrix_perm = Kmatrix_perm[idx_perm, :] # inputs permutation\n",
- " \n",
- " # Set the training, validation and test\n",
- " # Note: the percentage can be set up by the user\n",
- " num_train_val = int((datasize * 90) / 100) # 90% (of entire dataset) for training and validation\n",
- " num_test = datasize - num_train_val # 10% (of entire dataset) for test\n",
- " num_train = int((num_train_val * 90) / 100) # 90% (of train + val) for training\n",
- " num_val = num_train_val - num_train # 10% (of train + val) for validation\n",
- " \n",
- " # Split the kernel matrix\n",
- " Kmatrix_train = Kmatrix_perm[0:num_train, 0:num_train]\n",
- " Kmatrix_val = Kmatrix_perm[num_train:(num_train + num_val), 0:num_train]\n",
- " Kmatrix_test = Kmatrix_perm[(num_train + num_val):datasize, 0:num_train]\n",
- "\n",
- " # Split the targets\n",
- " y_train = y_perm[0:num_train]\n",
- "\n",
- " # Normalization step (for real valued targets only)\n",
- " print('\\n Normalizing output y...')\n",
- " if model_type == 'regression':\n",
- " y_train_mean = np.mean(y_train)\n",
- " y_train_std = np.std(y_train)\n",
- " y_train = (y_train - y_train_mean) / float(y_train_std)\n",
- "# print(y)\n",
- " \n",
- " y_val = y_perm[num_train:(num_train + num_val)]\n",
- " y_test = y_perm[(num_train + num_val):datasize]\n",
- " \n",
- " # Record the performance for each parameter trial respectively on validation and test set\n",
- " perf_all_val = []\n",
- " perf_all_test = []\n",
- " \n",
- " # For each parameter trial\n",
- " for i in range(trials):\n",
- " # For regression use the Kernel Ridge method\n",
- " if model_type == 'regression':\n",
- "# print('\\n Starting experiment for trial %d and parameter alpha = %3f\\n ' % (i, alpha_grid[i]))\n",
- "\n",
- " # Fit the kernel ridge model\n",
- " KR = KernelRidge(kernel = 'precomputed', alpha = alpha_grid[i])\n",
- "# KR = svm.SVR(kernel = 'precomputed', C = C_grid[i])\n",
- " KR.fit(Kmatrix_train, y_train)\n",
- "\n",
- " # predict on the validation and test set\n",
- " y_pred = KR.predict(Kmatrix_val)\n",
- " y_pred_test = KR.predict(Kmatrix_test)\n",
- "# print(y_pred)\n",
- "\n",
- " # adjust prediction: needed because the training targets have been normalizaed\n",
- " y_pred = y_pred * float(y_train_std) + y_train_mean\n",
- "# print(y_pred)\n",
- " y_pred_test = y_pred_test * float(y_train_std) + y_train_mean\n",
- "# print(y_pred_test)\n",
- "\n",
- " # root mean squared error on validation\n",
- " rmse = np.sqrt(mean_squared_error(y_val, y_pred))\n",
- " perf_all_val.append(rmse)\n",
- "\n",
- " # root mean squared error in test \n",
- " rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))\n",
- " perf_all_test.append(rmse_test)\n",
- "\n",
- "# print('The performance on the validation set is: %3f' % rmse)\n",
- "# print('The performance on the test set is: %3f' % rmse_test)\n",
- " \n",
- " # --- FIND THE OPTIMAL PARAMETERS --- #\n",
- " # For regression: minimise the mean squared error\n",
- " if model_type == 'regression':\n",
- "\n",
- " # get optimal parameter on validation (argmin mean squared error)\n",
- " min_idx = np.argmin(perf_all_test)\n",
- " alpha_opt = alpha_grid[min_idx]\n",
- "\n",
- " # performance corresponding to optimal parameter on val\n",
- " perf_val_opt = perf_all_val[min_idx]\n",
- "\n",
- " # corresponding performance on test for the same parameter\n",
- " perf_test_opt = perf_all_test[min_idx]\n",
- "\n",
- " print('The best performance is for trial %d with parameter alpha = %3f' % (min_idx, alpha_opt))\n",
- " print('The best performance on the validation set is: %3f' % perf_val_opt)\n",
- " print('The corresponding performance on test set is: %3f' % perf_test_opt)\n",
- "\n",
- " # append the best performance on validation\n",
- " # at the current split\n",
- " val_split.append(perf_val_opt)\n",
- "\n",
- " # append the correponding performance on the test set\n",
- " test_split.append(perf_test_opt)\n",
- "\n",
- "# average the results\n",
- "# mean of the validation performances over the splits\n",
- "val_mean = np.mean(np.asarray(val_split))\n",
- "# std deviation of validation over the splits\n",
- "val_std = np.std(np.asarray(val_split))\n",
- "\n",
- "# mean of the test performances over the splits\n",
- "test_mean = np.mean(np.asarray(test_split))\n",
- "# std deviation of the test oer the splits\n",
- "test_std = np.std(np.asarray(test_split))\n",
- "\n",
- "print('\\n Mean performance on val set: %3f' % val_mean)\n",
- "print('With standard deviation: %3f' % val_std)\n",
- "print('\\n Mean performance on test set: %3f' % test_mean)\n",
- "print('With standard deviation: %3f' % test_std)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|