|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Axolotl CSV manipulation [Binary Classification]."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In this example, we are showcasing different components of the system.\n",
- "- Loading syntethic data for a univariate regression task.\n",
- "- Easy use of the backend.\n",
- "- Use of simple interface for search predefined method.\n",
- "- Exploring searched pipelines."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Import multiple utils we will be using"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2020-07-12 15:23:25,435\tINFO resource_spec.py:212 -- Starting Ray with 4.39 GiB memory available for workers and up to 2.2 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).\n",
- "2020-07-12 15:23:25,965\tINFO services.py:1170 -- View the Ray dashboard at localhost:8265\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "from pprint import pprint\n",
- "import pandas as pd\n",
- "from sklearn.datasets import make_regression\n",
- "\n",
- "from d3m import container\n",
- "from d3m.metadata.pipeline import Pipeline\n",
- "\n",
- "from axolotl.utils import data_problem, pipeline as pipeline_utils\n",
- "from axolotl.backend.ray import RayRunner\n",
- "from axolotl.algorithms.random_search import RandomSearch\n",
- "\n",
- "# init runner\n",
- "backend = RayRunner(random_seed=42, volumes_dir=None, n_workers=3)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Load csv file and transform it as dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "table_path = os.path.join('..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv')\n",
- "df = pd.read_csv(table_path)\n",
- "dataset, problem_description = data_problem.generate_dataset_problem(df, task='binary_classification', target_index=5)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create an instance of the search and fit with the input_data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "# The method fit search for the best pipeline based on the time butget and fit the best pipeline based on the rank with the input_data.\n",
- "search = RandomSearch(problem_description=problem_description, backend=backend)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 47ec5c86-46b8-4dee-9562-1e5ebc3d0824 failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 64da5190-c2ee-4b8e-abef-697b54cfa32b failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 9e03188f-2120-49ac-a087-1e4fb1b29754 failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline af32bc20-64fa-44a5-ab34-bbe810b671b1 failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 5dbc9e87-19be-4cda-ac51-c1d7ea9328c1 failed.',)]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 918c088e-58dd-4991-8336-deb0b41cb5eb failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 41dfec8f-0b07-4f8e-8ff3-cdbb1dab11c7 failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline d465a878-1ea5-4b72-b8a7-3a4122d1a482 failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 8c39e981-f446-4fde-8744-5606c35a7fdf failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline df127bce-11af-4fae-b8bb-722cb0666484 failed.',)]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n",
- "(pid=85426) The parameter 'presort' is deprecated and has no effect. It will be removed in v0.24. You can suppress this warning by not passing any value to the 'presort' parameter. We also recommend using HistGradientBoosting models instead.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 0985e11e-8db0-4c1c-9f34-3ce8fbc626c1 failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 8977a9c0-dd79-4771-9dc1-455586b80947 failed.',)]\n",
- "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline c0238551-5fbb-41cd-8187-d3d23bc5571d failed.',)]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n"
- ]
- }
- ],
- "source": [
- "fitted_pipeline, fitted_pipelineine_result = search.search_fit(input_data=[dataset], time_limit=30)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "produce_results = search.produce(fitted_pipeline, [dataset])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>d3mIndex</th>\n",
- " <th>species</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>0</td>\n",
- " <td>Iris-setosa</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>1</td>\n",
- " <td>Iris-setosa</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>2</td>\n",
- " <td>Iris-setosa</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>3</td>\n",
- " <td>Iris-setosa</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>4</td>\n",
- " <td>Iris-setosa</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>...</th>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>145</th>\n",
- " <td>145</td>\n",
- " <td>Iris-virginica</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>146</th>\n",
- " <td>146</td>\n",
- " <td>Iris-virginica</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>147</th>\n",
- " <td>147</td>\n",
- " <td>Iris-virginica</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>148</th>\n",
- " <td>148</td>\n",
- " <td>Iris-virginica</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>149</th>\n",
- " <td>149</td>\n",
- " <td>Iris-virginica</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>150 rows × 2 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " d3mIndex species\n",
- "0 0 Iris-setosa\n",
- "1 1 Iris-setosa\n",
- "2 2 Iris-setosa\n",
- "3 3 Iris-setosa\n",
- "4 4 Iris-setosa\n",
- ".. ... ...\n",
- "145 145 Iris-virginica\n",
- "146 146 Iris-virginica\n",
- "147 147 Iris-virginica\n",
- "148 148 Iris-virginica\n",
- "149 149 Iris-virginica\n",
- "\n",
- "[150 rows x 2 columns]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "produce_results.output"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Print information about scores of the succeded pipelines."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "----------------------------------------------------\n",
- "Pipeline id: 676360d8-71ac-401c-b44a-31a810c4e8d3\n",
- "Rank: 0.22667216466666668\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.773333 0.773333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 85d44359-0dac-4260-aea8-c78950025c3f\n",
- "Rank: 0.33333446433333336\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.666667 0.666667 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 3efb07be-28ff-45d8-b1fb-1c49f96b3381\n",
- "Rank: 0.6666653826666668\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.333333 0.333333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: abd9eb99-a4ba-4210-bb34-c2dec7c3ccfa\n",
- "Rank: 0.6666606186666667\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.333333 0.333333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 8948a194-0dfe-4d07-a7c8-d1f5136f68c6\n",
- "Rank: 0.21333939733333337\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.786667 0.786667 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 22866f54-ba68-49e5-8f84-a2a6aba98253\n",
- "Rank: 0.16000235200000004\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.84 0.84 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 37a1c72a-9efd-4b0a-9d3d-811d47571b45\n",
- "Rank: 0.6666753326666668\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.333333 0.333333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 2d3cae0f-66f6-46e0-9fa5-128bf02b4d7e\n",
- "Rank: 0.6666655736666668\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.333333 0.333333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: d1e5a59d-be50-42f3-a71b-cf8ba59b3c47\n",
- "Rank: 0.08666869166666667\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.913333 0.913333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 35d47611-bded-4669-9803-9d259f686ec1\n",
- "Rank: 0.35999672099999996\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.64 0.64 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 7398d17f-e91f-4c75-9a95-c9f85763c858\n",
- "Rank: 0.6666598006666667\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.333333 0.333333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 5293503b-4cb6-4b8b-bf8e-8b9d981c3b03\n",
- "Rank: 0.04666429966666663\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.953333 0.953333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 756e2a15-3315-4aa1-8620-f73ffc69f8a4\n",
- "Rank: 0.6666748276666667\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.333333 0.333333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 46633510-6f46-479e-982e-263aaa2e187a\n",
- "Rank: 0.17999182400000005\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.82 0.82 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 49a750b0-5c86-4ff3-9b2d-c58c6390dd0d\n",
- "Rank: 0.6666588986666667\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.333333 0.333333 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 84c24452-b2cf-41a2-813c-a135eaeef480\n",
- "Rank: 0.36000324699999997\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.64 0.64 42 0\n",
- "----------------------------------------------------\n",
- "Pipeline id: 82117b6b-6960-48bb-b1f4-91355acf51d6\n",
- "Rank: 0.026667331666666617\n",
- " metric value normalized randomSeed fold\n",
- "0 ACCURACY 0.973333 0.973333 42 0\n"
- ]
- }
- ],
- "source": [
- "for pipeline_result in search.history:\n",
- " print('-' * 52)\n",
- " print('Pipeline id:', pipeline_result.pipeline.id)\n",
- " print('Rank:', pipeline_result.rank)\n",
- " print(pipeline_result.scores)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
- }
|