OpenI
/
tods

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Axolotl CSV manipulation [Binary Classification]."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this example, we are showcasing different components of the system.\n",
    "- Loading syntethic data for a univariate regression task.\n",
    "- Easy use of the backend.\n",
    "- Use of simple interface for search predefined method.\n",
    "- Exploring searched pipelines."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import multiple utils we will be using"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-07-12 15:23:25,435\tINFO resource_spec.py:212 -- Starting Ray with 4.39 GiB memory available for workers and up to 2.2 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).\n",
      "2020-07-12 15:23:25,965\tINFO services.py:1170 -- View the Ray dashboard at localhost:8265\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from pprint import pprint\n",
    "import pandas as pd\n",
    "from sklearn.datasets import make_regression\n",
    "\n",
    "from d3m import container\n",
    "from d3m.metadata.pipeline import Pipeline\n",
    "\n",
    "from axolotl.utils import data_problem, pipeline as pipeline_utils\n",
    "from axolotl.backend.ray import RayRunner\n",
    "from axolotl.algorithms.random_search import RandomSearch\n",
    "\n",
    "# init runner\n",
    "backend = RayRunner(random_seed=42, volumes_dir=None, n_workers=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load csv file and transform it as dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "table_path = os.path.join('..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv')\n",
    "df = pd.read_csv(table_path)\n",
    "dataset, problem_description = data_problem.generate_dataset_problem(df, task='binary_classification', target_index=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create an instance of the search and fit with the input_data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The method fit search for the best pipeline based on the time butget and fit the best pipeline based on the rank with the input_data.\n",
    "search = RandomSearch(problem_description=problem_description, backend=backend)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 47ec5c86-46b8-4dee-9562-1e5ebc3d0824 failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 64da5190-c2ee-4b8e-abef-697b54cfa32b failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 9e03188f-2120-49ac-a087-1e4fb1b29754 failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline af32bc20-64fa-44a5-ab34-bbe810b671b1 failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 5dbc9e87-19be-4cda-ac51-c1d7ea9328c1 failed.',)]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 918c088e-58dd-4991-8336-deb0b41cb5eb failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 41dfec8f-0b07-4f8e-8ff3-cdbb1dab11c7 failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline d465a878-1ea5-4b72-b8a7-3a4122d1a482 failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 8c39e981-f446-4fde-8744-5606c35a7fdf failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline df127bce-11af-4fae-b8bb-722cb0666484 failed.',)]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n",
      "(pid=85426) The parameter 'presort' is deprecated and has no effect. It will be removed in v0.24. You can suppress this warning by not passing any value to the 'presort' parameter. We also recommend using HistGradientBoosting models instead.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 0985e11e-8db0-4c1c-9f34-3ce8fbc626c1 failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 8977a9c0-dd79-4771-9dc1-455586b80947 failed.',)]\n",
      "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline c0238551-5fbb-41cd-8187-d3d23bc5571d failed.',)]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n"
     ]
    }
   ],
   "source": [
    "fitted_pipeline, fitted_pipelineine_result = search.search_fit(input_data=[dataset], time_limit=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "produce_results = search.produce(fitted_pipeline, [dataset])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>d3mIndex</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>145</td>\n",
       "      <td>Iris-virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>146</td>\n",
       "      <td>Iris-virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>147</td>\n",
       "      <td>Iris-virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>148</td>\n",
       "      <td>Iris-virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>149</td>\n",
       "      <td>Iris-virginica</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>150 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    d3mIndex         species\n",
       "0          0     Iris-setosa\n",
       "1          1     Iris-setosa\n",
       "2          2     Iris-setosa\n",
       "3          3     Iris-setosa\n",
       "4          4     Iris-setosa\n",
       "..       ...             ...\n",
       "145      145  Iris-virginica\n",
       "146      146  Iris-virginica\n",
       "147      147  Iris-virginica\n",
       "148      148  Iris-virginica\n",
       "149      149  Iris-virginica\n",
       "\n",
       "[150 rows x 2 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "produce_results.output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Print information about scores of the succeded pipelines."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------------------------------------------------\n",
      "Pipeline id: 676360d8-71ac-401c-b44a-31a810c4e8d3\n",
      "Rank: 0.22667216466666668\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.773333    0.773333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 85d44359-0dac-4260-aea8-c78950025c3f\n",
      "Rank: 0.33333446433333336\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.666667    0.666667          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 3efb07be-28ff-45d8-b1fb-1c49f96b3381\n",
      "Rank: 0.6666653826666668\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.333333    0.333333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: abd9eb99-a4ba-4210-bb34-c2dec7c3ccfa\n",
      "Rank: 0.6666606186666667\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.333333    0.333333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 8948a194-0dfe-4d07-a7c8-d1f5136f68c6\n",
      "Rank: 0.21333939733333337\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.786667    0.786667          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 22866f54-ba68-49e5-8f84-a2a6aba98253\n",
      "Rank: 0.16000235200000004\n",
      "     metric  value  normalized  randomSeed  fold\n",
      "0  ACCURACY   0.84        0.84          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 37a1c72a-9efd-4b0a-9d3d-811d47571b45\n",
      "Rank: 0.6666753326666668\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.333333    0.333333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 2d3cae0f-66f6-46e0-9fa5-128bf02b4d7e\n",
      "Rank: 0.6666655736666668\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.333333    0.333333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: d1e5a59d-be50-42f3-a71b-cf8ba59b3c47\n",
      "Rank: 0.08666869166666667\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.913333    0.913333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 35d47611-bded-4669-9803-9d259f686ec1\n",
      "Rank: 0.35999672099999996\n",
      "     metric  value  normalized  randomSeed  fold\n",
      "0  ACCURACY   0.64        0.64          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 7398d17f-e91f-4c75-9a95-c9f85763c858\n",
      "Rank: 0.6666598006666667\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.333333    0.333333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 5293503b-4cb6-4b8b-bf8e-8b9d981c3b03\n",
      "Rank: 0.04666429966666663\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.953333    0.953333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 756e2a15-3315-4aa1-8620-f73ffc69f8a4\n",
      "Rank: 0.6666748276666667\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.333333    0.333333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 46633510-6f46-479e-982e-263aaa2e187a\n",
      "Rank: 0.17999182400000005\n",
      "     metric  value  normalized  randomSeed  fold\n",
      "0  ACCURACY   0.82        0.82          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 49a750b0-5c86-4ff3-9b2d-c58c6390dd0d\n",
      "Rank: 0.6666588986666667\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.333333    0.333333          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 84c24452-b2cf-41a2-813c-a135eaeef480\n",
      "Rank: 0.36000324699999997\n",
      "     metric  value  normalized  randomSeed  fold\n",
      "0  ACCURACY   0.64        0.64          42     0\n",
      "----------------------------------------------------\n",
      "Pipeline id: 82117b6b-6960-48bb-b1f4-91355acf51d6\n",
      "Rank: 0.026667331666666617\n",
      "     metric     value  normalized  randomSeed  fold\n",
      "0  ACCURACY  0.973333    0.973333          42     0\n"
     ]
    }
   ],
   "source": [
    "for pipeline_result in search.history:\n",
    "    print('-' * 52)\n",
    "    print('Pipeline id:', pipeline_result.pipeline.id)\n",
    "    print('Rank:', pipeline_result.rank)\n",
    "    print(pipeline_result.scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}