Browse Source

Refine examples

Former-commit-id: 5a4a63ff27 [formerly 12c7d03aad] [formerly 67b40acc93 [formerly a24ae8f0a3]] [formerly 5525bfa76a [formerly 47d0e986ce] [formerly c5eb607a19 [formerly 34e4f50dd1]]] [formerly 03b189f439 [formerly 8ae21d08cd] [formerly 636aa28af7 [formerly 06a840bb1e]] [formerly 2bdac04c21 [formerly dc259c1717] [formerly 17d6611d8f [formerly d3445ad667]]]] [formerly 9ae550a90e [formerly f39d1b8bd2] [formerly 8ccba52b49 [formerly 723a326385]] [formerly a8ba9eab50 [formerly 082129e96d] [formerly 56365d8244 [formerly 3892ba8f16]]] [formerly dc86bec217 [formerly ea7862f984] [formerly 1ce9576289 [formerly e5a32f7002]] [formerly e9977975c6 [formerly 6f248084e9] [formerly a992cf924d [formerly fdb1f7f2a4]]]]] [formerly c672d24cc2 [formerly 1b9735b22d] [formerly b65d523d7e [formerly 030d59221e]] [formerly 162613cf00 [formerly 83fe17c6e2] [formerly fdb4c0aa0e [formerly 998b5befc4]]] [formerly 822cf5f77c [formerly ae40f60abe] [formerly 7a9f290b3d [formerly 71512c55b8]] [formerly afe9115cfd [formerly 121b4afdb8] [formerly 1826d99bfb [formerly 0fd001065a]]]] [formerly 1364dd2bf4 [formerly d3c4f82d97] [formerly 91a7ec745d [formerly c004313d27]] [formerly 34c488bb9f [formerly c1fe366bbc] [formerly 671f64fe28 [formerly 742453b103]]] [formerly 470515bb11 [formerly a1ce0731c1] [formerly 547a9c16c1 [formerly 086a10f109]] [formerly 85bf5246bb [formerly 3655590a78] [formerly cc02a40884 [formerly e09f9e36f2]]]]]]
Former-commit-id: 2bf9bbbcbd [formerly bbf3c752ad] [formerly fa5053a575 [formerly 437f4eb1e2]] [formerly 3e3b00bc0f [formerly a3187fbe85] [formerly 72e5a8da9e [formerly 9fa0adee46]]] [formerly 4b2fc94b1e [formerly 2007edd4a5] [formerly 2f81fd19b4 [formerly 18240e6a08]] [formerly 222b36c358 [formerly 981127893f] [formerly 6c305285ac [formerly 5eac958894]]]] [formerly da7314bf1b [formerly 7ac9fc4e09] [formerly 8a153b0592 [formerly 3a5c9048d7]] [formerly d8770decbc [formerly 4a125ea6d1] [formerly 816ff4cb4e [formerly 559c2355b3]]] [formerly 8759cc4f1c [formerly 8c7f9ee1b3] [formerly b9dacfba0b [formerly 4b9627101e]] [formerly a7a9865252 [formerly 93f6c14a90] [formerly cc02a40884]]]]
Former-commit-id: 6bf86dad84 [formerly 361e1989ad] [formerly 62c40e59aa [formerly 2e6e953066]] [formerly bcd0c831e1 [formerly e36b203fe8] [formerly 1680d41ca2 [formerly 5909cfb28b]]] [formerly 8f19a364bb [formerly c9df3addeb] [formerly 6456c6fe40 [formerly 747e44e598]] [formerly 46ab4b25ad [formerly d3e63c79f3] [formerly a553295d3b [formerly 6ea1609800]]]]
Former-commit-id: cd0dd9d427 [formerly 4797c81bff] [formerly c06f5e51b0 [formerly 240dfb39db]] [formerly 1ba9f47172 [formerly bbcf24b814] [formerly a955f79ff7 [formerly 8e48fb601e]]]
Former-commit-id: a0266a01ca [formerly d498dc088d] [formerly f45da48ee8 [formerly 2a1fbc2787]]
Former-commit-id: 7495739134 [formerly 052efc3b77]
Former-commit-id: 1fa1955ae6
master
Daochen Zha 4 years ago
parent
commit
f6e6a38012
6 changed files with 117 additions and 51 deletions
  1. +64
    -0
      README.md
  2. +23
    -0
      examples/evaluate_default_pipeline.py
  3. +0
    -51
      examples/run_predefined_pipeline.py
  4. +1
    -0
      tods/tods/resources/default_pipeline.json
  5. +10
    -0
      tods/tods/schemas.py
  6. +19
    -0
      tods/tods/utils.py

+ 64
- 0
README.md View File

@@ -44,6 +44,70 @@ cd ..


There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any. There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any.


# Examples
Examples are available in [/examples](examples/). For basic usage, you can evaluate a pipeline on a given datasets. Here, we provide an example to load our default pipeline and evaluate it on a subset of yahoo dataset.
```
import pandas as pd

from tods import schemas as schemas_utils
from tods.utils import generate_dataset_problem, evaluate_pipeline

table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

# Load the default pipeline
pipeline = schemas_utils.load_default_pipeline()

# Run the pipeline
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
```
We also provide AutoML support to help you automatically find a good pipeline for a your data.
```
import pandas as pd

from axolotl.backend.simple import SimpleRunner

from tods.utils import generate_dataset_problem
from tods.search import BruteForceSearch

# Some information
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset
#target_index = 2 # what column is the target

table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

# Start backend
backend = SimpleRunner(random_seed=0)

# Start search algorithm
search = BruteForceSearch(problem_description=problem_description, backend=backend)

# Find the best pipeline
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit)
best_pipeline = best_runtime.pipeline
best_output = best_pipeline_result.output

# Evaluate the best pipeline
best_scores = search.evaluate(best_pipeline).scores
```

# Dataset # Dataset
Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format. Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format.




+ 23
- 0
examples/evaluate_default_pipeline.py View File

@@ -0,0 +1,23 @@
import pandas as pd

from tods import schemas as schemas_utils
from tods.utils import generate_dataset_problem, evaluate_pipeline

table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

# Load the default pipeline
pipeline = schemas_utils.load_default_pipeline()

# Run the pipeline
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
print(pipeline_result)


+ 0
- 51
examples/run_predefined_pipeline.py View File

@@ -1,51 +0,0 @@
import uuid
import random
import pandas as pd
import json
from pprint import pprint
from sklearn.datasets import make_classification

from d3m import container
from d3m.metadata.pipeline import Pipeline
from d3m.metadata.problem import TaskKeyword, PerformanceMetric

from axolotl.utils import data_problem
from axolotl.backend.simple import SimpleRunner
# from axolotl.backend.ray import RayRunner
# from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils

import tods
from tods.search import BruteForceSearch

table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv'
df = pd.read_csv(table_path)
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=7,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=[{'metric': PerformanceMetric.F1}])

print(dataset)
print(problem_description)

metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}},
]

pipeline_path = 'example_pipeline.json'
pipeline = pipeline_utils.load_pipeline(pipeline_path)
print(pipeline)

data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

backend = SimpleRunner(random_seed=0)
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
pipeline=pipeline,
input_data=[dataset],
metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params)
print(pipeline_result)


+ 1
- 0
tods/tods/resources/default_pipeline.json View File

@@ -0,0 +1 @@
{"id": "384bbfab-4f6d-4001-9f90-684ea5681f5d", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-09-09T23:40:01.756164Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "b94ee59ccf8db678d506adddbc238fb2049fb664a1e3f3f3f6a6517c0c4f8e5f"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "256f0155c7185d747b3b23096e46c40d15844106f9ed6346453f6010891f1896"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "642de2e7-5590-3cab-9266-2a53c326c461", "version": "0.0.1", "python_path": "d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler", "name": "Axis_wise_scale"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "eaff2f35-978c-4530-a12e-061a5f0beacd", "version": "0.1.0", "python_path": "d3m.primitives.tods.feature_analysis.statistical_mean", "name": "Time Series Decompostional", "digest": "2f2a8c07878643fe29c346096b91b5ba91477baa1e7e78684f07e53d29766ca4"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "67e7fcdf-d645-3417-9aa4-85cd369487d9", "version": "0.0.1", "python_path": "d3m.primitives.tods.detection_algorithm.pyod_vae", "name": "TODS.anomaly_detection_primitives.VariationalAutoEncoder"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "d5384857f75090844f367504befb1a854e5088589f6aae0795f66ccf10403e19"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "8d969800816d9596e94cb045aacce43dc3d49e8c5bedb403e35af6c9b8339990"}

+ 10
- 0
tods/tods/schemas.py View File

@@ -0,0 +1,10 @@
import os

resource_dir = os.path.dirname(__file__)

DEFAULT_PIPELINE_DIR = os.path.join(resource_dir, 'resources', 'default_pipeline.json')

def load_default_pipeline():
from axolotl.utils import pipeline as pipeline_utils
pipeline = pipeline_utils.load_pipeline(DEFAULT_PIPELINE_DIR)
return pipeline

+ 19
- 0
tods/tods/utils.py View File

@@ -30,3 +30,22 @@ def generate_dataset_problem(df, target_index, metric):


return dataset, problem_description return dataset, problem_description


def evaluate_pipeline(problem_description, dataset, pipeline):
from axolotl.utils import schemas as schemas_utils
from axolotl.backend.simple import SimpleRunner
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
metrics = problem_description['problem']['performance_metrics']

backend = SimpleRunner(random_seed=0)
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
pipeline=pipeline,
input_data=[dataset],
metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params)
return pipeline_result



Loading…
Cancel
Save