Browse Source

Refine examples

Former-commit-id: 5a4a63ff27 [formerly 12c7d03aad] [formerly 67b40acc93 [formerly a24ae8f0a3]] [formerly 5525bfa76a [formerly 47d0e986ce] [formerly c5eb607a19 [formerly 34e4f50dd1]]] [formerly 03b189f439 [formerly 8ae21d08cd] [formerly 636aa28af7 [formerly 06a840bb1e]] [formerly 2bdac04c21 [formerly dc259c1717] [formerly 17d6611d8f [formerly d3445ad667]]]] [formerly 9ae550a90e [formerly f39d1b8bd2] [formerly 8ccba52b49 [formerly 723a326385]] [formerly a8ba9eab50 [formerly 082129e96d] [formerly 56365d8244 [formerly 3892ba8f16]]] [formerly dc86bec217 [formerly ea7862f984] [formerly 1ce9576289 [formerly e5a32f7002]] [formerly e9977975c6 [formerly 6f248084e9] [formerly a992cf924d [formerly fdb1f7f2a4]]]]] [formerly c672d24cc2 [formerly 1b9735b22d] [formerly b65d523d7e [formerly 030d59221e]] [formerly 162613cf00 [formerly 83fe17c6e2] [formerly fdb4c0aa0e [formerly 998b5befc4]]] [formerly 822cf5f77c [formerly ae40f60abe] [formerly 7a9f290b3d [formerly 71512c55b8]] [formerly afe9115cfd [formerly 121b4afdb8] [formerly 1826d99bfb [formerly 0fd001065a]]]] [formerly 1364dd2bf4 [formerly d3c4f82d97] [formerly 91a7ec745d [formerly c004313d27]] [formerly 34c488bb9f [formerly c1fe366bbc] [formerly 671f64fe28 [formerly 742453b103]]] [formerly 470515bb11 [formerly a1ce0731c1] [formerly 547a9c16c1 [formerly 086a10f109]] [formerly 85bf5246bb [formerly 3655590a78] [formerly cc02a40884 [formerly e09f9e36f2]]]]]]
Former-commit-id: 2bf9bbbcbd [formerly bbf3c752ad] [formerly fa5053a575 [formerly 437f4eb1e2]] [formerly 3e3b00bc0f [formerly a3187fbe85] [formerly 72e5a8da9e [formerly 9fa0adee46]]] [formerly 4b2fc94b1e [formerly 2007edd4a5] [formerly 2f81fd19b4 [formerly 18240e6a08]] [formerly 222b36c358 [formerly 981127893f] [formerly 6c305285ac [formerly 5eac958894]]]] [formerly da7314bf1b [formerly 7ac9fc4e09] [formerly 8a153b0592 [formerly 3a5c9048d7]] [formerly d8770decbc [formerly 4a125ea6d1] [formerly 816ff4cb4e [formerly 559c2355b3]]] [formerly 8759cc4f1c [formerly 8c7f9ee1b3] [formerly b9dacfba0b [formerly 4b9627101e]] [formerly a7a9865252 [formerly 93f6c14a90] [formerly cc02a40884]]]]
Former-commit-id: 6bf86dad84 [formerly 361e1989ad] [formerly 62c40e59aa [formerly 2e6e953066]] [formerly bcd0c831e1 [formerly e36b203fe8] [formerly 1680d41ca2 [formerly 5909cfb28b]]] [formerly 8f19a364bb [formerly c9df3addeb] [formerly 6456c6fe40 [formerly 747e44e598]] [formerly 46ab4b25ad [formerly d3e63c79f3] [formerly a553295d3b [formerly 6ea1609800]]]]
Former-commit-id: cd0dd9d427 [formerly 4797c81bff] [formerly c06f5e51b0 [formerly 240dfb39db]] [formerly 1ba9f47172 [formerly bbcf24b814] [formerly a955f79ff7 [formerly 8e48fb601e]]]
Former-commit-id: a0266a01ca [formerly d498dc088d] [formerly f45da48ee8 [formerly 2a1fbc2787]]
Former-commit-id: 7495739134 [formerly 052efc3b77]
Former-commit-id: 1fa1955ae6
master
Daochen Zha 4 years ago
parent
commit
f6e6a38012
6 changed files with 117 additions and 51 deletions
  1. +64
    -0
      README.md
  2. +23
    -0
      examples/evaluate_default_pipeline.py
  3. +0
    -51
      examples/run_predefined_pipeline.py
  4. +1
    -0
      tods/tods/resources/default_pipeline.json
  5. +10
    -0
      tods/tods/schemas.py
  6. +19
    -0
      tods/tods/utils.py

+ 64
- 0
README.md View File

@@ -44,6 +44,70 @@ cd ..

There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any.

# Examples
Examples are available in [/examples](examples/). For basic usage, you can evaluate a pipeline on a given datasets. Here, we provide an example to load our default pipeline and evaluate it on a subset of yahoo dataset.
```
import pandas as pd

from tods import schemas as schemas_utils
from tods.utils import generate_dataset_problem, evaluate_pipeline

table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

# Load the default pipeline
pipeline = schemas_utils.load_default_pipeline()

# Run the pipeline
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
```
We also provide AutoML support to help you automatically find a good pipeline for a your data.
```
import pandas as pd

from axolotl.backend.simple import SimpleRunner

from tods.utils import generate_dataset_problem
from tods.search import BruteForceSearch

# Some information
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset
#target_index = 2 # what column is the target

table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

# Start backend
backend = SimpleRunner(random_seed=0)

# Start search algorithm
search = BruteForceSearch(problem_description=problem_description, backend=backend)

# Find the best pipeline
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit)
best_pipeline = best_runtime.pipeline
best_output = best_pipeline_result.output

# Evaluate the best pipeline
best_scores = search.evaluate(best_pipeline).scores
```

# Dataset
Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format.



+ 23
- 0
examples/evaluate_default_pipeline.py View File

@@ -0,0 +1,23 @@
import pandas as pd

from tods import schemas as schemas_utils
from tods.utils import generate_dataset_problem, evaluate_pipeline

table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

# Load the default pipeline
pipeline = schemas_utils.load_default_pipeline()

# Run the pipeline
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
print(pipeline_result)


+ 0
- 51
examples/run_predefined_pipeline.py View File

@@ -1,51 +0,0 @@
import uuid
import random
import pandas as pd
import json
from pprint import pprint
from sklearn.datasets import make_classification

from d3m import container
from d3m.metadata.pipeline import Pipeline
from d3m.metadata.problem import TaskKeyword, PerformanceMetric

from axolotl.utils import data_problem
from axolotl.backend.simple import SimpleRunner
# from axolotl.backend.ray import RayRunner
# from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils

import tods
from tods.search import BruteForceSearch

table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv'
df = pd.read_csv(table_path)
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=7,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=[{'metric': PerformanceMetric.F1}])

print(dataset)
print(problem_description)

metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}},
]

pipeline_path = 'example_pipeline.json'
pipeline = pipeline_utils.load_pipeline(pipeline_path)
print(pipeline)

data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

backend = SimpleRunner(random_seed=0)
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
pipeline=pipeline,
input_data=[dataset],
metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params)
print(pipeline_result)


+ 1
- 0
tods/tods/resources/default_pipeline.json View File

@@ -0,0 +1 @@
{"id": "384bbfab-4f6d-4001-9f90-684ea5681f5d", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-09-09T23:40:01.756164Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "b94ee59ccf8db678d506adddbc238fb2049fb664a1e3f3f3f6a6517c0c4f8e5f"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "256f0155c7185d747b3b23096e46c40d15844106f9ed6346453f6010891f1896"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "642de2e7-5590-3cab-9266-2a53c326c461", "version": "0.0.1", "python_path": "d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler", "name": "Axis_wise_scale"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "eaff2f35-978c-4530-a12e-061a5f0beacd", "version": "0.1.0", "python_path": "d3m.primitives.tods.feature_analysis.statistical_mean", "name": "Time Series Decompostional", "digest": "2f2a8c07878643fe29c346096b91b5ba91477baa1e7e78684f07e53d29766ca4"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "67e7fcdf-d645-3417-9aa4-85cd369487d9", "version": "0.0.1", "python_path": "d3m.primitives.tods.detection_algorithm.pyod_vae", "name": "TODS.anomaly_detection_primitives.VariationalAutoEncoder"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "d5384857f75090844f367504befb1a854e5088589f6aae0795f66ccf10403e19"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "8d969800816d9596e94cb045aacce43dc3d49e8c5bedb403e35af6c9b8339990"}

+ 10
- 0
tods/tods/schemas.py View File

@@ -0,0 +1,10 @@
import os

resource_dir = os.path.dirname(__file__)

DEFAULT_PIPELINE_DIR = os.path.join(resource_dir, 'resources', 'default_pipeline.json')

def load_default_pipeline():
from axolotl.utils import pipeline as pipeline_utils
pipeline = pipeline_utils.load_pipeline(DEFAULT_PIPELINE_DIR)
return pipeline

+ 19
- 0
tods/tods/utils.py View File

@@ -30,3 +30,22 @@ def generate_dataset_problem(df, target_index, metric):

return dataset, problem_description

def evaluate_pipeline(problem_description, dataset, pipeline):
from axolotl.utils import schemas as schemas_utils
from axolotl.backend.simple import SimpleRunner
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
metrics = problem_description['problem']['performance_metrics']

backend = SimpleRunner(random_seed=0)
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
pipeline=pipeline,
input_data=[dataset],
metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params)
return pipeline_result



Loading…
Cancel
Save