Refine examples

Former-commit-id: 5a4a63ff27 [formerly 12c7d03aad] [formerly 67b40acc93 [formerly a24ae8f0a3]] [formerly 5525bfa76a [formerly 47d0e986ce] [formerly c5eb607a19 [formerly 34e4f50dd1]]] [formerly 03b189f439 [formerly 8ae21d08cd] [formerly 636aa28af7 [formerly 06a840bb1e]] [formerly 2bdac04c21 [formerly dc259c1717] [formerly 17d6611d8f [formerly d3445ad667]]]] [formerly 9ae550a90e [formerly f39d1b8bd2] [formerly 8ccba52b49 [formerly 723a326385]] [formerly a8ba9eab50 [formerly 082129e96d] [formerly 56365d8244 [formerly 3892ba8f16]]] [formerly dc86bec217 [formerly ea7862f984] [formerly 1ce9576289 [formerly e5a32f7002]] [formerly e9977975c6 [formerly 6f248084e9] [formerly a992cf924d [formerly fdb1f7f2a4]]]]] [formerly c672d24cc2 [formerly 1b9735b22d] [formerly b65d523d7e [formerly 030d59221e]] [formerly 162613cf00 [formerly 83fe17c6e2] [formerly fdb4c0aa0e [formerly 998b5befc4]]] [formerly 822cf5f77c [formerly ae40f60abe] [formerly 7a9f290b3d [formerly 71512c55b8]] [formerly afe9115cfd [formerly 121b4afdb8] [formerly 1826d99bfb [formerly 0fd001065a]]]] [formerly 1364dd2bf4 [formerly d3c4f82d97] [formerly 91a7ec745d [formerly c004313d27]] [formerly 34c488bb9f [formerly c1fe366bbc] [formerly 671f64fe28 [formerly 742453b103]]] [formerly 470515bb11 [formerly a1ce0731c1] [formerly 547a9c16c1 [formerly 086a10f109]] [formerly 85bf5246bb [formerly 3655590a78] [formerly cc02a40884 [formerly e09f9e36f2]]]]]] Former-commit-id: 2bf9bbbcbd [formerly bbf3c752ad] [formerly fa5053a575 [formerly 437f4eb1e2]] [formerly 3e3b00bc0f [formerly a3187fbe85] [formerly 72e5a8da9e [formerly 9fa0adee46]]] [formerly 4b2fc94b1e [formerly 2007edd4a5] [formerly 2f81fd19b4 [formerly 18240e6a08]] [formerly 222b36c358 [formerly 981127893f] [formerly 6c305285ac [formerly 5eac958894]]]] [formerly da7314bf1b [formerly 7ac9fc4e09] [formerly 8a153b0592 [formerly 3a5c9048d7]] [formerly d8770decbc [formerly 4a125ea6d1] [formerly 816ff4cb4e [formerly 559c2355b3]]] [formerly 8759cc4f1c [formerly 8c7f9ee1b3] [formerly b9dacfba0b [formerly 4b9627101e]] [formerly a7a9865252 [formerly 93f6c14a90] [formerly cc02a40884]]]] Former-commit-id: 6bf86dad84 [formerly 361e1989ad] [formerly 62c40e59aa [formerly 2e6e953066]] [formerly bcd0c831e1 [formerly e36b203fe8] [formerly 1680d41ca2 [formerly 5909cfb28b]]] [formerly 8f19a364bb [formerly c9df3addeb] [formerly 6456c6fe40 [formerly 747e44e598]] [formerly 46ab4b25ad [formerly d3e63c79f3] [formerly a553295d3b [formerly 6ea1609800]]]] Former-commit-id: cd0dd9d427 [formerly 4797c81bff] [formerly c06f5e51b0 [formerly 240dfb39db]] [formerly 1ba9f47172 [formerly bbcf24b814] [formerly a955f79ff7 [formerly 8e48fb601e]]] Former-commit-id: a0266a01ca [formerly d498dc088d] [formerly f45da48ee8 [formerly 2a1fbc2787]] Former-commit-id: 7495739134 [formerly 052efc3b77] Former-commit-id: 1fa1955ae6
4 years ago · f6e6a38012
--- a/README.md
+++ b/README.md
@@ -44,6 +44,70 @@ cd ..

 There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any.

 # Examples
 Examples are available in [/examples](examples/). For basic usage, you can evaluate a pipeline on a given datasets. Here, we provide an example to load our default pipeline and evaluate it on a subset of yahoo dataset.
 ```
 import pandas as pd

 from tods import schemas as schemas_utils
 from tods.utils import generate_dataset_problem, evaluate_pipeline

 table_path = 'datasets/yahoo_sub_5.csv'
 target_index = 6 # what column is the target
 #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
 time_limit = 30 # How many seconds you wanna search
 #metric = 'F1' # F1 on label 1
 metric = 'F1_MACRO' # F1 on both label 0 and 1

 # Read data and generate dataset and problem
 df = pd.read_csv(table_path)
 dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

 # Load the default pipeline
 pipeline = schemas_utils.load_default_pipeline()

 # Run the pipeline
 pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
 ```
 We also provide AutoML support to help you automatically find a good pipeline for a your data.
 ```
 import pandas as pd

 from axolotl.backend.simple import SimpleRunner

 from tods.utils import generate_dataset_problem
 from tods.search import BruteForceSearch

 # Some information
 #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset
 #target_index = 2 # what column is the target

 table_path = 'datasets/yahoo_sub_5.csv'
 target_index = 6 # what column is the target
 #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
 time_limit = 30 # How many seconds you wanna search
 #metric = 'F1' # F1 on label 1
 metric = 'F1_MACRO' # F1 on both label 0 and 1

 # Read data and generate dataset and problem
 df = pd.read_csv(table_path)
 dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

 # Start backend
 backend = SimpleRunner(random_seed=0)

 # Start search algorithm
 search = BruteForceSearch(problem_description=problem_description, backend=backend)

 # Find the best pipeline
 best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit)
 best_pipeline = best_runtime.pipeline
 best_output = best_pipeline_result.output

 # Evaluate the best pipeline
 best_scores = search.evaluate(best_pipeline).scores
 ```

 # Dataset
 Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format.

--- a/examples/evaluate_default_pipeline.py
+++ b/examples/evaluate_default_pipeline.py
@@ -0,0 +1,23 @@
 import pandas as pd

 from tods import schemas as schemas_utils
 from tods.utils import generate_dataset_problem, evaluate_pipeline

 table_path = 'datasets/yahoo_sub_5.csv'
 target_index = 6 # what column is the target
 #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
 time_limit = 30 # How many seconds you wanna search
 #metric = 'F1' # F1 on label 1
 metric = 'F1_MACRO' # F1 on both label 0 and 1

 # Read data and generate dataset and problem
 df = pd.read_csv(table_path)
 dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

 # Load the default pipeline
 pipeline = schemas_utils.load_default_pipeline()

 # Run the pipeline
 pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
 print(pipeline_result)

--- a/examples/run_predefined_pipeline.py
+++ b/examples/run_predefined_pipeline.py
@@ -1,51 +0,0 @@
 import uuid
 import random
 import pandas as pd
 import json
 from pprint import pprint
 from sklearn.datasets import make_classification

 from d3m import container
 from d3m.metadata.pipeline import Pipeline
 from d3m.metadata.problem import TaskKeyword, PerformanceMetric

 from axolotl.utils import data_problem
 from axolotl.backend.simple import SimpleRunner
 # from axolotl.backend.ray import RayRunner
 # from axolotl.algorithms.base import PipelineSearchBase
 from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils

 import tods
 from tods.search import BruteForceSearch

 table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv'
 df = pd.read_csv(table_path)
 dataset, problem_description = data_problem.generate_dataset_problem(df,
                                                                     target_index=7,
                                                                     task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
                                                                     performance_metrics=[{'metric': PerformanceMetric.F1}])

 print(dataset)
 print(problem_description)

 metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}},
          ]

 pipeline_path = 'example_pipeline.json'
 pipeline = pipeline_utils.load_pipeline(pipeline_path)
 print(pipeline)

 data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
 scoring_pipeline = schemas_utils.get_scoring_pipeline()
 data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

 backend = SimpleRunner(random_seed=0) 
 pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
                                            pipeline=pipeline,
                                            input_data=[dataset],
                                            metrics=metrics,
                                            data_preparation_pipeline=data_preparation_pipeline,
                                            scoring_pipeline=scoring_pipeline,
                                            data_preparation_params=data_preparation_params)
 print(pipeline_result)

--- a/tods/tods/resources/default_pipeline.json
+++ b/tods/tods/resources/default_pipeline.json
@@ -0,0 +1 @@
 {"id": "384bbfab-4f6d-4001-9f90-684ea5681f5d", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-09-09T23:40:01.756164Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "b94ee59ccf8db678d506adddbc238fb2049fb664a1e3f3f3f6a6517c0c4f8e5f"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "256f0155c7185d747b3b23096e46c40d15844106f9ed6346453f6010891f1896"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "642de2e7-5590-3cab-9266-2a53c326c461", "version": "0.0.1", "python_path": "d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler", "name": "Axis_wise_scale"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "eaff2f35-978c-4530-a12e-061a5f0beacd", "version": "0.1.0", "python_path": "d3m.primitives.tods.feature_analysis.statistical_mean", "name": "Time Series Decompostional", "digest": "2f2a8c07878643fe29c346096b91b5ba91477baa1e7e78684f07e53d29766ca4"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "67e7fcdf-d645-3417-9aa4-85cd369487d9", "version": "0.0.1", "python_path": "d3m.primitives.tods.detection_algorithm.pyod_vae", "name": "TODS.anomaly_detection_primitives.VariationalAutoEncoder"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "d5384857f75090844f367504befb1a854e5088589f6aae0795f66ccf10403e19"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "8d969800816d9596e94cb045aacce43dc3d49e8c5bedb403e35af6c9b8339990"}
--- a/tods/tods/schemas.py
+++ b/tods/tods/schemas.py
@@ -0,0 +1,10 @@
 import os

 resource_dir = os.path.dirname(__file__)

 DEFAULT_PIPELINE_DIR = os.path.join(resource_dir, 'resources', 'default_pipeline.json')

 def load_default_pipeline():
    from axolotl.utils import pipeline as pipeline_utils
    pipeline = pipeline_utils.load_pipeline(DEFAULT_PIPELINE_DIR)
    return pipeline
--- a/tods/tods/utils.py
+++ b/tods/tods/utils.py
@@ -30,3 +30,22 @@ def generate_dataset_problem(df, target_index, metric):

    return dataset, problem_description

 def evaluate_pipeline(problem_description, dataset, pipeline):
    from axolotl.utils import schemas as schemas_utils
    from axolotl.backend.simple import SimpleRunner
    data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
    scoring_pipeline = schemas_utils.get_scoring_pipeline()
    data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
    metrics = problem_description['problem']['performance_metrics']

    backend = SimpleRunner(random_seed=0) 
    pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
                                                pipeline=pipeline,
                                                input_data=[dataset],
                                                metrics=metrics,
                                                data_preparation_pipeline=data_preparation_pipeline,
                                                scoring_pipeline=scoring_pipeline,
                                                data_preparation_params=data_preparation_params)
    return pipeline_result