Former-commit-id:master5a4a63ff27
[formerly12c7d03aad
] [formerly67b40acc93
[formerlya24ae8f0a3
]] [formerly5525bfa76a
[formerly47d0e986ce
] [formerlyc5eb607a19
[formerly34e4f50dd1
]]] [formerly03b189f439
[formerly8ae21d08cd
] [formerly636aa28af7
[formerly06a840bb1e
]] [formerly2bdac04c21
[formerlydc259c1717
] [formerly17d6611d8f
[formerlyd3445ad667
]]]] [formerly9ae550a90e
[formerlyf39d1b8bd2
] [formerly8ccba52b49
[formerly723a326385
]] [formerlya8ba9eab50
[formerly082129e96d
] [formerly56365d8244
[formerly3892ba8f16
]]] [formerlydc86bec217
[formerlyea7862f984
] [formerly1ce9576289
[formerlye5a32f7002
]] [formerlye9977975c6
[formerly6f248084e9
] [formerlya992cf924d
[formerlyfdb1f7f2a4
]]]]] [formerlyc672d24cc2
[formerly1b9735b22d
] [formerlyb65d523d7e
[formerly030d59221e
]] [formerly162613cf00
[formerly83fe17c6e2
] [formerlyfdb4c0aa0e
[formerly998b5befc4
]]] [formerly822cf5f77c
[formerlyae40f60abe
] [formerly7a9f290b3d
[formerly71512c55b8
]] [formerlyafe9115cfd
[formerly121b4afdb8
] [formerly1826d99bfb
[formerly0fd001065a
]]]] [formerly1364dd2bf4
[formerlyd3c4f82d97
] [formerly91a7ec745d
[formerlyc004313d27
]] [formerly34c488bb9f
[formerlyc1fe366bbc
] [formerly671f64fe28
[formerly742453b103
]]] [formerly470515bb11
[formerlya1ce0731c1
] [formerly547a9c16c1
[formerly086a10f109
]] [formerly85bf5246bb
[formerly3655590a78
] [formerlycc02a40884
[formerlye09f9e36f2
]]]]]] Former-commit-id:2bf9bbbcbd
[formerlybbf3c752ad
] [formerlyfa5053a575
[formerly437f4eb1e2
]] [formerly3e3b00bc0f
[formerlya3187fbe85
] [formerly72e5a8da9e
[formerly9fa0adee46
]]] [formerly4b2fc94b1e
[formerly2007edd4a5
] [formerly2f81fd19b4
[formerly18240e6a08
]] [formerly222b36c358
[formerly981127893f
] [formerly6c305285ac
[formerly5eac958894
]]]] [formerlyda7314bf1b
[formerly7ac9fc4e09
] [formerly8a153b0592
[formerly3a5c9048d7
]] [formerlyd8770decbc
[formerly4a125ea6d1
] [formerly816ff4cb4e
[formerly559c2355b3
]]] [formerly8759cc4f1c
[formerly8c7f9ee1b3
] [formerlyb9dacfba0b
[formerly4b9627101e
]] [formerlya7a9865252
[formerly93f6c14a90
] [formerlycc02a40884
]]]] Former-commit-id:6bf86dad84
[formerly361e1989ad
] [formerly62c40e59aa
[formerly2e6e953066
]] [formerlybcd0c831e1
[formerlye36b203fe8
] [formerly1680d41ca2
[formerly5909cfb28b
]]] [formerly8f19a364bb
[formerlyc9df3addeb
] [formerly6456c6fe40
[formerly747e44e598
]] [formerly46ab4b25ad
[formerlyd3e63c79f3
] [formerlya553295d3b
[formerly6ea1609800
]]]] Former-commit-id:cd0dd9d427
[formerly4797c81bff
] [formerlyc06f5e51b0
[formerly240dfb39db
]] [formerly1ba9f47172
[formerlybbcf24b814
] [formerlya955f79ff7
[formerly8e48fb601e
]]] Former-commit-id:a0266a01ca
[formerlyd498dc088d
] [formerlyf45da48ee8
[formerly2a1fbc2787
]] Former-commit-id:7495739134
[formerly052efc3b77
] Former-commit-id:1fa1955ae6
@@ -44,6 +44,70 @@ cd .. | |||||
There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any. | There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any. | ||||
# Examples | |||||
Examples are available in [/examples](examples/). For basic usage, you can evaluate a pipeline on a given datasets. Here, we provide an example to load our default pipeline and evaluate it on a subset of yahoo dataset. | |||||
``` | |||||
import pandas as pd | |||||
from tods import schemas as schemas_utils | |||||
from tods.utils import generate_dataset_problem, evaluate_pipeline | |||||
table_path = 'datasets/yahoo_sub_5.csv' | |||||
target_index = 6 # what column is the target | |||||
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset | |||||
time_limit = 30 # How many seconds you wanna search | |||||
#metric = 'F1' # F1 on label 1 | |||||
metric = 'F1_MACRO' # F1 on both label 0 and 1 | |||||
# Read data and generate dataset and problem | |||||
df = pd.read_csv(table_path) | |||||
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) | |||||
# Load the default pipeline | |||||
pipeline = schemas_utils.load_default_pipeline() | |||||
# Run the pipeline | |||||
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline) | |||||
``` | |||||
We also provide AutoML support to help you automatically find a good pipeline for a your data. | |||||
``` | |||||
import pandas as pd | |||||
from axolotl.backend.simple import SimpleRunner | |||||
from tods.utils import generate_dataset_problem | |||||
from tods.search import BruteForceSearch | |||||
# Some information | |||||
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset | |||||
#target_index = 2 # what column is the target | |||||
table_path = 'datasets/yahoo_sub_5.csv' | |||||
target_index = 6 # what column is the target | |||||
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset | |||||
time_limit = 30 # How many seconds you wanna search | |||||
#metric = 'F1' # F1 on label 1 | |||||
metric = 'F1_MACRO' # F1 on both label 0 and 1 | |||||
# Read data and generate dataset and problem | |||||
df = pd.read_csv(table_path) | |||||
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) | |||||
# Start backend | |||||
backend = SimpleRunner(random_seed=0) | |||||
# Start search algorithm | |||||
search = BruteForceSearch(problem_description=problem_description, backend=backend) | |||||
# Find the best pipeline | |||||
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit) | |||||
best_pipeline = best_runtime.pipeline | |||||
best_output = best_pipeline_result.output | |||||
# Evaluate the best pipeline | |||||
best_scores = search.evaluate(best_pipeline).scores | |||||
``` | |||||
# Dataset | # Dataset | ||||
Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format. | Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format. | ||||
@@ -0,0 +1,23 @@ | |||||
import pandas as pd | |||||
from tods import schemas as schemas_utils | |||||
from tods.utils import generate_dataset_problem, evaluate_pipeline | |||||
table_path = 'datasets/yahoo_sub_5.csv' | |||||
target_index = 6 # what column is the target | |||||
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset | |||||
time_limit = 30 # How many seconds you wanna search | |||||
#metric = 'F1' # F1 on label 1 | |||||
metric = 'F1_MACRO' # F1 on both label 0 and 1 | |||||
# Read data and generate dataset and problem | |||||
df = pd.read_csv(table_path) | |||||
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) | |||||
# Load the default pipeline | |||||
pipeline = schemas_utils.load_default_pipeline() | |||||
# Run the pipeline | |||||
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline) | |||||
print(pipeline_result) | |||||
@@ -1,51 +0,0 @@ | |||||
import uuid | |||||
import random | |||||
import pandas as pd | |||||
import json | |||||
from pprint import pprint | |||||
from sklearn.datasets import make_classification | |||||
from d3m import container | |||||
from d3m.metadata.pipeline import Pipeline | |||||
from d3m.metadata.problem import TaskKeyword, PerformanceMetric | |||||
from axolotl.utils import data_problem | |||||
from axolotl.backend.simple import SimpleRunner | |||||
# from axolotl.backend.ray import RayRunner | |||||
# from axolotl.algorithms.base import PipelineSearchBase | |||||
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils | |||||
import tods | |||||
from tods.search import BruteForceSearch | |||||
table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv' | |||||
df = pd.read_csv(table_path) | |||||
dataset, problem_description = data_problem.generate_dataset_problem(df, | |||||
target_index=7, | |||||
task_keywords=[TaskKeyword.ANOMALY_DETECTION,], | |||||
performance_metrics=[{'metric': PerformanceMetric.F1}]) | |||||
print(dataset) | |||||
print(problem_description) | |||||
metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}, | |||||
] | |||||
pipeline_path = 'example_pipeline.json' | |||||
pipeline = pipeline_utils.load_pipeline(pipeline_path) | |||||
print(pipeline) | |||||
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") | |||||
scoring_pipeline = schemas_utils.get_scoring_pipeline() | |||||
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] | |||||
backend = SimpleRunner(random_seed=0) | |||||
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, | |||||
pipeline=pipeline, | |||||
input_data=[dataset], | |||||
metrics=metrics, | |||||
data_preparation_pipeline=data_preparation_pipeline, | |||||
scoring_pipeline=scoring_pipeline, | |||||
data_preparation_params=data_preparation_params) | |||||
print(pipeline_result) | |||||
@@ -0,0 +1 @@ | |||||
{"id": "384bbfab-4f6d-4001-9f90-684ea5681f5d", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-09-09T23:40:01.756164Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "b94ee59ccf8db678d506adddbc238fb2049fb664a1e3f3f3f6a6517c0c4f8e5f"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "256f0155c7185d747b3b23096e46c40d15844106f9ed6346453f6010891f1896"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "642de2e7-5590-3cab-9266-2a53c326c461", "version": "0.0.1", "python_path": "d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler", "name": "Axis_wise_scale"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "eaff2f35-978c-4530-a12e-061a5f0beacd", "version": "0.1.0", "python_path": "d3m.primitives.tods.feature_analysis.statistical_mean", "name": "Time Series Decompostional", "digest": "2f2a8c07878643fe29c346096b91b5ba91477baa1e7e78684f07e53d29766ca4"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "67e7fcdf-d645-3417-9aa4-85cd369487d9", "version": "0.0.1", "python_path": "d3m.primitives.tods.detection_algorithm.pyod_vae", "name": "TODS.anomaly_detection_primitives.VariationalAutoEncoder"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "d5384857f75090844f367504befb1a854e5088589f6aae0795f66ccf10403e19"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "8d969800816d9596e94cb045aacce43dc3d49e8c5bedb403e35af6c9b8339990"} |
@@ -0,0 +1,10 @@ | |||||
import os | |||||
resource_dir = os.path.dirname(__file__) | |||||
DEFAULT_PIPELINE_DIR = os.path.join(resource_dir, 'resources', 'default_pipeline.json') | |||||
def load_default_pipeline(): | |||||
from axolotl.utils import pipeline as pipeline_utils | |||||
pipeline = pipeline_utils.load_pipeline(DEFAULT_PIPELINE_DIR) | |||||
return pipeline |
@@ -30,3 +30,22 @@ def generate_dataset_problem(df, target_index, metric): | |||||
return dataset, problem_description | return dataset, problem_description | ||||
def evaluate_pipeline(problem_description, dataset, pipeline): | |||||
from axolotl.utils import schemas as schemas_utils | |||||
from axolotl.backend.simple import SimpleRunner | |||||
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") | |||||
scoring_pipeline = schemas_utils.get_scoring_pipeline() | |||||
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] | |||||
metrics = problem_description['problem']['performance_metrics'] | |||||
backend = SimpleRunner(random_seed=0) | |||||
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, | |||||
pipeline=pipeline, | |||||
input_data=[dataset], | |||||
metrics=metrics, | |||||
data_preparation_pipeline=data_preparation_pipeline, | |||||
scoring_pipeline=scoring_pipeline, | |||||
data_preparation_params=data_preparation_params) | |||||
return pipeline_result | |||||