diff --git a/README.md b/README.md index 202a0dd..571ef23 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,70 @@ cd .. There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any. +# Examples +Examples are available in [/examples](examples/). For basic usage, you can evaluate a pipeline on a given datasets. Here, we provide an example to load our default pipeline and evaluate it on a subset of yahoo dataset. +``` +import pandas as pd + +from tods import schemas as schemas_utils +from tods.utils import generate_dataset_problem, evaluate_pipeline + +table_path = 'datasets/yahoo_sub_5.csv' +target_index = 6 # what column is the target +#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset +time_limit = 30 # How many seconds you wanna search +#metric = 'F1' # F1 on label 1 +metric = 'F1_MACRO' # F1 on both label 0 and 1 + +# Read data and generate dataset and problem +df = pd.read_csv(table_path) +dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) + +# Load the default pipeline +pipeline = schemas_utils.load_default_pipeline() + +# Run the pipeline +pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline) +``` +We also provide AutoML support to help you automatically find a good pipeline for a your data. +``` +import pandas as pd + +from axolotl.backend.simple import SimpleRunner + +from tods.utils import generate_dataset_problem +from tods.search import BruteForceSearch + +# Some information +#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset +#target_index = 2 # what column is the target + +table_path = 'datasets/yahoo_sub_5.csv' +target_index = 6 # what column is the target +#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset +time_limit = 30 # How many seconds you wanna search +#metric = 'F1' # F1 on label 1 +metric = 'F1_MACRO' # F1 on both label 0 and 1 + +# Read data and generate dataset and problem +df = pd.read_csv(table_path) +dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) + +# Start backend +backend = SimpleRunner(random_seed=0) + +# Start search algorithm +search = BruteForceSearch(problem_description=problem_description, backend=backend) + +# Find the best pipeline +best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit) +best_pipeline = best_runtime.pipeline +best_output = best_pipeline_result.output + +# Evaluate the best pipeline +best_scores = search.evaluate(best_pipeline).scores +``` + # Dataset Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format. diff --git a/examples/evaluate_default_pipeline.py b/examples/evaluate_default_pipeline.py new file mode 100644 index 0000000..0bf9e3e --- /dev/null +++ b/examples/evaluate_default_pipeline.py @@ -0,0 +1,23 @@ +import pandas as pd + +from tods import schemas as schemas_utils +from tods.utils import generate_dataset_problem, evaluate_pipeline + +table_path = 'datasets/yahoo_sub_5.csv' +target_index = 6 # what column is the target +#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset +time_limit = 30 # How many seconds you wanna search +#metric = 'F1' # F1 on label 1 +metric = 'F1_MACRO' # F1 on both label 0 and 1 + +# Read data and generate dataset and problem +df = pd.read_csv(table_path) +dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) + +# Load the default pipeline +pipeline = schemas_utils.load_default_pipeline() + +# Run the pipeline +pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline) +print(pipeline_result) + diff --git a/examples/run_predefined_pipeline.py b/examples/run_predefined_pipeline.py deleted file mode 100644 index 0fb6779..0000000 --- a/examples/run_predefined_pipeline.py +++ /dev/null @@ -1,51 +0,0 @@ -import uuid -import random -import pandas as pd -import json -from pprint import pprint -from sklearn.datasets import make_classification - -from d3m import container -from d3m.metadata.pipeline import Pipeline -from d3m.metadata.problem import TaskKeyword, PerformanceMetric - -from axolotl.utils import data_problem -from axolotl.backend.simple import SimpleRunner -# from axolotl.backend.ray import RayRunner -# from axolotl.algorithms.base import PipelineSearchBase -from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils - -import tods -from tods.search import BruteForceSearch - -table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv' -df = pd.read_csv(table_path) -dataset, problem_description = data_problem.generate_dataset_problem(df, - target_index=7, - task_keywords=[TaskKeyword.ANOMALY_DETECTION,], - performance_metrics=[{'metric': PerformanceMetric.F1}]) - -print(dataset) -print(problem_description) - -metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}, - ] - -pipeline_path = 'example_pipeline.json' -pipeline = pipeline_utils.load_pipeline(pipeline_path) -print(pipeline) - -data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") -scoring_pipeline = schemas_utils.get_scoring_pipeline() -data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] - -backend = SimpleRunner(random_seed=0) -pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, - pipeline=pipeline, - input_data=[dataset], - metrics=metrics, - data_preparation_pipeline=data_preparation_pipeline, - scoring_pipeline=scoring_pipeline, - data_preparation_params=data_preparation_params) -print(pipeline_result) - diff --git a/tods/tods/resources/default_pipeline.json b/tods/tods/resources/default_pipeline.json new file mode 100644 index 0000000..bff2c98 --- /dev/null +++ b/tods/tods/resources/default_pipeline.json @@ -0,0 +1 @@ +{"id": "384bbfab-4f6d-4001-9f90-684ea5681f5d", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-09-09T23:40:01.756164Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "b94ee59ccf8db678d506adddbc238fb2049fb664a1e3f3f3f6a6517c0c4f8e5f"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "256f0155c7185d747b3b23096e46c40d15844106f9ed6346453f6010891f1896"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "642de2e7-5590-3cab-9266-2a53c326c461", "version": "0.0.1", "python_path": "d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler", "name": "Axis_wise_scale"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "eaff2f35-978c-4530-a12e-061a5f0beacd", "version": "0.1.0", "python_path": "d3m.primitives.tods.feature_analysis.statistical_mean", "name": "Time Series Decompostional", "digest": "2f2a8c07878643fe29c346096b91b5ba91477baa1e7e78684f07e53d29766ca4"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "67e7fcdf-d645-3417-9aa4-85cd369487d9", "version": "0.0.1", "python_path": "d3m.primitives.tods.detection_algorithm.pyod_vae", "name": "TODS.anomaly_detection_primitives.VariationalAutoEncoder"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "d5384857f75090844f367504befb1a854e5088589f6aae0795f66ccf10403e19"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "8d969800816d9596e94cb045aacce43dc3d49e8c5bedb403e35af6c9b8339990"} diff --git a/tods/tods/schemas.py b/tods/tods/schemas.py new file mode 100644 index 0000000..02d76ad --- /dev/null +++ b/tods/tods/schemas.py @@ -0,0 +1,10 @@ +import os + +resource_dir = os.path.dirname(__file__) + +DEFAULT_PIPELINE_DIR = os.path.join(resource_dir, 'resources', 'default_pipeline.json') + +def load_default_pipeline(): + from axolotl.utils import pipeline as pipeline_utils + pipeline = pipeline_utils.load_pipeline(DEFAULT_PIPELINE_DIR) + return pipeline diff --git a/tods/tods/utils.py b/tods/tods/utils.py index 6e3af4f..e375354 100644 --- a/tods/tods/utils.py +++ b/tods/tods/utils.py @@ -30,3 +30,22 @@ def generate_dataset_problem(df, target_index, metric): return dataset, problem_description +def evaluate_pipeline(problem_description, dataset, pipeline): + from axolotl.utils import schemas as schemas_utils + from axolotl.backend.simple import SimpleRunner + data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + scoring_pipeline = schemas_utils.get_scoring_pipeline() + data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + metrics = problem_description['problem']['performance_metrics'] + + backend = SimpleRunner(random_seed=0) + pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, + pipeline=pipeline, + input_data=[dataset], + metrics=metrics, + data_preparation_pipeline=data_preparation_pipeline, + scoring_pipeline=scoring_pipeline, + data_preparation_params=data_preparation_params) + return pipeline_result + +