From f6e6a380125233d8a600c00f7d6d224f6d6651e9 Mon Sep 17 00:00:00 2001 From: Daochen Zha Date: Wed, 9 Sep 2020 19:09:30 -0500 Subject: [PATCH] Refine examples Former-commit-id: 5a4a63ff2763a4e4b26f6a43757a047ae277a323 [formerly 12c7d03aad158e369924c3d758a6b130e6fb3bb3] [formerly 67b40acc937f12f070d59a24393ff1f9a0db13ba [formerly a24ae8f0a312314eb65b8efbaad7ddeda6d87eac]] [formerly 5525bfa76a60727d6a405ffe28e59e4c23203523 [formerly 47d0e986ceb2a559c5386c0643f583c3c46f66cd] [formerly c5eb607a19c6428c32c44ef65fcf858f778bdf23 [formerly 34e4f50dd1adcf568dd25b115a9b71882b158fb7]]] [formerly 03b189f439da8e7bf98b1024184312205fdcce34 [formerly 8ae21d08cdf82fe532d0235c51c7103cd2223412] [formerly 636aa28af728b4f4f051b1bbe2d9fa336260896a [formerly 06a840bb1e60f2c145a949a81d2266dfab462081]] [formerly 2bdac04c21acb03998f6d49087d91fb91c8a1412 [formerly dc259c17176a97f2d2357b8ef5a9a2b8e1685e93] [formerly 17d6611d8f9fde438d8e1bb053c8d28986d92782 [formerly d3445ad667206032f764448e1e6bce9172080fb2]]]] [formerly 9ae550a90e280e6df6b664d353a39b4fc9d70c9d [formerly f39d1b8bd22a0e0908b1e2434ca853da2b86176d] [formerly 8ccba52b499de6ae18303ff00cb232a06d810728 [formerly 723a326385dfb94ecf34a4b252e0c80143a36593]] [formerly a8ba9eab506832c5e917471bc845192799d59cb8 [formerly 082129e96dadd1afa9f60550fbe8fa856a5fb2ba] [formerly 56365d824490f01bbf174a2ca93dc835be7c15d8 [formerly 3892ba8f16f59ff44dea36041922c0f5346932ba]]] [formerly dc86bec2171991f949f2118b549815bf87700e59 [formerly ea7862f9843313d7578b50ee1854fcf340b1001a] [formerly 1ce95762891190e149146fa49529832271fc85cb [formerly e5a32f7002b708bb00c1b7fd6c588ac1ff5edebd]] [formerly e9977975c64f64e952c97858b86ac911428214e0 [formerly 6f248084e98c79294f29542d9c2a284d8f83f3d0] [formerly a992cf924da1ef6e1ea9aeb0b0d13e3bc20dc170 [formerly fdb1f7f2a46384d83eedaac1af01b5d1c165f67c]]]]] [formerly c672d24cc29cd93f5eb6fb8c1430bf4f182cbd68 [formerly 1b9735b22d9a198b85e229c7152de384021d5a6d] [formerly b65d523d7e02c4462a6ae1a09d06aa52ebd5cd76 [formerly 030d59221e6d387a4d6e6bc54eb2c6afdbbb2e77]] [formerly 162613cf00a5c8eab19f1bdb47f87e72edad6588 [formerly 83fe17c6e2098aa4d66d0fb16428a18b9891fec6] [formerly fdb4c0aa0e8671e008e69234065ca6de5427c510 [formerly 998b5befc40c9fd36427a8b59117dd11db864352]]] [formerly 822cf5f77ceae1b14f083d600b1bdeeb56a94469 [formerly ae40f60abec6f856e2ae9cea66b16801f341e5fb] [formerly 7a9f290b3d7d1419ec3b1c471a13914852cae388 [formerly 71512c55b806c676e8b2d51d9428a9ad0086c20b]] [formerly afe9115cfd42b7bc5c483021f756cc7b37ed4cee [formerly 121b4afdb85b1505022953d5cf1791b8d3f3450d] [formerly 1826d99bfbe91cd8d9a23df4c10eae3138cea471 [formerly 0fd001065a6d18a9e676e55f0a173816a23b3fe8]]]] [formerly 1364dd2bf4c360dc70d12e4c9c05d605a74a5d1f [formerly d3c4f82d977261507a79eafeda681b7b0ccf3b69] [formerly 91a7ec745d7b9faaa8020377fe0b26e8468f5376 [formerly c004313d2773d3bd928dc5b876cb4465b19bb21c]] [formerly 34c488bb9fae2a8a17d62c927758b60ad3fb0b0b [formerly c1fe366bbca34e89e4a6c395ef62ea67df73ca77] [formerly 671f64fe2822efcf49f8c9b73c8495f013f5e9b7 [formerly 742453b103e3666bf2940ec4e1cb5c43cee06e1a]]] [formerly 470515bb11702987225bfd74f038d1ac699cecc6 [formerly a1ce0731c1337ccddf8f836c7078a473a7a8c23b] [formerly 547a9c16c19ed336cf6a83b3a370d839fd45a407 [formerly 086a10f1091ec254a87c7a5bf2ba605bcbbd6e24]] [formerly 85bf5246bb9484a57f6419e44e7bd5bed520ded6 [formerly 3655590a7853b93dd65b72d1ca8e31d3b46e40e8] [formerly cc02a40884de8c14674fa9718dbb65d673dc9605 [formerly e09f9e36f2834be74661098fa02dec44f3311bf1]]]]]] Former-commit-id: 2bf9bbbcbd365a0597deae4e245950ef34293922 [formerly bbf3c752ad15b9d42690db28aef34f679ae0ba86] [formerly fa5053a575782e9ee128256ffbdbb3cee2826900 [formerly 437f4eb1e2a37515f966c18bfce699348fba62c1]] [formerly 3e3b00bc0fb009ed982fd7fe366021515abef3b6 [formerly a3187fbe85bb89113a2881c4d1fb9b45ac4e3c9b] [formerly 72e5a8da9e063ed01c0bad3d4c253e104fdcc2a8 [formerly 9fa0adee465ac14e77bea7da0d6c3974c2f7d399]]] [formerly 4b2fc94b1eca67ddee81890093e830dde0323fbd [formerly 2007edd4a5e08d148f4bca151bfb66ffd345a3a8] [formerly 2f81fd19b49aa455c120f225c65a2cce7ec1c86e [formerly 18240e6a088b64bd877916af4ac7f161f85283f7]] [formerly 222b36c358e096a36bac655a199487e114fff5ad [formerly 981127893f4f7820676a0cf37d7bcb4fc17f10a6] [formerly 6c305285acaa28c1a7f62a31b72063ff9876c1bf [formerly 5eac958894d185794b364f71063f5a2477d876e7]]]] [formerly da7314bf1b4820963fe945e9d54e400a6e811dbb [formerly 7ac9fc4e09230376d2001605c419798db17d0161] [formerly 8a153b0592b41e02267c48d006345fc601211ff7 [formerly 3a5c9048d7e80a942c4de2359f9a44ada53b3a2b]] [formerly d8770decbc849091c201e0c4bf53fe60bf54a9ba [formerly 4a125ea6d12675d108a4ec20a68604756c30bff7] [formerly 816ff4cb4e465f3812ef300e6a438c8071bef65e [formerly 559c2355b3ab33ff6a13304934f340a14ab86ef3]]] [formerly 8759cc4f1c6d3761d129b2cb39e5cea50dce3406 [formerly 8c7f9ee1b30ceccb18eb620212a278bed7c833af] [formerly b9dacfba0b688c0fbb50246a7eb1d3217b4eee7b [formerly 4b9627101e3d65acae34985ddf1a94ee85e82534]] [formerly a7a9865252911d4670bb8bda25dec7e2ed8ece44 [formerly 93f6c14a9049af25d91e5f6b24888b7bc1fb8c65] [formerly cc02a40884de8c14674fa9718dbb65d673dc9605]]]] Former-commit-id: 6bf86dad8485420dd94f6e22142179fe3f7b943d [formerly 361e1989ad51f8087df80ae9f4efb657cb816d75] [formerly 62c40e59aa3e2e7c7494a168284d70cd2f784280 [formerly 2e6e95306679c74a9b9d551bd50c1a221f8461b6]] [formerly bcd0c831e1a362834db61de5b44f215730320ea1 [formerly e36b203fe88954f0130e0aa2bc9140ef9e0f2c6b] [formerly 1680d41ca2a1081e661e79872b35b678c320be37 [formerly 5909cfb28bb124b58b27f34f0ae7931ed857306d]]] [formerly 8f19a364bba47371218e486cb236ad88d99911d1 [formerly c9df3addebe154dd57183e2d59fbdcc35cd8d5ad] [formerly 6456c6fe40d8b434c13d03c2add1455351ca1391 [formerly 747e44e598912b11a2b94c592fe869c93a3ce420]] [formerly 46ab4b25ad33f4141017748142818bfbf2f740ea [formerly d3e63c79f3bc280064e79df02f794332aee1785c] [formerly a553295d3be192efecf9338e1e31dc1fce5a8cd8 [formerly 6ea1609800bcee0cb92fb38146ffbdcce085355e]]]] Former-commit-id: cd0dd9d427b93eb9820ac3cabdd6a1d26a2e2058 [formerly 4797c81bff998eee7addecdbd8a0a9ed05d2fe7a] [formerly c06f5e51b0af00083b6d5e59d3267d6abb9df074 [formerly 240dfb39dbd881fbefa9d190c8270fc08b470ae5]] [formerly 1ba9f471722e7704871ff9633555c4a3fc8bd4bb [formerly bbcf24b814eeb2acba55e8776013eedf844e14a1] [formerly a955f79ff756529f26b0c1f322f9ee841f1c6898 [formerly 8e48fb601ece2cbd89502ec1783ae4546449c87c]]] Former-commit-id: a0266a01ca997b630a93dc94509111ca771f4209 [formerly d498dc088d9d267b02de2e953e8cbfa5de813842] [formerly f45da48ee8b37b79fb4be6c77b0ec37f4ee3377f [formerly 2a1fbc27876ca4bca19d3a5f36268a5b4a574187]] Former-commit-id: 74957391344734f439881aa98796b57e077b3155 [formerly 052efc3b77dbca2c17eceb524901481caaeec381] Former-commit-id: 1fa1955ae6485967efe4c99ca36ba663e75d96eb --- README.md | 64 +++++++++++++++++++++++++++++++ examples/evaluate_default_pipeline.py | 23 +++++++++++ examples/run_predefined_pipeline.py | 51 ------------------------ tods/tods/resources/default_pipeline.json | 1 + tods/tods/schemas.py | 10 +++++ tods/tods/utils.py | 19 +++++++++ 6 files changed, 117 insertions(+), 51 deletions(-) create mode 100644 examples/evaluate_default_pipeline.py delete mode 100644 examples/run_predefined_pipeline.py create mode 100644 tods/tods/resources/default_pipeline.json create mode 100644 tods/tods/schemas.py diff --git a/README.md b/README.md index 202a0dd..571ef23 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,70 @@ cd .. There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any. +# Examples +Examples are available in [/examples](examples/). For basic usage, you can evaluate a pipeline on a given datasets. Here, we provide an example to load our default pipeline and evaluate it on a subset of yahoo dataset. +``` +import pandas as pd + +from tods import schemas as schemas_utils +from tods.utils import generate_dataset_problem, evaluate_pipeline + +table_path = 'datasets/yahoo_sub_5.csv' +target_index = 6 # what column is the target +#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset +time_limit = 30 # How many seconds you wanna search +#metric = 'F1' # F1 on label 1 +metric = 'F1_MACRO' # F1 on both label 0 and 1 + +# Read data and generate dataset and problem +df = pd.read_csv(table_path) +dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) + +# Load the default pipeline +pipeline = schemas_utils.load_default_pipeline() + +# Run the pipeline +pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline) +``` +We also provide AutoML support to help you automatically find a good pipeline for a your data. +``` +import pandas as pd + +from axolotl.backend.simple import SimpleRunner + +from tods.utils import generate_dataset_problem +from tods.search import BruteForceSearch + +# Some information +#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset +#target_index = 2 # what column is the target + +table_path = 'datasets/yahoo_sub_5.csv' +target_index = 6 # what column is the target +#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset +time_limit = 30 # How many seconds you wanna search +#metric = 'F1' # F1 on label 1 +metric = 'F1_MACRO' # F1 on both label 0 and 1 + +# Read data and generate dataset and problem +df = pd.read_csv(table_path) +dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) + +# Start backend +backend = SimpleRunner(random_seed=0) + +# Start search algorithm +search = BruteForceSearch(problem_description=problem_description, backend=backend) + +# Find the best pipeline +best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit) +best_pipeline = best_runtime.pipeline +best_output = best_pipeline_result.output + +# Evaluate the best pipeline +best_scores = search.evaluate(best_pipeline).scores +``` + # Dataset Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format. diff --git a/examples/evaluate_default_pipeline.py b/examples/evaluate_default_pipeline.py new file mode 100644 index 0000000..0bf9e3e --- /dev/null +++ b/examples/evaluate_default_pipeline.py @@ -0,0 +1,23 @@ +import pandas as pd + +from tods import schemas as schemas_utils +from tods.utils import generate_dataset_problem, evaluate_pipeline + +table_path = 'datasets/yahoo_sub_5.csv' +target_index = 6 # what column is the target +#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset +time_limit = 30 # How many seconds you wanna search +#metric = 'F1' # F1 on label 1 +metric = 'F1_MACRO' # F1 on both label 0 and 1 + +# Read data and generate dataset and problem +df = pd.read_csv(table_path) +dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) + +# Load the default pipeline +pipeline = schemas_utils.load_default_pipeline() + +# Run the pipeline +pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline) +print(pipeline_result) + diff --git a/examples/run_predefined_pipeline.py b/examples/run_predefined_pipeline.py deleted file mode 100644 index 0fb6779..0000000 --- a/examples/run_predefined_pipeline.py +++ /dev/null @@ -1,51 +0,0 @@ -import uuid -import random -import pandas as pd -import json -from pprint import pprint -from sklearn.datasets import make_classification - -from d3m import container -from d3m.metadata.pipeline import Pipeline -from d3m.metadata.problem import TaskKeyword, PerformanceMetric - -from axolotl.utils import data_problem -from axolotl.backend.simple import SimpleRunner -# from axolotl.backend.ray import RayRunner -# from axolotl.algorithms.base import PipelineSearchBase -from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils - -import tods -from tods.search import BruteForceSearch - -table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv' -df = pd.read_csv(table_path) -dataset, problem_description = data_problem.generate_dataset_problem(df, - target_index=7, - task_keywords=[TaskKeyword.ANOMALY_DETECTION,], - performance_metrics=[{'metric': PerformanceMetric.F1}]) - -print(dataset) -print(problem_description) - -metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}, - ] - -pipeline_path = 'example_pipeline.json' -pipeline = pipeline_utils.load_pipeline(pipeline_path) -print(pipeline) - -data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") -scoring_pipeline = schemas_utils.get_scoring_pipeline() -data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] - -backend = SimpleRunner(random_seed=0) -pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, - pipeline=pipeline, - input_data=[dataset], - metrics=metrics, - data_preparation_pipeline=data_preparation_pipeline, - scoring_pipeline=scoring_pipeline, - data_preparation_params=data_preparation_params) -print(pipeline_result) - diff --git a/tods/tods/resources/default_pipeline.json b/tods/tods/resources/default_pipeline.json new file mode 100644 index 0000000..bff2c98 --- /dev/null +++ b/tods/tods/resources/default_pipeline.json @@ -0,0 +1 @@ +{"id": "384bbfab-4f6d-4001-9f90-684ea5681f5d", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-09-09T23:40:01.756164Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "b94ee59ccf8db678d506adddbc238fb2049fb664a1e3f3f3f6a6517c0c4f8e5f"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "256f0155c7185d747b3b23096e46c40d15844106f9ed6346453f6010891f1896"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "642de2e7-5590-3cab-9266-2a53c326c461", "version": "0.0.1", "python_path": "d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler", "name": "Axis_wise_scale"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "eaff2f35-978c-4530-a12e-061a5f0beacd", "version": "0.1.0", "python_path": "d3m.primitives.tods.feature_analysis.statistical_mean", "name": "Time Series Decompostional", "digest": "2f2a8c07878643fe29c346096b91b5ba91477baa1e7e78684f07e53d29766ca4"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "67e7fcdf-d645-3417-9aa4-85cd369487d9", "version": "0.0.1", "python_path": "d3m.primitives.tods.detection_algorithm.pyod_vae", "name": "TODS.anomaly_detection_primitives.VariationalAutoEncoder"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "d5384857f75090844f367504befb1a854e5088589f6aae0795f66ccf10403e19"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "8d969800816d9596e94cb045aacce43dc3d49e8c5bedb403e35af6c9b8339990"} diff --git a/tods/tods/schemas.py b/tods/tods/schemas.py new file mode 100644 index 0000000..02d76ad --- /dev/null +++ b/tods/tods/schemas.py @@ -0,0 +1,10 @@ +import os + +resource_dir = os.path.dirname(__file__) + +DEFAULT_PIPELINE_DIR = os.path.join(resource_dir, 'resources', 'default_pipeline.json') + +def load_default_pipeline(): + from axolotl.utils import pipeline as pipeline_utils + pipeline = pipeline_utils.load_pipeline(DEFAULT_PIPELINE_DIR) + return pipeline diff --git a/tods/tods/utils.py b/tods/tods/utils.py index 6e3af4f..e375354 100644 --- a/tods/tods/utils.py +++ b/tods/tods/utils.py @@ -30,3 +30,22 @@ def generate_dataset_problem(df, target_index, metric): return dataset, problem_description +def evaluate_pipeline(problem_description, dataset, pipeline): + from axolotl.utils import schemas as schemas_utils + from axolotl.backend.simple import SimpleRunner + data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + scoring_pipeline = schemas_utils.get_scoring_pipeline() + data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + metrics = problem_description['problem']['performance_metrics'] + + backend = SimpleRunner(random_seed=0) + pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, + pipeline=pipeline, + input_data=[dataset], + metrics=metrics, + data_preparation_pipeline=data_preparation_pipeline, + scoring_pipeline=scoring_pipeline, + data_preparation_params=data_preparation_params) + return pipeline_result + +