Browse Source

merge from dev

Former-commit-id: 6f3f9d8494 [formerly 07a7c27b2d] [formerly 8dc213a3a8 [formerly ee348c5b13]] [formerly 9f9acfb04e [formerly b5634a309b] [formerly 9f1063cac7 [formerly 2abbd1ffe5]]] [formerly c3960dc4c3 [formerly e63a7bcb10] [formerly 40fad58471 [formerly 2631617648]] [formerly d3374fdb78 [formerly c1bfb57045] [formerly 011265a5b4 [formerly b92f8aeaa6]]]] [formerly 9e7f566275 [formerly 931e17f3fc] [formerly 4077e6d78e [formerly 54ccce531a]] [formerly 86a4e4a232 [formerly e2f83920e7] [formerly 91b42cffdf [formerly c94191f277]]] [formerly bb5acec0b6 [formerly fd17fb3a4e] [formerly 45db33b756 [formerly 2e69351cdc]] [formerly 7eec793aea [formerly c2487bfedc] [formerly 19d42fcb84 [formerly a396098c45]]]]] [formerly 0b480f9ad6 [formerly 689b9334cb] [formerly f42a306816 [formerly a5dab6c16f]] [formerly ea2c8236ac [formerly dfecbd50f0] [formerly 38bec5b443 [formerly 307375c9b3]]] [formerly f6de8114e6 [formerly 795e15c17a] [formerly 362dfcfa83 [formerly 4760fcd7bb]] [formerly 90c615b5bb [formerly 8576464963] [formerly c78d8645f0 [formerly d5e3b90ba0]]]] [formerly 43099f3d32 [formerly c96a5f5a61] [formerly 4aea2330a1 [formerly e6be4547db]] [formerly c25ba7a2e0 [formerly 5458af4f61] [formerly 2d2ec6153f [formerly 0a85a3208b]]] [formerly fd69fe4476 [formerly 56ce765dfc] [formerly 262f73455b [formerly 3da8c2462c]] [formerly d8a809546b [formerly b00d8a8a15] [formerly 0308288cd2 [formerly b546c87c7a]]]]]]
Former-commit-id: a4688064b3 [formerly 6d2a8ff2e7] [formerly 59c747b73f [formerly a3f73de2c4]] [formerly 8eb501ca14 [formerly 60f7f02f4e] [formerly 5b1db973ed [formerly b9dec90eb8]]] [formerly ab64338cca [formerly d31e2f6a92] [formerly 22e3131d9d [formerly 4e78060377]] [formerly 378b7c13de [formerly 317915c397] [formerly 678749741d [formerly f4993bf5ef]]]] [formerly 0c6f0d1ad7 [formerly c816a51696] [formerly 0e94a24702 [formerly 9279b59a26]] [formerly 90ffdd1ae8 [formerly 710f2b7661] [formerly d16598c6ca [formerly 39e237df27]]] [formerly 34dd6fe1c9 [formerly 3552294fc7] [formerly 59d655e2a0 [formerly 82800fa139]] [formerly 3320d05e7b [formerly a2b88f4e2e] [formerly 0308288cd2]]]]
Former-commit-id: 3b60b9c05b [formerly 525640671a] [formerly b4c9f13430 [formerly 69ca711dce]] [formerly d1f9656f37 [formerly d746df9e15] [formerly d93051a807 [formerly fc0662f426]]] [formerly 39a194fab9 [formerly 9352b0db18] [formerly d64e767c5e [formerly 7aa934665d]] [formerly 27a08f3279 [formerly c445b437d6] [formerly 6430e1f5a1 [formerly 4c8718b6b5]]]]
Former-commit-id: d943b1b29e [formerly 2affc48dd3] [formerly a90e8befbe [formerly f511df04ce]] [formerly 255f18ce12 [formerly 4cce3a45ea] [formerly 5bd00c6593 [formerly 829c5f5a2c]]]
Former-commit-id: ddd8ba810d [formerly 8b23c3eb13] [formerly 26b4bd122c [formerly 8d9436c171]]
Former-commit-id: 09968fa0db [formerly f15bbd91f1]
Former-commit-id: 8bddae74b8
master
lhenry15 4 years ago
parent
commit
1a37aca30f
100 changed files with 170145 additions and 212 deletions
  1. +0
    -0
      .install.sh
  2. +57
    -90
      README.md
  3. +1
    -1
      axolotl/axolotl/algorithms/base.py
  4. +68
    -0
      datasets/NAB/README.md
  5. +28
    -0
      datasets/NAB/add_label.py
  6. +4033
    -0
      datasets/NAB/artificialNoAnomaly/labeled_art_daily_no_noise.csv
  7. +4033
    -0
      datasets/NAB/artificialNoAnomaly/labeled_art_daily_perfect_square_wave.csv
  8. +4033
    -0
      datasets/NAB/artificialNoAnomaly/labeled_art_daily_small_noise.csv
  9. +4033
    -0
      datasets/NAB/artificialNoAnomaly/labeled_art_flatline.csv
  10. +4033
    -0
      datasets/NAB/artificialNoAnomaly/labeled_art_noisy.csv
  11. +4033
    -0
      datasets/NAB/artificialWithAnomaly/labeled_art_daily_flatmiddle.csv
  12. +4033
    -0
      datasets/NAB/artificialWithAnomaly/labeled_art_daily_jumpsdown.csv
  13. +4033
    -0
      datasets/NAB/artificialWithAnomaly/labeled_art_daily_jumpsup.csv
  14. +4033
    -0
      datasets/NAB/artificialWithAnomaly/labeled_art_daily_nojump.csv
  15. +4033
    -0
      datasets/NAB/artificialWithAnomaly/labeled_art_increase_spike_density.csv
  16. +4033
    -0
      datasets/NAB/artificialWithAnomaly/labeled_art_load_balancer_spikes.csv
  17. +232
    -0
      datasets/NAB/combined_labels.json
  18. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_24ae8d.csv
  19. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_53ea38.csv
  20. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_5f5533.csv
  21. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_77c1ca.csv
  22. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_825cc2.csv
  23. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_ac20cd.csv
  24. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_c6585a.csv
  25. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_fe7f93.csv
  26. +4731
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_disk_write_bytes_1ef3de.csv
  27. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_disk_write_bytes_c0d644.csv
  28. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_network_in_257a54.csv
  29. +4731
    -0
      datasets/NAB/realAWSCloudwatch/labeled_ec2_network_in_5abac7.csv
  30. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_elb_request_count_8c0756.csv
  31. +4622
    -0
      datasets/NAB/realAWSCloudwatch/labeled_grok_asg_anomaly.csv
  32. +1244
    -0
      datasets/NAB/realAWSCloudwatch/labeled_iio_us-east-1_i-a2eb1cd9_NetworkIn.csv
  33. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_rds_cpu_utilization_cc0c53.csv
  34. +4033
    -0
      datasets/NAB/realAWSCloudwatch/labeled_rds_cpu_utilization_e47b3b.csv
  35. +1625
    -0
      datasets/NAB/realAdExchange/labeled_exchange-2_cpc_results.csv
  36. +1625
    -0
      datasets/NAB/realAdExchange/labeled_exchange-2_cpm_results.csv
  37. +1539
    -0
      datasets/NAB/realAdExchange/labeled_exchange-3_cpc_results.csv
  38. +1539
    -0
      datasets/NAB/realAdExchange/labeled_exchange-3_cpm_results.csv
  39. +1644
    -0
      datasets/NAB/realAdExchange/labeled_exchange-4_cpc_results.csv
  40. +1644
    -0
      datasets/NAB/realAdExchange/labeled_exchange-4_cpm_results.csv
  41. +7268
    -0
      datasets/NAB/realKnownCause/labeled_ambient_temperature_system_failure.csv
  42. +1
    -0
      datasets/NAB/realKnownCause/labeled_cpu_utilization_asg_misconfiguration.csv.REMOVED.git-id
  43. +4033
    -0
      datasets/NAB/realKnownCause/labeled_ec2_request_latency_system_failure.csv
  44. +1
    -0
      datasets/NAB/realKnownCause/labeled_machine_temperature_system_failure.csv.REMOVED.git-id
  45. +10321
    -0
      datasets/NAB/realKnownCause/labeled_nyc_taxi.csv
  46. +1883
    -0
      datasets/NAB/realKnownCause/labeled_rogue_agent_key_hold.csv
  47. +5316
    -0
      datasets/NAB/realKnownCause/labeled_rogue_agent_key_updown.csv
  48. +2501
    -0
      datasets/NAB/realTraffic/labeled_TravelTime_387.csv
  49. +2163
    -0
      datasets/NAB/realTraffic/labeled_TravelTime_451.csv
  50. +2381
    -0
      datasets/NAB/realTraffic/labeled_occupancy_6005.csv
  51. +2501
    -0
      datasets/NAB/realTraffic/labeled_occupancy_t4013.csv
  52. +2501
    -0
      datasets/NAB/realTraffic/labeled_speed_6005.csv
  53. +1128
    -0
      datasets/NAB/realTraffic/labeled_speed_7578.csv
  54. +2496
    -0
      datasets/NAB/realTraffic/labeled_speed_t4013.csv
  55. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_AAPL.csv.REMOVED.git-id
  56. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_AMZN.csv.REMOVED.git-id
  57. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_CRM.csv.REMOVED.git-id
  58. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_CVS.csv.REMOVED.git-id
  59. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_FB.csv.REMOVED.git-id
  60. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv.REMOVED.git-id
  61. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv.REMOVED.git-id
  62. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_KO.csv.REMOVED.git-id
  63. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_PFE.csv.REMOVED.git-id
  64. +1
    -0
      datasets/NAB/realTweets/labeled_Twitter_volume_UPS.csv.REMOVED.git-id
  65. +1401
    -0
      datasets/yahoo_sub_5.csv
  66. +20
    -0
      docs/Makefile
  67. +35
    -0
      docs/make.bat
  68. +85
    -0
      docs/source/conf.py
  69. +31
    -0
      docs/source/doctree.rst
  70. +595
    -0
      docs/source/getting_started.rst
  71. BIN
      docs/source/img/framework.pdf
  72. +28
    -0
      docs/source/index.rst
  73. +7
    -0
      docs/source/modules.rst
  74. +101
    -0
      docs/source/overview.rst
  75. +69
    -0
      docs/source/tods.data_processing.rst
  76. +189
    -0
      docs/source/tods.detection_algorithm.rst
  77. +253
    -0
      docs/source/tods.feature_analysis.rst
  78. +21
    -0
      docs/source/tods.reinforcement.rst
  79. +24
    -0
      docs/source/tods.rst
  80. +37
    -0
      docs/source/tods.searcher.rst
  81. +21
    -0
      docs/source/tods.searcher.search.rst
  82. +85
    -0
      docs/source/tods.timeseries_processing.rst
  83. +23
    -0
      examples/evaluate_default_pipeline.py
  84. +50
    -24
      examples/run_automl.py
  85. +0
    -51
      examples/run_predefined_pipeline.py
  86. +2
    -2
      requirements.txt
  87. +21
    -0
      setup.py
  88. +2
    -2
      tods/common-primitives/sklearn-wrap/requirements.txt
  89. +1
    -1
      tods/common-primitives/sklearn-wrap/setup.py
  90. +0
    -2
      tods/entry_points.ini
  91. +2
    -2
      tods/requirements.txt
  92. +0
    -0
      tods/searcher/__init__.py
  93. +1
    -0
      tods/searcher/resources/default_pipeline.json
  94. +10
    -0
      tods/searcher/schemas.py
  95. +0
    -0
      tods/searcher/search/__init__.py
  96. +294
    -0
      tods/searcher/search/brute_force_search.py
  97. +59
    -0
      tods/searcher/tods/utils.py
  98. +51
    -0
      tods/searcher/utils.py
  99. +1
    -1
      tods/setup.py
  100. +0
    -36
      tods/tods/search/brute_force_search.py

install.sh → .install.sh View File


+ 57
- 90
README.md View File

@@ -1,12 +1,13 @@
# Automated Time-series Outlie Detection System
This is a time-seried outlier detection system with automate machin learning.
# Time-series Outlie Detection System
TODS is a full-stack automated machine learning system for outlier detection on multivariate time-series data. TODS provides exahaustive modules for building machine learning-based outlier detection systems including: data processing, time series processing, feature analysis (extraction), detection algorithms, and reinforcement module. The functionalities provided via these modules including: data preprocessing for general purposes, time series data smoothing/transformation, extracting features from time/frequency domains, various detection algorithms, and involving human expertises to calibrate the system. Three common outlier detection scenarios on time-series data can be performed: point-wise detection (time points as outliers), pattern-wise detection (subsequences as outliers), and system-wise detection (sets of time series as outliers), and wide-range of corresponding algorithms are provided in TODS. This package is developed by [DATA Lab @ Texas A&M University](https://people.engr.tamu.edu/xiahu/index.html).

TODS is featured for:
* **Full Sack Machine Learning System** which supports exhaustive components from preprocessings, feature extraction, detection algorithms and also human-in-the loop interface.

* **Wide-range of Algorithms**, including all of the point-wise detection algorithms supported by [PyOD](https://github.com/yzhao062/pyod), state-of-the-art pattern-wise (collective) detection algorithms such as [DeepLog](https://www.cs.utah.edu/~lifeifei/papers/deeplog.pdf), [Telemanon](https://arxiv.org/pdf/1802.04431.pdf), and also various ensemble algorithms for performing system-wise detection.

* **Automated Machine Learning** aims on providing knowledge-free process that construct optimal pipeline based on the given data by automatically searching the best combination from all of the existing modules.

## Axolotl
Running pre-defined pipeline
```
python examples/build_AutoEncoder_pipeline.py
python examples/run_predefined_pipeline.py
```

## Installation

@@ -44,100 +45,66 @@ cd ..

There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any.

# Dataset
Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format.
The generated csv file will have the following columns: `d3mIndex`, `timestamp`, `value`, `'ground_truth`. In the example kpi dataset, there is only one value. For other datasets there could be multiple values. The goal of the pipline is to predict the `ground_truth` based on `timestamp` and the value(s).
# Examples
Examples are available in [/examples](examples/). For basic usage, you can evaluate a pipeline on a given datasets. Here, we provide an example to load our default pipeline and evaluate it on a subset of yahoo dataset.
```python
import pandas as pd

There is a nice script to check whether the dataset is in the right format. Run
```
python3 datasets/validate.py datasets/anomaly/kpi/
```
The expected output is as follows:
```
Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/SCORE/problem_TEST/problemDoc.json'.
Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/SCORE/dataset_TEST/datasetDoc.json'.
Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/kpi_problem/problemDoc.json'.
Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TEST/problem_TEST/problemDoc.json'.
Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json'.
Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/kpi_dataset/datasetDoc.json'.
Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json'.
Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json'.
Validating all datasets and problems.
There are no errors.
```
Of course, you can also create other datasets with `transform.py`. But for now, we can focus on this example dataset since other datasets are usually in the same format.
from tods import schemas as schemas_utils
from tods.utils import generate_dataset_problem, evaluate_pipeline

# Example
In D3M, our goal is to provide a **solution** to a **problem** on a **dataset**. Here, solution is a pipline which consists of data processing, classifiers, etc.
table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

Run the example to build the first pipline with
```
python3 examples/build_iforest_pipline.py
```
Note that we have not implemented iForest yet. This one is actually Random Forest. This will generate a file `pipline.yml`, which describes a pipline. We can run the pipeline on the example data in this repo as follows:
```
python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml
```
Another example on a subset of the sequences of Yahoo dataset is as follows:
```
python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml
```
The above commands will generate two files `results.csv` and `pipline_run.yml`
# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

# How to add a new primitive
# Load the default pipeline
pipeline = schemas_utils.load_default_pipeline()

For new primitives, put them in `/anomaly_pritives`. There is an example for isolation forest (however, this is essentially a RandomForest, although the name is IsolationForest. We need more efforts to change it to real IsolationForest).

In addition to add a new file, you need to register the promitive in `anomaly-primitives/setup.py` and rerun pip install.

Use the following command to check whether your new primitives are registered:
```
python3 -m d3m index search
```

Test the new primitives:
```
python3 examples/build_iforest_pipline.py
# Run the pipeline
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
```
We also provide AutoML support to help you automatically find a good pipeline for a your data.
```python
import pandas as pd

# Template for meta-data in primitives

* `__author__`: `DATA Lab at Texas A&M University`
* `name`: Just a name. Name your primitive with a few words
* `python_path`: This path should have **5** segments. The first two segments should be `d3m.primitives`. The third segment shoulb be `anomaly_detection`, `data_preprocessing` or `feature_construction` (it should match `primitive_family`). The fourth segment should be your algorithm name, e.g., `isolation_forest`. Note that this name should also be added to [this file](d3m/d3m/metadata/primitive_names.py). The last segment should be one of `Preprocessing`, `Feature`, `Algorithm` (for now).
* `source`: `name` should be `DATA Lab at Texas A&M University`, `contact` should be `mailto:khlai037@tamu.edu`, `uris` should have `https://gitlab.com/lhenry15/tods.git` and the path your py file.
* `algorithms_types`: Name the primitive by your self and add it to [here](d3m/d3m/metadata/schemas/v0/definitions.json#L1957). **Then reinstall d3m.** Fill this field with `metadata_base.PrimitiveAlgorithmType.YOUR_NAME`
* `primitive_family`: For preprocessing primitives, use `metadata_base.PrimitiveFamily.DATA_PREPROCESSING`. For feature analysis primitives, use `metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION`. For anomaly detection primitives, use `metadata_base.PrimitiveFamily.ANOMALY_DETECTION`.
* `id`: Randomly generate one with `import uuid; uuid.uuid4()`
* `hyperparameters_to_tune`: Specify what hyperparameters can be tuned in your primitive
* `version`: `0.0.1`

Notes:
from axolotl.backend.simple import SimpleRunner

1. `installation` is not required. We remove it.
from tods.utils import generate_dataset_problem
from tods.search import BruteForceSearch

2. Try to reinstall everything if it does not work.
# Some information
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset
#target_index = 2 # what column is the target

3. An example of fake Isolation Forest is [here](anomaly-primitives/anomaly_primitives/SKIsolationForest.py#L294)
table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

## Resources of D3M
# Start backend
backend = SimpleRunner(random_seed=0)

If you still have questions, you may refer to the following resources.
# Start search algorithm
search = BruteForceSearch(problem_description=problem_description, backend=backend)

Dataset format [https://gitlab.com/datadrivendiscovery/data-supply](https://gitlab.com/datadrivendiscovery/data-supply)
# Find the best pipeline
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit)
best_pipeline = best_runtime.pipeline
best_output = best_pipeline_result.output

Instructions for creating primitives [https://docs.datadrivendiscovery.org/v2020.1.9/interfaces.html](https://docs.datadrivendiscovery.org/v2020.1.9/interfaces.html)

We use a stable version of d3m core package at [https://gitlab.com/datadrivendiscovery/d3m/-/tree/v2020.1.9](https://gitlab.com/datadrivendiscovery/d3m/-/tree/v2020.1.9).

The documentation is at [https://docs.datadrivendiscovery.org/](https://docs.datadrivendiscovery.org/).

The core package documentation is at [https://docs.datadrivendiscovery.org/v2020.1.9/index.html](https://docs.datadrivendiscovery.org/v2020.1.9/index.html)

The common-primitives is v0.8.0 at [https://gitlab.com/datadrivendiscovery/common-primitives/-/tree/v0.8.0/common_primitives](https://gitlab.com/datadrivendiscovery/common-primitives/-/tree/v0.8.0/common_primitives)

The sklearn-wrap uses dist branch [https://gitlab.com/datadrivendiscovery/sklearn-wrap/-/tree/dist](https://gitlab.com/datadrivendiscovery/sklearn-wrap/-/tree/dist)

There are other primitives developed by many universities but are not used in this repo. See [https://gitlab.com/datadrivendiscovery/primitives](https://gitlab.com/datadrivendiscovery/primitives)
# Evaluate the best pipeline
best_scores = search.evaluate(best_pipeline).scores
```

+ 1
- 1
axolotl/axolotl/algorithms/base.py View File

@@ -151,7 +151,7 @@ class PipelineSearchBase:
logging.error('No solution founded')
pipeline_result = PipelineResult(fitted_pipeline_id='')
pipeline_result.error = RuntimeError("No solution found")
return _, pipeline_result
return None, pipeline_result

return self.fit(best_pipeline.pipeline, input_data, expose_values)



+ 68
- 0
datasets/NAB/README.md View File

@@ -0,0 +1,68 @@
NAB Data Corpus
---

Data are ordered, timestamped, single-valued metrics. All data files contain anomalies, unless otherwise noted.


### Real data
- realAWSCloudwatch/

AWS server metrics as collected by the AmazonCloudwatch service. Example metrics include CPU Utilization, Network Bytes In, and Disk Read Bytes.

- realAdExchange/
Online advertisement clicking rates, where the metrics are cost-per-click (CPC) and cost per thousand impressions (CPM). One of the files is normal, without anomalies.
- realKnownCause/

This is data for which we know the anomaly causes; no hand labeling.
- ambient_temperature_system_failure.csv: The ambient temperature in an office
setting.
- cpu_utilization_asg_misconfiguration.csv: From Amazon Web Services (AWS)
monitoring CPU usage – i.e. average CPU usage across a given cluster. When
usage is high, AWS spins up a new machine, and uses fewer machines when usage
is low.
- ec2_request_latency_system_failure.csv: CPU usage data from a server in
Amazon's East Coast datacenter. The dataset ends with complete system failure
resulting from a documented failure of AWS API servers. There's an interesting
story behind this data in the [Numenta
blog](http://numenta.com/blog/anomaly-of-the-week.html).
- machine_temperature_system_failure.csv: Temperature sensor data of an
internal component of a large, industrial mahcine. The first anomaly is a
planned shutdown of the machine. The second anomaly is difficult to detect and
directly led to the third anomaly, a catastrophic failure of the machine.
- nyc_taxi.csv: Number of NYC taxi passengers, where the five anomalies occur
during the NYC marathon, Thanksgiving, Christmas, New Years day, and a snow
storm. The raw data is from the [NYC Taxi and Limousine Commission](http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml).
The data file included here consists of aggregating the total number of
taxi passengers into 30 minute buckets.
- rogue_agent_key_hold.csv: Timing the key holds for several users of a
computer, where the anomalies represent a change in the user.
- rogue_agent_key_updown.csv: Timing the key strokes for several users of a
computer, where the anomalies represent a change in the user.

- realTraffic/

Real time traffic data from the Twin Cities Metro area in Minnesota, collected
by the
[Minnesota Department of Transportation](http://www.dot.state.mn.us/tmc/trafficinfo/developers.html).
Included metrics include occupancy, speed, and travel time from specific
sensors.

- realTweets/

A collection of Twitter mentions of large publicly-traded companies
such as Google and IBM. The metric value represents the number of mentions
for a given ticker symbol every 5 minutes.


### Artificial data

- artificialNoAnomaly/

Artificially-generated data without any anomalies.

- artificialWithAnomaly/

Artificially-generated data with varying types of anomalies.

+ 28
- 0
datasets/NAB/add_label.py View File

@@ -0,0 +1,28 @@

import pandas as pd
import json
import os
import time
import datetime


label_file = open('combined_labels.json', 'r')
label_info = json.load(label_file)

for key in label_info.keys():
df = pd.read_csv(key)
fpath, fname = key.split('/')[0], key.split('/')[1]
label = []
unix_timestamp = []
for _, row in df.iterrows():
if row['timestamp'] in list(label_info[key]):
label.append('1')
else:
label.append('0')
timestamp = datetime.datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S').timestamp()
unix_timestamp.append(timestamp)
df['label'] = label
df['timestamp'] = unix_timestamp
df.to_csv(fpath+"/labeled_"+fname, index=False)
#os.remove(key)


+ 4033
- 0
datasets/NAB/artificialNoAnomaly/labeled_art_daily_no_noise.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialNoAnomaly/labeled_art_daily_perfect_square_wave.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialNoAnomaly/labeled_art_daily_small_noise.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialNoAnomaly/labeled_art_flatline.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialNoAnomaly/labeled_art_noisy.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialWithAnomaly/labeled_art_daily_flatmiddle.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialWithAnomaly/labeled_art_daily_jumpsdown.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialWithAnomaly/labeled_art_daily_jumpsup.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialWithAnomaly/labeled_art_daily_nojump.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialWithAnomaly/labeled_art_increase_spike_density.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/artificialWithAnomaly/labeled_art_load_balancer_spikes.csv
File diff suppressed because it is too large
View File


+ 232
- 0
datasets/NAB/combined_labels.json View File

@@ -0,0 +1,232 @@
{
"artificialNoAnomaly/art_daily_no_noise.csv": [],
"artificialNoAnomaly/art_daily_perfect_square_wave.csv": [],
"artificialNoAnomaly/art_daily_small_noise.csv": [],
"artificialNoAnomaly/art_flatline.csv": [],
"artificialNoAnomaly/art_noisy.csv": [],
"artificialWithAnomaly/art_daily_flatmiddle.csv": [
"2014-04-11 00:00:00"
],
"artificialWithAnomaly/art_daily_jumpsdown.csv": [
"2014-04-11 09:00:00"
],
"artificialWithAnomaly/art_daily_jumpsup.csv": [
"2014-04-11 09:00:00"
],
"artificialWithAnomaly/art_daily_nojump.csv": [
"2014-04-11 09:00:00"
],
"artificialWithAnomaly/art_increase_spike_density.csv": [
"2014-04-07 23:10:00"
],
"artificialWithAnomaly/art_load_balancer_spikes.csv": [
"2014-04-11 04:35:00"
],
"realAWSCloudwatch/ec2_cpu_utilization_24ae8d.csv": [
"2014-02-26 22:05:00",
"2014-02-27 17:15:00"
],
"realAWSCloudwatch/ec2_cpu_utilization_53ea38.csv": [
"2014-02-19 19:10:00",
"2014-02-23 20:05:00"
],
"realAWSCloudwatch/ec2_cpu_utilization_5f5533.csv": [
"2014-02-19 00:22:00",
"2014-02-24 18:37:00"
],
"realAWSCloudwatch/ec2_cpu_utilization_77c1ca.csv": [
"2014-04-09 10:15:00"
],
"realAWSCloudwatch/ec2_cpu_utilization_825cc2.csv": [
"2014-04-15 15:44:00",
"2014-04-16 03:34:00"
],
"realAWSCloudwatch/ec2_cpu_utilization_ac20cd.csv": [
"2014-04-15 00:49:00"
],
"realAWSCloudwatch/ec2_cpu_utilization_c6585a.csv": [],
"realAWSCloudwatch/ec2_cpu_utilization_fe7f93.csv": [
"2014-02-17 06:12:00",
"2014-02-22 00:02:00",
"2014-02-23 15:17:00"
],
"realAWSCloudwatch/ec2_disk_write_bytes_1ef3de.csv": [
"2014-03-10 21:09:00"
],
"realAWSCloudwatch/ec2_disk_write_bytes_c0d644.csv": [
"2014-04-09 01:30:00",
"2014-04-10 14:35:00",
"2014-04-13 03:00:00"
],
"realAWSCloudwatch/ec2_network_in_257a54.csv": [
"2014-04-15 16:44:00"
],
"realAWSCloudwatch/ec2_network_in_5abac7.csv": [
"2014-03-10 18:56:00",
"2014-03-12 21:01:00"
],
"realAWSCloudwatch/elb_request_count_8c0756.csv": [
"2014-04-12 17:24:00",
"2014-04-22 19:34:00"
],
"realAWSCloudwatch/grok_asg_anomaly.csv": [
"2014-01-20 08:30:00",
"2014-01-21 10:45:00",
"2014-01-29 00:45:00"
],
"realAWSCloudwatch/iio_us-east-1_i-a2eb1cd9_NetworkIn.csv": [
"2013-10-10 09:35:00",
"2013-10-10 20:40:00"
],
"realAWSCloudwatch/rds_cpu_utilization_cc0c53.csv": [
"2014-02-25 07:15:00",
"2014-02-27 00:50:00"
],
"realAWSCloudwatch/rds_cpu_utilization_e47b3b.csv": [
"2014-04-13 06:52:00",
"2014-04-18 23:27:00"
],
"realAdExchange/exchange-2_cpc_results.csv": [
"2011-07-14 13:00:01"
],
"realAdExchange/exchange-2_cpm_results.csv": [
"2011-07-26 06:00:01",
"2011-08-10 17:00:01"
],
"realAdExchange/exchange-3_cpc_results.csv": [
"2011-07-14 10:15:01",
"2011-07-20 10:15:01",
"2011-08-13 10:15:01"
],
"realAdExchange/exchange-3_cpm_results.csv": [
"2011-08-19 18:15:01"
],
"realAdExchange/exchange-4_cpc_results.csv": [
"2011-07-16 09:15:01",
"2011-08-02 12:15:01",
"2011-08-23 08:15:01"
],
"realAdExchange/exchange-4_cpm_results.csv": [
"2011-07-16 09:15:01",
"2011-08-01 07:15:01",
"2011-08-23 08:15:01",
"2011-08-28 13:15:01"
],
"realKnownCause/ambient_temperature_system_failure.csv": [
"2013-12-22 20:00:00",
"2014-04-13 09:00:00"
],
"realKnownCause/cpu_utilization_asg_misconfiguration.csv": [
"2014-07-12 02:04:00",
"2014-07-14 21:44:00"
],
"realKnownCause/ec2_request_latency_system_failure.csv": [
"2014-03-14 09:06:00",
"2014-03-18 22:41:00",
"2014-03-21 03:01:00"
],
"realKnownCause/machine_temperature_system_failure.csv": [
"2013-12-11 06:00:00",
"2013-12-16 17:25:00",
"2014-01-28 13:55:00",
"2014-02-08 14:30:00"
],
"realKnownCause/nyc_taxi.csv": [
"2014-11-01 19:00:00",
"2014-11-27 15:30:00",
"2014-12-25 15:00:00",
"2015-01-01 01:00:00",
"2015-01-27 00:00:00"
],
"realKnownCause/rogue_agent_key_hold.csv": [
"2014-07-15 08:30:00",
"2014-07-17 09:50:00"
],
"realKnownCause/rogue_agent_key_updown.csv": [
"2014-07-15 04:00:00",
"2014-07-17 08:50:00"
],
"realTraffic/TravelTime_387.csv": [
"2015-07-30 12:29:00",
"2015-08-18 16:26:00",
"2015-09-01 05:34:00"
],
"realTraffic/TravelTime_451.csv": [
"2015-08-11 12:07:00"
],
"realTraffic/occupancy_6005.csv": [
"2015-09-15 06:55:00"
],
"realTraffic/occupancy_t4013.csv": [
"2015-09-16 08:09:00",
"2015-09-17 07:55:00"
],
"realTraffic/speed_6005.csv": [
"2015-09-17 07:00:00"
],
"realTraffic/speed_7578.csv": [
"2015-09-11 16:44:00",
"2015-09-15 14:34:00",
"2015-09-16 14:14:00",
"2015-09-16 17:10:00"
],
"realTraffic/speed_t4013.csv": [
"2015-09-16 08:04:00",
"2015-09-17 08:15:00"
],
"realTweets/Twitter_volume_AAPL.csv": [
"2015-03-03 21:07:53",
"2015-03-09 17:32:53",
"2015-03-16 02:57:53",
"2015-03-31 03:27:53"
],
"realTweets/Twitter_volume_AMZN.csv": [
"2015-03-05 19:47:53",
"2015-03-11 20:57:53",
"2015-04-01 21:57:53",
"2015-04-08 04:52:53"
],
"realTweets/Twitter_volume_CRM.csv": [
"2015-03-09 19:07:53",
"2015-03-19 23:07:53",
"2015-03-26 19:07:53"
],
"realTweets/Twitter_volume_CVS.csv": [
"2015-03-04 16:02:53",
"2015-03-05 19:57:53",
"2015-03-26 14:07:53",
"2015-04-14 22:37:53"
],
"realTweets/Twitter_volume_FB.csv": [
"2015-03-16 07:07:53",
"2015-04-03 17:47:53"
],
"realTweets/Twitter_volume_GOOG.csv": [
"2015-03-13 20:22:53",
"2015-03-14 16:27:53",
"2015-03-22 22:52:53",
"2015-04-01 05:27:53"
],
"realTweets/Twitter_volume_IBM.csv": [
"2015-03-23 22:27:53",
"2015-04-20 20:07:53"
],
"realTweets/Twitter_volume_KO.csv": [
"2015-03-20 13:12:53",
"2015-04-08 23:42:53",
"2015-04-14 14:52:53"
],
"realTweets/Twitter_volume_PFE.csv": [
"2015-03-02 21:22:53",
"2015-03-04 10:32:53",
"2015-03-13 19:57:53",
"2015-04-07 23:42:53"
],
"realTweets/Twitter_volume_UPS.csv": [
"2015-03-03 00:27:53",
"2015-03-04 11:07:53",
"2015-03-05 15:22:53",
"2015-03-24 18:17:53",
"2015-03-29 16:27:53"
]
}

+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_24ae8d.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_53ea38.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_5f5533.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_77c1ca.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_825cc2.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_ac20cd.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_c6585a.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_cpu_utilization_fe7f93.csv
File diff suppressed because it is too large
View File


+ 4731
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_disk_write_bytes_1ef3de.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_disk_write_bytes_c0d644.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_network_in_257a54.csv
File diff suppressed because it is too large
View File


+ 4731
- 0
datasets/NAB/realAWSCloudwatch/labeled_ec2_network_in_5abac7.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_elb_request_count_8c0756.csv
File diff suppressed because it is too large
View File


+ 4622
- 0
datasets/NAB/realAWSCloudwatch/labeled_grok_asg_anomaly.csv
File diff suppressed because it is too large
View File


+ 1244
- 0
datasets/NAB/realAWSCloudwatch/labeled_iio_us-east-1_i-a2eb1cd9_NetworkIn.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_rds_cpu_utilization_cc0c53.csv
File diff suppressed because it is too large
View File


+ 4033
- 0
datasets/NAB/realAWSCloudwatch/labeled_rds_cpu_utilization_e47b3b.csv
File diff suppressed because it is too large
View File


+ 1625
- 0
datasets/NAB/realAdExchange/labeled_exchange-2_cpc_results.csv
File diff suppressed because it is too large
View File


+ 1625
- 0
datasets/NAB/realAdExchange/labeled_exchange-2_cpm_results.csv
File diff suppressed because it is too large
View File


+ 1539
- 0
datasets/NAB/realAdExchange/labeled_exchange-3_cpc_results.csv
File diff suppressed because it is too large
View File


+ 1539
- 0
datasets/NAB/realAdExchange/labeled_exchange-3_cpm_results.csv
File diff suppressed because it is too large
View File


+ 1644
- 0
datasets/NAB/realAdExchange/labeled_exchange-4_cpc_results.csv
File diff suppressed because it is too large
View File


+ 1644
- 0
datasets/NAB/realAdExchange/labeled_exchange-4_cpm_results.csv
File diff suppressed because it is too large
View File


+ 7268
- 0
datasets/NAB/realKnownCause/labeled_ambient_temperature_system_failure.csv
File diff suppressed because it is too large
View File


+ 1
- 0
datasets/NAB/realKnownCause/labeled_cpu_utilization_asg_misconfiguration.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
948611b07519538ef036e0ec1c948f6bf97009cf

+ 4033
- 0
datasets/NAB/realKnownCause/labeled_ec2_request_latency_system_failure.csv
File diff suppressed because it is too large
View File


+ 1
- 0
datasets/NAB/realKnownCause/labeled_machine_temperature_system_failure.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
428229640a5466e68014f74649a24f00abb1150b

+ 10321
- 0
datasets/NAB/realKnownCause/labeled_nyc_taxi.csv
File diff suppressed because it is too large
View File


+ 1883
- 0
datasets/NAB/realKnownCause/labeled_rogue_agent_key_hold.csv
File diff suppressed because it is too large
View File


+ 5316
- 0
datasets/NAB/realKnownCause/labeled_rogue_agent_key_updown.csv
File diff suppressed because it is too large
View File


+ 2501
- 0
datasets/NAB/realTraffic/labeled_TravelTime_387.csv
File diff suppressed because it is too large
View File


+ 2163
- 0
datasets/NAB/realTraffic/labeled_TravelTime_451.csv
File diff suppressed because it is too large
View File


+ 2381
- 0
datasets/NAB/realTraffic/labeled_occupancy_6005.csv
File diff suppressed because it is too large
View File


+ 2501
- 0
datasets/NAB/realTraffic/labeled_occupancy_t4013.csv
File diff suppressed because it is too large
View File


+ 2501
- 0
datasets/NAB/realTraffic/labeled_speed_6005.csv
File diff suppressed because it is too large
View File


+ 1128
- 0
datasets/NAB/realTraffic/labeled_speed_7578.csv
File diff suppressed because it is too large
View File


+ 2496
- 0
datasets/NAB/realTraffic/labeled_speed_t4013.csv
File diff suppressed because it is too large
View File


+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_AAPL.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
68dd1084ed091fb9affe45b4e0894250c6c62c07

+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_AMZN.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
7bebf0fe077dda56f789d644090faf1d2484913c

+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_CRM.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
59fdf3c2b8d171704e3de1e10d8ccfca72c8ab9a

+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_CVS.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
d20453833fc13c681f0b5f5a830f3aba52b774cd

+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_FB.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
ed60bba6f53c779335874c39966b7d5e4309e2c3

+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
f01b654d9a6a6ebc7efc65da240f83680de2131d

+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
8e0088d97641d6ab39b808fe03ac0a7ec9ea99b9

+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_KO.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
d72fffb08da82bb70ecc379bb1fa56316efda557

+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_PFE.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
4c2f8543201c0a66e44815dee128d9044a41c382

+ 1
- 0
datasets/NAB/realTweets/labeled_Twitter_volume_UPS.csv.REMOVED.git-id View File

@@ -0,0 +1 @@
25a0dd3110986418d379a887cc575f9fdc45a6da

+ 1401
- 0
datasets/yahoo_sub_5.csv
File diff suppressed because it is too large
View File


+ 20
- 0
docs/Makefile View File

@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

+ 35
- 0
docs/make.bat View File

@@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

+ 85
- 0
docs/source/conf.py View File

@@ -0,0 +1,85 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.append(os.path.abspath('../../tods'))
sys.path.append(os.path.abspath('../../'))

# -- Auto-doc Skip --------------------
def skip_member(app, what, name, obj, skip, opts):
# we can document otherwise excluded entities here by returning False
# or skip otherwise included entities by returning True
if name == "__author" or name == "metadata":
return True
return None

def setup(app):
app.connect('autodoc-skip-member', skip_member)

# -- Project information -----------------------------------------------------

project = 'TODS'
copyright = '2020, DataLab@Texas A&M University'
author = 'DataLab@Texas A&M University'

# The full version, including alpha/beta/rc tags
release = '0.0.1'


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.doctest',
'sphinx.ext.intersphinx',
'sphinx.ext.todo',
'sphinx.ext.coverage',
'sphinx.ext.mathjax',
'sphinx.ext.githubpages',
'sphinx.ext.napoleon',
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
source_suffix = '.rst'

# The master toctree document.
master_doc = 'doctree'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = None


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_sidebars = {
'**': ['fulltoc.html', 'sourcelink.html', 'searchbox.html']
}


+ 31
- 0
docs/source/doctree.rst View File

@@ -0,0 +1,31 @@
.. rlcard documentation master file, created by
sphinx-quickstart on Thu Sep 5 18:45:31 2019.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.

.. toctree::
:glob:
:caption: Documentation:

overview
getting_started


.. toctree::
:glob:
:caption: API Documents:

tods.data_processing
tods.timeseries_processing
tods.feature_analysis
tods.detection_algorithm
tods.reinforcement



Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

+ 595
- 0
docs/source/getting_started.rst View File

@@ -0,0 +1,595 @@
Getting Started
===============

In this document, we provide some toy examples for getting started. All
the examples in this document and even more examples are available in
`examples/ <https://github.com/datamllab/rlcard/tree/master/examples>`__.

Playing with Random Agents
--------------------------

We have set up a random agent that can play randomly on each
environment. An example of applying a random agent on Blackjack is as
follow:

.. code:: python

import rlcard
from rlcard.agents import RandomAgent
from rlcard.utils import set_global_seed

# Make environment
env = rlcard.make('blackjack', config={'seed': 0})
episode_num = 2

# Set a global seed
set_global_seed(0)

# Set up agents
agent_0 = RandomAgent(action_num=env.action_num)
env.set_agents([agent_0])

for episode in range(episode_num):

# Generate data from the environment
trajectories, _ = env.run(is_training=False)

# Print out the trajectories
print('\nEpisode {}'.format(episode))
for ts in trajectories[0]:
print('State: {}, Action: {}, Reward: {}, Next State: {}, Done: {}'.format(ts[0], ts[1], ts[2], ts[3], ts[4]))

The expected output should look like something as follows:

::

Episode 0
State: {'obs': array([20, 3]), 'legal_actions': [0, 1]}, Action: 0, Reward: 0, Next State: {'obs': array([15, 3]), 'legal_actions': [0, 1]}, Done: False
State: {'obs': array([15, 3]), 'legal_actions': [0, 1]}, Action: 1, Reward: -1, Next State: {'obs': array([15, 20]), 'legal_actions': [0, 1]}, Done: True

Episode 1
State: {'obs': array([15, 5]), 'legal_actions': [0, 1]}, Action: 1, Reward: 1, Next State: {'obs': array([15, 23]), 'legal_actions': [0, 1]}, Done: True

Note that the states and actions are wrapped by ``env`` in Blackjack. In
this example, the ``[20, 3]`` suggests the current player obtains score
20 while the card that faces up in the dealer’s hand has score 3. Action
0 means “hit” while action 1 means “stand”. Reward 1 suggests the player
wins while reward -1 suggests the dealer wins. Reward 0 suggests a tie.
The above data can be directly fed into a RL algorithm for training.

Deep-Q Learning on Blackjack
----------------------------

The second example is to use Deep-Q learning to train an agent on
Blackjack. We aim to use this example to show how reinforcement learning
algorithms can be developed and applied in our toolkit. We design a
``run`` function which plays one complete game and provides the data for
training RL agents. The example is shown below:

.. code:: python

import tensorflow as tf
import os

import rlcard
from rlcard.agents import DQNAgent
from rlcard.utils import set_global_seed, tournament
from rlcard.utils import Logger

# Make environment
env = rlcard.make('blackjack', config={'seed': 0})
eval_env = rlcard.make('blackjack', config={'seed': 0})

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 100
evaluate_num = 10000
episode_num = 100000

# The intial memory size
memory_init_size = 100

# Train the agent every X steps
train_every = 1

# The paths for saving the logs and learning curves
log_dir = './experiments/blackjack_dqn_result/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

# Initialize a global step
global_step = tf.Variable(0, name='global_step', trainable=False)

# Set up the agents
agent = DQNAgent(sess,
scope='dqn',
action_num=env.action_num,
replay_memory_init_size=memory_init_size,
train_every=train_every,
state_shape=env.state_shape,
mlp_layers=[10,10])
env.set_agents([agent])
eval_env.set_agents([agent])

# Initialize global variables
sess.run(tf.global_variables_initializer())

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

for episode in range(episode_num):

# Generate data from the environment
trajectories, _ = env.run(is_training=True)

# Feed transitions into agent memory, and train the agent
for ts in trajectories[0]:
agent.feed(ts)

# Evaluate the performance. Play with random agents.
if episode % evaluate_every == 0:
logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0])

# Close files in the logger
logger.close_files()

# Plot the learning curve
logger.plot('DQN')
# Save model
save_dir = 'models/blackjack_dqn'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
saver = tf.train.Saver()
saver.save(sess, os.path.join(save_dir, 'model'))

The expected output is something like below:

::

----------------------------------------
timestep | 1
reward | -0.7342
----------------------------------------
INFO - Agent dqn, step 100, rl-loss: 1.0042707920074463
INFO - Copied model parameters to target network.
INFO - Agent dqn, step 136, rl-loss: 0.7888197302818298
----------------------------------------
timestep | 136
reward | -0.1406
----------------------------------------
INFO - Agent dqn, step 278, rl-loss: 0.6946825981140137
----------------------------------------
timestep | 278
reward | -0.1523
----------------------------------------
INFO - Agent dqn, step 412, rl-loss: 0.62268990278244025
----------------------------------------
timestep | 412
reward | -0.088
----------------------------------------
INFO - Agent dqn, step 544, rl-loss: 0.69050502777099616
----------------------------------------
timestep | 544
reward | -0.08
----------------------------------------
INFO - Agent dqn, step 681, rl-loss: 0.61789089441299444
----------------------------------------
timestep | 681
reward | -0.0793
----------------------------------------

In Blackjack, the player will get a payoff at the end of the game: 1 if
the player wins, -1 if the player loses, and 0 if it is a tie. The
performance is measured by the average payoff the player obtains by
playing 10000 episodes. The above example shows that the agent achieves
better and better performance during training. The logs and learning
curves are saved in ``./experiments/blackjack_dqn_result/``.

Running Multiple Processes
--------------------------

The environments can be run with multiple processes to accelerate the
training. Below is an example to train DQN on Blackjack with multiple
processes.

.. code:: python

''' An example of learning a Deep-Q Agent on Blackjack with multiple processes
Note that we must use if __name__ == '__main__' for multiprocessing
'''

import tensorflow as tf
import os

import rlcard
from rlcard.agents import DQNAgent
from rlcard.utils import set_global_seed, tournament
from rlcard.utils import Logger

def main():
# Make environment
env = rlcard.make('blackjack', config={'seed': 0, 'env_num': 4})
eval_env = rlcard.make('blackjack', config={'seed': 0, 'env_num': 4})

# Set the iterations numbers and how frequently we evaluate performance
evaluate_every = 100
evaluate_num = 10000
iteration_num = 100000

# The intial memory size
memory_init_size = 100

# Train the agent every X steps
train_every = 1

# The paths for saving the logs and learning curves
log_dir = './experiments/blackjack_dqn_result/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

# Initialize a global step
global_step = tf.Variable(0, name='global_step', trainable=False)

# Set up the agents
agent = DQNAgent(sess,
scope='dqn',
action_num=env.action_num,
replay_memory_init_size=memory_init_size,
train_every=train_every,
state_shape=env.state_shape,
mlp_layers=[10,10])
env.set_agents([agent])
eval_env.set_agents([agent])

# Initialize global variables
sess.run(tf.global_variables_initializer())

# Initialize a Logger to plot the learning curve
logger = Logger(log_dir)

for iteration in range(iteration_num):

# Generate data from the environment
trajectories, _ = env.run(is_training=True)

# Feed transitions into agent memory, and train the agent
for ts in trajectories[0]:
agent.feed(ts)

# Evaluate the performance. Play with random agents.
if iteration % evaluate_every == 0:
logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0])

# Close files in the logger
logger.close_files()

# Plot the learning curve
logger.plot('DQN')
# Save model
save_dir = 'models/blackjack_dqn'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
saver = tf.train.Saver()
saver.save(sess, os.path.join(save_dir, 'model'))

if __name__ == '__main__':
main()

Example output is as follow:

::

----------------------------------------
timestep | 17
reward | -0.7378
----------------------------------------

INFO - Copied model parameters to target network.
INFO - Agent dqn, step 1100, rl-loss: 0.40940183401107797
INFO - Copied model parameters to target network.
INFO - Agent dqn, step 2100, rl-loss: 0.44971221685409546
INFO - Copied model parameters to target network.
INFO - Agent dqn, step 2225, rl-loss: 0.65466868877410897
----------------------------------------
timestep | 2225
reward | -0.0658
----------------------------------------
INFO - Agent dqn, step 3100, rl-loss: 0.48663979768753053
INFO - Copied model parameters to target network.
INFO - Agent dqn, step 4100, rl-loss: 0.71293979883193974
INFO - Copied model parameters to target network.
INFO - Agent dqn, step 4440, rl-loss: 0.55871248245239263
----------------------------------------
timestep | 4440
reward | -0.0736
----------------------------------------

Training CFR on Leduc Hold’em
-----------------------------

To show how we can use ``step`` and ``step_back`` to traverse the game
tree, we provide an example of solving Leduc Hold’em with CFR:

.. code:: python

import numpy as np

import rlcard
from rlcard.agents import CFRAgent
from rlcard import models
from rlcard.utils import set_global_seed, tournament
from rlcard.utils import Logger

# Make environment and enable human mode
env = rlcard.make('leduc-holdem', config={'seed': 0, 'allow_step_back':True})
eval_env = rlcard.make('leduc-holdem', config={'seed': 0})

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 100
save_plot_every = 1000
evaluate_num = 10000
episode_num = 10000

# The paths for saving the logs and learning curves
log_dir = './experiments/leduc_holdem_cfr_result/'

# Set a global seed
set_global_seed(0)

# Initilize CFR Agent
agent = CFRAgent(env)
agent.load() # If we have saved model, we first load the model

# Evaluate CFR against pre-trained NFSP
eval_env.set_agents([agent, models.load('leduc-holdem-nfsp').agents[0]])

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

for episode in range(episode_num):
agent.train()
print('\rIteration {}'.format(episode), end='')
# Evaluate the performance. Play with NFSP agents.
if episode % evaluate_every == 0:
agent.save() # Save model
logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0])

# Close files in the logger
logger.close_files()

# Plot the learning curve
logger.plot('CFR')

In the above example, the performance is measured by playing against a
pre-trained NFSP model. The expected output is as below:

::

Iteration 0
----------------------------------------
timestep | 192
reward | -1.3662
----------------------------------------
Iteration 100
----------------------------------------
timestep | 19392
reward | 0.9462
----------------------------------------
Iteration 200
----------------------------------------
timestep | 38592
reward | 0.8591
----------------------------------------
Iteration 300
----------------------------------------
timestep | 57792
reward | 0.7861
----------------------------------------
Iteration 400
----------------------------------------
timestep | 76992
reward | 0.7752
----------------------------------------
Iteration 500
----------------------------------------
timestep | 96192
reward | 0.7215
----------------------------------------

We observe that CFR achieves better performance as NFSP. However, CFR
requires traversal of the game tree, which is infeasible in large
environments.

Having Fun with Pretrained Leduc Model
--------------------------------------

We have designed simple human interfaces to play against the pretrained
model. Leduc Hold’em is a simplified version of Texas Hold’em. Rules can
be found `here <games.md#leduc-holdem>`__. Example of playing against
Leduc Hold’em CFR model is as below:

.. code:: python

import rlcard
from rlcard import models
from rlcard.agents import LeducholdemHumanAgent as HumanAgent
from rlcard.utils import print_card

# Make environment
# Set 'record_action' to True because we need it to print results
env = rlcard.make('leduc-holdem', config={'record_action': True})
human_agent = HumanAgent(env.action_num)
cfr_agent = models.load('leduc-holdem-cfr').agents[0]
env.set_agents([human_agent, cfr_agent])

print(">> Leduc Hold'em pre-trained model")

while (True):
print(">> Start a new game")

trajectories, payoffs = env.run(is_training=False)
# If the human does not take the final action, we need to
# print other players action
final_state = trajectories[0][-1][-2]
action_record = final_state['action_record']
state = final_state['raw_obs']
_action_list = []
for i in range(1, len(action_record)+1):
if action_record[-i][0] == state['current_player']:
break
_action_list.insert(0, action_record[-i])
for pair in _action_list:
print('>> Player', pair[0], 'chooses', pair[1])

# Let's take a look at what the agent card is
print('=============== CFR Agent ===============')
print_card(env.get_perfect_information()['hand_cards'][1])

print('=============== Result ===============')
if payoffs[0] > 0:
print('You win {} chips!'.format(payoffs[0]))
elif payoffs[0] == 0:
print('It is a tie.')
else:
print('You lose {} chips!'.format(-payoffs[0]))
print('')

input("Press any key to continue...")

Example output is as follow:

::

>> Leduc Hold'em pre-trained model

>> Start a new game!
>> Agent 1 chooses raise

=============== Community Card ===============
┌─────────┐
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
└─────────┘
=============== Your Hand ===============
┌─────────┐
│J │
│ │
│ │
│ ♥ │
│ │
│ │
│ J│
└─────────┘
=============== Chips ===============
Yours: +
Agent 1: +++
=========== Actions You Can Choose ===========
0: call, 1: raise, 2: fold

>> You choose action (integer):

We also provide a running demo of a rule-based agent for UNO. Try it by
running ``examples/uno_human.py``.

Leduc Hold’em as Single-Agent Environment
-----------------------------------------

We have wrraped the environment as single agent environment by assuming
that other players play with pre-trained models. The interfaces are
exactly the same to OpenAI Gym. Thus, any single-agent algorithm can be
connected to the environment. An example of Leduc Hold’em is as below:

.. code:: python

import tensorflow as tf
import os
import numpy as np

import rlcard
from rlcard.agents import DQNAgent
from rlcard.agents import RandomAgent
from rlcard.utils import set_global_seed, tournament
from rlcard.utils import Logger

# Make environment
env = rlcard.make('leduc-holdem', config={'seed': 0, 'single_agent_mode':True})
eval_env = rlcard.make('leduc-holdem', config={'seed': 0, 'single_agent_mode':True})

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 1000
evaluate_num = 10000
timesteps = 100000

# The intial memory size
memory_init_size = 1000

# Train the agent every X steps
train_every = 1

# The paths for saving the logs and learning curves
log_dir = './experiments/leduc_holdem_single_dqn_result/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

# Initialize a global step
global_step = tf.Variable(0, name='global_step', trainable=False)

# Set up the agents
agent = DQNAgent(sess,
scope='dqn',
action_num=env.action_num,
replay_memory_init_size=memory_init_size,
train_every=train_every,
state_shape=env.state_shape,
mlp_layers=[128,128])
# Initialize global variables
sess.run(tf.global_variables_initializer())

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

state = env.reset()

for timestep in range(timesteps):
action = agent.step(state)
next_state, reward, done = env.step(action)
ts = (state, action, reward, next_state, done)
agent.feed(ts)

if timestep % evaluate_every == 0:
rewards = []
state = eval_env.reset()
for _ in range(evaluate_num):
action, _ = agent.eval_step(state)
_, reward, done = env.step(action)
if done:
rewards.append(reward)
logger.log_performance(env.timestep, np.mean(rewards))

# Close files in the logger
logger.close_files()

# Plot the learning curve
logger.plot('DQN')
# Save model
save_dir = 'models/leduc_holdem_single_dqn'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
saver = tf.train.Saver()
saver.save(sess, os.path.join(save_dir, 'model'))

BIN
docs/source/img/framework.pdf View File


+ 28
- 0
docs/source/index.rst View File

@@ -0,0 +1,28 @@
.. Time Series Outlier Detection System documentation master file, created by
sphinx-quickstart on Wed Sep 9 22:52:15 2020.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.

Welcome to TOD's documentation!
================================================================

.. toctree::
:maxdepth: 4
:caption: Contents:



API Documents
==================
.. toctree::
:maxdepth: 4
:caption: API Documents:
tods.data_processing
tods.timeseries_processing
tods.feature_analysis
tods.detection_algorithm
tods.reinforcement

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

+ 7
- 0
docs/source/modules.rst View File

@@ -0,0 +1,7 @@
tods
====

.. toctree::
:maxdepth: 4

tods

+ 101
- 0
docs/source/overview.rst View File

@@ -0,0 +1,101 @@
Overview
========

Design Principles
~~~~~~~~~~~~~~~~~

The toolkit wraps each game by ``Env`` class with easy-to-use
interfaces. The goal of this toolkit is to enable the users to focus on
algorithm development without caring about the environment. The
following design principles are applied when developing the toolkit:
* **Reproducible.** Results on the environments can be reproduced. The same result should be obtained with the same random seed in different runs.
* **Accessible.** The experiences are collected and well organized after each game with easy-to-use interfaces. Uses can conveniently configure state representation, action encoding, reward design, or even the game rules.
* **Scalable.** New card environments can be added conveniently into the toolkit with the above design principles. We also try to minimize the dependencies in the toolkit so that the codes can be easily maintained.

TODS High-level Design
~~~~~~~~~~~~~~~~~~~~~~~~

This document introduces the high-level design for the environments, the
games, and the agents (algorithms).

.. image:: img/framework.pdf
:width: 800



Data-Processing
---------------

We wrap each game with an ``Env`` class. The responsibility of ``Env``
is to help you generate trajectories of the games. For developing
Reinforcement Learning (RL) algorithms, we recommend to use the
following interfaces:

- ``set_agents``: This function tells the ``Env`` what agents will be
used to perform actions in the game. Different games may have a
different number of agents. The input of the function is a list of
``Agent`` class. For example,
``env.set_agent([RandomAgent(), RandomAgent()])`` indicates that two
random agents will be used to generate the trajectories.
- ``run``: After setting the agents, this interface will run a complete
trajectory of the game, calculate the reward for each transition, and
reorganize the data so that it can be directly fed into a RL
algorithm.

For advanced access to the environment, such as traversal of the game
tree, we provide the following interfaces:

- ``step``: Given the current state, the environment takes one step
forward, and returns the next state and the next player.
- ``step_back``: Takes one step backward. The environment will restore
to the last state. The ``step_back`` is defaultly turned off since it
requires expensively recoeding previous states. To turn it on, set
``allow_step_back = True`` when ``make`` environments.
- ``get_payoffs``: At the end of the game, this function can be called
to obtain the payoffs for each player.

We also support single-agent mode and human mode. Examples can be found
in ``examples/``.

- Single agent mode: single-agent environments are developped by
simulating other players with pre-trained models or rule-based
models. You can enable single-agent mode by
``rlcard.make(ENV_ID, config={'single_agent_mode':True})``. Then the
``step`` function will return ``(next_state, reward, done)`` just as
common single-agent environments. ``env.reset()`` will reset the game
and return the first state.

Games
-----

Card games usually have similar structures. We abstract some concepts in
card games and follow the same design pattern. In this way,
users/developers can easily dig into the code and change the rules for
research purpose. Specifically, the following classes are used in all
the games:

- ``Game``: A game is defined as a complete sequence starting from one
of the non-terminal states to a terminal state.
- ``Round``: A round is a part of the sequence of a game. Most card
games can be naturally divided into multiple rounds.
- ``Dealer``: A dealer is responsible for shuffling and allocating a
deck of cards.
- ``Judger``: A judger is responsible for making major decisions at the
end of a round or a game.
- ``Player``: A player is a role who plays cards following a strategy.

To summarize, in one ``Game``, a ``Dealer`` deals the cards for each
``Player``. In each ``Round`` of the game, a ``Judger`` will make major
decisions about the next round and the payoffs in the end of the game.

Agents
------

We provide examples of several representative algorithms and wrap them
as ``Agent`` to show how a learning algorithm can be connected to the
toolkit. The first example is DQN which is a representative of the
Reinforcement Learning (RL) algorithms category. The second example is
NFSP which is a representative of the Reinforcement Learning (RL) with
self-play. We also provide CFR and DeepCFR which belong to Conterfactual
Regret Minimization (CFR) category. Other algorithms from these three
categories can be connected in similar ways.

+ 69
- 0
docs/source/tods.data_processing.rst View File

@@ -0,0 +1,69 @@
tods.data\_processing package
=============================

Submodules
----------

tods.data\_processing.CategoricalToBinary module
------------------------------------------------

.. automodule:: tods.data_processing.CategoricalToBinary
:members:
:noindex:
:show-inheritance:

tods.data\_processing.ColumnFilter module
-----------------------------------------

.. automodule:: tods.data_processing.ColumnFilter
:members:
:noindex:
:show-inheritance:

tods.data\_processing.ContinuityValidation module
-------------------------------------------------

.. automodule:: tods.data_processing.ContinuityValidation
:members:
:noindex:
:show-inheritance:

tods.data\_processing.DatasetToDataframe module
-----------------------------------------------

.. automodule:: tods.data_processing.DatasetToDataframe
:members:
:noindex:
:show-inheritance:

tods.data\_processing.DuplicationValidation module
--------------------------------------------------

.. automodule:: tods.data_processing.DuplicationValidation
:members:
:noindex:
:show-inheritance:

tods.data\_processing.TimeIntervalTransform module
--------------------------------------------------

.. automodule:: tods.data_processing.TimeIntervalTransform
:members:
:noindex:
:show-inheritance:

tods.data\_processing.TimeStampValidation module
------------------------------------------------

.. automodule:: tods.data_processing.TimeStampValidation
:members:
:noindex:
:show-inheritance:

Module contents
---------------

.. automodule:: tods.data_processing
:members:
:noindex:
:show-inheritance:

+ 189
- 0
docs/source/tods.detection_algorithm.rst View File

@@ -0,0 +1,189 @@
tods.detection\_algorithm package
=================================

Submodules
----------

tods.detection\_algorithm.AutoRegODetect module
-----------------------------------------------

.. automodule:: tods.detection_algorithm.AutoRegODetect
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.DeepLog module
----------------------------------------

.. automodule:: tods.detection_algorithm.DeepLog
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.KDiscordODetect module
------------------------------------------------

.. automodule:: tods.detection_algorithm.KDiscordODetect
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.LSTMODetect module
--------------------------------------------

.. automodule:: tods.detection_algorithm.LSTMODetect
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.MatrixProfile module
----------------------------------------------

.. automodule:: tods.detection_algorithm.MatrixProfile
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PCAODetect module
-------------------------------------------

.. automodule:: tods.detection_algorithm.PCAODetect
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodABOD module
-----------------------------------------

.. automodule:: tods.detection_algorithm.PyodABOD
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodAE module
---------------------------------------

.. automodule:: tods.detection_algorithm.PyodAE
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodCBLOF module
------------------------------------------

.. automodule:: tods.detection_algorithm.PyodCBLOF
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodCOF module
----------------------------------------

.. automodule:: tods.detection_algorithm.PyodCOF
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodHBOS module
-----------------------------------------

.. automodule:: tods.detection_algorithm.PyodHBOS
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodIsolationForest module
----------------------------------------------------

.. automodule:: tods.detection_algorithm.PyodIsolationForest
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodKNN module
----------------------------------------

.. automodule:: tods.detection_algorithm.PyodKNN
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodLODA module
-----------------------------------------

.. automodule:: tods.detection_algorithm.PyodLODA
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodLOF module
----------------------------------------

.. automodule:: tods.detection_algorithm.PyodLOF
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodMoGaal module
-------------------------------------------

.. automodule:: tods.detection_algorithm.PyodMoGaal
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodOCSVM module
------------------------------------------

.. automodule:: tods.detection_algorithm.PyodOCSVM
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodSOD module
----------------------------------------

.. automodule:: tods.detection_algorithm.PyodSOD
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodSoGaal module
-------------------------------------------

.. automodule:: tods.detection_algorithm.PyodSoGaal
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.PyodVAE module
----------------------------------------

.. automodule:: tods.detection_algorithm.PyodVAE
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.Telemanom module
------------------------------------------

.. automodule:: tods.detection_algorithm.Telemanom
:members:
:noindex:
:show-inheritance:

tods.detection\_algorithm.UODBasePrimitive module
-------------------------------------------------

.. automodule:: tods.detection_algorithm.UODBasePrimitive
:members:
:noindex:
:show-inheritance:

Module contents
---------------

.. automodule:: tods.detection_algorithm
:members:
:noindex:
:show-inheritance:

+ 253
- 0
docs/source/tods.feature_analysis.rst View File

@@ -0,0 +1,253 @@
tods.feature\_analysis package
==============================

Submodules
----------

tods.feature\_analysis.AutoCorrelation module
---------------------------------------------

.. automodule:: tods.feature_analysis.AutoCorrelation
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.BKFilter module
--------------------------------------

.. automodule:: tods.feature_analysis.BKFilter
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.DiscreteCosineTransform module
-----------------------------------------------------

.. automodule:: tods.feature_analysis.DiscreteCosineTransform
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.FastFourierTransform module
--------------------------------------------------

.. automodule:: tods.feature_analysis.FastFourierTransform
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.HPFilter module
--------------------------------------

.. automodule:: tods.feature_analysis.HPFilter
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.NonNegativeMatrixFactorization module
------------------------------------------------------------

.. automodule:: tods.feature_analysis.NonNegativeMatrixFactorization
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.SKTruncatedSVD module
--------------------------------------------

.. automodule:: tods.feature_analysis.SKTruncatedSVD
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.SpectralResidualTransform module
-------------------------------------------------------

.. automodule:: tods.feature_analysis.SpectralResidualTransform
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalAbsEnergy module
--------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalAbsEnergy
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalAbsSum module
-----------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalAbsSum
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalGmean module
----------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalGmean
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalHmean module
----------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalHmean
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalKurtosis module
-------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalKurtosis
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalMaximum module
------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalMaximum
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalMean module
---------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalMean
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalMeanAbs module
------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalMeanAbs
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalMeanAbsTemporalDerivative module
------------------------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalMeanAbsTemporalDerivative
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalMeanTemporalDerivative module
---------------------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalMeanTemporalDerivative
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalMedian module
-----------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalMedian
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalMedianAbsoluteDeviation module
----------------------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalMedianAbsoluteDeviation
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalMinimum module
------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalMinimum
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalSkew module
---------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalSkew
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalStd module
--------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalStd
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalVar module
--------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalVar
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalVariation module
--------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalVariation
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalVecSum module
-----------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalVecSum
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalWillisonAmplitude module
----------------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalWillisonAmplitude
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.StatisticalZeroCrossing module
-----------------------------------------------------

.. automodule:: tods.feature_analysis.StatisticalZeroCrossing
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.TRMF module
----------------------------------

.. automodule:: tods.feature_analysis.TRMF
:members:
:noindex:
:show-inheritance:

tods.feature\_analysis.WaveletTransform module
----------------------------------------------

.. automodule:: tods.feature_analysis.WaveletTransform
:members:
:noindex:
:show-inheritance:

Module contents
---------------

.. automodule:: tods.feature_analysis
:members:
:noindex:
:show-inheritance:

+ 21
- 0
docs/source/tods.reinforcement.rst View File

@@ -0,0 +1,21 @@
tods.reinforcement package
==========================

Submodules
----------

tods.reinforcement.RuleBasedFilter module
-----------------------------------------

.. automodule:: tods.reinforcement.RuleBasedFilter
:members:
:noindex:
:show-inheritance:

Module contents
---------------

.. automodule:: tods.reinforcement
:members:
:noindex:
:show-inheritance:

+ 24
- 0
docs/source/tods.rst View File

@@ -0,0 +1,24 @@
tods package
============

Subpackages
-----------

.. toctree::
:maxdepth: 2

tods.data_processing
tods.detection_algorithm
tods.feature_analysis
tods.reinforcement
tods.searcher
tods.timeseries_processing


Module contents
---------------

.. automodule:: tods
:members:
:undoc-members:
:show-inheritance:

+ 37
- 0
docs/source/tods.searcher.rst View File

@@ -0,0 +1,37 @@
tods.searcher package
=====================

Subpackages
-----------

.. toctree::
:maxdepth: 4

tods.searcher.search

Submodules
----------

tods.searcher.schemas module
----------------------------

.. automodule:: tods.searcher.schemas
:members:
:undoc-members:
:show-inheritance:

tods.searcher.utils module
--------------------------

.. automodule:: tods.searcher.utils
:members:
:undoc-members:
:show-inheritance:

Module contents
---------------

.. automodule:: tods.searcher
:members:
:undoc-members:
:show-inheritance:

+ 21
- 0
docs/source/tods.searcher.search.rst View File

@@ -0,0 +1,21 @@
tods.searcher.search package
============================

Submodules
----------

tods.searcher.search.brute\_force\_search module
------------------------------------------------

.. automodule:: tods.searcher.search.brute_force_search
:members:
:noindex:
:show-inheritance:

Module contents
---------------

.. automodule:: tods.searcher.search
:members:
:noindex:
:show-inheritance:

+ 85
- 0
docs/source/tods.timeseries_processing.rst View File

@@ -0,0 +1,85 @@
tods.timeseries\_processing package
===================================

Submodules
----------

tods.timeseries\_processing.HoltSmoothing module
------------------------------------------------

.. automodule:: tods.timeseries_processing.HoltSmoothing
:members:
:noindex:
:show-inheritance:

tods.timeseries\_processing.HoltWintersExponentialSmoothing module
------------------------------------------------------------------

.. automodule:: tods.timeseries_processing.HoltWintersExponentialSmoothing
:members:
:noindex:
:show-inheritance:

tods.timeseries\_processing.MovingAverageTransform module
---------------------------------------------------------

.. automodule:: tods.timeseries_processing.MovingAverageTransform
:members:
:noindex:
:show-inheritance:

tods.timeseries\_processing.SKAxiswiseScaler module
---------------------------------------------------

.. automodule:: tods.timeseries_processing.SKAxiswiseScaler
:members:
:noindex:
:show-inheritance:

tods.timeseries\_processing.SKPowerTransformer module
-----------------------------------------------------

.. automodule:: tods.timeseries_processing.SKPowerTransformer
:members:
:noindex:
:show-inheritance:

tods.timeseries\_processing.SKQuantileTransformer module
--------------------------------------------------------

.. automodule:: tods.timeseries_processing.SKQuantileTransformer
:members:
:noindex:
:show-inheritance:

tods.timeseries\_processing.SKStandardScaler module
---------------------------------------------------

.. automodule:: tods.timeseries_processing.SKStandardScaler
:members:
:noindex:
:show-inheritance:

tods.timeseries\_processing.SimpleExponentialSmoothing module
-------------------------------------------------------------

.. automodule:: tods.timeseries_processing.SimpleExponentialSmoothing
:members:
:noindex:
:show-inheritance:

tods.timeseries\_processing.TimeSeriesSeasonalityTrendDecomposition module
--------------------------------------------------------------------------

.. automodule:: tods.timeseries_processing.TimeSeriesSeasonalityTrendDecomposition
:members:
:noindex:
:show-inheritance:

Module contents
---------------

.. automodule:: tods.timeseries_processing
:members:
:noindex:
:show-inheritance:

+ 23
- 0
examples/evaluate_default_pipeline.py View File

@@ -0,0 +1,23 @@
import pandas as pd

from searcher import schemas as schemas_utils
from searcher.utils import generate_dataset_problem, evaluate_pipeline

table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

# Load the default pipeline
pipeline = schemas_utils.load_default_pipeline()

# Run the pipeline
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline)
print(pipeline_result)


+ 50
- 24
examples/run_automl.py View File

@@ -1,33 +1,59 @@
import uuid
import random
import pandas as pd
from pprint import pprint
from sklearn.datasets import make_classification

from d3m import container
from d3m.metadata.pipeline import Pipeline
from d3m.metadata.problem import TaskKeyword, PerformanceMetric

from axolotl.utils import data_problem
from axolotl.backend.simple import SimpleRunner
from axolotl.backend.ray import RayRunner
from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils

import tods
from tods.search import BruteForceSearch
from searcher.utils import generate_dataset_problem
from searcher.search import BruteForceSearch

table_path = 'datasets/anomaly/kpi/kpi_dataset/tables/learningData.csv'
df = pd.read_csv(table_path)
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=3,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=[{'metric': PerformanceMetric.F1}])
# Some information
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset
#target_index = 2 # what column is the target

table_path = 'datasets/yahoo_sub_5.csv'
target_index = 6 # what column is the target
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset
time_limit = 30 # How many seconds you wanna search

print(dataset)
print(problem_description)
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

# Start backend
backend = SimpleRunner(random_seed=42)

backend = SimpleRunner(random_seed=0)
# Start search algorithm
search = BruteForceSearch(problem_description=problem_description, backend=backend)
print(search)

# Find the best pipeline
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit)
best_pipeline = best_runtime.pipeline
best_output = best_pipeline_result.output

# Evaluate the best pipeline
best_scores = search.evaluate(best_pipeline).scores


print('*' * 52)
print('Search History:')
for pipeline_result in search.history:
print('-' * 52)
print('Pipeline id:', pipeline_result.pipeline.id)
print(pipeline_result.scores)
print('*' * 52)

print('')

print('*' * 52)
print('Best pipeline:')
print('-' * 52)
print('Pipeline id:', best_pipeline.id)
print('Pipeline json:', best_pipeline.to_json())
print('Output:')
print(best_output)
print('Scores:')
print(best_scores)
print('*' * 52)


+ 0
- 51
examples/run_predefined_pipeline.py View File

@@ -1,51 +0,0 @@
import uuid
import random
import pandas as pd
import json
from pprint import pprint
from sklearn.datasets import make_classification

from d3m import container
from d3m.metadata.pipeline import Pipeline
from d3m.metadata.problem import TaskKeyword, PerformanceMetric

from axolotl.utils import data_problem
from axolotl.backend.simple import SimpleRunner
# from axolotl.backend.ray import RayRunner
# from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils

import tods
from tods.search import BruteForceSearch

table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv'
df = pd.read_csv(table_path)
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=7,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=[{'metric': PerformanceMetric.F1}])

print(dataset)
print(problem_description)

metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}},
]

pipeline_path = 'example_pipeline.json'
pipeline = pipeline_utils.load_pipeline(pipeline_path)
print(pipeline)

data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

backend = SimpleRunner(random_seed=0)
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
pipeline=pipeline,
input_data=[dataset],
metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params)
print(pipeline_result)


+ 2
- 2
requirements.txt View File

@@ -1,4 +1,4 @@
scikit-learn==0.21.3
scikit-learn==0.22.0
pytypes==1.0b5
frozendict==1.2
numpy>=1.15.4,<=1.18.1
@@ -9,7 +9,7 @@ rfc3987==1.3.8
webcolors>=1.8.1,<=1.10
dateparser>=0.7.0,<=0.7.2
python-dateutil==2.8.1
pandas==0.23.4
pandas==0.25
typing-inspect==0.5.0
GitPython==3.1.0
jsonpath-ng==1.4.3


+ 21
- 0
setup.py View File

@@ -0,0 +1,21 @@
from distutils.command.sdist import sdist as sdist_orig
from distutils.errors import DistutilsExecError

from setuptools import setup, find_packages


class install(sdist_orig):
def run(self):
try:
self.spawn(['sh', '.install.sh'])
except DistutilsExecError:
self.warn('lost installation script')
super().run()


setup(name='tods',
version='0.0.1',
cmdclass={
'install': install
},
)

+ 2
- 2
tods/common-primitives/sklearn-wrap/requirements.txt View File

@@ -1,4 +1,4 @@
scikit-learn==0.21.3
scikit-learn==0.22.0
pytypes==1.0b5
frozendict==1.2
numpy>=1.15.4,<=1.18.1
@@ -9,7 +9,7 @@ rfc3987==1.3.8
webcolors>=1.8.1,<=1.10
dateparser>=0.7.0,<=0.7.2
python-dateutil==2.8.1
pandas==0.23.4
pandas==0.25
typing-inspect==0.5.0
GitPython>=2.1.11,<=3.0.5
jsonpath-ng==1.4.3


+ 1
- 1
tods/common-primitives/sklearn-wrap/setup.py View File

@@ -25,7 +25,7 @@ setup(
'd3m',
'Jinja2==2.9.4',
'simplejson==3.12.0',
'scikit-learn==0.21.3',
'scikit-learn==0.22.0',
],
url='https://gitlab.datadrivendiscovery.org/jpl/sklearn-wrapping',
entry_points = {


+ 0
- 2
tods/entry_points.ini View File

@@ -68,8 +68,6 @@ tods.detection_algorithm.pyod_mogaal = detection_algorithm.PyodMoGaal:Mo_GaalPri

tods.detection_algorithm.matrix_profile = detection_algorithm.MatrixProfile:MatrixProfile
tods.detection_algorithm.AutoRegODetector = detection_algorithm.AutoRegODetect:AutoRegODetector
tods.detection_algorithm.KDiscordDetector = detection_algorithm.KDiscordODetect:KDiscordDetector
tods.detection_algorithm.PCADetector = detection_algorithm.PCAODetect:PCADetector

tods.detection_algorithm.LSTMODetector = detection_algorithm.LSTMODetect:LSTMODetector
tods.detection_algorithm.AutoRegODetector = detection_algorithm.AutoRegODetect:AutoRegODetector


+ 2
- 2
tods/requirements.txt View File

@@ -1,4 +1,4 @@
scikit-learn==0.21.3
scikit-learn==0.22.0
pytypes==1.0b5
frozendict==1.2
numpy>=1.15.4,<=1.18.1
@@ -9,7 +9,7 @@ rfc3987==1.3.8
webcolors>=1.8.1,<=1.10
dateparser>=0.7.0,<=0.7.2
python-dateutil==2.8.1
pandas==0.23.4
pandas==0.25.0
typing-inspect==0.5.0
GitPython>=2.1.11,<=3.0.5
jsonpath-ng==1.4.3


tods/tods/__init__.py → tods/searcher/__init__.py View File


+ 1
- 0
tods/searcher/resources/default_pipeline.json View File

@@ -0,0 +1 @@
{"id": "384bbfab-4f6d-4001-9f90-684ea5681f5d", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-09-09T23:40:01.756164Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "b94ee59ccf8db678d506adddbc238fb2049fb664a1e3f3f3f6a6517c0c4f8e5f"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "256f0155c7185d747b3b23096e46c40d15844106f9ed6346453f6010891f1896"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.4.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "85fe81066e85dbb62eacbe8a96be52d08e7aec22a025a29c81feaaaa72d7f7d0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "642de2e7-5590-3cab-9266-2a53c326c461", "version": "0.0.1", "python_path": "d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler", "name": "Axis_wise_scale"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "eaff2f35-978c-4530-a12e-061a5f0beacd", "version": "0.1.0", "python_path": "d3m.primitives.tods.feature_analysis.statistical_mean", "name": "Time Series Decompostional", "digest": "2f2a8c07878643fe29c346096b91b5ba91477baa1e7e78684f07e53d29766ca4"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "67e7fcdf-d645-3417-9aa4-85cd369487d9", "version": "0.0.1", "python_path": "d3m.primitives.tods.detection_algorithm.pyod_vae", "name": "TODS.anomaly_detection_primitives.VariationalAutoEncoder"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "d5384857f75090844f367504befb1a854e5088589f6aae0795f66ccf10403e19"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "8d969800816d9596e94cb045aacce43dc3d49e8c5bedb403e35af6c9b8339990"}

+ 10
- 0
tods/searcher/schemas.py View File

@@ -0,0 +1,10 @@
import os

resource_dir = os.path.dirname(__file__)

DEFAULT_PIPELINE_DIR = os.path.join(resource_dir, 'resources', 'default_pipeline.json')

def load_default_pipeline():
from axolotl.utils import pipeline as pipeline_utils
pipeline = pipeline_utils.load_pipeline(DEFAULT_PIPELINE_DIR)
return pipeline

tods/tods/search/__init__.py → tods/searcher/search/__init__.py View File


+ 294
- 0
tods/searcher/search/brute_force_search.py View File

@@ -0,0 +1,294 @@
# A Brute-Force Search
import uuid
import random

from d3m.metadata.pipeline import Pipeline

from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils import schemas as schemas_utils

class BruteForceSearch(PipelineSearchBase):
def __init__(self, problem_description, backend, *, primitives_blocklist=None, ranking_function=None):
super().__init__(problem_description=problem_description, backend=backend,
primitives_blocklist=primitives_blocklist, ranking_function=ranking_function)
if self.ranking_function is None:
self.ranking_function = _rank_first_metric

# Find the candidates
self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords'])
self.available_pipelines = self._return_pipelines(
self.task_description['task_type'], self.task_description['task_subtype'], self.task_description['data_types'])
self.metrics = self.problem_description['problem']['performance_metrics']
self.data_preparation_pipeline = _generate_data_preparation_pipeline()
self.scoring_pipeline = _generate_scoring_pipeline()
self.data_preparation_params = _generate_data_preparation_params()

self.current_pipeline_index = 0
self.offset = 1

def evaluate(self, pipeline_to_eval, input_data=None):
if input_data is None:
input_data = self.input_data
pipeline_result = self.backend.evaluate_pipeline(
problem_description=self.problem_description,
pipeline=pipeline_to_eval,
input_data=input_data,
metrics=self.metrics,
data_preparation_pipeline=self.data_preparation_pipeline,
scoring_pipeline=self.scoring_pipeline,
data_preparation_params=self.data_preparation_params)
return pipeline_result

def _search(self, time_left):
# Read all the pipelines to be evaluated
pipelines_to_eval = self.available_pipelines[self.current_pipeline_index: self.current_pipeline_index+self.offset]
self.current_pipeline_index += 1
pipeline_results = self.backend.evaluate_pipelines(
problem_description=self.problem_description,
pipelines=pipelines_to_eval,
input_data=self.input_data,
metrics=self.metrics,
data_preparation_pipeline=self.data_preparation_pipeline,
scoring_pipeline=self.scoring_pipeline,
data_preparation_params=self.data_preparation_params)

# DEBUG
####################
for pipeline_result in pipeline_results:
try:
for error in pipeline_result.error:
if error is not None:
raise error
except:
import traceback
traceback.print_exc()
####################

return [self.ranking_function(pipeline_result) for pipeline_result in pipeline_results]

def _return_pipelines(self, task_type, task_subtype, data_type):
pipeline_candidates = _generate_pipelines(primitive_python_paths)
return pipeline_candidates

primitive_python_paths = {
'data_processing': [
#'d3m.primitives.tods.data_processing.time_interval_transform',
#'d3m.primitives.tods.data_processing.categorical_to_binary',
'd3m.primitives.tods.data_processing.column_filter',
#'d3m.primitives.tods.data_processing.timestamp_validation',
#'d3m.primitives.tods.data_processing.duplication_validation',
#'d3m.primitives.tods.data_processing.continuity_validation',
],
'timeseries_processing': [
'd3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler',
'd3m.primitives.tods.timeseries_processing.transformation.standard_scaler',
'd3m.primitives.tods.timeseries_processing.transformation.power_transformer',
'd3m.primitives.tods.timeseries_processing.transformation.quantile_transformer',
'd3m.primitives.tods.timeseries_processing.transformation.moving_average_transform',
'd3m.primitives.tods.timeseries_processing.transformation.simple_exponential_smoothing',
#'d3m.primitives.tods.timeseries_processing.transformation.holt_smoothing',
#'d3m.primitives.tods.timeseries_processing.transformation.holt_winters_exponential_smoothing',
#'d3m.primitives.tods.timeseries_processing.decomposition.time_series_seasonality_trend_decomposition',
],
'feature_analysis': [
#'d3m.primitives.tods.feature_analysis.auto_correlation',
'd3m.primitives.tods.feature_analysis.statistical_mean',
'd3m.primitives.tods.feature_analysis.statistical_median',
'd3m.primitives.tods.feature_analysis.statistical_g_mean',
'd3m.primitives.tods.feature_analysis.statistical_abs_energy',
'd3m.primitives.tods.feature_analysis.statistical_abs_sum',
'd3m.primitives.tods.feature_analysis.statistical_h_mean',
'd3m.primitives.tods.feature_analysis.statistical_maximum',
#'d3m.primitives.tods.feature_analysis.statistical_minimum',
#'d3m.primitives.tods.feature_analysis.statistical_mean_abs',
#'d3m.primitives.tods.feature_analysis.statistical_mean_abs_temporal_derivative',
#'d3m.primitives.tods.feature_analysis.statistical_mean_temporal_derivative',
#'d3m.primitives.tods.feature_analysis.statistical_median_abs_deviation',
#'d3m.primitives.tods.feature_analysis.statistical_kurtosis',
#'d3m.primitives.tods.feature_analysis.statistical_skew',
#'d3m.primitives.tods.feature_analysis.statistical_std',
#'d3m.primitives.tods.feature_analysis.statistical_var',
#'d3m.primitives.tods.feature_analysis.statistical_variation',
#'d3m.primitives.tods.feature_analysis.statistical_vec_sum',
#'d3m.primitives.tods.feature_analysis.statistical_willison_amplitude',
#'d3m.primitives.tods.feature_analysis.statistical_zero_crossing',
#'d3m.primitives.tods.feature_analysis.spectral_residual_transform',
#'d3m.primitives.tods.feature_analysis.fast_fourier_transform',
#'d3m.primitives.tods.feature_analysis.discrete_cosine_transform',
#'d3m.primitives.tods.feature_analysis.non_negative_matrix_factorization',
#'d3m.primitives.tods.feature_analysis.bk_filter',
#'d3m.primitives.tods.feature_analysis.hp_filter',
#'d3m.primitives.tods.feature_analysis.truncated_svd',
#'d3m.primitives.tods.feature_analysis.wavelet_transform',
#'d3m.primitives.tods.feature_analysis.trmf',
],
'detection_algorithm': [
'd3m.primitives.tods.detection_algorithm.pyod_ae',
'd3m.primitives.tods.detection_algorithm.pyod_vae',
'd3m.primitives.tods.detection_algorithm.pyod_cof',
'd3m.primitives.tods.detection_algorithm.pyod_sod',
'd3m.primitives.tods.detection_algorithm.pyod_abod',
'd3m.primitives.tods.detection_algorithm.pyod_hbos',
'd3m.primitives.tods.detection_algorithm.pyod_iforest',
#'d3m.primitives.tods.detection_algorithm.pyod_lof',
#'d3m.primitives.tods.detection_algorithm.pyod_knn',
#'d3m.primitives.tods.detection_algorithm.pyod_ocsvm',
#'d3m.primitives.tods.detection_algorithm.pyod_loda',
#'d3m.primitives.tods.detection_algorithm.pyod_cblof',
#'d3m.primitives.tods.detection_algorithm.pyod_sogaal',
#'d3m.primitives.tods.detection_algorithm.pyod_mogaal',
#'d3m.primitives.tods.detection_algorithm.matrix_profile',
#'d3m.primitives.tods.detection_algorithm.AutoRegODetector',
#'d3m.primitives.tods.detection_algorithm.LSTMODetector',
#'d3m.primitives.tods.detection_algorithm.AutoRegODetector',
#'d3m.primitives.tods.detection_algorithm.PCAODetector',
#'d3m.primitives.tods.detection_algorithm.KDiscordODetector',
#'d3m.primitives.tods.detection_algorithm.deeplog',
#'d3m.primitives.tods.detection_algorithm.telemanom',
],
'contamination': [0.01, 0.02, 0.05, 0.07, 0.1, 0.15, 0.2],
}


def _rank_first_metric(pipeline_result):
if pipeline_result.status == 'COMPLETED':
scores = pipeline_result.scores
pipeline_result.rank = -scores['value'][0]
return pipeline_result
else:
# error
pipeline_result.rank = 1
return pipeline_result

def _generate_data_preparation_params():
from axolotl.utils import schemas as schemas_utils
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
return data_preparation_params
def _generate_scoring_pipeline():
from axolotl.utils import schemas as schemas_utils
scoring_pipeline = schemas_utils.get_scoring_pipeline()
return scoring_pipeline
def _generate_data_preparation_pipeline():
from axolotl.utils import schemas as schemas_utils
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
return data_preparation_pipeline

def _generate_pipline(combinations):
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

piplines = []
for combination in combinations:
# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')
# The first three steps are fixed
# Step 0: dataset_to_dataframe
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common'))
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: column_parser
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common'))
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_1.add_output('produce')
pipeline_description.add_step(step_1)

# Step 2: extract_columns_by_semantic_types(attributes)
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
step_2.add_output('produce')
step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
pipeline_description.add_step(step_2)

# Step 3: extract_columns_by_semantic_types(targets)
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_3.add_output('produce')
step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
pipeline_description.add_step(step_3)

attributes = 'steps.2.produce'
targets = 'steps.3.produce'

tods_step_4 = PrimitiveStep(primitive=index.get_primitive(combination[0]))
tods_step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes)
tods_step_4.add_output('produce')
pipeline_description.add_step(tods_step_4)

tods_step_5 = PrimitiveStep(primitive=index.get_primitive(combination[1]))
tods_step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
tods_step_5.add_output('produce')
pipeline_description.add_step(tods_step_5)

tods_step_6= PrimitiveStep(primitive=index.get_primitive(combination[2]))
tods_step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce')
tods_step_6.add_output('produce')
tods_step_6.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=combination[3])
pipeline_description.add_step(tods_step_6)

#tods_step_7 = PrimitiveStep(primitive=index.get_primitive(combination[3]))
#tods_step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce')
#tods_step_7.add_output('produce')
#pipeline_description.add_step(tods_step_7)

# Finalize the pipeline
final_step = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common'))
final_step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce')
final_step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
final_step.add_output('produce')
pipeline_description.add_step(final_step)

pipeline_description.add_output(name='output predictions', data_reference='steps.7.produce')
pipeline_description.id = str(uuid.uuid4())
pipeline_description.created = Pipeline().created

piplines.append(pipeline_description)
return piplines

def _generate_pipelines(primitive_python_paths, cpu_count=40):
"""
Args:
primitive_python_paths: a list of primitive Python paths for algorithms
Returns:
the pipline description json
"""
import itertools
import multiprocessing as mp

#components = ['data_processing', 'timeseries_processing', 'feature_analysis', 'detection_algorithm']
components = ['timeseries_processing', 'feature_analysis', 'detection_algorithm', 'contamination']
combinations = itertools.product(*(primitive_python_paths[k] for k in components))


return _generate_pipline(combinations)
#pipelines = []

## Allocate tasks
#combination_each_core_list = [[] for i in range(cpu_count)]
#for idx, combination in enumerate(combinations):
# core = idx % cpu_count
# combination_each_core_list[core].append(combination)

## Obtain all the pipelines
#pool = mp.Pool(processes=cpu_count)
#results = [pool.apply_async(_generate_pipline,
# args=(combinations,))
# for combinations in combination_each_core_list]
#piplines = []
#for p in results:
# piplines.extend(p.get())

return piplines

+ 59
- 0
tods/searcher/tods/utils.py View File

@@ -0,0 +1,59 @@

def generate_dataset_problem(df, target_index, metric):
"""
A wrapper for generating dataset and problem

Args:
df (pandas.DataFrame): dataset
target_index (int): The column index of the target
metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for
macro-F1 on both 0 and 1

returns:
dataset, problem
"""
from axolotl.utils import data_problem
from d3m.metadata.problem import TaskKeyword, PerformanceMetric

if metric == 'F1':
performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}]
elif metric == 'F1_MACRO':
performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}]
else:
raise ValueError('The metric {} not supported.'.format(metric))
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=target_index,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=performance_metrics)

return dataset, problem_description

def evaluate_pipeline(problem_description, dataset, pipeline):
from axolotl.utils import schemas as schemas_utils
from axolotl.backend.simple import SimpleRunner
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
metrics = problem_description['problem']['performance_metrics']

backend = SimpleRunner(random_seed=0)
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
pipeline=pipeline,
input_data=[dataset],
metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params)
try:
for error in pipeline_result.error:
if error is not None:
raise error
except:
import traceback
traceback.print_exc()

return pipeline_result



+ 51
- 0
tods/searcher/utils.py View File

@@ -0,0 +1,51 @@

def generate_dataset_problem(df, target_index, metric):
"""
A wrapper for generating dataset and problem

Args:
df (pandas.DataFrame): dataset
target_index (int): The column index of the target
metric (str): `F1` for computing F1 on label 1, 'F1_MACRO` for
macro-F1 on both 0 and 1

returns:
dataset, problem
"""
from axolotl.utils import data_problem
from d3m.metadata.problem import TaskKeyword, PerformanceMetric

if metric == 'F1':
performance_metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}]
elif metric == 'F1_MACRO':
performance_metrics = [{'metric': PerformanceMetric.F1_MACRO, 'params': {}}]
else:
raise ValueError('The metric {} not supported.'.format(metric))
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=target_index,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=performance_metrics)

return dataset, problem_description

def evaluate_pipeline(problem_description, dataset, pipeline):
from axolotl.utils import schemas as schemas_utils
from axolotl.backend.simple import SimpleRunner
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
metrics = problem_description['problem']['performance_metrics']

backend = SimpleRunner(random_seed=0)
pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
pipeline=pipeline,
input_data=[dataset],
metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params)
return pipeline_result



+ 1
- 1
tods/setup.py View File

@@ -29,7 +29,7 @@ setup(
'd3m',
'Jinja2',
'simplejson==3.12.0',
'scikit-learn==0.21.3',
'scikit-learn==0.22.0',
'statsmodels==0.11.1',
'PyWavelets>=1.1.1',
'tensorflow', # should be removed later


+ 0
- 36
tods/tods/search/brute_force_search.py View File

@@ -1,36 +0,0 @@
# A Brute-Force Search
import uuid

from d3m.metadata.pipeline import Pipeline

from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils

def random_rank(pipeline_result):
if pipeline_result.status == 'COMPLETED':
pipeline_result.rank = random.uniform(0, 1)
return pipeline_result

class BruteForceSearch(PipelineSearchBase):
def __init__(self, problem_description, backend, *, primitives_blocklist=None, ranking_function=None):
super().__init__(problem_description=problem_description, backend=backend,
primitives_blocklist=primitives_blocklist, ranking_function=ranking_function)
if self.ranking_function is None:
self.ranking_function = random_rank

# Find th candidates
self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords'])
print('task_description:', self.task_description)
self.available_pipelines = self._return_pipelines(
self.task_description['task_type'], self.task_description['task_subtype'], self.task_description['data_types'])
print('available_pipelines:', self.available_pipelines)

def _return_pipelines(self, task_type, task_subtype, data_type):
pipeline_candidates = []
for pipeline_dict in schemas_utils.get_pipelines_db()['CLASSIFICATION']:
pipeline = pipeline_utils.load_pipeline(pipeline_dict)
pipeline.id = str(uuid.uuid4())
pipeline.created = Pipeline().created
pipeline_candidates.append(pipeline)

return pipeline_candidates

Loading…
Cancel
Save