Browse Source

Merge branch 'dev' of https://github.com/datamllab/tods into dev

Former-commit-id: cb4c075eae [formerly 5e6829bf03] [formerly 4aa760c285 [formerly dad7db5423]] [formerly d5ea0d1d7a [formerly 04b69ec343] [formerly bf02e971e6 [formerly a28caafb62]]] [formerly 476abddd49 [formerly 3041402c02] [formerly 60c56c29c8 [formerly 8c3ccbd2ac]] [formerly a091b208da [formerly 4c92ee9c15] [formerly e32a84e951 [formerly 2bcea1d1a7]]]] [formerly dfc1b23ff2 [formerly a17a488d92] [formerly 72e2329d4e [formerly 528d2ef979]] [formerly fb7f463738 [formerly f5389a7098] [formerly 24e297503a [formerly de9dbbc467]]] [formerly 67f768802a [formerly 6c8efb8110] [formerly 380aef8497 [formerly 5725e6d8f6]] [formerly 2e935b037c [formerly 0fc1ec8392] [formerly 13c2968d9d [formerly 7856fae75d]]]]] [formerly d572a74f3f [formerly ff81257cfd] [formerly b1c0078360 [formerly 2f13b987fb]] [formerly 65c23b07c0 [formerly 9127bad6c8] [formerly 94502c7588 [formerly d26ec3604a]]] [formerly bcd9d2f628 [formerly b251800469] [formerly 66e565a61e [formerly f7d4af1e2a]] [formerly a94d95a815 [formerly 1c99d12704] [formerly bd7948170a [formerly f37677bae3]]]] [formerly 6450488697 [formerly caa9df31ef] [formerly 27613be5cc [formerly f0505baf77]] [formerly cf6318a944 [formerly c5bdb7a912] [formerly effbadb66b [formerly ae30ca435d]]] [formerly 7873ccb2ff [formerly 35d0bdde6f] [formerly 5fa5c65b81 [formerly 97ef494e69]] [formerly 8cc1353bdb [formerly ac27652284] [formerly c29897f807 [formerly 2478392ad7]]]]]]
Former-commit-id: a95428d053 [formerly 5cfd93a0b6] [formerly a7509290f2 [formerly f8e2656ee8]] [formerly 52f067e10e [formerly a6eb4b0f6c] [formerly 94972f2621 [formerly c28f95df65]]] [formerly 87d1998d2f [formerly 7a963e10fe] [formerly d6ad74535b [formerly fc21c53472]] [formerly a40b59459c [formerly f9e08220fe] [formerly 63749cd73b [formerly 1319a03d67]]]] [formerly 231916853f [formerly 89856e87e4] [formerly 5b9e6a2352 [formerly c489bba9c2]] [formerly 9f877f984f [formerly d27ee607fd] [formerly ef1d4190df [formerly fb577f2688]]] [formerly 5e03ef1894 [formerly 1cdc05ea41] [formerly 951b8f8752 [formerly a1458ee85b]] [formerly ef1a983075 [formerly f0d0487faa] [formerly c29897f807]]]]
Former-commit-id: d6926861f3 [formerly ead7564a76] [formerly 015b87e31a [formerly faa0016c02]] [formerly d28813974e [formerly b6e4cf1b0c] [formerly 5a099d34db [formerly 8d2784221d]]] [formerly e3d46c8705 [formerly 942d2bb23e] [formerly 8b791eb9cf [formerly fe190aedde]] [formerly cbdd58d974 [formerly d0e1e6fe31] [formerly af63675442 [formerly 47856c6548]]]]
Former-commit-id: edc327048d [formerly 6e9aed714c] [formerly 034ec82722 [formerly 1a2e792536]] [formerly 00e0ed3838 [formerly ff23a8a631] [formerly 83f3660051 [formerly e69565c6fd]]]
Former-commit-id: 451506fc8d [formerly 81b06e1486] [formerly 0b16630084 [formerly d45b5f6d8c]]
Former-commit-id: da2cdebccf [formerly abe171b1a2]
Former-commit-id: d3999736ed
master
YileAllenChen1 4 years ago
parent
commit
d4ed084013
21 changed files with 38836 additions and 0 deletions
  1. +71
    -0
      datasets/anomaly/yahoo_system_sub_5/SCORE/dataset_TEST/datasetDoc.json
  2. +1401
    -0
      datasets/anomaly/yahoo_system_sub_5/SCORE/dataset_TEST/tables/learningData.csv
  3. +5601
    -0
      datasets/anomaly/yahoo_system_sub_5/SCORE/problem_TEST/dataSplits.csv
  4. +65
    -0
      datasets/anomaly/yahoo_system_sub_5/SCORE/problem_TEST/problemDoc.json
  5. +0
    -0
      datasets/anomaly/yahoo_system_sub_5/SCORE/targets.csv
  6. +71
    -0
      datasets/anomaly/yahoo_system_sub_5/TEST/dataset_TEST/datasetDoc.json
  7. +1401
    -0
      datasets/anomaly/yahoo_system_sub_5/TEST/dataset_TEST/tables/learningData.csv
  8. +5601
    -0
      datasets/anomaly/yahoo_system_sub_5/TEST/problem_TEST/dataSplits.csv
  9. +65
    -0
      datasets/anomaly/yahoo_system_sub_5/TEST/problem_TEST/problemDoc.json
  10. +71
    -0
      datasets/anomaly/yahoo_system_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json
  11. +5601
    -0
      datasets/anomaly/yahoo_system_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv
  12. +5601
    -0
      datasets/anomaly/yahoo_system_sub_5/TRAIN/problem_TRAIN/dataSplits.csv
  13. +65
    -0
      datasets/anomaly/yahoo_system_sub_5/TRAIN/problem_TRAIN/problemDoc.json
  14. +71
    -0
      datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_dataset/datasetDoc.json
  15. +7001
    -0
      datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_dataset/tables/learningData.csv
  16. +5601
    -0
      datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_problem/dataSplits.csv
  17. +65
    -0
      datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_problem/problemDoc.json
  18. +72
    -0
      examples/build_Ensemble.py
  19. +48
    -0
      examples/run_pipeline_ensemble.py
  20. +363
    -0
      tods/detection_algorithm/Ensemble.py
  21. +1
    -0
      tods/resources/.entry_points.ini

+ 71
- 0
datasets/anomaly/yahoo_system_sub_5/SCORE/dataset_TEST/datasetDoc.json View File

@@ -0,0 +1,71 @@
{
"about": {
"datasetID": "yahoo_system_sub_5_dataset_TEST",
"datasetName": "NULL",
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'",
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ",
"license": " CC Public Domain Mark 1.0 ",
"source": "OpenML",
"sourceURI": "http://www.openml.org/d/185",
"approximateSize": "",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "4.0.0"
},
"dataResources": [
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "timestamp",
"colType": "integer",
"role": [
"attribute"
]
},
{
"colIndex": 2,
"colName": "value_0",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 3,
"colName": "system_id",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 4,
"colName": "ground_truth",
"colType": "integer",
"role": [
"suggestedTarget"
]
}
],
"columnsCount": 5
}
]
}

+ 1401
- 0
datasets/anomaly/yahoo_system_sub_5/SCORE/dataset_TEST/tables/learningData.csv
File diff suppressed because it is too large
View File


+ 5601
- 0
datasets/anomaly/yahoo_system_sub_5/SCORE/problem_TEST/dataSplits.csv
File diff suppressed because it is too large
View File


+ 65
- 0
datasets/anomaly/yahoo_system_sub_5/SCORE/problem_TEST/problemDoc.json View File

@@ -0,0 +1,65 @@
{
"about": {
"problemID": "yahoo_system_sub_5_problem",
"problemName": "yahoo_system_sub_5_problem",
"problemDescription": "Anomaly detection",
"problemVersion": "4.0.0",
"problemSchemaVersion": "4.0.0",
"taskKeywords": [
"classification",
"binary",
"tabular"
]
},
"inputs": {
"data": [
{
"datasetID": "yahoo_system_sub_5_dataset",
"targets": [
{
"targetIndex": 0,
"resID": "learningData",
"colIndex": 4,
"colName": "ground_truth"
}
]
}
],
"dataSplits": {
"method": "holdOut",
"testSize": 0.2,
"stratified": true,
"numRepeats": 0,
"randomSeed": 42,
"splitsFile": "dataSplits.csv",
"datasetViewMaps": {
"train": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_TRAIN"
}
],
"test": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_TEST"
}
],
"score": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_SCORE"
}
]
}
},
"performanceMetrics": [
{
"metric": "f1Macro"
}
]
},
"expectedOutputs": {
"predictionsFile": "predictions.csv"
}
}

+ 0
- 0
datasets/anomaly/yahoo_system_sub_5/SCORE/targets.csv View File


+ 71
- 0
datasets/anomaly/yahoo_system_sub_5/TEST/dataset_TEST/datasetDoc.json View File

@@ -0,0 +1,71 @@
{
"about": {
"datasetID": "yahoo_system_sub_5_dataset_TEST",
"datasetName": "NULL",
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'",
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ",
"license": " CC Public Domain Mark 1.0 ",
"source": "OpenML",
"sourceURI": "http://www.openml.org/d/185",
"approximateSize": "",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "4.0.0"
},
"dataResources": [
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "timestamp",
"colType": "integer",
"role": [
"attribute"
]
},
{
"colIndex": 2,
"colName": "value_0",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 3,
"colName": "system_id",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 4,
"colName": "ground_truth",
"colType": "integer",
"role": [
"suggestedTarget"
]
}
],
"columnsCount": 5
}
]
}

+ 1401
- 0
datasets/anomaly/yahoo_system_sub_5/TEST/dataset_TEST/tables/learningData.csv
File diff suppressed because it is too large
View File


+ 5601
- 0
datasets/anomaly/yahoo_system_sub_5/TEST/problem_TEST/dataSplits.csv
File diff suppressed because it is too large
View File


+ 65
- 0
datasets/anomaly/yahoo_system_sub_5/TEST/problem_TEST/problemDoc.json View File

@@ -0,0 +1,65 @@
{
"about": {
"problemID": "yahoo_system_sub_5_problem",
"problemName": "yahoo_system_sub_5_problem",
"problemDescription": "Anomaly detection",
"problemVersion": "4.0.0",
"problemSchemaVersion": "4.0.0",
"taskKeywords": [
"classification",
"binary",
"tabular"
]
},
"inputs": {
"data": [
{
"datasetID": "yahoo_system_sub_5_dataset",
"targets": [
{
"targetIndex": 0,
"resID": "learningData",
"colIndex": 4,
"colName": "ground_truth"
}
]
}
],
"dataSplits": {
"method": "holdOut",
"testSize": 0.2,
"stratified": true,
"numRepeats": 0,
"randomSeed": 42,
"splitsFile": "dataSplits.csv",
"datasetViewMaps": {
"train": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_TRAIN"
}
],
"test": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_TEST"
}
],
"score": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_SCORE"
}
]
}
},
"performanceMetrics": [
{
"metric": "f1Macro"
}
]
},
"expectedOutputs": {
"predictionsFile": "predictions.csv"
}
}

+ 71
- 0
datasets/anomaly/yahoo_system_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json View File

@@ -0,0 +1,71 @@
{
"about": {
"datasetID": "yahoo_system_sub_5_dataset_TRAIN",
"datasetName": "NULL",
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'",
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ",
"license": " CC Public Domain Mark 1.0 ",
"source": "OpenML",
"sourceURI": "http://www.openml.org/d/185",
"approximateSize": "",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "4.0.0"
},
"dataResources": [
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "timestamp",
"colType": "integer",
"role": [
"attribute"
]
},
{
"colIndex": 2,
"colName": "value_0",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 3,
"colName": "system_id",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 4,
"colName": "ground_truth",
"colType": "integer",
"role": [
"suggestedTarget"
]
}
],
"columnsCount": 5
}
]
}

+ 5601
- 0
datasets/anomaly/yahoo_system_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv
File diff suppressed because it is too large
View File


+ 5601
- 0
datasets/anomaly/yahoo_system_sub_5/TRAIN/problem_TRAIN/dataSplits.csv
File diff suppressed because it is too large
View File


+ 65
- 0
datasets/anomaly/yahoo_system_sub_5/TRAIN/problem_TRAIN/problemDoc.json View File

@@ -0,0 +1,65 @@
{
"about": {
"problemID": "yahoo_system_sub_5_problem",
"problemName": "yahoo_system_sub_5_problem",
"problemDescription": "Anomaly detection",
"problemVersion": "4.0.0",
"problemSchemaVersion": "4.0.0",
"taskKeywords": [
"classification",
"binary",
"tabular"
]
},
"inputs": {
"data": [
{
"datasetID": "yahoo_system_sub_5_dataset",
"targets": [
{
"targetIndex": 0,
"resID": "learningData",
"colIndex": 4,
"colName": "ground_truth"
}
]
}
],
"dataSplits": {
"method": "holdOut",
"testSize": 0.2,
"stratified": true,
"numRepeats": 0,
"randomSeed": 42,
"splitsFile": "dataSplits.csv",
"datasetViewMaps": {
"train": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_TRAIN"
}
],
"test": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_TEST"
}
],
"score": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_SCORE"
}
]
}
},
"performanceMetrics": [
{
"metric": "f1Macro"
}
]
},
"expectedOutputs": {
"predictionsFile": "predictions.csv"
}
}

+ 71
- 0
datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_dataset/datasetDoc.json View File

@@ -0,0 +1,71 @@
{
"about": {
"datasetID": "yahoo_system_sub_5_dataset",
"datasetName": "yahoo_system_sub_5",
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'",
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ",
"license": " CC Public Domain Mark 1.0 ",
"source": "OpenML",
"sourceURI": "http://www.openml.org/d/185",
"approximateSize": "",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "4.0.0"
},
"dataResources": [
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "timestamp",
"colType": "integer",
"role": [
"attribute"
]
},
{
"colIndex": 2,
"colName": "value_0",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 3,
"colName": "system_id",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 4,
"colName": "ground_truth",
"colType": "integer",
"role": [
"suggestedTarget"
]
}
],
"columnsCount": 5
}
]
}

+ 7001
- 0
datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_dataset/tables/learningData.csv
File diff suppressed because it is too large
View File


+ 5601
- 0
datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_problem/dataSplits.csv
File diff suppressed because it is too large
View File


+ 65
- 0
datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_problem/problemDoc.json View File

@@ -0,0 +1,65 @@
{
"about": {
"problemID": "yahoo_system_sub_5_problem",
"problemName": "yahoo_system_sub_5_problem",
"problemDescription": "Anomaly detection",
"problemVersion": "4.0.0",
"problemSchemaVersion": "4.0.0",
"taskKeywords": [
"classification",
"binary",
"tabular"
]
},
"inputs": {
"data": [
{
"datasetID": "yahoo_system_sub_5_dataset",
"targets": [
{
"targetIndex": 0,
"resID": "learningData",
"colIndex": 4,
"colName": "ground_truth"
}
]
}
],
"dataSplits": {
"method": "holdOut",
"testSize": 0.2,
"stratified": true,
"numRepeats": 0,
"randomSeed": 42,
"splitsFile": "dataSplits.csv",
"datasetViewMaps": {
"train": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_TRAIN"
}
],
"test": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_TEST"
}
],
"score": [
{
"from": "yahoo_system_sub_5_dataset",
"to": "yahoo_system_sub_5_dataset_SCORE"
}
]
}
},
"performanceMetrics": [
{
"metric": "f1Macro"
}
]
},
"expectedOutputs": {
"predictionsFile": "predictions.csv"
}
}

+ 72
- 0
examples/build_Ensemble.py View File

@@ -0,0 +1,72 @@
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest
# extract_columns_by_semantic_types(targets) -> ^

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: dataset_to_dataframe
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common'))
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: column_parser
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common'))
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_1.add_output('produce')
pipeline_description.add_step(step_1)

# Step 2: extract_columns_by_semantic_types(attributes)
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
step_2.add_output('produce')
step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
pipeline_description.add_step(step_2)

# Step 3: extract_columns_by_semantic_types(targets)
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_3.add_output('produce')
step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
pipeline_description.add_step(step_3)

attributes = 'steps.2.produce'
targets = 'steps.3.produce'

# Step 4: auto encoder
step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_ae'))
step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes)
step_4.add_output('produce_score')
step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=[0,1,2])
step_4.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True)
step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append')
pipeline_description.add_step(step_4)

# Step 5: ensemble
step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.Ensemble'))
step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce_score')
step_5.add_output('produce')
pipeline_description.add_step(step_5)


# Final Output
pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce')

# Output to YAML
#yaml = pipeline_description.to_yaml()
#with open('pipeline.yml', 'w') as f:
# f.write(yaml)
#prin(yaml)

# Output to json
data = pipeline_description.to_json()
with open('example_pipeline.json', 'w') as f:
f.write(data)
print(data)

+ 48
- 0
examples/run_pipeline_ensemble.py View File

@@ -0,0 +1,48 @@
import sys
import argparse
import os
import pandas as pd

from tods import generate_dataset, load_pipeline, evaluate_pipeline

this_path = os.path.dirname(os.path.abspath(__file__))
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset

parser = argparse.ArgumentParser(description='Arguments for running predefined pipelin.')
#parser.add_argument('--table_path', type=str, default=os.path.join(this_path, '../datasets/yahoo_sub_5.csv'),
# help='Input the path of the input data table')

parser.add_argument('--table_path', type=str, default=os.path.join(this_path, '../datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_dataset/tables/learningData.csv'),
help='Input the path of the input data table')

parser.add_argument('--target_index', type=int, default=4,
help='Index of the ground truth (for evaluation)')
parser.add_argument('--metric',type=str, default='F1_MACRO',
help='Evaluation Metric (F1, F1_MACRO)')

#parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/resources/default_pipeline.json'),
# help='Input the path of the pre-built pipeline description')

#Using the pipeline that was build and saved in example_pipeline
parser.add_argument('--pipeline_path', default=os.path.join(this_path, './example_pipeline.json'),
help='Input the path of the pre-built pipeline description')


args = parser.parse_args()

table_path = args.table_path
target_index = args.target_index # what column is the target
pipeline_path = args.pipeline_path
metric = args.metric # F1 on both label 0 and 1

# Read data and generate dataset
df = pd.read_csv(table_path)
dataset = generate_dataset(df, target_index)

# Load the default pipeline
pipeline = load_pipeline(pipeline_path)

# Run the pipeline
pipeline_result = evaluate_pipeline(dataset, pipeline, metric)
print(pipeline_result)


+ 363
- 0
tods/detection_algorithm/Ensemble.py View File

@@ -0,0 +1,363 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing
import pandas as pd
# Custom import commands if any
from sklearn.preprocessing.data import Normalizer
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult,DockerContainer
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase

import os
from typing import Any,Optional,List

from d3m import container, utils as d3m_utils
from d3m.metadata import base as metadata_base
from d3m.metadata import hyperparams,params
from d3m.primitive_interfaces import base, transformer

Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
# Added by Mia
endog = hyperparams.Bounded[int](
lower = 2,
upper = None,
default = 3,
description='Array like time series.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)

threshold = hyperparams.Bounded[float](
lower = 0,
upper = 1,
default = 0.5,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)

# keep previous
norm = hyperparams.Enumeration[str](
default='l2',
values=['l1', 'l2', 'max'],
description='The norm to use to normalize each non zero sample.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
)
exclude_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe",
)

add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
default='https://metadata.datadrivendiscovery.org/types/Attribute',
description='Decides what semantic type to attach to generated attributes',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class Ensemble(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Ensemble method
`Calculate the Maximum/Minimum/Average and Majority Voting for the detection algorithm based on the threshold set for the score`_
"""
__author__ = "DATA Lab at Texas A&M University"
metadata = metadata_base.PrimitiveMetadata({
"name": "Ensemble",
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ISOLATION_FOREST, ],
"primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION,
"python_path": "d3m.primitives.tods.detection_algorithm.Ensemble",
"source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/HoltSmoothing.py']},
"version": "0.0.1",
"id": "3688b5b4-885c-40bb-9731-fe3969ea81b0",
"hyperparams_to_tune": ['use_columns'],
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = Normalizer(
norm=self.hyperparams['norm'],
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
def set_training_data(self, *, inputs: Inputs) -> None:
self._inputs = inputs
self._fitted = False
def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]:
if self._fitted:
return CallResult(None)

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if self._training_inputs is None:
return CallResult(None)

if len(self._training_indices) > 0:
self._clf.fit(self._training_inputs)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:

outputs = inputs
outputs.columns = ['timestamp','value','system_id','scores']
# print(outputs)
# print('max_score')
# ensemble_max = outputs.groupby('system_id')[outputs.columns[-1]].max()

# print(ensemble_max)
#
# print('min_score')
# ensemble_min = outputs.groupby('system_id')[outputs.columns[-1]].min()
# print(ensemble_min)
#
# print('mean_score')
# outputs_mean = outputs.groupby('system_id')[outputs.columns[3]].mean()
# print(outputs_mean)

outputs['results'] = numpy.where(outputs['scores']>0.05, 1, 0)
print(outputs)

outputs_xy = outputs.groupby('system_id')['results'].sum().reset_index()
print("*****majority_sum_xy*****")
print(outputs_xy)

outputs_sum_x = outputs.groupby(['timestamp','system_id'])['results'].sum()
# outputs_sum_x = outputs.groupby(['system_id','timestamp']).size().reset_index().groupby(['timestamp'])['results'].sum()

outputs_sum_y = outputs.groupby(['system_id','value'])['results'].sum()

print('*****majority_max_x*****')
print(outputs_sum_x)
print('*****majority_max_y*****')
print(outputs_sum_y)

return base.CallResult(outputs)

def _update_metadata(self, outputs):
outputs.metadata = outputs.metadata.generate(outputs,)

def get_params(self) -> Params:
if not self._fitted:
return Params(
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
self._fitted = True



@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_columns'],
exclude_columns=hyperparams['exclude_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = []
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=True)
target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
return outputs


@classmethod
def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int],
outputs_metadata: metadata_base.DataMetadata, hyperparams):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
target_columns_metadata: List[OrderedDict] = []
for column_index in input_indices:
column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)

column_metadata = OrderedDict(inputs_metadata.query_column(column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = set()
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

# If outputs has more columns than index, add Attribute Type to all remaining
if outputs_length > len(input_indices):
for column_index in range(len(input_indices), outputs_length):
column_metadata = OrderedDict()
semantic_types = set()
semantic_types.add(hyperparams["return_semantic_type"])
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = list(semantic_types)
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


Ensemble.__doc__ = Normalizer.__doc__

+ 1
- 0
tods/resources/.entry_points.ini View File

@@ -76,5 +76,6 @@ tods.detection_algorithm.PCAODetector = tods.detection_algorithm.PCAODetect:PCAO
tods.detection_algorithm.KDiscordODetector = tods.detection_algorithm.KDiscordODetect:KDiscordODetector
tods.detection_algorithm.deeplog = tods.detection_algorithm.DeepLog:DeepLogPrimitive
tods.detection_algorithm.telemanom = tods.detection_algorithm.Telemanom:TelemanomPrimitive
tods.detection_algorithm.Ensemble = tods.detection_algorithm.Ensemble:Ensemble

tods.reinforcement.rule_filter = tods.reinforcement.RuleBasedFilter:RuleBasedFilter

Loading…
Cancel
Save