Former-commit-id:mastercb4c075eae
[formerly5e6829bf03
] [formerly4aa760c285
[formerlydad7db5423
]] [formerlyd5ea0d1d7a
[formerly04b69ec343
] [formerlybf02e971e6
[formerlya28caafb62
]]] [formerly476abddd49
[formerly3041402c02
] [formerly60c56c29c8
[formerly8c3ccbd2ac
]] [formerlya091b208da
[formerly4c92ee9c15
] [formerlye32a84e951
[formerly2bcea1d1a7
]]]] [formerlydfc1b23ff2
[formerlya17a488d92
] [formerly72e2329d4e
[formerly528d2ef979
]] [formerlyfb7f463738
[formerlyf5389a7098
] [formerly24e297503a
[formerlyde9dbbc467
]]] [formerly67f768802a
[formerly6c8efb8110
] [formerly380aef8497
[formerly5725e6d8f6
]] [formerly2e935b037c
[formerly0fc1ec8392
] [formerly13c2968d9d
[formerly7856fae75d
]]]]] [formerlyd572a74f3f
[formerlyff81257cfd
] [formerlyb1c0078360
[formerly2f13b987fb
]] [formerly65c23b07c0
[formerly9127bad6c8
] [formerly94502c7588
[formerlyd26ec3604a
]]] [formerlybcd9d2f628
[formerlyb251800469
] [formerly66e565a61e
[formerlyf7d4af1e2a
]] [formerlya94d95a815
[formerly1c99d12704
] [formerlybd7948170a
[formerlyf37677bae3
]]]] [formerly6450488697
[formerlycaa9df31ef
] [formerly27613be5cc
[formerlyf0505baf77
]] [formerlycf6318a944
[formerlyc5bdb7a912
] [formerlyeffbadb66b
[formerlyae30ca435d
]]] [formerly7873ccb2ff
[formerly35d0bdde6f
] [formerly5fa5c65b81
[formerly97ef494e69
]] [formerly8cc1353bdb
[formerlyac27652284
] [formerlyc29897f807
[formerly2478392ad7
]]]]]] Former-commit-id:a95428d053
[formerly5cfd93a0b6
] [formerlya7509290f2
[formerlyf8e2656ee8
]] [formerly52f067e10e
[formerlya6eb4b0f6c
] [formerly94972f2621
[formerlyc28f95df65
]]] [formerly87d1998d2f
[formerly7a963e10fe
] [formerlyd6ad74535b
[formerlyfc21c53472
]] [formerlya40b59459c
[formerlyf9e08220fe
] [formerly63749cd73b
[formerly1319a03d67
]]]] [formerly231916853f
[formerly89856e87e4
] [formerly5b9e6a2352
[formerlyc489bba9c2
]] [formerly9f877f984f
[formerlyd27ee607fd
] [formerlyef1d4190df
[formerlyfb577f2688
]]] [formerly5e03ef1894
[formerly1cdc05ea41
] [formerly951b8f8752
[formerlya1458ee85b
]] [formerlyef1a983075
[formerlyf0d0487faa
] [formerlyc29897f807
]]]] Former-commit-id:d6926861f3
[formerlyead7564a76
] [formerly015b87e31a
[formerlyfaa0016c02
]] [formerlyd28813974e
[formerlyb6e4cf1b0c
] [formerly5a099d34db
[formerly8d2784221d
]]] [formerlye3d46c8705
[formerly942d2bb23e
] [formerly8b791eb9cf
[formerlyfe190aedde
]] [formerlycbdd58d974
[formerlyd0e1e6fe31
] [formerlyaf63675442
[formerly47856c6548
]]]] Former-commit-id:edc327048d
[formerly6e9aed714c
] [formerly034ec82722
[formerly1a2e792536
]] [formerly00e0ed3838
[formerlyff23a8a631
] [formerly83f3660051
[formerlye69565c6fd
]]] Former-commit-id:451506fc8d
[formerly81b06e1486
] [formerly0b16630084
[formerlyd45b5f6d8c
]] Former-commit-id:da2cdebccf
[formerlyabe171b1a2
] Former-commit-id:d3999736ed
@@ -0,0 +1,71 @@ | |||
{ | |||
"about": { | |||
"datasetID": "yahoo_system_sub_5_dataset_TEST", | |||
"datasetName": "NULL", | |||
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", | |||
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", | |||
"license": " CC Public Domain Mark 1.0 ", | |||
"source": "OpenML", | |||
"sourceURI": "http://www.openml.org/d/185", | |||
"approximateSize": "", | |||
"datasetSchemaVersion": "4.0.0", | |||
"redacted": false, | |||
"datasetVersion": "4.0.0" | |||
}, | |||
"dataResources": [ | |||
{ | |||
"resID": "learningData", | |||
"resPath": "tables/learningData.csv", | |||
"resType": "table", | |||
"resFormat": { | |||
"text/csv": [ | |||
"csv" | |||
] | |||
}, | |||
"isCollection": false, | |||
"columns": [ | |||
{ | |||
"colIndex": 0, | |||
"colName": "d3mIndex", | |||
"colType": "integer", | |||
"role": [ | |||
"index" | |||
] | |||
}, | |||
{ | |||
"colIndex": 1, | |||
"colName": "timestamp", | |||
"colType": "integer", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 2, | |||
"colName": "value_0", | |||
"colType": "real", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 3, | |||
"colName": "system_id", | |||
"colType": "real", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 4, | |||
"colName": "ground_truth", | |||
"colType": "integer", | |||
"role": [ | |||
"suggestedTarget" | |||
] | |||
} | |||
], | |||
"columnsCount": 5 | |||
} | |||
] | |||
} |
@@ -0,0 +1,65 @@ | |||
{ | |||
"about": { | |||
"problemID": "yahoo_system_sub_5_problem", | |||
"problemName": "yahoo_system_sub_5_problem", | |||
"problemDescription": "Anomaly detection", | |||
"problemVersion": "4.0.0", | |||
"problemSchemaVersion": "4.0.0", | |||
"taskKeywords": [ | |||
"classification", | |||
"binary", | |||
"tabular" | |||
] | |||
}, | |||
"inputs": { | |||
"data": [ | |||
{ | |||
"datasetID": "yahoo_system_sub_5_dataset", | |||
"targets": [ | |||
{ | |||
"targetIndex": 0, | |||
"resID": "learningData", | |||
"colIndex": 4, | |||
"colName": "ground_truth" | |||
} | |||
] | |||
} | |||
], | |||
"dataSplits": { | |||
"method": "holdOut", | |||
"testSize": 0.2, | |||
"stratified": true, | |||
"numRepeats": 0, | |||
"randomSeed": 42, | |||
"splitsFile": "dataSplits.csv", | |||
"datasetViewMaps": { | |||
"train": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_TRAIN" | |||
} | |||
], | |||
"test": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_TEST" | |||
} | |||
], | |||
"score": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_SCORE" | |||
} | |||
] | |||
} | |||
}, | |||
"performanceMetrics": [ | |||
{ | |||
"metric": "f1Macro" | |||
} | |||
] | |||
}, | |||
"expectedOutputs": { | |||
"predictionsFile": "predictions.csv" | |||
} | |||
} |
@@ -0,0 +1,71 @@ | |||
{ | |||
"about": { | |||
"datasetID": "yahoo_system_sub_5_dataset_TEST", | |||
"datasetName": "NULL", | |||
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", | |||
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", | |||
"license": " CC Public Domain Mark 1.0 ", | |||
"source": "OpenML", | |||
"sourceURI": "http://www.openml.org/d/185", | |||
"approximateSize": "", | |||
"datasetSchemaVersion": "4.0.0", | |||
"redacted": false, | |||
"datasetVersion": "4.0.0" | |||
}, | |||
"dataResources": [ | |||
{ | |||
"resID": "learningData", | |||
"resPath": "tables/learningData.csv", | |||
"resType": "table", | |||
"resFormat": { | |||
"text/csv": [ | |||
"csv" | |||
] | |||
}, | |||
"isCollection": false, | |||
"columns": [ | |||
{ | |||
"colIndex": 0, | |||
"colName": "d3mIndex", | |||
"colType": "integer", | |||
"role": [ | |||
"index" | |||
] | |||
}, | |||
{ | |||
"colIndex": 1, | |||
"colName": "timestamp", | |||
"colType": "integer", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 2, | |||
"colName": "value_0", | |||
"colType": "real", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 3, | |||
"colName": "system_id", | |||
"colType": "real", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 4, | |||
"colName": "ground_truth", | |||
"colType": "integer", | |||
"role": [ | |||
"suggestedTarget" | |||
] | |||
} | |||
], | |||
"columnsCount": 5 | |||
} | |||
] | |||
} |
@@ -0,0 +1,65 @@ | |||
{ | |||
"about": { | |||
"problemID": "yahoo_system_sub_5_problem", | |||
"problemName": "yahoo_system_sub_5_problem", | |||
"problemDescription": "Anomaly detection", | |||
"problemVersion": "4.0.0", | |||
"problemSchemaVersion": "4.0.0", | |||
"taskKeywords": [ | |||
"classification", | |||
"binary", | |||
"tabular" | |||
] | |||
}, | |||
"inputs": { | |||
"data": [ | |||
{ | |||
"datasetID": "yahoo_system_sub_5_dataset", | |||
"targets": [ | |||
{ | |||
"targetIndex": 0, | |||
"resID": "learningData", | |||
"colIndex": 4, | |||
"colName": "ground_truth" | |||
} | |||
] | |||
} | |||
], | |||
"dataSplits": { | |||
"method": "holdOut", | |||
"testSize": 0.2, | |||
"stratified": true, | |||
"numRepeats": 0, | |||
"randomSeed": 42, | |||
"splitsFile": "dataSplits.csv", | |||
"datasetViewMaps": { | |||
"train": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_TRAIN" | |||
} | |||
], | |||
"test": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_TEST" | |||
} | |||
], | |||
"score": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_SCORE" | |||
} | |||
] | |||
} | |||
}, | |||
"performanceMetrics": [ | |||
{ | |||
"metric": "f1Macro" | |||
} | |||
] | |||
}, | |||
"expectedOutputs": { | |||
"predictionsFile": "predictions.csv" | |||
} | |||
} |
@@ -0,0 +1,71 @@ | |||
{ | |||
"about": { | |||
"datasetID": "yahoo_system_sub_5_dataset_TRAIN", | |||
"datasetName": "NULL", | |||
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", | |||
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", | |||
"license": " CC Public Domain Mark 1.0 ", | |||
"source": "OpenML", | |||
"sourceURI": "http://www.openml.org/d/185", | |||
"approximateSize": "", | |||
"datasetSchemaVersion": "4.0.0", | |||
"redacted": false, | |||
"datasetVersion": "4.0.0" | |||
}, | |||
"dataResources": [ | |||
{ | |||
"resID": "learningData", | |||
"resPath": "tables/learningData.csv", | |||
"resType": "table", | |||
"resFormat": { | |||
"text/csv": [ | |||
"csv" | |||
] | |||
}, | |||
"isCollection": false, | |||
"columns": [ | |||
{ | |||
"colIndex": 0, | |||
"colName": "d3mIndex", | |||
"colType": "integer", | |||
"role": [ | |||
"index" | |||
] | |||
}, | |||
{ | |||
"colIndex": 1, | |||
"colName": "timestamp", | |||
"colType": "integer", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 2, | |||
"colName": "value_0", | |||
"colType": "real", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 3, | |||
"colName": "system_id", | |||
"colType": "real", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 4, | |||
"colName": "ground_truth", | |||
"colType": "integer", | |||
"role": [ | |||
"suggestedTarget" | |||
] | |||
} | |||
], | |||
"columnsCount": 5 | |||
} | |||
] | |||
} |
@@ -0,0 +1,65 @@ | |||
{ | |||
"about": { | |||
"problemID": "yahoo_system_sub_5_problem", | |||
"problemName": "yahoo_system_sub_5_problem", | |||
"problemDescription": "Anomaly detection", | |||
"problemVersion": "4.0.0", | |||
"problemSchemaVersion": "4.0.0", | |||
"taskKeywords": [ | |||
"classification", | |||
"binary", | |||
"tabular" | |||
] | |||
}, | |||
"inputs": { | |||
"data": [ | |||
{ | |||
"datasetID": "yahoo_system_sub_5_dataset", | |||
"targets": [ | |||
{ | |||
"targetIndex": 0, | |||
"resID": "learningData", | |||
"colIndex": 4, | |||
"colName": "ground_truth" | |||
} | |||
] | |||
} | |||
], | |||
"dataSplits": { | |||
"method": "holdOut", | |||
"testSize": 0.2, | |||
"stratified": true, | |||
"numRepeats": 0, | |||
"randomSeed": 42, | |||
"splitsFile": "dataSplits.csv", | |||
"datasetViewMaps": { | |||
"train": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_TRAIN" | |||
} | |||
], | |||
"test": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_TEST" | |||
} | |||
], | |||
"score": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_SCORE" | |||
} | |||
] | |||
} | |||
}, | |||
"performanceMetrics": [ | |||
{ | |||
"metric": "f1Macro" | |||
} | |||
] | |||
}, | |||
"expectedOutputs": { | |||
"predictionsFile": "predictions.csv" | |||
} | |||
} |
@@ -0,0 +1,71 @@ | |||
{ | |||
"about": { | |||
"datasetID": "yahoo_system_sub_5_dataset", | |||
"datasetName": "yahoo_system_sub_5", | |||
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", | |||
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", | |||
"license": " CC Public Domain Mark 1.0 ", | |||
"source": "OpenML", | |||
"sourceURI": "http://www.openml.org/d/185", | |||
"approximateSize": "", | |||
"datasetSchemaVersion": "4.0.0", | |||
"redacted": false, | |||
"datasetVersion": "4.0.0" | |||
}, | |||
"dataResources": [ | |||
{ | |||
"resID": "learningData", | |||
"resPath": "tables/learningData.csv", | |||
"resType": "table", | |||
"resFormat": { | |||
"text/csv": [ | |||
"csv" | |||
] | |||
}, | |||
"isCollection": false, | |||
"columns": [ | |||
{ | |||
"colIndex": 0, | |||
"colName": "d3mIndex", | |||
"colType": "integer", | |||
"role": [ | |||
"index" | |||
] | |||
}, | |||
{ | |||
"colIndex": 1, | |||
"colName": "timestamp", | |||
"colType": "integer", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 2, | |||
"colName": "value_0", | |||
"colType": "real", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 3, | |||
"colName": "system_id", | |||
"colType": "real", | |||
"role": [ | |||
"attribute" | |||
] | |||
}, | |||
{ | |||
"colIndex": 4, | |||
"colName": "ground_truth", | |||
"colType": "integer", | |||
"role": [ | |||
"suggestedTarget" | |||
] | |||
} | |||
], | |||
"columnsCount": 5 | |||
} | |||
] | |||
} |
@@ -0,0 +1,65 @@ | |||
{ | |||
"about": { | |||
"problemID": "yahoo_system_sub_5_problem", | |||
"problemName": "yahoo_system_sub_5_problem", | |||
"problemDescription": "Anomaly detection", | |||
"problemVersion": "4.0.0", | |||
"problemSchemaVersion": "4.0.0", | |||
"taskKeywords": [ | |||
"classification", | |||
"binary", | |||
"tabular" | |||
] | |||
}, | |||
"inputs": { | |||
"data": [ | |||
{ | |||
"datasetID": "yahoo_system_sub_5_dataset", | |||
"targets": [ | |||
{ | |||
"targetIndex": 0, | |||
"resID": "learningData", | |||
"colIndex": 4, | |||
"colName": "ground_truth" | |||
} | |||
] | |||
} | |||
], | |||
"dataSplits": { | |||
"method": "holdOut", | |||
"testSize": 0.2, | |||
"stratified": true, | |||
"numRepeats": 0, | |||
"randomSeed": 42, | |||
"splitsFile": "dataSplits.csv", | |||
"datasetViewMaps": { | |||
"train": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_TRAIN" | |||
} | |||
], | |||
"test": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_TEST" | |||
} | |||
], | |||
"score": [ | |||
{ | |||
"from": "yahoo_system_sub_5_dataset", | |||
"to": "yahoo_system_sub_5_dataset_SCORE" | |||
} | |||
] | |||
} | |||
}, | |||
"performanceMetrics": [ | |||
{ | |||
"metric": "f1Macro" | |||
} | |||
] | |||
}, | |||
"expectedOutputs": { | |||
"predictionsFile": "predictions.csv" | |||
} | |||
} |
@@ -0,0 +1,72 @@ | |||
from d3m import index | |||
from d3m.metadata.base import ArgumentType | |||
from d3m.metadata.pipeline import Pipeline, PrimitiveStep | |||
# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest | |||
# extract_columns_by_semantic_types(targets) -> ^ | |||
# Creating pipeline | |||
pipeline_description = Pipeline() | |||
pipeline_description.add_input(name='inputs') | |||
# Step 0: dataset_to_dataframe | |||
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) | |||
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') | |||
step_0.add_output('produce') | |||
pipeline_description.add_step(step_0) | |||
# Step 1: column_parser | |||
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) | |||
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') | |||
step_1.add_output('produce') | |||
pipeline_description.add_step(step_1) | |||
# Step 2: extract_columns_by_semantic_types(attributes) | |||
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) | |||
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') | |||
step_2.add_output('produce') | |||
step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, | |||
data=['https://metadata.datadrivendiscovery.org/types/Attribute']) | |||
pipeline_description.add_step(step_2) | |||
# Step 3: extract_columns_by_semantic_types(targets) | |||
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) | |||
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') | |||
step_3.add_output('produce') | |||
step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, | |||
data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) | |||
pipeline_description.add_step(step_3) | |||
attributes = 'steps.2.produce' | |||
targets = 'steps.3.produce' | |||
# Step 4: auto encoder | |||
step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_ae')) | |||
step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) | |||
step_4.add_output('produce_score') | |||
step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=[0,1,2]) | |||
step_4.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) | |||
step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') | |||
pipeline_description.add_step(step_4) | |||
# Step 5: ensemble | |||
step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.Ensemble')) | |||
step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce_score') | |||
step_5.add_output('produce') | |||
pipeline_description.add_step(step_5) | |||
# Final Output | |||
pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') | |||
# Output to YAML | |||
#yaml = pipeline_description.to_yaml() | |||
#with open('pipeline.yml', 'w') as f: | |||
# f.write(yaml) | |||
#prin(yaml) | |||
# Output to json | |||
data = pipeline_description.to_json() | |||
with open('example_pipeline.json', 'w') as f: | |||
f.write(data) | |||
print(data) |
@@ -0,0 +1,48 @@ | |||
import sys | |||
import argparse | |||
import os | |||
import pandas as pd | |||
from tods import generate_dataset, load_pipeline, evaluate_pipeline | |||
this_path = os.path.dirname(os.path.abspath(__file__)) | |||
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset | |||
parser = argparse.ArgumentParser(description='Arguments for running predefined pipelin.') | |||
#parser.add_argument('--table_path', type=str, default=os.path.join(this_path, '../datasets/yahoo_sub_5.csv'), | |||
# help='Input the path of the input data table') | |||
parser.add_argument('--table_path', type=str, default=os.path.join(this_path, '../datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_dataset/tables/learningData.csv'), | |||
help='Input the path of the input data table') | |||
parser.add_argument('--target_index', type=int, default=4, | |||
help='Index of the ground truth (for evaluation)') | |||
parser.add_argument('--metric',type=str, default='F1_MACRO', | |||
help='Evaluation Metric (F1, F1_MACRO)') | |||
#parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/resources/default_pipeline.json'), | |||
# help='Input the path of the pre-built pipeline description') | |||
#Using the pipeline that was build and saved in example_pipeline | |||
parser.add_argument('--pipeline_path', default=os.path.join(this_path, './example_pipeline.json'), | |||
help='Input the path of the pre-built pipeline description') | |||
args = parser.parse_args() | |||
table_path = args.table_path | |||
target_index = args.target_index # what column is the target | |||
pipeline_path = args.pipeline_path | |||
metric = args.metric # F1 on both label 0 and 1 | |||
# Read data and generate dataset | |||
df = pd.read_csv(table_path) | |||
dataset = generate_dataset(df, target_index) | |||
# Load the default pipeline | |||
pipeline = load_pipeline(pipeline_path) | |||
# Run the pipeline | |||
pipeline_result = evaluate_pipeline(dataset, pipeline, metric) | |||
print(pipeline_result) | |||
@@ -0,0 +1,363 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
import pandas as pd | |||
# Custom import commands if any | |||
from sklearn.preprocessing.data import Normalizer | |||
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult,DockerContainer | |||
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase | |||
import os | |||
from typing import Any,Optional,List | |||
from d3m import container, utils as d3m_utils | |||
from d3m.metadata import base as metadata_base | |||
from d3m.metadata import hyperparams,params | |||
from d3m.primitive_interfaces import base, transformer | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
# Added by Mia | |||
endog = hyperparams.Bounded[int]( | |||
lower = 2, | |||
upper = None, | |||
default = 3, | |||
description='Array like time series.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
threshold = hyperparams.Bounded[float]( | |||
lower = 0, | |||
upper = 1, | |||
default = 0.5, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
# keep previous | |||
norm = hyperparams.Enumeration[str]( | |||
default='l2', | |||
values=['l1', 'l2', 'max'], | |||
description='The norm to use to normalize each non zero sample.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe", | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], | |||
default='https://metadata.datadrivendiscovery.org/types/Attribute', | |||
description='Decides what semantic type to attach to generated attributes', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class Ensemble(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Ensemble method | |||
`Calculate the Maximum/Minimum/Average and Majority Voting for the detection algorithm based on the threshold set for the score`_ | |||
""" | |||
__author__ = "DATA Lab at Texas A&M University" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"name": "Ensemble", | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ISOLATION_FOREST, ], | |||
"primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, | |||
"python_path": "d3m.primitives.tods.detection_algorithm.Ensemble", | |||
"source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/HoltSmoothing.py']}, | |||
"version": "0.0.1", | |||
"id": "3688b5b4-885c-40bb-9731-fe3969ea81b0", | |||
"hyperparams_to_tune": ['use_columns'], | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = Normalizer( | |||
norm=self.hyperparams['norm'], | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
def set_training_data(self, *, inputs: Inputs) -> None: | |||
self._inputs = inputs | |||
self._fitted = False | |||
def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]: | |||
if self._fitted: | |||
return CallResult(None) | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if self._training_inputs is None: | |||
return CallResult(None) | |||
if len(self._training_indices) > 0: | |||
self._clf.fit(self._training_inputs) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: | |||
outputs = inputs | |||
outputs.columns = ['timestamp','value','system_id','scores'] | |||
# print(outputs) | |||
# print('max_score') | |||
# ensemble_max = outputs.groupby('system_id')[outputs.columns[-1]].max() | |||
# print(ensemble_max) | |||
# | |||
# print('min_score') | |||
# ensemble_min = outputs.groupby('system_id')[outputs.columns[-1]].min() | |||
# print(ensemble_min) | |||
# | |||
# print('mean_score') | |||
# outputs_mean = outputs.groupby('system_id')[outputs.columns[3]].mean() | |||
# print(outputs_mean) | |||
outputs['results'] = numpy.where(outputs['scores']>0.05, 1, 0) | |||
print(outputs) | |||
outputs_xy = outputs.groupby('system_id')['results'].sum().reset_index() | |||
print("*****majority_sum_xy*****") | |||
print(outputs_xy) | |||
outputs_sum_x = outputs.groupby(['timestamp','system_id'])['results'].sum() | |||
# outputs_sum_x = outputs.groupby(['system_id','timestamp']).size().reset_index().groupby(['timestamp'])['results'].sum() | |||
outputs_sum_y = outputs.groupby(['system_id','value'])['results'].sum() | |||
print('*****majority_max_x*****') | |||
print(outputs_sum_x) | |||
print('*****majority_max_y*****') | |||
print(outputs_sum_y) | |||
return base.CallResult(outputs) | |||
def _update_metadata(self, outputs): | |||
outputs.metadata = outputs.metadata.generate(outputs,) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_columns'], | |||
exclude_columns=hyperparams['exclude_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set([]) | |||
add_semantic_types = [] | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=True) | |||
target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], | |||
outputs_metadata: metadata_base.DataMetadata, hyperparams): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in input_indices: | |||
column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set([]) | |||
add_semantic_types = set() | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
# If outputs has more columns than index, add Attribute Type to all remaining | |||
if outputs_length > len(input_indices): | |||
for column_index in range(len(input_indices), outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = set() | |||
semantic_types.add(hyperparams["return_semantic_type"]) | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = list(semantic_types) | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
Ensemble.__doc__ = Normalizer.__doc__ |
@@ -76,5 +76,6 @@ tods.detection_algorithm.PCAODetector = tods.detection_algorithm.PCAODetect:PCAO | |||
tods.detection_algorithm.KDiscordODetector = tods.detection_algorithm.KDiscordODetect:KDiscordODetector | |||
tods.detection_algorithm.deeplog = tods.detection_algorithm.DeepLog:DeepLogPrimitive | |||
tods.detection_algorithm.telemanom = tods.detection_algorithm.Telemanom:TelemanomPrimitive | |||
tods.detection_algorithm.Ensemble = tods.detection_algorithm.Ensemble:Ensemble | |||
tods.reinforcement.rule_filter = tods.reinforcement.RuleBasedFilter:RuleBasedFilter |