Former-commit-id:mastercb4c075eae
[formerly5e6829bf03
] [formerly4aa760c285
[formerlydad7db5423
]] [formerlyd5ea0d1d7a
[formerly04b69ec343
] [formerlybf02e971e6
[formerlya28caafb62
]]] [formerly476abddd49
[formerly3041402c02
] [formerly60c56c29c8
[formerly8c3ccbd2ac
]] [formerlya091b208da
[formerly4c92ee9c15
] [formerlye32a84e951
[formerly2bcea1d1a7
]]]] [formerlydfc1b23ff2
[formerlya17a488d92
] [formerly72e2329d4e
[formerly528d2ef979
]] [formerlyfb7f463738
[formerlyf5389a7098
] [formerly24e297503a
[formerlyde9dbbc467
]]] [formerly67f768802a
[formerly6c8efb8110
] [formerly380aef8497
[formerly5725e6d8f6
]] [formerly2e935b037c
[formerly0fc1ec8392
] [formerly13c2968d9d
[formerly7856fae75d
]]]]] [formerlyd572a74f3f
[formerlyff81257cfd
] [formerlyb1c0078360
[formerly2f13b987fb
]] [formerly65c23b07c0
[formerly9127bad6c8
] [formerly94502c7588
[formerlyd26ec3604a
]]] [formerlybcd9d2f628
[formerlyb251800469
] [formerly66e565a61e
[formerlyf7d4af1e2a
]] [formerlya94d95a815
[formerly1c99d12704
] [formerlybd7948170a
[formerlyf37677bae3
]]]] [formerly6450488697
[formerlycaa9df31ef
] [formerly27613be5cc
[formerlyf0505baf77
]] [formerlycf6318a944
[formerlyc5bdb7a912
] [formerlyeffbadb66b
[formerlyae30ca435d
]]] [formerly7873ccb2ff
[formerly35d0bdde6f
] [formerly5fa5c65b81
[formerly97ef494e69
]] [formerly8cc1353bdb
[formerlyac27652284
] [formerlyc29897f807
[formerly2478392ad7
]]]]]] Former-commit-id:a95428d053
[formerly5cfd93a0b6
] [formerlya7509290f2
[formerlyf8e2656ee8
]] [formerly52f067e10e
[formerlya6eb4b0f6c
] [formerly94972f2621
[formerlyc28f95df65
]]] [formerly87d1998d2f
[formerly7a963e10fe
] [formerlyd6ad74535b
[formerlyfc21c53472
]] [formerlya40b59459c
[formerlyf9e08220fe
] [formerly63749cd73b
[formerly1319a03d67
]]]] [formerly231916853f
[formerly89856e87e4
] [formerly5b9e6a2352
[formerlyc489bba9c2
]] [formerly9f877f984f
[formerlyd27ee607fd
] [formerlyef1d4190df
[formerlyfb577f2688
]]] [formerly5e03ef1894
[formerly1cdc05ea41
] [formerly951b8f8752
[formerlya1458ee85b
]] [formerlyef1a983075
[formerlyf0d0487faa
] [formerlyc29897f807
]]]] Former-commit-id:d6926861f3
[formerlyead7564a76
] [formerly015b87e31a
[formerlyfaa0016c02
]] [formerlyd28813974e
[formerlyb6e4cf1b0c
] [formerly5a099d34db
[formerly8d2784221d
]]] [formerlye3d46c8705
[formerly942d2bb23e
] [formerly8b791eb9cf
[formerlyfe190aedde
]] [formerlycbdd58d974
[formerlyd0e1e6fe31
] [formerlyaf63675442
[formerly47856c6548
]]]] Former-commit-id:edc327048d
[formerly6e9aed714c
] [formerly034ec82722
[formerly1a2e792536
]] [formerly00e0ed3838
[formerlyff23a8a631
] [formerly83f3660051
[formerlye69565c6fd
]]] Former-commit-id:451506fc8d
[formerly81b06e1486
] [formerly0b16630084
[formerlyd45b5f6d8c
]] Former-commit-id:da2cdebccf
[formerlyabe171b1a2
] Former-commit-id:d3999736ed
@@ -0,0 +1,71 @@ | |||||
{ | |||||
"about": { | |||||
"datasetID": "yahoo_system_sub_5_dataset_TEST", | |||||
"datasetName": "NULL", | |||||
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", | |||||
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", | |||||
"license": " CC Public Domain Mark 1.0 ", | |||||
"source": "OpenML", | |||||
"sourceURI": "http://www.openml.org/d/185", | |||||
"approximateSize": "", | |||||
"datasetSchemaVersion": "4.0.0", | |||||
"redacted": false, | |||||
"datasetVersion": "4.0.0" | |||||
}, | |||||
"dataResources": [ | |||||
{ | |||||
"resID": "learningData", | |||||
"resPath": "tables/learningData.csv", | |||||
"resType": "table", | |||||
"resFormat": { | |||||
"text/csv": [ | |||||
"csv" | |||||
] | |||||
}, | |||||
"isCollection": false, | |||||
"columns": [ | |||||
{ | |||||
"colIndex": 0, | |||||
"colName": "d3mIndex", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"index" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 1, | |||||
"colName": "timestamp", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 2, | |||||
"colName": "value_0", | |||||
"colType": "real", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 3, | |||||
"colName": "system_id", | |||||
"colType": "real", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 4, | |||||
"colName": "ground_truth", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"suggestedTarget" | |||||
] | |||||
} | |||||
], | |||||
"columnsCount": 5 | |||||
} | |||||
] | |||||
} |
@@ -0,0 +1,65 @@ | |||||
{ | |||||
"about": { | |||||
"problemID": "yahoo_system_sub_5_problem", | |||||
"problemName": "yahoo_system_sub_5_problem", | |||||
"problemDescription": "Anomaly detection", | |||||
"problemVersion": "4.0.0", | |||||
"problemSchemaVersion": "4.0.0", | |||||
"taskKeywords": [ | |||||
"classification", | |||||
"binary", | |||||
"tabular" | |||||
] | |||||
}, | |||||
"inputs": { | |||||
"data": [ | |||||
{ | |||||
"datasetID": "yahoo_system_sub_5_dataset", | |||||
"targets": [ | |||||
{ | |||||
"targetIndex": 0, | |||||
"resID": "learningData", | |||||
"colIndex": 4, | |||||
"colName": "ground_truth" | |||||
} | |||||
] | |||||
} | |||||
], | |||||
"dataSplits": { | |||||
"method": "holdOut", | |||||
"testSize": 0.2, | |||||
"stratified": true, | |||||
"numRepeats": 0, | |||||
"randomSeed": 42, | |||||
"splitsFile": "dataSplits.csv", | |||||
"datasetViewMaps": { | |||||
"train": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_TRAIN" | |||||
} | |||||
], | |||||
"test": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_TEST" | |||||
} | |||||
], | |||||
"score": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_SCORE" | |||||
} | |||||
] | |||||
} | |||||
}, | |||||
"performanceMetrics": [ | |||||
{ | |||||
"metric": "f1Macro" | |||||
} | |||||
] | |||||
}, | |||||
"expectedOutputs": { | |||||
"predictionsFile": "predictions.csv" | |||||
} | |||||
} |
@@ -0,0 +1,71 @@ | |||||
{ | |||||
"about": { | |||||
"datasetID": "yahoo_system_sub_5_dataset_TEST", | |||||
"datasetName": "NULL", | |||||
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", | |||||
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", | |||||
"license": " CC Public Domain Mark 1.0 ", | |||||
"source": "OpenML", | |||||
"sourceURI": "http://www.openml.org/d/185", | |||||
"approximateSize": "", | |||||
"datasetSchemaVersion": "4.0.0", | |||||
"redacted": false, | |||||
"datasetVersion": "4.0.0" | |||||
}, | |||||
"dataResources": [ | |||||
{ | |||||
"resID": "learningData", | |||||
"resPath": "tables/learningData.csv", | |||||
"resType": "table", | |||||
"resFormat": { | |||||
"text/csv": [ | |||||
"csv" | |||||
] | |||||
}, | |||||
"isCollection": false, | |||||
"columns": [ | |||||
{ | |||||
"colIndex": 0, | |||||
"colName": "d3mIndex", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"index" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 1, | |||||
"colName": "timestamp", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 2, | |||||
"colName": "value_0", | |||||
"colType": "real", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 3, | |||||
"colName": "system_id", | |||||
"colType": "real", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 4, | |||||
"colName": "ground_truth", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"suggestedTarget" | |||||
] | |||||
} | |||||
], | |||||
"columnsCount": 5 | |||||
} | |||||
] | |||||
} |
@@ -0,0 +1,65 @@ | |||||
{ | |||||
"about": { | |||||
"problemID": "yahoo_system_sub_5_problem", | |||||
"problemName": "yahoo_system_sub_5_problem", | |||||
"problemDescription": "Anomaly detection", | |||||
"problemVersion": "4.0.0", | |||||
"problemSchemaVersion": "4.0.0", | |||||
"taskKeywords": [ | |||||
"classification", | |||||
"binary", | |||||
"tabular" | |||||
] | |||||
}, | |||||
"inputs": { | |||||
"data": [ | |||||
{ | |||||
"datasetID": "yahoo_system_sub_5_dataset", | |||||
"targets": [ | |||||
{ | |||||
"targetIndex": 0, | |||||
"resID": "learningData", | |||||
"colIndex": 4, | |||||
"colName": "ground_truth" | |||||
} | |||||
] | |||||
} | |||||
], | |||||
"dataSplits": { | |||||
"method": "holdOut", | |||||
"testSize": 0.2, | |||||
"stratified": true, | |||||
"numRepeats": 0, | |||||
"randomSeed": 42, | |||||
"splitsFile": "dataSplits.csv", | |||||
"datasetViewMaps": { | |||||
"train": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_TRAIN" | |||||
} | |||||
], | |||||
"test": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_TEST" | |||||
} | |||||
], | |||||
"score": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_SCORE" | |||||
} | |||||
] | |||||
} | |||||
}, | |||||
"performanceMetrics": [ | |||||
{ | |||||
"metric": "f1Macro" | |||||
} | |||||
] | |||||
}, | |||||
"expectedOutputs": { | |||||
"predictionsFile": "predictions.csv" | |||||
} | |||||
} |
@@ -0,0 +1,71 @@ | |||||
{ | |||||
"about": { | |||||
"datasetID": "yahoo_system_sub_5_dataset_TRAIN", | |||||
"datasetName": "NULL", | |||||
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", | |||||
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", | |||||
"license": " CC Public Domain Mark 1.0 ", | |||||
"source": "OpenML", | |||||
"sourceURI": "http://www.openml.org/d/185", | |||||
"approximateSize": "", | |||||
"datasetSchemaVersion": "4.0.0", | |||||
"redacted": false, | |||||
"datasetVersion": "4.0.0" | |||||
}, | |||||
"dataResources": [ | |||||
{ | |||||
"resID": "learningData", | |||||
"resPath": "tables/learningData.csv", | |||||
"resType": "table", | |||||
"resFormat": { | |||||
"text/csv": [ | |||||
"csv" | |||||
] | |||||
}, | |||||
"isCollection": false, | |||||
"columns": [ | |||||
{ | |||||
"colIndex": 0, | |||||
"colName": "d3mIndex", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"index" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 1, | |||||
"colName": "timestamp", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 2, | |||||
"colName": "value_0", | |||||
"colType": "real", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 3, | |||||
"colName": "system_id", | |||||
"colType": "real", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 4, | |||||
"colName": "ground_truth", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"suggestedTarget" | |||||
] | |||||
} | |||||
], | |||||
"columnsCount": 5 | |||||
} | |||||
] | |||||
} |
@@ -0,0 +1,65 @@ | |||||
{ | |||||
"about": { | |||||
"problemID": "yahoo_system_sub_5_problem", | |||||
"problemName": "yahoo_system_sub_5_problem", | |||||
"problemDescription": "Anomaly detection", | |||||
"problemVersion": "4.0.0", | |||||
"problemSchemaVersion": "4.0.0", | |||||
"taskKeywords": [ | |||||
"classification", | |||||
"binary", | |||||
"tabular" | |||||
] | |||||
}, | |||||
"inputs": { | |||||
"data": [ | |||||
{ | |||||
"datasetID": "yahoo_system_sub_5_dataset", | |||||
"targets": [ | |||||
{ | |||||
"targetIndex": 0, | |||||
"resID": "learningData", | |||||
"colIndex": 4, | |||||
"colName": "ground_truth" | |||||
} | |||||
] | |||||
} | |||||
], | |||||
"dataSplits": { | |||||
"method": "holdOut", | |||||
"testSize": 0.2, | |||||
"stratified": true, | |||||
"numRepeats": 0, | |||||
"randomSeed": 42, | |||||
"splitsFile": "dataSplits.csv", | |||||
"datasetViewMaps": { | |||||
"train": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_TRAIN" | |||||
} | |||||
], | |||||
"test": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_TEST" | |||||
} | |||||
], | |||||
"score": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_SCORE" | |||||
} | |||||
] | |||||
} | |||||
}, | |||||
"performanceMetrics": [ | |||||
{ | |||||
"metric": "f1Macro" | |||||
} | |||||
] | |||||
}, | |||||
"expectedOutputs": { | |||||
"predictionsFile": "predictions.csv" | |||||
} | |||||
} |
@@ -0,0 +1,71 @@ | |||||
{ | |||||
"about": { | |||||
"datasetID": "yahoo_system_sub_5_dataset", | |||||
"datasetName": "yahoo_system_sub_5", | |||||
"description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", | |||||
"citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", | |||||
"license": " CC Public Domain Mark 1.0 ", | |||||
"source": "OpenML", | |||||
"sourceURI": "http://www.openml.org/d/185", | |||||
"approximateSize": "", | |||||
"datasetSchemaVersion": "4.0.0", | |||||
"redacted": false, | |||||
"datasetVersion": "4.0.0" | |||||
}, | |||||
"dataResources": [ | |||||
{ | |||||
"resID": "learningData", | |||||
"resPath": "tables/learningData.csv", | |||||
"resType": "table", | |||||
"resFormat": { | |||||
"text/csv": [ | |||||
"csv" | |||||
] | |||||
}, | |||||
"isCollection": false, | |||||
"columns": [ | |||||
{ | |||||
"colIndex": 0, | |||||
"colName": "d3mIndex", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"index" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 1, | |||||
"colName": "timestamp", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 2, | |||||
"colName": "value_0", | |||||
"colType": "real", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 3, | |||||
"colName": "system_id", | |||||
"colType": "real", | |||||
"role": [ | |||||
"attribute" | |||||
] | |||||
}, | |||||
{ | |||||
"colIndex": 4, | |||||
"colName": "ground_truth", | |||||
"colType": "integer", | |||||
"role": [ | |||||
"suggestedTarget" | |||||
] | |||||
} | |||||
], | |||||
"columnsCount": 5 | |||||
} | |||||
] | |||||
} |
@@ -0,0 +1,65 @@ | |||||
{ | |||||
"about": { | |||||
"problemID": "yahoo_system_sub_5_problem", | |||||
"problemName": "yahoo_system_sub_5_problem", | |||||
"problemDescription": "Anomaly detection", | |||||
"problemVersion": "4.0.0", | |||||
"problemSchemaVersion": "4.0.0", | |||||
"taskKeywords": [ | |||||
"classification", | |||||
"binary", | |||||
"tabular" | |||||
] | |||||
}, | |||||
"inputs": { | |||||
"data": [ | |||||
{ | |||||
"datasetID": "yahoo_system_sub_5_dataset", | |||||
"targets": [ | |||||
{ | |||||
"targetIndex": 0, | |||||
"resID": "learningData", | |||||
"colIndex": 4, | |||||
"colName": "ground_truth" | |||||
} | |||||
] | |||||
} | |||||
], | |||||
"dataSplits": { | |||||
"method": "holdOut", | |||||
"testSize": 0.2, | |||||
"stratified": true, | |||||
"numRepeats": 0, | |||||
"randomSeed": 42, | |||||
"splitsFile": "dataSplits.csv", | |||||
"datasetViewMaps": { | |||||
"train": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_TRAIN" | |||||
} | |||||
], | |||||
"test": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_TEST" | |||||
} | |||||
], | |||||
"score": [ | |||||
{ | |||||
"from": "yahoo_system_sub_5_dataset", | |||||
"to": "yahoo_system_sub_5_dataset_SCORE" | |||||
} | |||||
] | |||||
} | |||||
}, | |||||
"performanceMetrics": [ | |||||
{ | |||||
"metric": "f1Macro" | |||||
} | |||||
] | |||||
}, | |||||
"expectedOutputs": { | |||||
"predictionsFile": "predictions.csv" | |||||
} | |||||
} |
@@ -0,0 +1,72 @@ | |||||
from d3m import index | |||||
from d3m.metadata.base import ArgumentType | |||||
from d3m.metadata.pipeline import Pipeline, PrimitiveStep | |||||
# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest | |||||
# extract_columns_by_semantic_types(targets) -> ^ | |||||
# Creating pipeline | |||||
pipeline_description = Pipeline() | |||||
pipeline_description.add_input(name='inputs') | |||||
# Step 0: dataset_to_dataframe | |||||
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) | |||||
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') | |||||
step_0.add_output('produce') | |||||
pipeline_description.add_step(step_0) | |||||
# Step 1: column_parser | |||||
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) | |||||
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') | |||||
step_1.add_output('produce') | |||||
pipeline_description.add_step(step_1) | |||||
# Step 2: extract_columns_by_semantic_types(attributes) | |||||
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) | |||||
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') | |||||
step_2.add_output('produce') | |||||
step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, | |||||
data=['https://metadata.datadrivendiscovery.org/types/Attribute']) | |||||
pipeline_description.add_step(step_2) | |||||
# Step 3: extract_columns_by_semantic_types(targets) | |||||
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) | |||||
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') | |||||
step_3.add_output('produce') | |||||
step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, | |||||
data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) | |||||
pipeline_description.add_step(step_3) | |||||
attributes = 'steps.2.produce' | |||||
targets = 'steps.3.produce' | |||||
# Step 4: auto encoder | |||||
step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_ae')) | |||||
step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) | |||||
step_4.add_output('produce_score') | |||||
step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=[0,1,2]) | |||||
step_4.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) | |||||
step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') | |||||
pipeline_description.add_step(step_4) | |||||
# Step 5: ensemble | |||||
step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.Ensemble')) | |||||
step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce_score') | |||||
step_5.add_output('produce') | |||||
pipeline_description.add_step(step_5) | |||||
# Final Output | |||||
pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') | |||||
# Output to YAML | |||||
#yaml = pipeline_description.to_yaml() | |||||
#with open('pipeline.yml', 'w') as f: | |||||
# f.write(yaml) | |||||
#prin(yaml) | |||||
# Output to json | |||||
data = pipeline_description.to_json() | |||||
with open('example_pipeline.json', 'w') as f: | |||||
f.write(data) | |||||
print(data) |
@@ -0,0 +1,48 @@ | |||||
import sys | |||||
import argparse | |||||
import os | |||||
import pandas as pd | |||||
from tods import generate_dataset, load_pipeline, evaluate_pipeline | |||||
this_path = os.path.dirname(os.path.abspath(__file__)) | |||||
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset | |||||
parser = argparse.ArgumentParser(description='Arguments for running predefined pipelin.') | |||||
#parser.add_argument('--table_path', type=str, default=os.path.join(this_path, '../datasets/yahoo_sub_5.csv'), | |||||
# help='Input the path of the input data table') | |||||
parser.add_argument('--table_path', type=str, default=os.path.join(this_path, '../datasets/anomaly/yahoo_system_sub_5/yahoo_system_sub_5_dataset/tables/learningData.csv'), | |||||
help='Input the path of the input data table') | |||||
parser.add_argument('--target_index', type=int, default=4, | |||||
help='Index of the ground truth (for evaluation)') | |||||
parser.add_argument('--metric',type=str, default='F1_MACRO', | |||||
help='Evaluation Metric (F1, F1_MACRO)') | |||||
#parser.add_argument('--pipeline_path', default=os.path.join(this_path, '../tods/resources/default_pipeline.json'), | |||||
# help='Input the path of the pre-built pipeline description') | |||||
#Using the pipeline that was build and saved in example_pipeline | |||||
parser.add_argument('--pipeline_path', default=os.path.join(this_path, './example_pipeline.json'), | |||||
help='Input the path of the pre-built pipeline description') | |||||
args = parser.parse_args() | |||||
table_path = args.table_path | |||||
target_index = args.target_index # what column is the target | |||||
pipeline_path = args.pipeline_path | |||||
metric = args.metric # F1 on both label 0 and 1 | |||||
# Read data and generate dataset | |||||
df = pd.read_csv(table_path) | |||||
dataset = generate_dataset(df, target_index) | |||||
# Load the default pipeline | |||||
pipeline = load_pipeline(pipeline_path) | |||||
# Run the pipeline | |||||
pipeline_result = evaluate_pipeline(dataset, pipeline, metric) | |||||
print(pipeline_result) | |||||
@@ -0,0 +1,363 @@ | |||||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||||
from numpy import ndarray | |||||
from collections import OrderedDict | |||||
from scipy import sparse | |||||
import os | |||||
import sklearn | |||||
import numpy | |||||
import typing | |||||
import pandas as pd | |||||
# Custom import commands if any | |||||
from sklearn.preprocessing.data import Normalizer | |||||
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt | |||||
from d3m.container.numpy import ndarray as d3m_ndarray | |||||
from d3m.container import DataFrame as d3m_dataframe | |||||
from d3m.metadata import hyperparams, params, base as metadata_base | |||||
from d3m import utils | |||||
from d3m.base import utils as base_utils | |||||
from d3m.exceptions import PrimitiveNotFittedError | |||||
from d3m.primitive_interfaces.base import CallResult,DockerContainer | |||||
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase | |||||
import os | |||||
from typing import Any,Optional,List | |||||
from d3m import container, utils as d3m_utils | |||||
from d3m.metadata import base as metadata_base | |||||
from d3m.metadata import hyperparams,params | |||||
from d3m.primitive_interfaces import base, transformer | |||||
Inputs = d3m_dataframe | |||||
Outputs = d3m_dataframe | |||||
class Params(params.Params): | |||||
input_column_names: Optional[Any] | |||||
target_names_: Optional[Sequence[Any]] | |||||
training_indices_: Optional[Sequence[int]] | |||||
target_column_indices_: Optional[Sequence[int]] | |||||
target_columns_metadata_: Optional[List[OrderedDict]] | |||||
class Hyperparams(hyperparams.Hyperparams): | |||||
# Added by Mia | |||||
endog = hyperparams.Bounded[int]( | |||||
lower = 2, | |||||
upper = None, | |||||
default = 3, | |||||
description='Array like time series.', | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||||
) | |||||
threshold = hyperparams.Bounded[float]( | |||||
lower = 0, | |||||
upper = 1, | |||||
default = 0.5, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||||
) | |||||
# keep previous | |||||
norm = hyperparams.Enumeration[str]( | |||||
default='l2', | |||||
values=['l1', 'l2', 'max'], | |||||
description='The norm to use to normalize each non zero sample.', | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||||
) | |||||
use_columns = hyperparams.Set( | |||||
elements=hyperparams.Hyperparameter[int](-1), | |||||
default=(), | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", | |||||
) | |||||
exclude_columns = hyperparams.Set( | |||||
elements=hyperparams.Hyperparameter[int](-1), | |||||
default=(), | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", | |||||
) | |||||
return_result = hyperparams.Enumeration( | |||||
values=['append', 'replace', 'new'], | |||||
default='new', | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||||
) | |||||
use_semantic_types = hyperparams.UniformBool( | |||||
default=False, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe", | |||||
) | |||||
add_index_columns = hyperparams.UniformBool( | |||||
default=False, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||||
) | |||||
error_on_no_input = hyperparams.UniformBool( | |||||
default=True, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||||
) | |||||
return_semantic_type = hyperparams.Enumeration[str]( | |||||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], | |||||
default='https://metadata.datadrivendiscovery.org/types/Attribute', | |||||
description='Decides what semantic type to attach to generated attributes', | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||||
) | |||||
class Ensemble(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||||
""" | |||||
Ensemble method | |||||
`Calculate the Maximum/Minimum/Average and Majority Voting for the detection algorithm based on the threshold set for the score`_ | |||||
""" | |||||
__author__ = "DATA Lab at Texas A&M University" | |||||
metadata = metadata_base.PrimitiveMetadata({ | |||||
"name": "Ensemble", | |||||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ISOLATION_FOREST, ], | |||||
"primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, | |||||
"python_path": "d3m.primitives.tods.detection_algorithm.Ensemble", | |||||
"source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/HoltSmoothing.py']}, | |||||
"version": "0.0.1", | |||||
"id": "3688b5b4-885c-40bb-9731-fe3969ea81b0", | |||||
"hyperparams_to_tune": ['use_columns'], | |||||
}) | |||||
def __init__(self, *, | |||||
hyperparams: Hyperparams, | |||||
random_seed: int = 0, | |||||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||||
# False | |||||
self._clf = Normalizer( | |||||
norm=self.hyperparams['norm'], | |||||
) | |||||
self._inputs = None | |||||
self._outputs = None | |||||
self._training_inputs = None | |||||
self._training_outputs = None | |||||
self._target_names = None | |||||
self._training_indices = None | |||||
self._target_column_indices = None | |||||
self._target_columns_metadata: List[OrderedDict] = None | |||||
self._input_column_names = None | |||||
self._fitted = False | |||||
def set_training_data(self, *, inputs: Inputs) -> None: | |||||
self._inputs = inputs | |||||
self._fitted = False | |||||
def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]: | |||||
if self._fitted: | |||||
return CallResult(None) | |||||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||||
self._input_column_names = self._training_inputs.columns | |||||
if self._training_inputs is None: | |||||
return CallResult(None) | |||||
if len(self._training_indices) > 0: | |||||
self._clf.fit(self._training_inputs) | |||||
self._fitted = True | |||||
else: | |||||
if self.hyperparams['error_on_no_input']: | |||||
raise RuntimeError("No input columns were selected") | |||||
self.logger.warn("No input columns were selected") | |||||
return CallResult(None) | |||||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: | |||||
outputs = inputs | |||||
outputs.columns = ['timestamp','value','system_id','scores'] | |||||
# print(outputs) | |||||
# print('max_score') | |||||
# ensemble_max = outputs.groupby('system_id')[outputs.columns[-1]].max() | |||||
# print(ensemble_max) | |||||
# | |||||
# print('min_score') | |||||
# ensemble_min = outputs.groupby('system_id')[outputs.columns[-1]].min() | |||||
# print(ensemble_min) | |||||
# | |||||
# print('mean_score') | |||||
# outputs_mean = outputs.groupby('system_id')[outputs.columns[3]].mean() | |||||
# print(outputs_mean) | |||||
outputs['results'] = numpy.where(outputs['scores']>0.05, 1, 0) | |||||
print(outputs) | |||||
outputs_xy = outputs.groupby('system_id')['results'].sum().reset_index() | |||||
print("*****majority_sum_xy*****") | |||||
print(outputs_xy) | |||||
outputs_sum_x = outputs.groupby(['timestamp','system_id'])['results'].sum() | |||||
# outputs_sum_x = outputs.groupby(['system_id','timestamp']).size().reset_index().groupby(['timestamp'])['results'].sum() | |||||
outputs_sum_y = outputs.groupby(['system_id','value'])['results'].sum() | |||||
print('*****majority_max_x*****') | |||||
print(outputs_sum_x) | |||||
print('*****majority_max_y*****') | |||||
print(outputs_sum_y) | |||||
return base.CallResult(outputs) | |||||
def _update_metadata(self, outputs): | |||||
outputs.metadata = outputs.metadata.generate(outputs,) | |||||
def get_params(self) -> Params: | |||||
if not self._fitted: | |||||
return Params( | |||||
input_column_names=self._input_column_names, | |||||
training_indices_=self._training_indices, | |||||
target_names_=self._target_names, | |||||
target_column_indices_=self._target_column_indices, | |||||
target_columns_metadata_=self._target_columns_metadata | |||||
) | |||||
return Params( | |||||
input_column_names=self._input_column_names, | |||||
training_indices_=self._training_indices, | |||||
target_names_=self._target_names, | |||||
target_column_indices_=self._target_column_indices, | |||||
target_columns_metadata_=self._target_columns_metadata | |||||
) | |||||
def set_params(self, *, params: Params) -> None: | |||||
self._input_column_names = params['input_column_names'] | |||||
self._training_indices = params['training_indices_'] | |||||
self._target_names = params['target_names_'] | |||||
self._target_column_indices = params['target_column_indices_'] | |||||
self._target_columns_metadata = params['target_columns_metadata_'] | |||||
self._fitted = True | |||||
@classmethod | |||||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||||
if not hyperparams['use_semantic_types']: | |||||
return inputs, list(range(len(inputs.columns))) | |||||
inputs_metadata = inputs.metadata | |||||
def can_produce_column(column_index: int) -> bool: | |||||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||||
use_columns=hyperparams['use_columns'], | |||||
exclude_columns=hyperparams['exclude_columns'], | |||||
can_use_column=can_produce_column) | |||||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||||
# return columns_to_produce | |||||
@classmethod | |||||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||||
accepted_semantic_types = set() | |||||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||||
return False | |||||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||||
if len(semantic_types) == 0: | |||||
cls.logger.warning("No semantic types found in column metadata") | |||||
return False | |||||
# Making sure all accepted_semantic_types are available in semantic_types | |||||
if len(accepted_semantic_types - semantic_types) == 0: | |||||
return True | |||||
return False | |||||
@classmethod | |||||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||||
target_columns_metadata: List[OrderedDict] = [] | |||||
for column_index in range(outputs_length): | |||||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||||
# Update semantic types and prepare it for predicted targets. | |||||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||||
semantic_types_to_remove = set([]) | |||||
add_semantic_types = [] | |||||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||||
semantic_types = semantic_types - semantic_types_to_remove | |||||
semantic_types = semantic_types.union(add_semantic_types) | |||||
column_metadata['semantic_types'] = list(semantic_types) | |||||
target_columns_metadata.append(column_metadata) | |||||
return target_columns_metadata | |||||
@classmethod | |||||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||||
column_metadata.pop("structural_type", None) | |||||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||||
return outputs_metadata | |||||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||||
outputs = d3m_dataframe(predictions, generate_metadata=True) | |||||
target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) | |||||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) | |||||
return outputs | |||||
@classmethod | |||||
def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], | |||||
outputs_metadata: metadata_base.DataMetadata, hyperparams): | |||||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||||
target_columns_metadata: List[OrderedDict] = [] | |||||
for column_index in input_indices: | |||||
column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||||
if column_name is None: | |||||
column_name = "output_{}".format(column_index) | |||||
column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) | |||||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||||
semantic_types_to_remove = set([]) | |||||
add_semantic_types = set() | |||||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||||
semantic_types = semantic_types - semantic_types_to_remove | |||||
semantic_types = semantic_types.union(add_semantic_types) | |||||
column_metadata['semantic_types'] = list(semantic_types) | |||||
column_metadata["name"] = str(column_name) | |||||
target_columns_metadata.append(column_metadata) | |||||
# If outputs has more columns than index, add Attribute Type to all remaining | |||||
if outputs_length > len(input_indices): | |||||
for column_index in range(len(input_indices), outputs_length): | |||||
column_metadata = OrderedDict() | |||||
semantic_types = set() | |||||
semantic_types.add(hyperparams["return_semantic_type"]) | |||||
column_name = "output_{}".format(column_index) | |||||
column_metadata["semantic_types"] = list(semantic_types) | |||||
column_metadata["name"] = str(column_name) | |||||
target_columns_metadata.append(column_metadata) | |||||
return target_columns_metadata | |||||
Ensemble.__doc__ = Normalizer.__doc__ |
@@ -76,5 +76,6 @@ tods.detection_algorithm.PCAODetector = tods.detection_algorithm.PCAODetect:PCAO | |||||
tods.detection_algorithm.KDiscordODetector = tods.detection_algorithm.KDiscordODetect:KDiscordODetector | tods.detection_algorithm.KDiscordODetector = tods.detection_algorithm.KDiscordODetect:KDiscordODetector | ||||
tods.detection_algorithm.deeplog = tods.detection_algorithm.DeepLog:DeepLogPrimitive | tods.detection_algorithm.deeplog = tods.detection_algorithm.DeepLog:DeepLogPrimitive | ||||
tods.detection_algorithm.telemanom = tods.detection_algorithm.Telemanom:TelemanomPrimitive | tods.detection_algorithm.telemanom = tods.detection_algorithm.Telemanom:TelemanomPrimitive | ||||
tods.detection_algorithm.Ensemble = tods.detection_algorithm.Ensemble:Ensemble | |||||
tods.reinforcement.rule_filter = tods.reinforcement.RuleBasedFilter:RuleBasedFilter | tods.reinforcement.rule_filter = tods.reinforcement.RuleBasedFilter:RuleBasedFilter |