|
-
- def generate_metrics():
- from d3m.metadata.problem import PerformanceMetric
- metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}},
- ]
- return metrics
-
- def generate_data_preparation_params():
- from axolotl.utils import schemas as schemas_utils
- data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
- return data_preparation_params
-
- def generate_scoring_pipeline():
- from axolotl.utils import schemas as schemas_utils
- scoring_pipeline = schemas_utils.get_scoring_pipeline()
- return scoring_pipeline
-
- def generate_data_preparation_pipeline():
- from axolotl.utils import schemas as schemas_utils
- data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
- return data_preparation_pipeline
-
-
- def generate_dataset_problems(dataset_infos):
- """
- Args:
- dataset_infos: A list of dataset info, including `path` and `target`
-
- Returns:
- A list of Dataset and Problem
- """
- import pandas as pd
- from axolotl.utils import data_problem
- from d3m.metadata.problem import TaskKeyword, PerformanceMetric
-
- dataset_problems = []
- for dataset_info in dataset_infos:
- table_path = dataset_info['path']
- target = dataset_info['target']
-
- df = pd.read_csv(table_path)
- dataset, problem_description = data_problem.generate_dataset_problem(df,
- target_index=target,
- task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
- performance_metrics=[{'metric': PerformanceMetric.F1}])
-
- dataset_problems.append((dataset, problem_description))
-
- return dataset_problems
-
- # FIXME: Currently only consider algorithm
- def generate_pipelines(primitive_python_paths):
- """
- Args:
- primitive_python_paths: a list of primitive Python paths for algorithms
-
- Returns:
- the pipline description json
- """
- from d3m import index
- from d3m.metadata.base import ArgumentType
- from d3m.metadata.pipeline import Pipeline, PrimitiveStep
- from axolotl.utils import pipeline as pipeline_utils
-
- pipelines = []
- for primitive_python_path in primitive_python_paths:
- # Creating pipeline
- pipeline_description = Pipeline()
- pipeline_description.add_input(name='inputs')
-
- # The first three steps are fixed
- # Step 0: dataset_to_dataframe
- step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common'))
- step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
- step_0.add_output('produce')
- pipeline_description.add_step(step_0)
-
- # Step 1: column_parser
- step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common'))
- step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
- step_1.add_output('produce')
- pipeline_description.add_step(step_1)
-
- # Step 2: extract_columns_by_semantic_types(attributes)
- step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
- step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
- step_2.add_output('produce')
- step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
- data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
- pipeline_description.add_step(step_2)
-
- # Step 3: extract_columns_by_semantic_types(targets)
- step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
- step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
- step_3.add_output('produce')
- step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
- data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
- pipeline_description.add_step(step_3)
-
- attributes = 'steps.2.produce'
- targets = 'steps.3.produce'
-
- # This one is what we want to test
- test_step = PrimitiveStep(primitive=index.get_primitive(primitive_python_path))
- test_step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes)
- test_step.add_output('produce')
- pipeline_description.add_step(test_step)
-
- # Finalize the pipeline
- final_step = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common'))
- final_step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
- final_step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
- final_step.add_output('produce')
- pipeline_description.add_step(final_step)
-
- pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce')
-
- pipelines.append(pipeline_description)
-
- return pipelines
-
- def test():
- # datasets to be tested
- dataset_infos = [
- {
- 'path': 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv',
- 'target': 7
- },
- {
- 'path': 'datasets/anomaly/kpi/kpi_dataset/tables/learningData.csv',
- # 'path': 'datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv',
- 'target': 3
- },
- ]
-
- # Algorithms to be tested
- # FIXME: Test more primitives
- primitive_python_paths = [
- 'd3m.primitives.tods.detection_algorithm.pyod_ae',
- 'd3m.primitives.tods.detection_algorithm.pyod_vae',
- 'd3m.primitives.tods.detection_algorithm.pyod_cof',
- 'd3m.primitives.tods.detection_algorithm.pyod_sod',
- 'd3m.primitives.tods.detection_algorithm.pyod_abod',
- 'd3m.primitives.tods.detection_algorithm.pyod_hbos',
- 'd3m.primitives.tods.detection_algorithm.pyod_iforest',
- 'd3m.primitives.tods.detection_algorithm.pyod_lof',
- 'd3m.primitives.tods.detection_algorithm.pyod_knn',
- 'd3m.primitives.tods.detection_algorithm.pyod_ocsvm',
- 'd3m.primitives.tods.detection_algorithm.pyod_loda',
- # 'd3m.primitives.tods.detection_algorithm.pyod_cblof',
- 'd3m.primitives.tods.detection_algorithm.pyod_sogaal',
- 'd3m.primitives.tods.detection_algorithm.pyod_mogaal',
- ]
-
- dataset_problems = generate_dataset_problems(dataset_infos)
- pipelines = generate_pipelines(primitive_python_paths)
- metrics = generate_metrics()
- data_preparation_pipeline = generate_data_preparation_pipeline()
- scoring_pipeline = generate_scoring_pipeline()
- data_preparation_params = generate_data_preparation_params()
-
- # Start running
- from axolotl.backend.simple import SimpleRunner
- backend = SimpleRunner(random_seed=0)
- for i, dataset_problem in enumerate(dataset_problems):
-
- dataset, problem_description = dataset_problem
- for j, pipeline in enumerate(pipelines):
-
- print('Dataset:', i, 'Pipline:', j)
-
- pipeline_result = backend.evaluate_pipeline(problem_description=problem_description,
- pipeline=pipeline,
- input_data=[dataset],
- metrics=metrics,
- data_preparation_pipeline=data_preparation_pipeline,
- scoring_pipeline=scoring_pipeline,
- data_preparation_params=data_preparation_params)
- print('Results')
- print('----------------------------')
- print(pipeline_result)
- print('----------------------------')
- if pipeline_result.status == 'ERRORED':
- print('Scoring pipeline is {}'.format(scoring_pipeline.id))
- print('Data preparation pipeline is {}'.format(data_preparation_pipeline.id))
- raise ValueError('ERRORED for dataset {}, primitive {}'.format(dataset_infos[i], primitive_python_paths[j]))
-
- if __name__ == "__main__":
- test()
-
-
-
-
|