From 03efe340c979bd474bd9be89fc169f9549287ced Mon Sep 17 00:00:00 2001 From: Daochen Zha Date: Tue, 8 Sep 2020 21:24:00 -0500 Subject: [PATCH] Fix bugs in entry point and add brute force search Former-commit-id: 58e703532fc93ab54c30d29dd0103afee4013f29 [formerly 87aa9f78af2244bd14da18a4394843c9677ba14d] [formerly 9195b39a4c52bcee5401bbfbc5f27e0855ad84b0 [formerly aaf56647cc11782a0555a0b828afc2529a51cd7e]] [formerly 702c036b8784973fe8c472ea1297a3ed928a7f02 [formerly fb99aa68e553923e597569af8df5aa3db42a4840] [formerly dea1456a54b2a88320b45e59be849c412bb1b0a4 [formerly f7c3fb71f86566ae64972a7866ee894d6e1ccc2a]]] [formerly 0d735a1c05d523ca5638f269a0e928e0cc92d31e [formerly 88ed7cce0d845b8a91ac953022282d533c95fbca] [formerly 391df98a48e0a43742d855ebea0205d141c7b2bd [formerly d687dbb739f61231bc180d50f2e2971b2f51b17a]] [formerly c2a69c742ac5a3c798263eb0f3aacec7c8a55388 [formerly 41d038dfb41df4ba784135b8e1fc173bcc0c6512] [formerly eede0743509105030b4b93b69d139d88e66521a1 [formerly 8e4bd2db2a991adccece8ee9b79ceca7208d2c26]]]] Former-commit-id: f28ae9ca848badb11b8a69fcb56a80f4df057b91 [formerly 90426390da67862996a277fd32264fc6087ecfcd] [formerly c742aeed06526d3dc404ac4b5d8d0af730f76cd1 [formerly 02607bd661d97f9b00b3e34995fd91d9a9164467]] [formerly 260f19f1b2cbafa6845e26447a357a8e1cb163c1 [formerly 0c2978f53d50214fb8748128645756aeb61e5345] [formerly eede0743509105030b4b93b69d139d88e66521a1]] Former-commit-id: 06f6e26f5475a6d5d8b1829915fcf7a0058c264e [formerly 1324d96620a5581506fbde5627421c8ed18826bf] [formerly dd4fbe6fe032bdbcbf11a6a598be811f9e36f84d [formerly 064a641b296ef860f15f8fb69be89f044223a9ec]] Former-commit-id: 2a0c1b6d5737f5af82e4d0d0d55456546db9f094 [formerly 7ae23b3209352bbe1909d5b891bbe12d9578324e] Former-commit-id: d01c1290f7df306c7baa36008190b94e3fdd3214 --- examples/run_automl.py | 39 ++++- tods/entry_points.ini | 2 - tods/tods/search/brute_force_search.py | 281 +++++++++++++++++++++++++++++++-- 3 files changed, 296 insertions(+), 26 deletions(-) diff --git a/examples/run_automl.py b/examples/run_automl.py index 4b91b18..77b9747 100644 --- a/examples/run_automl.py +++ b/examples/run_automl.py @@ -17,17 +17,42 @@ from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils import tods from tods.search import BruteForceSearch -table_path = 'datasets/anomaly/kpi/kpi_dataset/tables/learningData.csv' +table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv' df = pd.read_csv(table_path) dataset, problem_description = data_problem.generate_dataset_problem(df, - target_index=3, + target_index=7, task_keywords=[TaskKeyword.ANOMALY_DETECTION,], performance_metrics=[{'metric': PerformanceMetric.F1}]) -print(dataset) -print(problem_description) - - backend = SimpleRunner(random_seed=0) search = BruteForceSearch(problem_description=problem_description, backend=backend) -print(search) + +# Find the best pipeline +best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=80) +best_pipeline = best_runtime.pipeline +best_output = best_pipeline_result.output +# Evaluate the best pipeline +best_scores = search.evaluate(best_pipeline).scores + + +print('*' * 52) +print('Search History:') +for pipeline_result in search.history: + print('-' * 52) + print('Pipeline id:', pipeline_result.pipeline.id) + print(pipeline_result.scores) +print('*' * 52) + +print('') + +print('*' * 52) +print('Best pipeline:') +print('-' * 52) +print('Pipeline id:', best_pipeline.id) +print('Pipeline json:', best_pipeline.to_json()) +print('Output:') +print(best_output) +print('Scores:') +print(best_scores) +print('*' * 52) + diff --git a/tods/entry_points.ini b/tods/entry_points.ini index 5c68b0d..6806df3 100644 --- a/tods/entry_points.ini +++ b/tods/entry_points.ini @@ -68,8 +68,6 @@ tods.detection_algorithm.pyod_mogaal = detection_algorithm.PyodMoGaal:Mo_GaalPri tods.detection_algorithm.matrix_profile = detection_algorithm.MatrixProfile:MatrixProfile tods.detection_algorithm.AutoRegODetector = detection_algorithm.AutoRegODetect:AutoRegODetector -tods.detection_algorithm.KDiscordDetector = detection_algorithm.KDiscordODetect:KDiscordDetector -tods.detection_algorithm.PCADetector = detection_algorithm.PCAODetect:PCADetector tods.detection_algorithm.LSTMODetector = detection_algorithm.LSTMODetect:LSTMODetector tods.detection_algorithm.AutoRegODetector = detection_algorithm.AutoRegODetect:AutoRegODetector diff --git a/tods/tods/search/brute_force_search.py b/tods/tods/search/brute_force_search.py index 28fb9c3..c6841ca 100644 --- a/tods/tods/search/brute_force_search.py +++ b/tods/tods/search/brute_force_search.py @@ -1,36 +1,283 @@ # A Brute-Force Search import uuid +import random from d3m.metadata.pipeline import Pipeline from axolotl.algorithms.base import PipelineSearchBase -from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils - -def random_rank(pipeline_result): - if pipeline_result.status == 'COMPLETED': - pipeline_result.rank = random.uniform(0, 1) - return pipeline_result +from axolotl.utils import schemas as schemas_utils class BruteForceSearch(PipelineSearchBase): def __init__(self, problem_description, backend, *, primitives_blocklist=None, ranking_function=None): super().__init__(problem_description=problem_description, backend=backend, primitives_blocklist=primitives_blocklist, ranking_function=ranking_function) if self.ranking_function is None: - self.ranking_function = random_rank + self.ranking_function = _f1_rank - # Find th candidates + # Find the candidates self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords']) - print('task_description:', self.task_description) + #print('task_description:', self.task_description) self.available_pipelines = self._return_pipelines( self.task_description['task_type'], self.task_description['task_subtype'], self.task_description['data_types']) - print('available_pipelines:', self.available_pipelines) + #print('available_pipelines:', self.available_pipelines) + + self.metrics = _generate_metrics() + self.data_preparation_pipeline = _generate_data_preparation_pipeline() + self.scoring_pipeline = _generate_scoring_pipeline() + self.data_preparation_params = _generate_data_preparation_params() - def _return_pipelines(self, task_type, task_subtype, data_type): - pipeline_candidates = [] - for pipeline_dict in schemas_utils.get_pipelines_db()['CLASSIFICATION']: - pipeline = pipeline_utils.load_pipeline(pipeline_dict) - pipeline.id = str(uuid.uuid4()) - pipeline.created = Pipeline().created - pipeline_candidates.append(pipeline) + self.current_pipeline_index = 0 + self.offset = 1 + def evaluate(self, pipeline_to_eval, input_data=None): + if input_data is None: + input_data = self.input_data + pipeline_result = self.backend.evaluate_pipeline( + problem_description=self.problem_description, + pipeline=pipeline_to_eval, + input_data=input_data, + metrics=self.metrics, + data_preparation_pipeline=self.data_preparation_pipeline, + scoring_pipeline=self.scoring_pipeline, + data_preparation_params=self.data_preparation_params) + + return pipeline_result + + def _search(self, time_left): + # Read all the pipelines to be evaluated + pipelines_to_eval = self.available_pipelines[self.current_pipeline_index: self.current_pipeline_index+self.offset] + self.current_pipeline_index += 1 + + pipeline_results = self.backend.evaluate_pipelines( + problem_description=self.problem_description, + pipelines=pipelines_to_eval, + input_data=self.input_data, + metrics=self.metrics, + data_preparation_pipeline=self.data_preparation_pipeline, + scoring_pipeline=self.scoring_pipeline, + data_preparation_params=self.data_preparation_params) + + return [self.ranking_function(pipeline_result) for pipeline_result in pipeline_results] + + def _return_pipelines(self, task_type, task_subtype, data_type): + pipeline_candidates = _generate_pipelines(primitive_python_paths) + print(len(pipeline_candidates)) + exit() return pipeline_candidates + +primitive_python_paths = { + 'data_processing': [ + 'd3m.primitives.tods.data_processing.dataset_to_dataframe', + 'd3m.primitives.tods.data_processing.time_interval_transform', + 'd3m.primitives.tods.data_processing.categorical_to_binary', + 'd3m.primitives.tods.data_processing.column_filter', + 'd3m.primitives.tods.data_processing.timestamp_validation', + 'd3m.primitives.tods.data_processing.duplication_validation', + 'd3m.primitives.tods.data_processing.continuity_validation', + ], + 'timeseries_processing': [ + 'd3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler', + 'd3m.primitives.tods.timeseries_processing.transformation.standard_scaler', + 'd3m.primitives.tods.timeseries_processing.transformation.power_transformer', + 'd3m.primitives.tods.timeseries_processing.transformation.quantile_transformer', + 'd3m.primitives.tods.timeseries_processing.transformation.moving_average_transform', + 'd3m.primitives.tods.timeseries_processing.transformation.simple_exponential_smoothing', + 'd3m.primitives.tods.timeseries_processing.transformation.holt_smoothing', + 'd3m.primitives.tods.timeseries_processing.transformation.holt_winters_exponential_smoothing', + 'd3m.primitives.tods.timeseries_processing.decomposition.time_series_seasonality_trend_decomposition', + ], + 'feature_analysis': [ + 'd3m.primitives.tods.feature_analysis.auto_correlation', + 'd3m.primitives.tods.feature_analysis.statistical_mean', + 'd3m.primitives.tods.feature_analysis.statistical_median', + 'd3m.primitives.tods.feature_analysis.statistical_g_mean', + 'd3m.primitives.tods.feature_analysis.statistical_abs_energy', + 'd3m.primitives.tods.feature_analysis.statistical_abs_sum', + 'd3m.primitives.tods.feature_analysis.statistical_h_mean', + 'd3m.primitives.tods.feature_analysis.statistical_maximum', + 'd3m.primitives.tods.feature_analysis.statistical_minimum', + 'd3m.primitives.tods.feature_analysis.statistical_mean_abs', + 'd3m.primitives.tods.feature_analysis.statistical_mean_abs_temporal_derivative', + 'd3m.primitives.tods.feature_analysis.statistical_mean_temporal_derivative', + 'd3m.primitives.tods.feature_analysis.statistical_median_abs_deviation', + 'd3m.primitives.tods.feature_analysis.statistical_kurtosis', + 'd3m.primitives.tods.feature_analysis.statistical_skew', + 'd3m.primitives.tods.feature_analysis.statistical_std', + 'd3m.primitives.tods.feature_analysis.statistical_var', + 'd3m.primitives.tods.feature_analysis.statistical_variation', + 'd3m.primitives.tods.feature_analysis.statistical_vec_sum', + 'd3m.primitives.tods.feature_analysis.statistical_willison_amplitude', + 'd3m.primitives.tods.feature_analysis.statistical_zero_crossing', + 'd3m.primitives.tods.feature_analysis.spectral_residual_transform', + 'd3m.primitives.tods.feature_analysis.fast_fourier_transform', + 'd3m.primitives.tods.feature_analysis.discrete_cosine_transform', + 'd3m.primitives.tods.feature_analysis.non_negative_matrix_factorization', + 'd3m.primitives.tods.feature_analysis.bk_filter', + 'd3m.primitives.tods.feature_analysis.hp_filter', + 'd3m.primitives.tods.feature_analysis.truncated_svd', + 'd3m.primitives.tods.feature_analysis.wavelet_transform', + 'd3m.primitives.tods.feature_analysis.trmf', + ], + 'detection_algorithm': [ + 'd3m.primitives.tods.detection_algorithm.pyod_ae', + 'd3m.primitives.tods.detection_algorithm.pyod_vae', + 'd3m.primitives.tods.detection_algorithm.pyod_cof', + 'd3m.primitives.tods.detection_algorithm.pyod_sod', + 'd3m.primitives.tods.detection_algorithm.pyod_abod', + 'd3m.primitives.tods.detection_algorithm.pyod_hbos', + 'd3m.primitives.tods.detection_algorithm.pyod_iforest', + 'd3m.primitives.tods.detection_algorithm.pyod_lof', + 'd3m.primitives.tods.detection_algorithm.pyod_knn', + 'd3m.primitives.tods.detection_algorithm.pyod_ocsvm', + 'd3m.primitives.tods.detection_algorithm.pyod_loda', + 'd3m.primitives.tods.detection_algorithm.pyod_cblof', + 'd3m.primitives.tods.detection_algorithm.pyod_sogaal', + 'd3m.primitives.tods.detection_algorithm.pyod_mogaal', + 'd3m.primitives.tods.detection_algorithm.matrix_profile', + 'd3m.primitives.tods.detection_algorithm.AutoRegODetector', + 'd3m.primitives.tods.detection_algorithm.LSTMODetector', + 'd3m.primitives.tods.detection_algorithm.AutoRegODetector', + 'd3m.primitives.tods.detection_algorithm.PCAODetector', + 'd3m.primitives.tods.detection_algorithm.KDiscordODetector', + 'd3m.primitives.tods.detection_algorithm.deeplog', + 'd3m.primitives.tods.detection_algorithm.telemanom', + ] +} + + +def _f1_rank(pipeline_result): + if pipeline_result.status == 'COMPLETED': + scores = pipeline_result.scores + pipeline_result.rank = -scores['value'][0] + return pipeline_result + +def _generate_metrics(): + from d3m.metadata.problem import PerformanceMetric + metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}, + ] + return metrics + +def _generate_data_preparation_params(): + from axolotl.utils import schemas as schemas_utils + data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + return data_preparation_params + +def _generate_scoring_pipeline(): + from axolotl.utils import schemas as schemas_utils + scoring_pipeline = schemas_utils.get_scoring_pipeline() + return scoring_pipeline + +def _generate_data_preparation_pipeline(): + from axolotl.utils import schemas as schemas_utils + data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + return data_preparation_pipeline + +def _generate_pipline(combinations): + from d3m import index + from d3m.metadata.base import ArgumentType + from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + piplines = [] + for combination in combinations: + # Creating pipeline + pipeline_description = Pipeline() + pipeline_description.add_input(name='inputs') + + # The first three steps are fixed + # Step 0: dataset_to_dataframe + step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) + step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') + step_0.add_output('produce') + pipeline_description.add_step(step_0) + + # Step 1: column_parser + step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) + step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') + step_1.add_output('produce') + pipeline_description.add_step(step_1) + + # Step 2: extract_columns_by_semantic_types(attributes) + step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) + step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') + step_2.add_output('produce') + step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) + pipeline_description.add_step(step_2) + + # Step 3: extract_columns_by_semantic_types(targets) + step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) + step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') + step_3.add_output('produce') + step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) + pipeline_description.add_step(step_3) + + attributes = 'steps.2.produce' + targets = 'steps.3.produce' + + tods_step_4 = PrimitiveStep(primitive=index.get_primitive(combination[0])) + tods_step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) + tods_step_4.add_output('produce') + pipeline_description.add_step(tods_step_4) + + tods_step_5 = PrimitiveStep(primitive=index.get_primitive(combination[1])) + tods_step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') + tods_step_5.add_output('produce') + pipeline_description.add_step(tods_step_5) + + tods_step_6= PrimitiveStep(primitive=index.get_primitive(combination[1])) + tods_step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') + tods_step_6.add_output('produce') + pipeline_description.add_step(tods_step_6) + + tods_step_7 = PrimitiveStep(primitive=index.get_primitive(combination[3])) + tods_step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') + tods_step_7.add_output('produce') + pipeline_description.add_step(tods_step_7) + + # Finalize the pipeline + final_step = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) + final_step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce') + final_step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') + final_step.add_output('produce') + pipeline_description.add_step(final_step) + + pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + + pipeline_description.id = str(uuid.uuid4()) + pipeline_description.created = Pipeline().created + + piplines.append(pipeline_description) + return piplines + +def _generate_pipelines(primitive_python_paths, cpu_count=40): + """ + Args: + primitive_python_paths: a list of primitive Python paths for algorithms + + Returns: + the pipline description json + """ + import itertools + import multiprocessing as mp + + components = ['data_processing', 'timeseries_processing', 'feature_analysis', 'detection_algorithm'] + combinations = itertools.product(*(primitive_python_paths[k] for k in components)) + pipelines = [] + + # Allocate tasks + combination_each_core_list = [[] for i in range(cpu_count)] + for idx, combination in enumerate(combinations): + core = idx % cpu_count + combination_each_core_list[core].append(combination) + + # Obtain all the pipelines + pool = mp.Pool(processes=cpu_count) + results = [pool.apply_async(_generate_pipline, + args=(combinations,)) + for combinations in combination_each_core_list] + piplines = [] + for p in results: + piplines.extend(p.get()) + + return piplines