Browse Source

Fix bugs in entry point and add brute force search

Former-commit-id: 58e703532f [formerly 87aa9f78af] [formerly 9195b39a4c [formerly aaf56647cc]] [formerly 702c036b87 [formerly fb99aa68e5] [formerly dea1456a54 [formerly f7c3fb71f8]]] [formerly 0d735a1c05 [formerly 88ed7cce0d] [formerly 391df98a48 [formerly d687dbb739]] [formerly c2a69c742a [formerly 41d038dfb4] [formerly eede074350 [formerly 8e4bd2db2a]]]]
Former-commit-id: f28ae9ca84 [formerly 90426390da] [formerly c742aeed06 [formerly 02607bd661]] [formerly 260f19f1b2 [formerly 0c2978f53d] [formerly eede074350]]
Former-commit-id: 06f6e26f54 [formerly 1324d96620] [formerly dd4fbe6fe0 [formerly 064a641b29]]
Former-commit-id: 2a0c1b6d57 [formerly 7ae23b3209]
Former-commit-id: d01c1290f7
master
Daochen Zha 4 years ago
parent
commit
03efe340c9
3 changed files with 296 additions and 26 deletions
  1. +32
    -7
      examples/run_automl.py
  2. +0
    -2
      tods/entry_points.ini
  3. +264
    -17
      tods/tods/search/brute_force_search.py

+ 32
- 7
examples/run_automl.py View File

@@ -17,17 +17,42 @@ from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils
import tods
from tods.search import BruteForceSearch

table_path = 'datasets/anomaly/kpi/kpi_dataset/tables/learningData.csv'
table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv'
df = pd.read_csv(table_path)
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=3,
target_index=7,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=[{'metric': PerformanceMetric.F1}])

print(dataset)
print(problem_description)


backend = SimpleRunner(random_seed=0)
search = BruteForceSearch(problem_description=problem_description, backend=backend)
print(search)

# Find the best pipeline
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=80)
best_pipeline = best_runtime.pipeline
best_output = best_pipeline_result.output
# Evaluate the best pipeline
best_scores = search.evaluate(best_pipeline).scores


print('*' * 52)
print('Search History:')
for pipeline_result in search.history:
print('-' * 52)
print('Pipeline id:', pipeline_result.pipeline.id)
print(pipeline_result.scores)
print('*' * 52)

print('')

print('*' * 52)
print('Best pipeline:')
print('-' * 52)
print('Pipeline id:', best_pipeline.id)
print('Pipeline json:', best_pipeline.to_json())
print('Output:')
print(best_output)
print('Scores:')
print(best_scores)
print('*' * 52)


+ 0
- 2
tods/entry_points.ini View File

@@ -68,8 +68,6 @@ tods.detection_algorithm.pyod_mogaal = detection_algorithm.PyodMoGaal:Mo_GaalPri

tods.detection_algorithm.matrix_profile = detection_algorithm.MatrixProfile:MatrixProfile
tods.detection_algorithm.AutoRegODetector = detection_algorithm.AutoRegODetect:AutoRegODetector
tods.detection_algorithm.KDiscordDetector = detection_algorithm.KDiscordODetect:KDiscordDetector
tods.detection_algorithm.PCADetector = detection_algorithm.PCAODetect:PCADetector

tods.detection_algorithm.LSTMODetector = detection_algorithm.LSTMODetect:LSTMODetector
tods.detection_algorithm.AutoRegODetector = detection_algorithm.AutoRegODetect:AutoRegODetector


+ 264
- 17
tods/tods/search/brute_force_search.py View File

@@ -1,36 +1,283 @@
# A Brute-Force Search
import uuid
import random

from d3m.metadata.pipeline import Pipeline

from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils

def random_rank(pipeline_result):
if pipeline_result.status == 'COMPLETED':
pipeline_result.rank = random.uniform(0, 1)
return pipeline_result
from axolotl.utils import schemas as schemas_utils

class BruteForceSearch(PipelineSearchBase):
def __init__(self, problem_description, backend, *, primitives_blocklist=None, ranking_function=None):
super().__init__(problem_description=problem_description, backend=backend,
primitives_blocklist=primitives_blocklist, ranking_function=ranking_function)
if self.ranking_function is None:
self.ranking_function = random_rank
self.ranking_function = _f1_rank

# Find th candidates
# Find the candidates
self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords'])
print('task_description:', self.task_description)
#print('task_description:', self.task_description)
self.available_pipelines = self._return_pipelines(
self.task_description['task_type'], self.task_description['task_subtype'], self.task_description['data_types'])
print('available_pipelines:', self.available_pipelines)
#print('available_pipelines:', self.available_pipelines)
self.metrics = _generate_metrics()
self.data_preparation_pipeline = _generate_data_preparation_pipeline()
self.scoring_pipeline = _generate_scoring_pipeline()
self.data_preparation_params = _generate_data_preparation_params()

def _return_pipelines(self, task_type, task_subtype, data_type):
pipeline_candidates = []
for pipeline_dict in schemas_utils.get_pipelines_db()['CLASSIFICATION']:
pipeline = pipeline_utils.load_pipeline(pipeline_dict)
pipeline.id = str(uuid.uuid4())
pipeline.created = Pipeline().created
pipeline_candidates.append(pipeline)
self.current_pipeline_index = 0
self.offset = 1

def evaluate(self, pipeline_to_eval, input_data=None):
if input_data is None:
input_data = self.input_data
pipeline_result = self.backend.evaluate_pipeline(
problem_description=self.problem_description,
pipeline=pipeline_to_eval,
input_data=input_data,
metrics=self.metrics,
data_preparation_pipeline=self.data_preparation_pipeline,
scoring_pipeline=self.scoring_pipeline,
data_preparation_params=self.data_preparation_params)
return pipeline_result

def _search(self, time_left):
# Read all the pipelines to be evaluated
pipelines_to_eval = self.available_pipelines[self.current_pipeline_index: self.current_pipeline_index+self.offset]
self.current_pipeline_index += 1
pipeline_results = self.backend.evaluate_pipelines(
problem_description=self.problem_description,
pipelines=pipelines_to_eval,
input_data=self.input_data,
metrics=self.metrics,
data_preparation_pipeline=self.data_preparation_pipeline,
scoring_pipeline=self.scoring_pipeline,
data_preparation_params=self.data_preparation_params)

return [self.ranking_function(pipeline_result) for pipeline_result in pipeline_results]

def _return_pipelines(self, task_type, task_subtype, data_type):
pipeline_candidates = _generate_pipelines(primitive_python_paths)
print(len(pipeline_candidates))
exit()
return pipeline_candidates

primitive_python_paths = {
'data_processing': [
'd3m.primitives.tods.data_processing.dataset_to_dataframe',
'd3m.primitives.tods.data_processing.time_interval_transform',
'd3m.primitives.tods.data_processing.categorical_to_binary',
'd3m.primitives.tods.data_processing.column_filter',
'd3m.primitives.tods.data_processing.timestamp_validation',
'd3m.primitives.tods.data_processing.duplication_validation',
'd3m.primitives.tods.data_processing.continuity_validation',
],
'timeseries_processing': [
'd3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler',
'd3m.primitives.tods.timeseries_processing.transformation.standard_scaler',
'd3m.primitives.tods.timeseries_processing.transformation.power_transformer',
'd3m.primitives.tods.timeseries_processing.transformation.quantile_transformer',
'd3m.primitives.tods.timeseries_processing.transformation.moving_average_transform',
'd3m.primitives.tods.timeseries_processing.transformation.simple_exponential_smoothing',
'd3m.primitives.tods.timeseries_processing.transformation.holt_smoothing',
'd3m.primitives.tods.timeseries_processing.transformation.holt_winters_exponential_smoothing',
'd3m.primitives.tods.timeseries_processing.decomposition.time_series_seasonality_trend_decomposition',
],
'feature_analysis': [
'd3m.primitives.tods.feature_analysis.auto_correlation',
'd3m.primitives.tods.feature_analysis.statistical_mean',
'd3m.primitives.tods.feature_analysis.statistical_median',
'd3m.primitives.tods.feature_analysis.statistical_g_mean',
'd3m.primitives.tods.feature_analysis.statistical_abs_energy',
'd3m.primitives.tods.feature_analysis.statistical_abs_sum',
'd3m.primitives.tods.feature_analysis.statistical_h_mean',
'd3m.primitives.tods.feature_analysis.statistical_maximum',
'd3m.primitives.tods.feature_analysis.statistical_minimum',
'd3m.primitives.tods.feature_analysis.statistical_mean_abs',
'd3m.primitives.tods.feature_analysis.statistical_mean_abs_temporal_derivative',
'd3m.primitives.tods.feature_analysis.statistical_mean_temporal_derivative',
'd3m.primitives.tods.feature_analysis.statistical_median_abs_deviation',
'd3m.primitives.tods.feature_analysis.statistical_kurtosis',
'd3m.primitives.tods.feature_analysis.statistical_skew',
'd3m.primitives.tods.feature_analysis.statistical_std',
'd3m.primitives.tods.feature_analysis.statistical_var',
'd3m.primitives.tods.feature_analysis.statistical_variation',
'd3m.primitives.tods.feature_analysis.statistical_vec_sum',
'd3m.primitives.tods.feature_analysis.statistical_willison_amplitude',
'd3m.primitives.tods.feature_analysis.statistical_zero_crossing',
'd3m.primitives.tods.feature_analysis.spectral_residual_transform',
'd3m.primitives.tods.feature_analysis.fast_fourier_transform',
'd3m.primitives.tods.feature_analysis.discrete_cosine_transform',
'd3m.primitives.tods.feature_analysis.non_negative_matrix_factorization',
'd3m.primitives.tods.feature_analysis.bk_filter',
'd3m.primitives.tods.feature_analysis.hp_filter',
'd3m.primitives.tods.feature_analysis.truncated_svd',
'd3m.primitives.tods.feature_analysis.wavelet_transform',
'd3m.primitives.tods.feature_analysis.trmf',
],
'detection_algorithm': [
'd3m.primitives.tods.detection_algorithm.pyod_ae',
'd3m.primitives.tods.detection_algorithm.pyod_vae',
'd3m.primitives.tods.detection_algorithm.pyod_cof',
'd3m.primitives.tods.detection_algorithm.pyod_sod',
'd3m.primitives.tods.detection_algorithm.pyod_abod',
'd3m.primitives.tods.detection_algorithm.pyod_hbos',
'd3m.primitives.tods.detection_algorithm.pyod_iforest',
'd3m.primitives.tods.detection_algorithm.pyod_lof',
'd3m.primitives.tods.detection_algorithm.pyod_knn',
'd3m.primitives.tods.detection_algorithm.pyod_ocsvm',
'd3m.primitives.tods.detection_algorithm.pyod_loda',
'd3m.primitives.tods.detection_algorithm.pyod_cblof',
'd3m.primitives.tods.detection_algorithm.pyod_sogaal',
'd3m.primitives.tods.detection_algorithm.pyod_mogaal',
'd3m.primitives.tods.detection_algorithm.matrix_profile',
'd3m.primitives.tods.detection_algorithm.AutoRegODetector',
'd3m.primitives.tods.detection_algorithm.LSTMODetector',
'd3m.primitives.tods.detection_algorithm.AutoRegODetector',
'd3m.primitives.tods.detection_algorithm.PCAODetector',
'd3m.primitives.tods.detection_algorithm.KDiscordODetector',
'd3m.primitives.tods.detection_algorithm.deeplog',
'd3m.primitives.tods.detection_algorithm.telemanom',
]
}


def _f1_rank(pipeline_result):
if pipeline_result.status == 'COMPLETED':
scores = pipeline_result.scores
pipeline_result.rank = -scores['value'][0]
return pipeline_result

def _generate_metrics():
from d3m.metadata.problem import PerformanceMetric
metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}},
]
return metrics

def _generate_data_preparation_params():
from axolotl.utils import schemas as schemas_utils
data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']
return data_preparation_params
def _generate_scoring_pipeline():
from axolotl.utils import schemas as schemas_utils
scoring_pipeline = schemas_utils.get_scoring_pipeline()
return scoring_pipeline
def _generate_data_preparation_pipeline():
from axolotl.utils import schemas as schemas_utils
data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
return data_preparation_pipeline

def _generate_pipline(combinations):
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

piplines = []
for combination in combinations:
# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')
# The first three steps are fixed
# Step 0: dataset_to_dataframe
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common'))
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: column_parser
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common'))
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_1.add_output('produce')
pipeline_description.add_step(step_1)

# Step 2: extract_columns_by_semantic_types(attributes)
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
step_2.add_output('produce')
step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
pipeline_description.add_step(step_2)

# Step 3: extract_columns_by_semantic_types(targets)
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_3.add_output('produce')
step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
pipeline_description.add_step(step_3)

attributes = 'steps.2.produce'
targets = 'steps.3.produce'

tods_step_4 = PrimitiveStep(primitive=index.get_primitive(combination[0]))
tods_step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes)
tods_step_4.add_output('produce')
pipeline_description.add_step(tods_step_4)

tods_step_5 = PrimitiveStep(primitive=index.get_primitive(combination[1]))
tods_step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
tods_step_5.add_output('produce')
pipeline_description.add_step(tods_step_5)

tods_step_6= PrimitiveStep(primitive=index.get_primitive(combination[1]))
tods_step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce')
tods_step_6.add_output('produce')
pipeline_description.add_step(tods_step_6)

tods_step_7 = PrimitiveStep(primitive=index.get_primitive(combination[3]))
tods_step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce')
tods_step_7.add_output('produce')
pipeline_description.add_step(tods_step_7)

# Finalize the pipeline
final_step = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common'))
final_step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce')
final_step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
final_step.add_output('produce')
pipeline_description.add_step(final_step)

pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce')
pipeline_description.id = str(uuid.uuid4())
pipeline_description.created = Pipeline().created

piplines.append(pipeline_description)
return piplines

def _generate_pipelines(primitive_python_paths, cpu_count=40):
"""
Args:
primitive_python_paths: a list of primitive Python paths for algorithms
Returns:
the pipline description json
"""
import itertools
import multiprocessing as mp

components = ['data_processing', 'timeseries_processing', 'feature_analysis', 'detection_algorithm']
combinations = itertools.product(*(primitive_python_paths[k] for k in components))
pipelines = []

# Allocate tasks
combination_each_core_list = [[] for i in range(cpu_count)]
for idx, combination in enumerate(combinations):
core = idx % cpu_count
combination_each_core_list[core].append(combination)

# Obtain all the pipelines
pool = mp.Pool(processes=cpu_count)
results = [pool.apply_async(_generate_pipline,
args=(combinations,))
for combinations in combination_each_core_list]
piplines = []
for p in results:
piplines.extend(p.get())

return piplines

Loading…
Cancel
Save