Browse Source

update Automl

Former-commit-id: f329413c23 [formerly fbc9364a04] [formerly 1de9149f55 [formerly 70ce578e28]] [formerly ba7486e129 [formerly 4d0ad761b6] [formerly e0a8c9ef9a [formerly b071749536]]] [formerly 96226aa3fa [formerly 33f1fb955e] [formerly 76a0a7d068 [formerly 7ddf23f692]] [formerly c087662f91 [formerly c48bb57447] [formerly 3a19d1c697 [formerly aff8923318]]]]
Former-commit-id: 505ba89edf [formerly 6f1595ef97] [formerly 68902619de [formerly 7d427327cb]] [formerly 4949f72ed6 [formerly 99289aaf57] [formerly 3a19d1c697]]
Former-commit-id: 7d86671a3b [formerly 3756e16cc2] [formerly 3ef8be421a [formerly a7849b254d]]
Former-commit-id: cfd80b6f0d [formerly 0a4f55a634]
Former-commit-id: b245688262
master
Daochen Zha 4 years ago
parent
commit
43ff02cf86
2 changed files with 1417 additions and 20 deletions
  1. +1401
    -0
      datasets/yahoo_sub_5.csv
  2. +16
    -20
      examples/run_automl.py

+ 1401
- 0
datasets/yahoo_sub_5.csv
File diff suppressed because it is too large
View File


+ 16
- 20
examples/run_automl.py View File

@@ -1,36 +1,32 @@
import uuid
import random
import pandas as pd
from pprint import pprint
from sklearn.datasets import make_classification

from d3m import container
from d3m.metadata.pipeline import Pipeline
from d3m.metadata.problem import TaskKeyword, PerformanceMetric

from axolotl.utils import data_problem
from axolotl.backend.simple import SimpleRunner
from axolotl.backend.ray import RayRunner
from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils

import tods
from tods.utils import generate_dataset_problem
from tods.search import BruteForceSearch

table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv'
# Some information
table_path = 'datasets/yahoo_sub_5.csv' # The path of the dataset
target_index = 6 # what column is the target
time_limit = 30 # How many seconds you wanna search
#metric = 'F1' # F1 on label 1
metric = 'F1_MACRO' # F1 on both label 0 and 1

# Read data and generate dataset and problem
df = pd.read_csv(table_path)
dataset, problem_description = data_problem.generate_dataset_problem(df,
target_index=7,
task_keywords=[TaskKeyword.ANOMALY_DETECTION,],
performance_metrics=[{'metric': PerformanceMetric.F1}])
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric)

backend = SimpleRunner(random_seed=0)
# Start backend
backend = SimpleRunner(random_seed=0)

# Start search algorithm
search = BruteForceSearch(problem_description=problem_description, backend=backend)

# Find the best pipeline
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=15)
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit)
best_pipeline = best_runtime.pipeline
best_output = best_pipeline_result.output

# Evaluate the best pipeline
best_scores = search.evaluate(best_pipeline).scores



Loading…
Cancel
Save