|
@@ -24,24 +24,22 @@ Examples are available in [/examples](examples/). For basic usage, you can evalu |
|
|
import pandas as pd |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
from tods import schemas as schemas_utils |
|
|
from tods import schemas as schemas_utils |
|
|
from tods.utils import generate_dataset_problem, evaluate_pipeline |
|
|
|
|
|
|
|
|
from tods import generate_dataset, evaluate_pipeline |
|
|
|
|
|
|
|
|
table_path = 'datasets/yahoo_sub_5.csv' |
|
|
table_path = 'datasets/yahoo_sub_5.csv' |
|
|
target_index = 6 # what column is the target |
|
|
target_index = 6 # what column is the target |
|
|
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset |
|
|
|
|
|
time_limit = 30 # How many seconds you wanna search |
|
|
|
|
|
#metric = 'F1' # F1 on label 1 |
|
|
|
|
|
metric = 'F1_MACRO' # F1 on both label 0 and 1 |
|
|
metric = 'F1_MACRO' # F1 on both label 0 and 1 |
|
|
|
|
|
|
|
|
# Read data and generate dataset and problem |
|
|
|
|
|
|
|
|
# Read data and generate dataset |
|
|
df = pd.read_csv(table_path) |
|
|
df = pd.read_csv(table_path) |
|
|
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) |
|
|
|
|
|
|
|
|
dataset = generate_dataset(df, target_index) |
|
|
|
|
|
|
|
|
# Load the default pipeline |
|
|
# Load the default pipeline |
|
|
pipeline = schemas_utils.load_default_pipeline() |
|
|
pipeline = schemas_utils.load_default_pipeline() |
|
|
|
|
|
|
|
|
# Run the pipeline |
|
|
# Run the pipeline |
|
|
pipeline_result = evaluate_pipeline(problem_description, dataset, pipeline) |
|
|
|
|
|
|
|
|
pipeline_result = evaluate_pipeline(dataset, pipeline, metric) |
|
|
|
|
|
print(pipeline_result) |
|
|
``` |
|
|
``` |
|
|
We also provide AutoML support to help you automatically find a good pipeline for a your data. |
|
|
We also provide AutoML support to help you automatically find a good pipeline for a your data. |
|
|
```python |
|
|
```python |
|
@@ -49,29 +47,26 @@ import pandas as pd |
|
|
|
|
|
|
|
|
from axolotl.backend.simple import SimpleRunner |
|
|
from axolotl.backend.simple import SimpleRunner |
|
|
|
|
|
|
|
|
from tods.utils import generate_dataset_problem |
|
|
|
|
|
from tods.search import BruteForceSearch |
|
|
|
|
|
|
|
|
from tods import generate_dataset, generate_problem |
|
|
|
|
|
from tods.searcher import BruteForceSearch |
|
|
|
|
|
|
|
|
# Some information |
|
|
# Some information |
|
|
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_GOOG.csv' # The path of the dataset |
|
|
|
|
|
#target_index = 2 # what column is the target |
|
|
|
|
|
|
|
|
|
|
|
table_path = 'datasets/yahoo_sub_5.csv' |
|
|
table_path = 'datasets/yahoo_sub_5.csv' |
|
|
target_index = 6 # what column is the target |
|
|
target_index = 6 # what column is the target |
|
|
#table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset |
|
|
|
|
|
time_limit = 30 # How many seconds you wanna search |
|
|
time_limit = 30 # How many seconds you wanna search |
|
|
#metric = 'F1' # F1 on label 1 |
|
|
|
|
|
metric = 'F1_MACRO' # F1 on both label 0 and 1 |
|
|
metric = 'F1_MACRO' # F1 on both label 0 and 1 |
|
|
|
|
|
|
|
|
# Read data and generate dataset and problem |
|
|
# Read data and generate dataset and problem |
|
|
df = pd.read_csv(table_path) |
|
|
df = pd.read_csv(table_path) |
|
|
dataset, problem_description = generate_dataset_problem(df, target_index=target_index, metric=metric) |
|
|
|
|
|
|
|
|
dataset = generate_dataset(df, target_index=target_index) |
|
|
|
|
|
problem_description = generate_problem(dataset, metric) |
|
|
|
|
|
|
|
|
# Start backend |
|
|
# Start backend |
|
|
backend = SimpleRunner(random_seed=0) |
|
|
backend = SimpleRunner(random_seed=0) |
|
|
|
|
|
|
|
|
# Start search algorithm |
|
|
# Start search algorithm |
|
|
search = BruteForceSearch(problem_description=problem_description, backend=backend) |
|
|
|
|
|
|
|
|
search = BruteForceSearch(problem_description=problem_description, |
|
|
|
|
|
backend=backend) |
|
|
|
|
|
|
|
|
# Find the best pipeline |
|
|
# Find the best pipeline |
|
|
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit) |
|
|
best_runtime, best_pipeline_result = search.search_fit(input_data=[dataset], time_limit=time_limit) |
|
|