|
- import warnings
- import os
- import os.path
- import numpy as np
- import codecs
- import string
- import gzip
- import lzma
- from typing import Any, Callable, Dict, IO, List, Optional, Tuple, Union
- from dataset_utils import download_url, download_and_extract_archive, extract_archive, verify_str_arg
-
- # tqdm >= 4.31.1
-
- from tods import generate_dataset
- from sklearn import preprocessing
- import pandas as pd
-
- class TODS_dataset:
- resources = []
- training_file = None
- testing_file = None
- ground_truth_index = None
- _repr_indent = None
-
- @property
- def raw_folder(self) -> str:
- return os.path.join(self.root, self.__class__.__name__, 'raw')
-
- @property
- def processed_folder(self) -> str:
- return os.path.join(self.root, self.__class__.__name__, 'processed')
-
- def __init__(self, root, train, transform=None, download=True):
-
- self.root = root
- self.train = train
- self.transform = self.transform_init(transform)
-
- if download:
- self.download()
- pass
-
- self.process()
-
-
- def _check_exists(self) -> bool:
- return (os.path.exists(os.path.join(self.processed_folder,
- self.training_file)) and
- os.path.exists(os.path.join(self.processed_folder,
- self.testing_file)))
-
-
- def download(self) -> None:
-
- if self._check_exists():
- return
-
- os.makedirs(self.raw_folder, exist_ok=True)
-
- # download files
- for url, md5 in self.resources:
- filename = url.rpartition('/')[2]
- download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5)
-
-
- def process(self) -> None:
-
- pass
-
-
- def process_dataframe(self) -> None:
-
- if self.transform is None:
- pass
-
- else:
- self.transform.fit(self.training_set_dataframe)
- self.training_set_array = self.transform.transform(self.training_set_dataframe.values)
- self.testing_set_array = self.transform.transform(self.testing_set_dataframe.values)
- self.training_set_dataframe = pd.DataFrame(self.training_set_array)
- self.testing_set_dataframe = pd.DataFrame(self.testing_set_array)
-
-
- def transform_init(self, transform_str):
-
- if transform_str is None:
- return None
- elif transform_str == 'standardscale':
- return preprocessing.StandardScaler()
- elif transform_str == 'normalize':
- return preprocessing.Normalizer()
- elif transform_str == 'minmaxscale':
- return preprocessing.MinMaxScaler()
- elif transform_str == 'maxabsscale':
- return preprocessing.MaxAbsScaler()
- elif transform_str == 'binarize':
- return preprocessing.Binarizer()
- else:
- raise ValueError("Input parameter transform must take value of 'standardscale', 'normalize', " +
- "'minmaxscale', 'maxabsscale' or 'binarize'."
- )
-
-
- def to_axolotl_dataset(self):
- if self.train:
- return generate_dataset(self.training_set_dataframe, self.ground_truth_index)
- else:
- return generate_dataset(self.testing_set_dataframe, self.ground_truth_index)
-
-
- def __repr__(self) -> str:
- head = "Dataset " + self.__class__.__name__
- body = ["Number of datapoints: {}".format(self.__len__())]
- if self.root is not None:
- body.append("Root location: {}".format(self.root))
- body += self.extra_repr().splitlines()
- if hasattr(self, "transforms") and self.transforms is not None:
- body += [repr(self.transforms)]
- lines = [head] + [" " * self._repr_indent + line for line in body]
-
- print(self.training_set_dataframe)
-
- return '\n'.join(lines)
-
-
- def __len__(self) -> int:
- return len(self.training_set_dataframe)
-
-
- def extra_repr(self) -> str:
- return ""
-
-
- # kpi(root='./datasets', train=True)
-
- # class yahoo5:
- #
- # def __init__(self):
- # pass
|