import os import pandas as pd from tods_dataset_base import TODS_dataset from shutil import copyfile class kpi_dataset(TODS_dataset): resources = [ # ("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873"), # ("https://github.com/datamllab/tods/blob/master/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # ("https://github.com/NetManAIOps/KPI-Anomaly-Detection/blob/master/Preliminary_dataset/train.csv", None), ("https://hegsns.github.io/tods_datasets/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # it needs md5 to check if local learningData.csv is the same with online. ("https://hegsns.github.io/tods_datasets/kpi/TRAIN/dataset_TRAIN/datasetDoc.json", None), # needs a server to store the dataset. # ("https://raw.githubusercontent.com/datamllab/tods/master/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # it needs md5 to check if local learningData.csv is the same with online. ] training_file = 'learningData.csv' testing_file = 'testingData.csv' ground_truth_index = 3 _repr_indent = 4 # def __init__(self, root, train, transform=None, target_transform=None, download=True): # super().__init__(root, train, transform=None, target_transform=None, download=True) def process(self) -> None: print('Processing...') os.makedirs(self.processed_folder, exist_ok=True) os.makedirs(os.path.join(self.processed_folder, 'tables'), exist_ok=True) training_set_fname = os.path.join(self.raw_folder, 'learningData.csv') self.training_set_dataframe = pd.read_csv(training_set_fname) testing_set_fname = os.path.join(self.raw_folder, 'learningData.csv') # temperarily same with training set self.testing_set_dataframe = pd.read_csv(testing_set_fname) self.process_dataframe() self.training_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.training_file)) self.testing_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.testing_file)) copyfile(os.path.join(self.raw_folder, 'datasetDoc.json'), os.path.join(self.processed_folder, 'datasetDoc.json')) print('Done!') class yahoo_dataset(TODS_dataset): resources = [ # ("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873"), # ("https://github.com/datamllab/tods/blob/master/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # ("https://github.com/NetManAIOps/KPI-Anomaly-Detection/blob/master/Preliminary_dataset/train.csv", None), ("https://hegsns.github.io/tods_datasets/yahoo_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # it needs md5 to check if local learningData.csv is the same with online. ("https://hegsns.github.io/tods_datasets/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json", None), # needs a server to store the dataset. # ("https://raw.githubusercontent.com/datamllab/tods/master/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # it needs md5 to check if local learningData.csv is the same with online. ] training_file = 'learningData.csv' testing_file = 'testingData.csv' ground_truth_index = 7 _repr_indent = 4 def process(self) -> None: print('Processing...') os.makedirs(self.processed_folder, exist_ok=True) os.makedirs(os.path.join(self.processed_folder, 'tables'), exist_ok=True) training_set_fname = os.path.join(self.raw_folder, 'learningData.csv') self.training_set_dataframe = pd.read_csv(training_set_fname) testing_set_fname = os.path.join(self.raw_folder, 'learningData.csv') # temperarily same with training set self.testing_set_dataframe = pd.read_csv(testing_set_fname) self.process_dataframe() self.training_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.training_file)) self.testing_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.testing_file)) copyfile(os.path.join(self.raw_folder, 'datasetDoc.json'), os.path.join(self.processed_folder, 'datasetDoc.json')) print('Done!') class NAB_dataset(TODS_dataset): resources = [ ("https://hegsns.github.io/tods_datasets/NAB/realTweets/labeled_Twitter_volume_AMZN.csv", None), # it needs md5 to check if local learningData.csv is the same with online. ("https://hegsns.github.io/tods_datasets/NAB/realTweets/labeled_Twitter_volume_AMZN.json", None), # needs a server to store the dataset. ] training_file = 'learningData.csv' testing_file = 'testingData.csv' ground_truth_index = 2 _repr_indent = 4 def process(self) -> None: print('Processing...') os.makedirs(self.processed_folder, exist_ok=True) os.makedirs(os.path.join(self.processed_folder, 'tables'), exist_ok=True) training_set_fname = os.path.join(self.raw_folder, 'labeled_Twitter_volume_AMZN.csv') self.training_set_dataframe = pd.read_csv(training_set_fname) testing_set_fname = os.path.join(self.raw_folder, 'labeled_Twitter_volume_AMZN.csv') # temperarily same with training set self.testing_set_dataframe = pd.read_csv(testing_set_fname) self.process_dataframe() self.training_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.training_file)) self.testing_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.testing_file)) copyfile(os.path.join(self.raw_folder, 'labeled_Twitter_volume_AMZN.json'), os.path.join(self.processed_folder, 'datasetDoc.json')) print('Done!') # kpi_dataset(root='./datasets', train=True, transform='binarize') # yahoo_dataset(root='./datasets', train=True, transform='binarize') # NAB_dataset(root='./datasets', train=True, transform='binarize')