You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tods_datasets.py 6.1 kB

resolve construct prediction bug Former-commit-id: b0d35d9aea14dc7b9382642dc4aebbb7b33227af [formerly 28b7582d253ac7370edb4ee77c41a78ec5efe21b] [formerly 632f8af13ad3d37eb5e721730ccef4c795052ff0 [formerly 9eb35832567152e9d8b616951cba2f9d472e3fd0]] [formerly ab6f5a8a9c6b9255cdcd65805974015a7daf0693 [formerly 2720e70c78084e35396f51bb28c2090679f1a53b] [formerly c1f615a3c58a4916b810c6a566c16c12bc507908 [formerly 21840b378759723d0ff5a11d59d3e2daade1e679]]] [formerly 2184b60ad634dbc847bfaeb3413ae99b824bad0a [formerly 2dae6c4ed4a1995848d151ca54fddd69969145a1] [formerly 6a577865b8a1f92a963b3d168e06a63dcb0161b3 [formerly ee28aaed84e9f06385a453ed06a7314fc603ca91]] [formerly 72078d88c8363efb9a91ddd5963d234b8ab0fd10 [formerly ab1fb9d5125467f2bad37c01b395460e15830910] [formerly 476600a47c7aa6c88895eda67185ee6a70d1fd5d [formerly f5a6205bb56bdfcde3584db8107b25af51ce1feb]]]] [formerly e5c2c3deefcac7dea9094877181d7910eb5286a5 [formerly 8585dc41a9c94b9a7084484b60e3cc07b9cef7fe] [formerly 9cc7fe2088bcc893022abf4b6ba137fdbdda4211 [formerly 1d2104316f6dc308482b2d8c1c0210a240ef4f4a]] [formerly b51e614a11931bf92fcbc497ce6509339a4a9771 [formerly f9891a191d390c09ecd274931f6fbab9f09fe06c] [formerly 048aa2f114fe950c50e3d9833dbe4967c65f25b1 [formerly d4de64574be6221a8287ce4c4f72e7b39f361258]]] [formerly ff66f55a512c8e42de6d48182b79b9c8e42bfae1 [formerly 0b691b9a7f54cef7d12d14b5e5859959503838c8] [formerly e64e8dd25314730134bf6de25d3d3c0a9d5c8d63 [formerly cc45939e01d88b193138b72384ba519e4df731e7]] [formerly cdad10712ac93799628bcf24365402a347dbb16d [formerly 2789e20e79f5b9726ad7c1a002a31c5d2253f63b] [formerly 25924f293c4e9d69e44c77a08a05c46d3f66bb27 [formerly 32997accab262d1a8960e38792035422d14efd13]]]]] [formerly f43b4310405ef100174e295d57504653eb26f04f [formerly 95815d02ca055259aa579dc3822f1e77ddca8a1c] [formerly fe9bd45d441e3608622bdfb90d3748bb2fd17bbd [formerly 6daf0aa73e40a4027dd5ae04cf16522974b71b36]] [formerly 61ab30c9a3ab7d8fb4d7366dd09937813e34a7fb [formerly a13c6e23b463cb54a80938f981b811f33326f68e] [formerly 86fa5919eec79d126e750a176f67f1554eac41fa [formerly 1e49e1a30348ac38fe7e31a59a50006aabfac43c]]] [formerly 69c5bc967a3bd09af946c28c8680a510614897fd [formerly ab82915cbd9157ad0dc204c0247107bda650742a] [formerly f8057c3b14e022eb44a0abb79f50196e093eaa35 [formerly 5232f345780a58c7544bd6a4d5c31b930cd0a36e]] [formerly 671c54e9520203b5ed14053770c4a1ed4b621959 [formerly 6454a28f2685a4176ead96d7ee68e15afaaadfa8] [formerly 3db6ff66b90f6abf68a8c78ae929d548c1f6b2ad [formerly aa8c7fe127cf82a244718da550b121052855ad27]]]] [formerly 86b2ec6b84d16ae7d5c51eace4f24ea472ea11a2 [formerly f35c344efebe54ad170cd6de9bc7f4126e53f510] [formerly d5616f66cd90f797ad80a85b38fe9234f86540fe [formerly 98c9dca7daf1efc38766b3fde4854f06538d7efa]] [formerly a7dcc62bc54b16698b3b7e06aa9a6735afd63b08 [formerly 4ef4fa0c98b2f210cbe7de6f2a00ba5c63c1b2ba] [formerly 55f670b9ae053698ddba84658940ddea9ac795e6 [formerly 1cd4421e2e02bfb8597f338583ad32c0550c02af]]] [formerly d7a5bab8320a31eb1dedac6a7806e5c59683053f [formerly c77c5b48df83e1aed383fb28f0812fc87ecbf973] [formerly 01ffd33e2ff1a6fa5cffd6dd82289111682cd9a1 [formerly aea728ceb67c39709f9a3921547a95e09c1dcff7]] [formerly 16afb18e352c8da18fff35384430bbc427cc4802 [formerly 4768b156f37cfba1a657944fe8d4314af3532685] [formerly 3c1298c626a945d362a74f922c5660b11e8ad491 [formerly 1e61cf09743c7ec285301e748dbbec3caf42d89c]]]]]] Former-commit-id: 28b09a56cb8b16b07fc356e5f3133ad2bb9d5859 [formerly 5241dbb36c08ebc2ecfeac6450b1cb3b52f624ea] [formerly d43909d97927eee8f9f09fcc8b4e5c92cddb3a6a [formerly 43b0cca7f5e0372770f2b2a530bc921d8ac9ec8d]] [formerly 8d8f384c8e46748e59790e0421d07bd8a942043d [formerly bcf58203c66d0bc6c5392a855d1b5d617dcef9df] [formerly ca56bff2d0a2b498941b59b240f1cf4ac420462c [formerly 7a8750ffc99115c6cb37e63ce9fd0c3958d235a8]]] [formerly 2ce9fa87aeb4fe174de68dc4d79501b057b477b1 [formerly e7b1b542f5b9fbf69df7912c4393d264713c24f4] [formerly 62a7edf94a8193828f465bd432ffc15edd480ea1 [formerly 26ca5f220dcc3745ee271e5712d72836f2255222]] [formerly 73d10253b93b35f692e13089ddb148d408cf14fe [formerly af463cecb0333e14aabc502126f7c6d22973e214] [formerly 961314f474a483302d2a5ce0a12b51afc386cb6d [formerly 6e7141e5e2e9f72d2102892b4affce0de0944cb0]]]] [formerly c3f8938ba5ba29f2ad8de1cfc486a7f323ecea69 [formerly 7f6674829240eab485a34d67eb41f61ca744b26f] [formerly 3e5ef2c136584b7f0e769f5bdf43ab994fb12c21 [formerly 266ddb4ccc8948b124ef81cee46efd1804bddd2e]] [formerly 4a8a5437b39ea3daf303fa79dbae22c294ff0b40 [formerly cbcb0f87775b068d2fccd5c8632cf05ab0a019a7] [formerly c212b8217fd92fa1b7899ab5211e0262f3a9c927 [formerly 5f3d3d01c87574acb44b077dd7439ac53b98a0f1]]] [formerly 09764ba6cd565eae5cb426f705ec72ee255e4b49 [formerly 4db991be5f3f4d27092a19876eb75c5e51f60bb0] [formerly f79c6ec15df35536aeba92957c37306643e3a517 [formerly 7f8eb54d4752bbefbc208251563af8e77a1792c0]] [formerly 771b00b188ceaa0b9569636bae5f28643999ce47 [formerly f1dcba565fc0943c24d468b496b48df7f815420f] [formerly 83c42510ad4af56394a30b78f2b4eeb693b2df6b [formerly 3c1298c626a945d362a74f922c5660b11e8ad491]]]]] Former-commit-id: f5cdcca4f3e91cc75d3af0de6bd17cce1dfaf021 [formerly dc57f947a2385967e8847ab784a433184f9620a3] [formerly 55cd6eb9a35d44b00d0d55a30c9686dffafdae57 [formerly e92d6c09230f08e9760fc17f45f8389e919b82a3]] [formerly ba80ed43d23d7ff4c349e534fb9036b84064bbf3 [formerly 0a2f65401a73b676569fb809d0e4e9dd28803952] [formerly 73c0c2ebb36b0a14c038f327507a133d082fef6f [formerly bf80cf285eec74221e6add1ff41e5f5c3b3ab808]]] [formerly a34f21d93329c3bee2d5e97fcbe820a0eab1d5df [formerly ae311cb3e77af5bcd03976e2a108ac1eba772fc4] [formerly ae0e3ed079c601e0096c648911ae8246eef220fe [formerly c9030d0303acdbd031b19797c30e98573cf32837]] [formerly f8f4f6a8ec75e269e0db55255d3a01dcafc79c23 [formerly 343dd65df69b51f0f404f200c3af4cbc52320b25] [formerly 1c97e6a7baa6ff68d466723b754b2887cc834be9 [formerly c69d281aca97ee3bde4328d04d92163eab376623]]]] Former-commit-id: b93de572408cfd327f0ca40b8d62530945e5c5ba [formerly 6fd32d0759069e91a1183acb86096ccbab5f88e9] [formerly ff29e71cb0b43473b0be6345b1747e6ec2df856b [formerly ee483b56f98eb5e187b7b262278d121c34ed86ab]] [formerly 2f944ec28b50590b4b4eb1e7aa07518f78234810 [formerly 15c46e806cd9f35403a758aa6f46f3d746b0d0c5] [formerly dca0c18b8b693227cdef4539165e8e2d6a47c340 [formerly f185a8658cf2893c7fa8b0393d52489593d32d37]]] Former-commit-id: 76ed7f184f13ac60e2d458d60e9a8befe3cace6f [formerly c8adbe1dea7c40540fc74be2867a780b10a5788f] [formerly 392fc3e54b90f8d1201d2986ffbde77529b26560 [formerly 5bd396738e6152ff5b4b28bbda4cb495bcda1459]] Former-commit-id: 11579edbe104d2a7361847a5dd71fc06dc4f369c [formerly 6970795314200993d1d027617ceddd7a7c6421a3] Former-commit-id: 8fca90698b3ffff3c183e4754cabc6b39fe9b16f
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. import os
  2. import pandas as pd
  3. from tods_dataset_base import TODS_dataset
  4. from shutil import copyfile
  5. class kpi_dataset(TODS_dataset):
  6. resources = [
  7. # ("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873"),
  8. # ("https://github.com/datamllab/tods/blob/master/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None),
  9. # ("https://github.com/NetManAIOps/KPI-Anomaly-Detection/blob/master/Preliminary_dataset/train.csv", None),
  10. ("https://hegsns.github.io/tods_datasets/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # it needs md5 to check if local learningData.csv is the same with online.
  11. ("https://hegsns.github.io/tods_datasets/kpi/TRAIN/dataset_TRAIN/datasetDoc.json", None),
  12. # needs a server to store the dataset.
  13. # ("https://raw.githubusercontent.com/datamllab/tods/master/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # it needs md5 to check if local learningData.csv is the same with online.
  14. ]
  15. training_file = 'learningData.csv'
  16. testing_file = 'testingData.csv'
  17. ground_truth_index = 3
  18. _repr_indent = 4
  19. # def __init__(self, root, train, transform=None, target_transform=None, download=True):
  20. # super().__init__(root, train, transform=None, target_transform=None, download=True)
  21. def process(self) -> None:
  22. print('Processing...')
  23. os.makedirs(self.processed_folder, exist_ok=True)
  24. os.makedirs(os.path.join(self.processed_folder, 'tables'), exist_ok=True)
  25. training_set_fname = os.path.join(self.raw_folder, 'learningData.csv')
  26. self.training_set_dataframe = pd.read_csv(training_set_fname)
  27. testing_set_fname = os.path.join(self.raw_folder, 'learningData.csv') # temperarily same with training set
  28. self.testing_set_dataframe = pd.read_csv(testing_set_fname)
  29. self.process_dataframe()
  30. self.training_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.training_file))
  31. self.testing_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.testing_file))
  32. copyfile(os.path.join(self.raw_folder, 'datasetDoc.json'), os.path.join(self.processed_folder, 'datasetDoc.json'))
  33. print('Done!')
  34. class yahoo_dataset(TODS_dataset):
  35. resources = [
  36. # ("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873"),
  37. # ("https://github.com/datamllab/tods/blob/master/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None),
  38. # ("https://github.com/NetManAIOps/KPI-Anomaly-Detection/blob/master/Preliminary_dataset/train.csv", None),
  39. ("https://hegsns.github.io/tods_datasets/yahoo_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # it needs md5 to check if local learningData.csv is the same with online.
  40. ("https://hegsns.github.io/tods_datasets/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json", None),
  41. # needs a server to store the dataset.
  42. # ("https://raw.githubusercontent.com/datamllab/tods/master/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv", None), # it needs md5 to check if local learningData.csv is the same with online.
  43. ]
  44. training_file = 'learningData.csv'
  45. testing_file = 'testingData.csv'
  46. ground_truth_index = 7
  47. _repr_indent = 4
  48. def process(self) -> None:
  49. print('Processing...')
  50. os.makedirs(self.processed_folder, exist_ok=True)
  51. os.makedirs(os.path.join(self.processed_folder, 'tables'), exist_ok=True)
  52. training_set_fname = os.path.join(self.raw_folder, 'learningData.csv')
  53. self.training_set_dataframe = pd.read_csv(training_set_fname)
  54. testing_set_fname = os.path.join(self.raw_folder, 'learningData.csv') # temperarily same with training set
  55. self.testing_set_dataframe = pd.read_csv(testing_set_fname)
  56. self.process_dataframe()
  57. self.training_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.training_file))
  58. self.testing_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.testing_file))
  59. copyfile(os.path.join(self.raw_folder, 'datasetDoc.json'), os.path.join(self.processed_folder, 'datasetDoc.json'))
  60. print('Done!')
  61. class NAB_dataset(TODS_dataset):
  62. resources = [
  63. ("https://hegsns.github.io/tods_datasets/NAB/realTweets/labeled_Twitter_volume_AMZN.csv", None),
  64. # it needs md5 to check if local learningData.csv is the same with online.
  65. ("https://hegsns.github.io/tods_datasets/NAB/realTweets/labeled_Twitter_volume_AMZN.json", None),
  66. # needs a server to store the dataset.
  67. ]
  68. training_file = 'learningData.csv'
  69. testing_file = 'testingData.csv'
  70. ground_truth_index = 2
  71. _repr_indent = 4
  72. def process(self) -> None:
  73. print('Processing...')
  74. os.makedirs(self.processed_folder, exist_ok=True)
  75. os.makedirs(os.path.join(self.processed_folder, 'tables'), exist_ok=True)
  76. training_set_fname = os.path.join(self.raw_folder, 'labeled_Twitter_volume_AMZN.csv')
  77. self.training_set_dataframe = pd.read_csv(training_set_fname)
  78. testing_set_fname = os.path.join(self.raw_folder, 'labeled_Twitter_volume_AMZN.csv') # temperarily same with training set
  79. self.testing_set_dataframe = pd.read_csv(testing_set_fname)
  80. self.process_dataframe()
  81. self.training_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.training_file))
  82. self.testing_set_dataframe.to_csv(os.path.join(self.processed_folder, 'tables', self.testing_file))
  83. copyfile(os.path.join(self.raw_folder, 'labeled_Twitter_volume_AMZN.json'),
  84. os.path.join(self.processed_folder, 'datasetDoc.json'))
  85. print('Done!')
  86. # kpi_dataset(root='./datasets', train=True, transform='binarize')
  87. # yahoo_dataset(root='./datasets', train=True, transform='binarize')
  88. # NAB_dataset(root='./datasets', train=True, transform='binarize')

全栈的自动化机器学习系统,主要针对多变量时间序列数据的异常检测。TODS提供了详尽的用于构建基于机器学习的异常检测系统的模块,它们包括:数据处理(data processing),时间序列处理( time series processing),特征分析(feature analysis),检测算法(detection algorithms),和强化模块( reinforcement module)。这些模块所提供的功能包括常见的数据预处理、时间序列数据的平滑或变换,从时域或频域中抽取特征、多种多样的检测算