From dbf31f4cca3045ee4ea7dea61a32755859d185b3 Mon Sep 17 00:00:00 2001 From: lhenry15 Date: Thu, 4 Feb 2021 00:32:03 -0600 Subject: [PATCH] add unittest for common Former-commit-id: f69cb061fa55151a4b00c4f2712feb0e137aff59 [formerly c3397fef19895a5181a27b7b882c543b9919a308] [formerly 6a860e57201f98501c8d562d65a51730d879d890 [formerly 000a545c3ae1bd3d1998869d2d29854e89ed1768]] [formerly ff9c443db313b9aecb2dd53e12cdd823ad95e667 [formerly 156df43e37fd5aa9454425a96a99639559532db5] [formerly cea004358e63150e4817c08a4a658e227b2597d6 [formerly 13dc768026f6005432a736f81dce1d02d5c53e56]]] [formerly 0e8c2b36d64ca4e1f7f95d412068ae2cc6a2555c [formerly e06d6e4b1da6744d3ca578aace9b0e4180ee1485] [formerly 916268b59a6ed9013fb315b214c6c2f28ac041ec [formerly 94269e59350bc04c6daada74ab2f48e91844c30a]] [formerly e7f3cbe2bc669a134c4e16d1c7a4cf2b1fa9134f [formerly 578e3b340785d666e8f2d96a7050a5e6cda1ec0e] [formerly bdd544f42f4bc7e597904c1b0381ea01ffbecde8 [formerly 4a0f7ebacb3409b5b14d198152b8dc5cca92f9e9]]]] [formerly 8d0312ea2adbf93405fbcb6272c4e6f4d8b1f0bf [formerly dd6a809873998233c79acb7566e757423d2909e9] [formerly eb3fa974a5926d6a1913cb24f7ed9a2fbb72da4e [formerly eb845a33a5bb8c45e327448f7d7adba9bbccb8a6]] [formerly 6abaa03fc0fa4d1d8fa3fb6654fa79ecbbb96423 [formerly 7f91618baf88e10f195b9e56176b040192605dc3] [formerly 7349184150c48fd263aa7e7d198dd303511f7891 [formerly 65db25658d1c222918aa30c432b43d3b4b962c61]]] [formerly 3bddb52e0338fe27a8fea36e63d79960b21aaec6 [formerly 6d45707f8b5b6325b037f4b02d5a418ad767c4f7] [formerly eb1d413e4824180c38d3339bc331206a4c39c53a [formerly 3dff64fc9c67a8cd48d546416bf3bd48163a0007]] [formerly 37ba0a93d9a2a4faf830daca3cd05cb77e200a39 [formerly 68376d11365f0f91d81c1999dd8c79976a104fdb] [formerly 7337100c7e6995cff861d31ed92ff3933ab34da5 [formerly a7d7191cd0d6687b3f67d7826a1edd96093e70ce]]]]] [formerly 5ed93d611fb8a7a9891bf28c8e146786d83bb30c [formerly fccbcab1d6bd60fae11f0a7bd02788524b63363b] [formerly b1e0c0a2fca3aba607915ed8c43b7a9a1395096b [formerly 32f4b1fd86a827c99237dc881c2d42ffea7ef221]] [formerly f83c346b2837b095ecb6a4ff971c1b7d1aad6eae [formerly d8334572b9b48972967fad21818c1c10bdbbcd73] [formerly 132996877b2e90a1281e737ef3d392f1badec45b [formerly c4879bd9b692cb0ead48bd3daa188208c2395fc7]]] [formerly 5250a292644baa5b5d32b22605027e1ecde4d1ab [formerly 136ae3d312f780d43753e254ab1cc9ef780b24c4] [formerly 98915afd21ad0b7f02e334dfe22d8f3f0ce7b55e [formerly 901c78eda0ac565617d68f2285d8cb0de43ec9fb]] [formerly 1da527284f4087699f4ac42d43f52120ce32c91f [formerly 5afe58ad8834b95a1b8acba974d0fbbe14490c3a] [formerly 034d01c383868f257855f38842ae3d52c41ac07c [formerly 30d34d22f1f7d10f425c9ef535078455c018f71a]]]] [formerly 1f82e0423c140a6a2f055135b1ca8809d8a7382a [formerly 7baf32d730833663f8bdfbb6a52658c2c8839c34] [formerly 4fa02a3232723f5b1f67450f766c3b8d54ec8ceb [formerly 68453ba1569f9ee77f805f010b65cc2245e64863]] [formerly f7a60646152c891df654ca507acd30a6a6451143 [formerly 466cad082270f03b61fd5f643d76556d92b69792] [formerly 8c6edce88a04700ce3bd27e39c29e5891d064c1b [formerly 5ba34e8055211de65d6ed297fa0720fa940764ca]]] [formerly 47c8407d98f640b2948250272ab874b66735f73a [formerly 6ffe487218b74387df6b96ffb989ba499eb8ce5f] [formerly b29093f8c5732f31495d2bdc69d31e08b74bf242 [formerly 1d558ef56404c16cabe038d530cea374da35ce09]] [formerly e07914e9bc4e1af51ed96d56ea4eddfdd109e0e7 [formerly e771b7661fb478db024f45fd8155e84f8434ba22] [formerly d0e83ee68d26ba45379021c9a9d0f6b1b675c9ed [formerly c247502053319cfd0ebf544983b18993f43268a1]]]]]] Former-commit-id: 64ce167b5a578d90a4382ffe744498d653f80b85 [formerly 272db3aac4dcaf9de7a1574a976cb8dacb2341df] [formerly 8dbf9dfaf4a0e3d6531920840d414e5534d80f37 [formerly fe817f3ae20b703f26794f2a1b703de717e48e78]] [formerly 1e885a2687e7e9f11e8f5119ca9621512c3b610f [formerly acef5b5524a2cd1f3c99d432a032fa6044d511ba] [formerly 13c13bbd3ff55778400c5bae463357d1b74797e7 [formerly c145b1b38daa0efe4c2521272e0bada601f73e30]]] [formerly f96e25db938b74e95cb89b0649450dbd2ff75f32 [formerly 63212c31517f4aeb123cdc6526490fe230278fae] [formerly 2b1ec4708a3a404ceb0a800cfc1c2217a1094aed [formerly 2e68680bbb4a8a6c2b9d205539d3e37ce035dd2c]] [formerly 4c9740e169746d6c76d6a4b30bec9dd62198ebd0 [formerly 0342c2b56657bf45cf02088bc63d01fb15fa1b07] [formerly e176dfaa43a299c61de2f0092032b30399194980 [formerly da7a1ebf6a79309a7e57d85aab50d2933284e09f]]]] [formerly fdfdf497ebc778fdc562685c11b31659f769db5a [formerly c640fd5e0df282ca7fecc2a8fb40dd086140e94c] [formerly 785858d194be1465fcd9530a80298c6a5a06ec24 [formerly aabaed9b5c62284a2cb4d8186f9d07cd205c3f8f]] [formerly 0b22f224113b3e3de32ca5efeaf424e52b68175e [formerly c1121bd5ccc7c7c0b4cbd6668851342440231e8d] [formerly 6c25ea99c9d6a040e379abc63b8ecbb6300af7e6 [formerly beb2d795776051241047f06ee20430389498ed83]]] [formerly 801fb8513305629e9ebfffb21970f71f21aafdfc [formerly ea56dfe63d4e4d72671d7b491c21e00fc6432bef] [formerly 06896e7d440a4e162b191b35e0fe52d6568394c1 [formerly 02afce99f8e7dfed7a8fa6b6298090150b1229c3]] [formerly 29b354b7eafc834cfc909578b0feedfaeec3aeb4 [formerly 2bf5cbc12ed483a67bc8cf54b48b7347f7acce3f] [formerly d0e83ee68d26ba45379021c9a9d0f6b1b675c9ed]]]] Former-commit-id: b6909ab7c05f5401c6e25b1638937193a1aff89f [formerly deb15a65c08afd2a5ea1bd7646181086731af725] [formerly ac1624e7bf37ec5b72a9263481ebe37343439741 [formerly 7b46ea57b1b54885eb6afc5360442de0e52a50f9]] [formerly fbb66c7baf1205d4eb7ab6c22147e0fa25e1f77b [formerly 56c7c97cd8e3f98e62ebc93e6d62b9150152cb04] [formerly 64fa88c0b3d6f537531e0a5e561701bff881b275 [formerly 26cba61fc3603b7f6a652f4399b360754aeb0d99]]] [formerly 9ac4a2597c5151fcbca55fb69fe4b1d15f7b24ab [formerly a2049b937eee64c2c47e661de0ab648c23893e97] [formerly d45c50064eabbd301f3a13086e31a67490dd0ceb [formerly 9ce9d8535c26338e8ddcb4a1a31030527d085dcf]] [formerly c1b31154a58a1b74c33cd79c4f81e19267c58f33 [formerly 3f6855b738ae8f13729baa6b79214016ab0c049b] [formerly c3c75819970fa5384d5bfdf1ca9c7da8a5ac528e [formerly b998f88a499a9629a5de099d1c6d802afa0b9d7a]]]] Former-commit-id: 39d7e6e3c2d92d54f9a4dc085a9a626d5e42700a [formerly ec719a521ecdf67ed63268a5a45d27d3ce219817] [formerly a725d64be6891a10a2fc99155c4a081697b31854 [formerly 37269dd3cd33a6a14e36e06457a6e3a324d44b27]] [formerly 161f51241f9b5d9801c98698045d0250b51dcdd2 [formerly b899d38f900e0bfaea346685ac290b949e90407d] [formerly f67dc6de3187ba9dedeceb7f68dd0be5badc8116 [formerly 1136390cd0a66d4ed45181fefc5869cb24665b43]]] Former-commit-id: 82bc989202b69c4dc2be5649102c86f9ab8e8772 [formerly a966eafa191a6fb864eb9343d34c9ff8b7a6b72f] [formerly a3ba1cf3d48c66eedbb47c51cdd21f59fdd6472e [formerly 63d4c93d5d0bc9fade1983fc5028bc4f2dfb515b]] Former-commit-id: fecc0f6568de24b8b403b6fb299dea4b326a9d86 [formerly 4f124e331005bc55388b07d758d46cf6aa4cb2f9] Former-commit-id: 9a7d39ff77b5202ea3b2de86b43c69d22176a171 --- tods/tests/common/run_tests.py | 14 ++ tods/tests/common/test_fixed_split.py | 59 ++++----- tods/tests/common/test_kfold_split.py | 80 +++++------ tods/tests/common/test_kfold_timeseries_split.py | 162 +++++++++-------------- tods/tests/common/test_no_split.py | 40 +++--- tods/tests/common/test_redact_columns.py | 22 +-- tods/tests/common/test_train_score_split.py | 110 ++++++++------- 7 files changed, 222 insertions(+), 265 deletions(-) create mode 100755 tods/tests/common/run_tests.py diff --git a/tods/tests/common/run_tests.py b/tods/tests/common/run_tests.py new file mode 100755 index 0000000..16e2374 --- /dev/null +++ b/tods/tests/common/run_tests.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import sys +import unittest + +runner = unittest.TextTestRunner(verbosity=1) +tests = unittest.TestLoader().discover('./') +if not runner.run(tests).wasSuccessful(): + sys.exit(1) + +#for each in ['data_processing', 'timeseries_processing', 'feature_analysis', 'detection_algorithm']: +# tests = unittest.TestLoader().discover(each) +# if not runner.run(tests).wasSuccessful(): +# sys.exit(1) diff --git a/tods/tests/common/test_fixed_split.py b/tods/tests/common/test_fixed_split.py index 08f773a..b3b4636 100644 --- a/tods/tests/common/test_fixed_split.py +++ b/tods/tests/common/test_fixed_split.py @@ -9,14 +9,20 @@ from tods.common import FixedSplit class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train_values(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train_values(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() @@ -44,20 +50,18 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 147) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i not in [9, 11, 13]]) + self.assertEqual(results[0]['learningData'].shape[0], 1257) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260) if i not in [9, 11, 13]]) def test_produce_score_values(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() hyperparams = hyperparams_class.defaults().replace({ 'primary_index_values': ['9', '11', '13'], @@ -67,7 +71,7 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): # a pickle because runtime populates this primitive as a list from a split file. self.assertEqual(hyperparams.values_to_json_structure(), {'primary_index_values': ['9', '11', '13'], 'row_indices': [], 'delete_recursive': False}) - primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams) + primitive = FixedSplit.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams) primitive.set_training_data(dataset=dataset) primitive.fit() @@ -83,18 +87,15 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i in [9, 11, 13]]) def test_produce_train_indices(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() - hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() - - primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = FixedSplit.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'row_indices': [9, 11, 13], })) @@ -111,22 +112,20 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 147) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i not in [9, 11, 13]]) + self.assertEqual(results[0]['learningData'].shape[0], 1257) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260) if i not in [9, 11, 13]]) def test_produce_score_indices(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = FixedSplit.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'row_indices': [9, 11, 13], })) diff --git a/tods/tests/common/test_kfold_split.py b/tods/tests/common/test_kfold_split.py index 9983a6e..16726b4 100644 --- a/tods/tests/common/test_kfold_split.py +++ b/tods/tests/common/test_kfold_split.py @@ -5,26 +5,31 @@ import unittest from d3m import container from d3m.metadata import base as metadata_base -from common_primitives import kfold_split +from tods.common import KFoldSplit class KFoldDatasetSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplit.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = kfold_split.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplit.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': 10, 'shuffle': True, - 'delete_recursive': True, + 'delete_recursive': False, })) primitive.set_training_data(dataset=dataset) @@ -33,68 +38,45 @@ class KFoldDatasetSplitPrimitiveTestCase(unittest.TestCase): # To test that pickling works. pickle.dumps(primitive) - results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: - self.assertEqual(len(dataset), 4) - - self.assertEqual(results[0]['codes'].shape[0], 3) - self.assertEqual(results[1]['codes'].shape[0], 3) - - self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 40) - self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'}) + self.assertEqual(len(dataset), 1) - self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 40) - self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'}) + self.assertEqual(results[0]['learningData'].shape[0], 1134) def test_produce_score(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplit.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = kfold_split.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplit.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': 10, 'shuffle': True, - 'delete_recursive': True, + 'delete_recursive': False, })) primitive.set_training_data(dataset=dataset) primitive.fit() - results = primitive.produce_score_data(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: - self.assertEqual(len(dataset), 4) - - self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 0]), {'5', '11', '28', '31', '38'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ddd', 'eee'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000'}) + self.assertEqual(len(dataset), 1) - self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 0]), {'12', '26', '29', '32', '39'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'bbb', 'ccc', 'ddd', 'eee'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'}) + self.assertEqual(results[0]['learningData'].shape[0], 126) if __name__ == '__main__': unittest.main() + + diff --git a/tods/tests/common/test_kfold_timeseries_split.py b/tods/tests/common/test_kfold_timeseries_split.py index 885ab2e..0f86fda 100644 --- a/tods/tests/common/test_kfold_timeseries_split.py +++ b/tods/tests/common/test_kfold_timeseries_split.py @@ -5,24 +5,31 @@ import unittest from d3m import container from d3m.metadata import base as metadata_base -from common_primitives import kfold_split_timeseries +from tods.common import KFoldSplitTimeseries class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train_timeseries_1(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train_timeseries_1(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -33,35 +40,29 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): # To test that pickling works. pickle.dumps(primitive) - results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 8) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-05', '2013-11-06', '2013-11-07', '2013-11-08', '2013-11-11', - '2013-11-12', '2013-11-13', '2013-11-14'}) - - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 8) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-13', '2013-11-14', '2013-11-15', '2013-11-18', '2013-11-19', - '2013-11-20', '2013-11-21', '2013-11-22'}) + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 210) + #TODO: correct the semantic type and validate unix timestamp def test_produce_score_timeseries_1(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -69,38 +70,31 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): primitive.set_training_data(dataset=dataset) primitive.fit() - results = primitive.produce_score_data(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 6) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-15', '2013-11-18', '2013-11-19', - '2013-11-20', '2013-11-21', '2013-11-22'}) - - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 6) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-25', '2013-11-26', '2013-11-27', - '2013-11-29', '2013-12-02', '2013-12-03'}) + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 210) def test_produce_train(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') # We fake that the dataset is time-series. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Time') + #dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -111,45 +105,30 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): # To test that pickling works. pickle.dumps(primitive) - results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: - self.assertEqual(len(dataset), 4) + self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['codes'].shape[0], 3) - self.assertEqual(results[1]['codes'].shape[0], 3) - - self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 9) - self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'bbb', 'ccc', 'ddd'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990'}) - - self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 9) - self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ddd', 'eee'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000'}) + self.assertEqual(results[0]['learningData'].shape[0], 210) + #TODO: correct the semantic type and validate unix timestamp def test_produce_score(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - # We fake that the dataset is time-series. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Time') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -162,37 +141,24 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): self.assertEqual(len(results), 2) for dataset in results: - self.assertEqual(len(dataset), 4) - - self.assertEqual(results[0]['codes'].shape[0], 3) - self.assertEqual(results[1]['codes'].shape[0], 3) + self.assertEqual(len(dataset), 1) - self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 0]), {'2', '3', '32', '33', '37', '38', '39'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'ddd', 'eee'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000'}) + self.assertEqual(results[0]['learningData'].shape[0], 210) - self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 0]), {'22', '23', '24', '31', '40', '41', '42'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'ccc', 'ddd', 'eee'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2000'}) def test_unsorted_datetimes_timeseries_4(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_4', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -203,20 +169,16 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): # To test that pickling works. pickle.dumps(primitive) - results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 8) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-05', '2013-11-06', '2013-11-07', '2013-11-08', '2013-11-11', - '2013-11-12', '2013-11-13', '2013-11-14'}) + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 210) - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 8) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-13', '2013-11-14', '2013-11-15', '2013-11-18', '2013-11-19', - '2013-11-20', '2013-11-21', '2013-11-22'}) + #TODO: correct the semantic type and validate unix timestamp if __name__ == '__main__': diff --git a/tods/tests/common/test_no_split.py b/tods/tests/common/test_no_split.py index f61f476..8f48a6e 100644 --- a/tods/tests/common/test_no_split.py +++ b/tods/tests/common/test_no_split.py @@ -5,23 +5,29 @@ import unittest from d3m import container from d3m.metadata import base as metadata_base -from common_primitives import no_split +from tods.common import NoSplit class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = no_split.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = no_split.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) + hyperparams_class = NoSplit.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = NoSplit.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) primitive.set_training_data(dataset=dataset) primitive.fit() @@ -36,22 +42,20 @@ class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 150) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150)]) + self.assertEqual(results[0]['learningData'].shape[0], 1260) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260)]) def test_produce_score(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = no_split.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = NoSplit.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = no_split.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) + primitive = NoSplit.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) primitive.set_training_data(dataset=dataset) primitive.fit() @@ -63,8 +67,8 @@ class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 150) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150)]) + self.assertEqual(results[0]['learningData'].shape[0], 1260) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260)]) if __name__ == '__main__': diff --git a/tods/tests/common/test_redact_columns.py b/tods/tests/common/test_redact_columns.py index 5bd5df0..69cdf39 100644 --- a/tods/tests/common/test_redact_columns.py +++ b/tods/tests/common/test_redact_columns.py @@ -4,18 +4,18 @@ import unittest from d3m import container, utils from d3m.metadata import base as metadata_base -from common_primitives import redact_columns - +from tods.common import RedactColumns class RedactColumnsPrimitiveTestCase(unittest.TestCase): + def _get_datasets(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') datasets = container.List([dataset], { @@ -37,9 +37,9 @@ class RedactColumnsPrimitiveTestCase(unittest.TestCase): def test_basic(self): dataset_doc_path, datasets = self._get_datasets() - hyperparams_class = redact_columns.RedactColumnsPrimitive.metadata.get_hyperparams() + hyperparams_class = RedactColumns.RedactColumnsPrimitive.metadata.get_hyperparams() - primitive = redact_columns.RedactColumnsPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = RedactColumns.RedactColumnsPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TrueTarget',), 'add_semantic_types': ('https://metadata.datadrivendiscovery.org/types/RedactedTarget', 'https://metadata.datadrivendiscovery.org/types/MissingData'), })) @@ -50,15 +50,15 @@ class RedactColumnsPrimitiveTestCase(unittest.TestCase): redacted_dataset = redacted_datasets[0] self.assertIsInstance(redacted_dataset, container.Dataset) - self.assertEqual(redacted_dataset['learningData']['species'].values.tolist(), [''] * 150) - self._test_metadata(redacted_datasets.metadata, dataset_doc_path, True) - self._test_metadata(redacted_dataset.metadata, dataset_doc_path, False) + # TODO: check metadata of yahoo dataset + #self._test_metadata(redacted_datasets.metadata, dataset_doc_path, True) + #self._test_metadata(redacted_dataset.metadata, dataset_doc_path, False) def _test_metadata(self, metadata, dataset_doc_path, is_list): top_metadata = { 'structural_type': 'd3m.container.dataset.Dataset', - 'id': 'iris_dataset_1', + 'id': 'yahoo_sub_5_dataset_TRAIN', 'version': '4.0.0', 'name': 'Iris Dataset', 'location_uris': [ diff --git a/tods/tests/common/test_train_score_split.py b/tods/tests/common/test_train_score_split.py index b2f9a4e..1e0a1b7 100644 --- a/tods/tests/common/test_train_score_split.py +++ b/tods/tests/common/test_train_score_split.py @@ -5,23 +5,33 @@ import unittest from d3m import container from d3m.metadata import base as metadata_base -from common_primitives import train_score_split +from tods.common import TrainScoreSplit class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 6), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = train_score_split.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = TrainScoreSplit.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = train_score_split.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = TrainScoreSplit.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'shuffle': True, })) @@ -38,51 +48,42 @@ class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 112) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [ - '0', '1', '2', '3', '4', '5', '6', '9', '10', '11', '12', '13', '14', '15', '17', '19', '20', - '21', '23', '25', '28', '29', '30', '31', '32', '34', '35', '36', '38', '39', '41', '42', '43', - '46', '47', '48', '49', '50', '52', '53', '55', '56', '57', '58', '60', '61', '64', '65', '67', - '68', '69', '70', '72', '74', '75', '77', '79', '80', '81', '82', '85', '87', '88', '89', '91', - '92', '94', '95', '96', '98', '99', '101', '102', '103', '104', '105', '106', '108', '109', '110', - '111', '112', '113', '115', '116', '117', '118', '119', '120', '122', '123', '124', '125', '128', - '129', '130', '131', '133', '135', '136', '138', '139', '140', '141', '142', '143', '144', '145', - '146', '147', '148', '149', - ]) - - self.assertEqual(results.metadata.query((0, 'learningData'))['dimension']['length'], 112) + self.assertEqual(results[0]['learningData'].shape[0], 945) - column_names = ['d3mIndex', 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species'] - for i in range(6): - self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['name'], column_names[i]) + column_names = ['d3mIndex', 'timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4','ground_truth'] + for i in range(8): + self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['name'], + column_names[i]) self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 0))['semantic_types'], ( - "http://schema.org/Integer", "https://metadata.datadrivendiscovery.org/types/PrimaryKey" + 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Index' )) - for i in range(1, 5): - self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ( - 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute' - )) - self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 5))['semantic_types'],( - 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + for i in range(2, 6): + self.assertEqual( + results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ('http://schema.org/Float',) + ) + self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 7))['semantic_types'],( + 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', - 'https://metadata.datadrivendiscovery.org/types/Target', - 'https://metadata.datadrivendiscovery.org/types/TrueTarget' + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', )) def test_produce_score(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 6), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = train_score_split.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = TrainScoreSplit.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = train_score_split.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = TrainScoreSplit.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'shuffle': True, })) @@ -96,33 +97,28 @@ class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 38) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [ - '7', '8', '16', '18', '22', '24', '26', '27', '33', '37', '40', '44', '45', '51', '54', - '59', '62', '63', '66', '71', '73', '76', '78', '83', '84', '86', '90', '93', '97', '100', - '107', '114', '121', '126', '127', '132', '134', '137', - ]) + self.assertEqual(results[0]['learningData'].shape[0], 315) + #TODO check data type - self.assertEqual(results.metadata.query((0, 'learningData'))['dimension']['length'], 38) + self.assertEqual(results.metadata.query((0, 'learningData'))['dimension']['length'], 315) - column_names = ['d3mIndex', 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species'] - for i in range(6): + column_names = ['d3mIndex', 'timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4','ground_truth'] + for i in range(8): self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['name'], column_names[i]) self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 0))['semantic_types'], ( - "http://schema.org/Integer", "https://metadata.datadrivendiscovery.org/types/PrimaryKey" + 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Index' )) - for i in range(1, 5): + for i in range(2, 6): self.assertEqual( - results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ( - 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute' - )) - self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 5))['semantic_types'], ( - 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ('http://schema.org/Float',) + ) + print(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 7))['semantic_types']) + self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 7))['semantic_types'], ( + 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', - 'https://metadata.datadrivendiscovery.org/types/Target', - 'https://metadata.datadrivendiscovery.org/types/TrueTarget' + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', ))