diff --git a/tods/tests/common/run_tests.py b/tods/tests/common/run_tests.py new file mode 100755 index 0000000..16e2374 --- /dev/null +++ b/tods/tests/common/run_tests.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import sys +import unittest + +runner = unittest.TextTestRunner(verbosity=1) +tests = unittest.TestLoader().discover('./') +if not runner.run(tests).wasSuccessful(): + sys.exit(1) + +#for each in ['data_processing', 'timeseries_processing', 'feature_analysis', 'detection_algorithm']: +# tests = unittest.TestLoader().discover(each) +# if not runner.run(tests).wasSuccessful(): +# sys.exit(1) diff --git a/tods/tests/common/test_fixed_split.py b/tods/tests/common/test_fixed_split.py index 08f773a..b3b4636 100644 --- a/tods/tests/common/test_fixed_split.py +++ b/tods/tests/common/test_fixed_split.py @@ -9,14 +9,20 @@ from tods.common import FixedSplit class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train_values(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train_values(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() @@ -44,20 +50,18 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 147) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i not in [9, 11, 13]]) + self.assertEqual(results[0]['learningData'].shape[0], 1257) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260) if i not in [9, 11, 13]]) def test_produce_score_values(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() hyperparams = hyperparams_class.defaults().replace({ 'primary_index_values': ['9', '11', '13'], @@ -67,7 +71,7 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): # a pickle because runtime populates this primitive as a list from a split file. self.assertEqual(hyperparams.values_to_json_structure(), {'primary_index_values': ['9', '11', '13'], 'row_indices': [], 'delete_recursive': False}) - primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams) + primitive = FixedSplit.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams) primitive.set_training_data(dataset=dataset) primitive.fit() @@ -83,18 +87,15 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i in [9, 11, 13]]) def test_produce_train_indices(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() - hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() - - primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = FixedSplit.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'row_indices': [9, 11, 13], })) @@ -111,22 +112,20 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 147) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i not in [9, 11, 13]]) + self.assertEqual(results[0]['learningData'].shape[0], 1257) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260) if i not in [9, 11, 13]]) def test_produce_score_indices(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = FixedSplit.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'row_indices': [9, 11, 13], })) diff --git a/tods/tests/common/test_kfold_split.py b/tods/tests/common/test_kfold_split.py index 9983a6e..16726b4 100644 --- a/tods/tests/common/test_kfold_split.py +++ b/tods/tests/common/test_kfold_split.py @@ -5,26 +5,31 @@ import unittest from d3m import container from d3m.metadata import base as metadata_base -from common_primitives import kfold_split +from tods.common import KFoldSplit class KFoldDatasetSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplit.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = kfold_split.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplit.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': 10, 'shuffle': True, - 'delete_recursive': True, + 'delete_recursive': False, })) primitive.set_training_data(dataset=dataset) @@ -33,68 +38,45 @@ class KFoldDatasetSplitPrimitiveTestCase(unittest.TestCase): # To test that pickling works. pickle.dumps(primitive) - results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: - self.assertEqual(len(dataset), 4) - - self.assertEqual(results[0]['codes'].shape[0], 3) - self.assertEqual(results[1]['codes'].shape[0], 3) - - self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 40) - self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'}) + self.assertEqual(len(dataset), 1) - self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 40) - self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'}) + self.assertEqual(results[0]['learningData'].shape[0], 1134) def test_produce_score(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplit.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = kfold_split.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplit.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': 10, 'shuffle': True, - 'delete_recursive': True, + 'delete_recursive': False, })) primitive.set_training_data(dataset=dataset) primitive.fit() - results = primitive.produce_score_data(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: - self.assertEqual(len(dataset), 4) - - self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 0]), {'5', '11', '28', '31', '38'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ddd', 'eee'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000'}) + self.assertEqual(len(dataset), 1) - self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 0]), {'12', '26', '29', '32', '39'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'bbb', 'ccc', 'ddd', 'eee'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'}) + self.assertEqual(results[0]['learningData'].shape[0], 126) if __name__ == '__main__': unittest.main() + + diff --git a/tods/tests/common/test_kfold_timeseries_split.py b/tods/tests/common/test_kfold_timeseries_split.py index 885ab2e..0f86fda 100644 --- a/tods/tests/common/test_kfold_timeseries_split.py +++ b/tods/tests/common/test_kfold_timeseries_split.py @@ -5,24 +5,31 @@ import unittest from d3m import container from d3m.metadata import base as metadata_base -from common_primitives import kfold_split_timeseries +from tods.common import KFoldSplitTimeseries class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train_timeseries_1(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train_timeseries_1(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -33,35 +40,29 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): # To test that pickling works. pickle.dumps(primitive) - results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 8) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-05', '2013-11-06', '2013-11-07', '2013-11-08', '2013-11-11', - '2013-11-12', '2013-11-13', '2013-11-14'}) - - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 8) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-13', '2013-11-14', '2013-11-15', '2013-11-18', '2013-11-19', - '2013-11-20', '2013-11-21', '2013-11-22'}) + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 210) + #TODO: correct the semantic type and validate unix timestamp def test_produce_score_timeseries_1(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -69,38 +70,31 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): primitive.set_training_data(dataset=dataset) primitive.fit() - results = primitive.produce_score_data(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 6) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-15', '2013-11-18', '2013-11-19', - '2013-11-20', '2013-11-21', '2013-11-22'}) - - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 6) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-25', '2013-11-26', '2013-11-27', - '2013-11-29', '2013-12-02', '2013-12-03'}) + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 210) def test_produce_train(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') # We fake that the dataset is time-series. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Time') + #dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -111,45 +105,30 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): # To test that pickling works. pickle.dumps(primitive) - results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: - self.assertEqual(len(dataset), 4) + self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['codes'].shape[0], 3) - self.assertEqual(results[1]['codes'].shape[0], 3) - - self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 9) - self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'bbb', 'ccc', 'ddd'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990'}) - - self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 9) - self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ddd', 'eee'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000'}) + self.assertEqual(results[0]['learningData'].shape[0], 210) + #TODO: correct the semantic type and validate unix timestamp def test_produce_score(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - # We fake that the dataset is time-series. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Time') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -162,37 +141,24 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): self.assertEqual(len(results), 2) for dataset in results: - self.assertEqual(len(dataset), 4) - - self.assertEqual(results[0]['codes'].shape[0], 3) - self.assertEqual(results[1]['codes'].shape[0], 3) + self.assertEqual(len(dataset), 1) - self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 0]), {'2', '3', '32', '33', '37', '38', '39'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'ddd', 'eee'}) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000'}) + self.assertEqual(results[0]['learningData'].shape[0], 210) - self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 0]), {'22', '23', '24', '31', '40', '41', '42'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'ccc', 'ddd', 'eee'}) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2000'}) def test_unsorted_datetimes_timeseries_4(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_4', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') - dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() folds = 5 - primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'number_of_folds': folds, 'number_of_window_folds': 1, })) @@ -203,20 +169,16 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): # To test that pickling works. pickle.dumps(primitive) - results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value - self.assertEqual(len(results), 2) + self.assertEqual(len(results), 1) for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 8) - self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-05', '2013-11-06', '2013-11-07', '2013-11-08', '2013-11-11', - '2013-11-12', '2013-11-13', '2013-11-14'}) + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 210) - self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 8) - self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-13', '2013-11-14', '2013-11-15', '2013-11-18', '2013-11-19', - '2013-11-20', '2013-11-21', '2013-11-22'}) + #TODO: correct the semantic type and validate unix timestamp if __name__ == '__main__': diff --git a/tods/tests/common/test_no_split.py b/tods/tests/common/test_no_split.py index f61f476..8f48a6e 100644 --- a/tods/tests/common/test_no_split.py +++ b/tods/tests/common/test_no_split.py @@ -5,23 +5,29 @@ import unittest from d3m import container from d3m.metadata import base as metadata_base -from common_primitives import no_split +from tods.common import NoSplit class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = no_split.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = no_split.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) + hyperparams_class = NoSplit.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = NoSplit.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) primitive.set_training_data(dataset=dataset) primitive.fit() @@ -36,22 +42,20 @@ class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 150) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150)]) + self.assertEqual(results[0]['learningData'].shape[0], 1260) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260)]) def test_produce_score(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = no_split.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = NoSplit.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = no_split.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) + primitive = NoSplit.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) primitive.set_training_data(dataset=dataset) primitive.fit() @@ -63,8 +67,8 @@ class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 150) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150)]) + self.assertEqual(results[0]['learningData'].shape[0], 1260) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260)]) if __name__ == '__main__': diff --git a/tods/tests/common/test_redact_columns.py b/tods/tests/common/test_redact_columns.py index 5bd5df0..69cdf39 100644 --- a/tods/tests/common/test_redact_columns.py +++ b/tods/tests/common/test_redact_columns.py @@ -4,18 +4,18 @@ import unittest from d3m import container, utils from d3m.metadata import base as metadata_base -from common_primitives import redact_columns - +from tods.common import RedactColumns class RedactColumnsPrimitiveTestCase(unittest.TestCase): + def _get_datasets(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') datasets = container.List([dataset], { @@ -37,9 +37,9 @@ class RedactColumnsPrimitiveTestCase(unittest.TestCase): def test_basic(self): dataset_doc_path, datasets = self._get_datasets() - hyperparams_class = redact_columns.RedactColumnsPrimitive.metadata.get_hyperparams() + hyperparams_class = RedactColumns.RedactColumnsPrimitive.metadata.get_hyperparams() - primitive = redact_columns.RedactColumnsPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = RedactColumns.RedactColumnsPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TrueTarget',), 'add_semantic_types': ('https://metadata.datadrivendiscovery.org/types/RedactedTarget', 'https://metadata.datadrivendiscovery.org/types/MissingData'), })) @@ -50,15 +50,15 @@ class RedactColumnsPrimitiveTestCase(unittest.TestCase): redacted_dataset = redacted_datasets[0] self.assertIsInstance(redacted_dataset, container.Dataset) - self.assertEqual(redacted_dataset['learningData']['species'].values.tolist(), [''] * 150) - self._test_metadata(redacted_datasets.metadata, dataset_doc_path, True) - self._test_metadata(redacted_dataset.metadata, dataset_doc_path, False) + # TODO: check metadata of yahoo dataset + #self._test_metadata(redacted_datasets.metadata, dataset_doc_path, True) + #self._test_metadata(redacted_dataset.metadata, dataset_doc_path, False) def _test_metadata(self, metadata, dataset_doc_path, is_list): top_metadata = { 'structural_type': 'd3m.container.dataset.Dataset', - 'id': 'iris_dataset_1', + 'id': 'yahoo_sub_5_dataset_TRAIN', 'version': '4.0.0', 'name': 'Iris Dataset', 'location_uris': [ diff --git a/tods/tests/common/test_train_score_split.py b/tods/tests/common/test_train_score_split.py index b2f9a4e..1e0a1b7 100644 --- a/tods/tests/common/test_train_score_split.py +++ b/tods/tests/common/test_train_score_split.py @@ -5,23 +5,33 @@ import unittest from d3m import container from d3m.metadata import base as metadata_base -from common_primitives import train_score_split +from tods.common import TrainScoreSplit class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase): - def test_produce_train(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + def _get_yahoo_dataset(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + def test_produce_train(self): + dataset = self._get_yahoo_dataset() + # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 6), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = train_score_split.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = TrainScoreSplit.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = train_score_split.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = TrainScoreSplit.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'shuffle': True, })) @@ -38,51 +48,42 @@ class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 112) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [ - '0', '1', '2', '3', '4', '5', '6', '9', '10', '11', '12', '13', '14', '15', '17', '19', '20', - '21', '23', '25', '28', '29', '30', '31', '32', '34', '35', '36', '38', '39', '41', '42', '43', - '46', '47', '48', '49', '50', '52', '53', '55', '56', '57', '58', '60', '61', '64', '65', '67', - '68', '69', '70', '72', '74', '75', '77', '79', '80', '81', '82', '85', '87', '88', '89', '91', - '92', '94', '95', '96', '98', '99', '101', '102', '103', '104', '105', '106', '108', '109', '110', - '111', '112', '113', '115', '116', '117', '118', '119', '120', '122', '123', '124', '125', '128', - '129', '130', '131', '133', '135', '136', '138', '139', '140', '141', '142', '143', '144', '145', - '146', '147', '148', '149', - ]) - - self.assertEqual(results.metadata.query((0, 'learningData'))['dimension']['length'], 112) + self.assertEqual(results[0]['learningData'].shape[0], 945) - column_names = ['d3mIndex', 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species'] - for i in range(6): - self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['name'], column_names[i]) + column_names = ['d3mIndex', 'timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4','ground_truth'] + for i in range(8): + self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['name'], + column_names[i]) self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 0))['semantic_types'], ( - "http://schema.org/Integer", "https://metadata.datadrivendiscovery.org/types/PrimaryKey" + 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Index' )) - for i in range(1, 5): - self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ( - 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute' - )) - self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 5))['semantic_types'],( - 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + for i in range(2, 6): + self.assertEqual( + results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ('http://schema.org/Float',) + ) + self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 7))['semantic_types'],( + 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', - 'https://metadata.datadrivendiscovery.org/types/Target', - 'https://metadata.datadrivendiscovery.org/types/TrueTarget' + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', )) def test_produce_score(self): - dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) - - dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + dataset = self._get_yahoo_dataset() # We set semantic types like runtime would. - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') - dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 6), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/Attribute') - hyperparams_class = train_score_split.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() + hyperparams_class = TrainScoreSplit.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() - primitive = train_score_split.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + primitive = TrainScoreSplit.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'shuffle': True, })) @@ -96,33 +97,28 @@ class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase): for dataset in results: self.assertEqual(len(dataset), 1) - self.assertEqual(results[0]['learningData'].shape[0], 38) - self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [ - '7', '8', '16', '18', '22', '24', '26', '27', '33', '37', '40', '44', '45', '51', '54', - '59', '62', '63', '66', '71', '73', '76', '78', '83', '84', '86', '90', '93', '97', '100', - '107', '114', '121', '126', '127', '132', '134', '137', - ]) + self.assertEqual(results[0]['learningData'].shape[0], 315) + #TODO check data type - self.assertEqual(results.metadata.query((0, 'learningData'))['dimension']['length'], 38) + self.assertEqual(results.metadata.query((0, 'learningData'))['dimension']['length'], 315) - column_names = ['d3mIndex', 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species'] - for i in range(6): + column_names = ['d3mIndex', 'timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4','ground_truth'] + for i in range(8): self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['name'], column_names[i]) self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 0))['semantic_types'], ( - "http://schema.org/Integer", "https://metadata.datadrivendiscovery.org/types/PrimaryKey" + 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Index' )) - for i in range(1, 5): + for i in range(2, 6): self.assertEqual( - results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ( - 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute' - )) - self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 5))['semantic_types'], ( - 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ('http://schema.org/Float',) + ) + print(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 7))['semantic_types']) + self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 7))['semantic_types'], ( + 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', - 'https://metadata.datadrivendiscovery.org/types/Target', - 'https://metadata.datadrivendiscovery.org/types/TrueTarget' + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', ))