Browse Source

add unittest for common

Former-commit-id: f69cb061fa [formerly c3397fef19] [formerly 6a860e5720 [formerly 000a545c3a]] [formerly ff9c443db3 [formerly 156df43e37] [formerly cea004358e [formerly 13dc768026]]] [formerly 0e8c2b36d6 [formerly e06d6e4b1d] [formerly 916268b59a [formerly 94269e5935]] [formerly e7f3cbe2bc [formerly 578e3b3407] [formerly bdd544f42f [formerly 4a0f7ebacb]]]] [formerly 8d0312ea2a [formerly dd6a809873] [formerly eb3fa974a5 [formerly eb845a33a5]] [formerly 6abaa03fc0 [formerly 7f91618baf] [formerly 7349184150 [formerly 65db25658d]]] [formerly 3bddb52e03 [formerly 6d45707f8b] [formerly eb1d413e48 [formerly 3dff64fc9c]] [formerly 37ba0a93d9 [formerly 68376d1136] [formerly 7337100c7e [formerly a7d7191cd0]]]]] [formerly 5ed93d611f [formerly fccbcab1d6] [formerly b1e0c0a2fc [formerly 32f4b1fd86]] [formerly f83c346b28 [formerly d8334572b9] [formerly 132996877b [formerly c4879bd9b6]]] [formerly 5250a29264 [formerly 136ae3d312] [formerly 98915afd21 [formerly 901c78eda0]] [formerly 1da527284f [formerly 5afe58ad88] [formerly 034d01c383 [formerly 30d34d22f1]]]] [formerly 1f82e0423c [formerly 7baf32d730] [formerly 4fa02a3232 [formerly 68453ba156]] [formerly f7a6064615 [formerly 466cad0822] [formerly 8c6edce88a [formerly 5ba34e8055]]] [formerly 47c8407d98 [formerly 6ffe487218] [formerly b29093f8c5 [formerly 1d558ef564]] [formerly e07914e9bc [formerly e771b7661f] [formerly d0e83ee68d [formerly c247502053]]]]]]
Former-commit-id: 64ce167b5a [formerly 272db3aac4] [formerly 8dbf9dfaf4 [formerly fe817f3ae2]] [formerly 1e885a2687 [formerly acef5b5524] [formerly 13c13bbd3f [formerly c145b1b38d]]] [formerly f96e25db93 [formerly 63212c3151] [formerly 2b1ec4708a [formerly 2e68680bbb]] [formerly 4c9740e169 [formerly 0342c2b566] [formerly e176dfaa43 [formerly da7a1ebf6a]]]] [formerly fdfdf497eb [formerly c640fd5e0d] [formerly 785858d194 [formerly aabaed9b5c]] [formerly 0b22f22411 [formerly c1121bd5cc] [formerly 6c25ea99c9 [formerly beb2d79577]]] [formerly 801fb85133 [formerly ea56dfe63d] [formerly 06896e7d44 [formerly 02afce99f8]] [formerly 29b354b7ea [formerly 2bf5cbc12e] [formerly d0e83ee68d]]]]
Former-commit-id: b6909ab7c0 [formerly deb15a65c0] [formerly ac1624e7bf [formerly 7b46ea57b1]] [formerly fbb66c7baf [formerly 56c7c97cd8] [formerly 64fa88c0b3 [formerly 26cba61fc3]]] [formerly 9ac4a2597c [formerly a2049b937e] [formerly d45c50064e [formerly 9ce9d8535c]] [formerly c1b31154a5 [formerly 3f6855b738] [formerly c3c7581997 [formerly b998f88a49]]]]
Former-commit-id: 39d7e6e3c2 [formerly ec719a521e] [formerly a725d64be6 [formerly 37269dd3cd]] [formerly 161f51241f [formerly b899d38f90] [formerly f67dc6de31 [formerly 1136390cd0]]]
Former-commit-id: 82bc989202 [formerly a966eafa19] [formerly a3ba1cf3d4 [formerly 63d4c93d5d]]
Former-commit-id: fecc0f6568 [formerly 4f124e3310]
Former-commit-id: 9a7d39ff77
master
lhenry15 4 years ago
parent
commit
dbf31f4cca
7 changed files with 222 additions and 265 deletions
  1. +14
    -0
      tods/tests/common/run_tests.py
  2. +29
    -30
      tods/tests/common/test_fixed_split.py
  3. +31
    -49
      tods/tests/common/test_kfold_split.py
  4. +62
    -100
      tods/tests/common/test_kfold_timeseries_split.py
  5. +22
    -18
      tods/tests/common/test_no_split.py
  6. +11
    -11
      tods/tests/common/test_redact_columns.py
  7. +53
    -57
      tods/tests/common/test_train_score_split.py

+ 14
- 0
tods/tests/common/run_tests.py View File

@@ -0,0 +1,14 @@
#!/usr/bin/env python3

import sys
import unittest

runner = unittest.TextTestRunner(verbosity=1)
tests = unittest.TestLoader().discover('./')
if not runner.run(tests).wasSuccessful():
sys.exit(1)

#for each in ['data_processing', 'timeseries_processing', 'feature_analysis', 'detection_algorithm']:
# tests = unittest.TestLoader().discover(each)
# if not runner.run(tests).wasSuccessful():
# sys.exit(1)

+ 29
- 30
tods/tests/common/test_fixed_split.py View File

@@ -9,14 +9,20 @@ from tods.common import FixedSplit


class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase):
def test_produce_train_values(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))

def _get_yahoo_dataset(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))

return dataset

def test_produce_train_values(self):
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams()
@@ -44,20 +50,18 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase):
for dataset in results:
self.assertEqual(len(dataset), 1)

self.assertEqual(results[0]['learningData'].shape[0], 147)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i not in [9, 11, 13]])
self.assertEqual(results[0]['learningData'].shape[0], 1257)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260) if i not in [9, 11, 13]])

def test_produce_score_values(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams()

hyperparams = hyperparams_class.defaults().replace({
'primary_index_values': ['9', '11', '13'],
@@ -67,7 +71,7 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase):
# a pickle because runtime populates this primitive as a list from a split file.
self.assertEqual(hyperparams.values_to_json_structure(), {'primary_index_values': ['9', '11', '13'], 'row_indices': [], 'delete_recursive': False})

primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams)
primitive = FixedSplit.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams)

primitive.set_training_data(dataset=dataset)
primitive.fit()
@@ -83,18 +87,15 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase):
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i in [9, 11, 13]])

def test_produce_train_indices(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')
hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams()

hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams()

primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = FixedSplit.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'row_indices': [9, 11, 13],
}))

@@ -111,22 +112,20 @@ class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase):
for dataset in results:
self.assertEqual(len(dataset), 1)

self.assertEqual(results[0]['learningData'].shape[0], 147)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i not in [9, 11, 13]])
self.assertEqual(results[0]['learningData'].shape[0], 1257)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260) if i not in [9, 11, 13]])

def test_produce_score_indices(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = FixedSplit.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams()

primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = FixedSplit.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'row_indices': [9, 11, 13],
}))



+ 31
- 49
tods/tests/common/test_kfold_split.py View File

@@ -5,26 +5,31 @@ import unittest
from d3m import container
from d3m.metadata import base as metadata_base

from common_primitives import kfold_split
from tods.common import KFoldSplit


class KFoldDatasetSplitPrimitiveTestCase(unittest.TestCase):
def test_produce_train(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json'))
def _get_yahoo_dataset(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))

return dataset

def test_produce_train(self):
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = kfold_split.KFoldDatasetSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = KFoldSplit.KFoldDatasetSplitPrimitive.metadata.get_hyperparams()

primitive = kfold_split.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = KFoldSplit.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'number_of_folds': 10,
'shuffle': True,
'delete_recursive': True,
'delete_recursive': False,
}))

primitive.set_training_data(dataset=dataset)
@@ -33,68 +38,45 @@ class KFoldDatasetSplitPrimitiveTestCase(unittest.TestCase):
# To test that pickling works.
pickle.dumps(primitive)

results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value
results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value

self.assertEqual(len(results), 2)
self.assertEqual(len(results), 1)

for dataset in results:
self.assertEqual(len(dataset), 4)

self.assertEqual(results[0]['codes'].shape[0], 3)
self.assertEqual(results[1]['codes'].shape[0], 3)

self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 40)
self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'})
self.assertEqual(len(dataset), 1)

self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 40)
self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'})
self.assertEqual(results[0]['learningData'].shape[0], 1134)

def test_produce_score(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = kfold_split.KFoldDatasetSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = KFoldSplit.KFoldDatasetSplitPrimitive.metadata.get_hyperparams()

primitive = kfold_split.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = KFoldSplit.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'number_of_folds': 10,
'shuffle': True,
'delete_recursive': True,
'delete_recursive': False,
}))

primitive.set_training_data(dataset=dataset)
primitive.fit()

results = primitive.produce_score_data(inputs=container.List([0, 1], generate_metadata=True)).value
results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value

self.assertEqual(len(results), 2)
self.assertEqual(len(results), 1)

for dataset in results:
self.assertEqual(len(dataset), 4)

self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 0]), {'5', '11', '28', '31', '38'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ddd', 'eee'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000'})
self.assertEqual(len(dataset), 1)

self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'BBB', 'CCC'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 0]), {'12', '26', '29', '32', '39'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'BBB', 'CCC'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'bbb', 'ccc', 'ddd', 'eee'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'})
self.assertEqual(results[0]['learningData'].shape[0], 126)


if __name__ == '__main__':
unittest.main()



+ 62
- 100
tods/tests/common/test_kfold_timeseries_split.py View File

@@ -5,24 +5,31 @@ import unittest
from d3m import container
from d3m.metadata import base as metadata_base

from common_primitives import kfold_split_timeseries
from tods.common import KFoldSplitTimeseries


class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase):
def test_produce_train_timeseries_1(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json'))

def _get_yahoo_dataset(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))

return dataset

def test_produce_train_timeseries_1(self):
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()

folds = 5
primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'number_of_folds': folds,
'number_of_window_folds': 1,
}))
@@ -33,35 +40,29 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase):
# To test that pickling works.
pickle.dumps(primitive)

results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value
results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value

self.assertEqual(len(results), 2)
self.assertEqual(len(results), 1)

for dataset in results:
self.assertEqual(len(dataset), 1)

self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 8)
self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-05', '2013-11-06', '2013-11-07', '2013-11-08', '2013-11-11',
'2013-11-12', '2013-11-13', '2013-11-14'})

self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 8)
self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-13', '2013-11-14', '2013-11-15', '2013-11-18', '2013-11-19',
'2013-11-20', '2013-11-21', '2013-11-22'})
self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 210)
#TODO: correct the semantic type and validate unix timestamp

def test_produce_score_timeseries_1(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()

folds = 5
primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'number_of_folds': folds,
'number_of_window_folds': 1,
}))
@@ -69,38 +70,31 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase):
primitive.set_training_data(dataset=dataset)
primitive.fit()

results = primitive.produce_score_data(inputs=container.List([0, 1], generate_metadata=True)).value
results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value

self.assertEqual(len(results), 2)
self.assertEqual(len(results), 1)

for dataset in results:
self.assertEqual(len(dataset), 1)

self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 6)
self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-15', '2013-11-18', '2013-11-19',
'2013-11-20', '2013-11-21', '2013-11-22'})

self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 6)
self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-25', '2013-11-26', '2013-11-27',
'2013-11-29', '2013-12-02', '2013-12-03'})
self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 210)

def test_produce_train(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

# We fake that the dataset is time-series.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Time')
#dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time')

hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()

folds = 5
primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'number_of_folds': folds,
'number_of_window_folds': 1,
}))
@@ -111,45 +105,30 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase):
# To test that pickling works.
pickle.dumps(primitive)

results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value
results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value

self.assertEqual(len(results), 2)
self.assertEqual(len(results), 1)

for dataset in results:
self.assertEqual(len(dataset), 4)
self.assertEqual(len(dataset), 1)

self.assertEqual(results[0]['codes'].shape[0], 3)
self.assertEqual(results[1]['codes'].shape[0], 3)

self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 9)
self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'bbb', 'ccc', 'ddd'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990'})

self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 9)
self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ddd', 'eee'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000'})
self.assertEqual(results[0]['learningData'].shape[0], 210)
#TODO: correct the semantic type and validate unix timestamp

def test_produce_score(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

# We fake that the dataset is time-series.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Time')

hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()

folds = 5
primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'number_of_folds': folds,
'number_of_window_folds': 1,
}))
@@ -162,37 +141,24 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase):
self.assertEqual(len(results), 2)

for dataset in results:
self.assertEqual(len(dataset), 4)

self.assertEqual(results[0]['codes'].shape[0], 3)
self.assertEqual(results[1]['codes'].shape[0], 3)
self.assertEqual(len(dataset), 1)

self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 0]), {'2', '3', '32', '33', '37', '38', '39'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'ddd', 'eee'})
self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000'})
self.assertEqual(results[0]['learningData'].shape[0], 210)

self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 0]), {'22', '23', '24', '31', '40', '41', '42'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'ccc', 'ddd', 'eee'})
self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2000'})

def test_unsorted_datetimes_timeseries_4(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_4', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Time')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams()

folds = 5
primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = KFoldSplitTimeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'number_of_folds': folds,
'number_of_window_folds': 1,
}))
@@ -203,20 +169,16 @@ class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase):
# To test that pickling works.
pickle.dumps(primitive)

results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value
results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value

self.assertEqual(len(results), 2)
self.assertEqual(len(results), 1)

for dataset in results:
self.assertEqual(len(dataset), 1)

self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 8)
self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-05', '2013-11-06', '2013-11-07', '2013-11-08', '2013-11-11',
'2013-11-12', '2013-11-13', '2013-11-14'})
self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 210)

self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 8)
self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-13', '2013-11-14', '2013-11-15', '2013-11-18', '2013-11-19',
'2013-11-20', '2013-11-21', '2013-11-22'})
#TODO: correct the semantic type and validate unix timestamp


if __name__ == '__main__':


+ 22
- 18
tods/tests/common/test_no_split.py View File

@@ -5,23 +5,29 @@ import unittest
from d3m import container
from d3m.metadata import base as metadata_base

from common_primitives import no_split
from tods.common import NoSplit


class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase):
def test_produce_train(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))
def _get_yahoo_dataset(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))

return dataset

def test_produce_train(self):
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = no_split.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams()

primitive = no_split.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults())
hyperparams_class = NoSplit.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams()

primitive = NoSplit.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults())

primitive.set_training_data(dataset=dataset)
primitive.fit()
@@ -36,22 +42,20 @@ class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase):
for dataset in results:
self.assertEqual(len(dataset), 1)

self.assertEqual(results[0]['learningData'].shape[0], 150)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150)])
self.assertEqual(results[0]['learningData'].shape[0], 1260)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260)])

def test_produce_score(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = no_split.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = NoSplit.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams()

primitive = no_split.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults())
primitive = NoSplit.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults())

primitive.set_training_data(dataset=dataset)
primitive.fit()
@@ -63,8 +67,8 @@ class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase):
for dataset in results:
self.assertEqual(len(dataset), 1)

self.assertEqual(results[0]['learningData'].shape[0], 150)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150)])
self.assertEqual(results[0]['learningData'].shape[0], 1260)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(1260)])


if __name__ == '__main__':


+ 11
- 11
tods/tests/common/test_redact_columns.py View File

@@ -4,18 +4,18 @@ import unittest
from d3m import container, utils
from d3m.metadata import base as metadata_base

from common_primitives import redact_columns

from tods.common import RedactColumns

class RedactColumnsPrimitiveTestCase(unittest.TestCase):

def _get_datasets(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')

datasets = container.List([dataset], {
@@ -37,9 +37,9 @@ class RedactColumnsPrimitiveTestCase(unittest.TestCase):
def test_basic(self):
dataset_doc_path, datasets = self._get_datasets()

hyperparams_class = redact_columns.RedactColumnsPrimitive.metadata.get_hyperparams()
hyperparams_class = RedactColumns.RedactColumnsPrimitive.metadata.get_hyperparams()

primitive = redact_columns.RedactColumnsPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = RedactColumns.RedactColumnsPrimitive(hyperparams=hyperparams_class.defaults().replace({
'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TrueTarget',),
'add_semantic_types': ('https://metadata.datadrivendiscovery.org/types/RedactedTarget', 'https://metadata.datadrivendiscovery.org/types/MissingData'),
}))
@@ -50,15 +50,15 @@ class RedactColumnsPrimitiveTestCase(unittest.TestCase):
redacted_dataset = redacted_datasets[0]

self.assertIsInstance(redacted_dataset, container.Dataset)
self.assertEqual(redacted_dataset['learningData']['species'].values.tolist(), [''] * 150)

self._test_metadata(redacted_datasets.metadata, dataset_doc_path, True)
self._test_metadata(redacted_dataset.metadata, dataset_doc_path, False)
# TODO: check metadata of yahoo dataset
#self._test_metadata(redacted_datasets.metadata, dataset_doc_path, True)
#self._test_metadata(redacted_dataset.metadata, dataset_doc_path, False)

def _test_metadata(self, metadata, dataset_doc_path, is_list):
top_metadata = {
'structural_type': 'd3m.container.dataset.Dataset',
'id': 'iris_dataset_1',
'id': 'yahoo_sub_5_dataset_TRAIN',
'version': '4.0.0',
'name': 'Iris Dataset',
'location_uris': [


+ 53
- 57
tods/tests/common/test_train_score_split.py View File

@@ -5,23 +5,33 @@ import unittest
from d3m import container
from d3m.metadata import base as metadata_base

from common_primitives import train_score_split
from tods.common import TrainScoreSplit


class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase):
def test_produce_train(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))
def _get_yahoo_dataset(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))

return dataset

def test_produce_train(self):
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 6), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = train_score_split.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = TrainScoreSplit.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams()

primitive = train_score_split.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = TrainScoreSplit.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'shuffle': True,
}))

@@ -38,51 +48,42 @@ class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase):
for dataset in results:
self.assertEqual(len(dataset), 1)

self.assertEqual(results[0]['learningData'].shape[0], 112)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [
'0', '1', '2', '3', '4', '5', '6', '9', '10', '11', '12', '13', '14', '15', '17', '19', '20',
'21', '23', '25', '28', '29', '30', '31', '32', '34', '35', '36', '38', '39', '41', '42', '43',
'46', '47', '48', '49', '50', '52', '53', '55', '56', '57', '58', '60', '61', '64', '65', '67',
'68', '69', '70', '72', '74', '75', '77', '79', '80', '81', '82', '85', '87', '88', '89', '91',
'92', '94', '95', '96', '98', '99', '101', '102', '103', '104', '105', '106', '108', '109', '110',
'111', '112', '113', '115', '116', '117', '118', '119', '120', '122', '123', '124', '125', '128',
'129', '130', '131', '133', '135', '136', '138', '139', '140', '141', '142', '143', '144', '145',
'146', '147', '148', '149',
])

self.assertEqual(results.metadata.query((0, 'learningData'))['dimension']['length'], 112)
self.assertEqual(results[0]['learningData'].shape[0], 945)

column_names = ['d3mIndex', 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species']
for i in range(6):
self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['name'], column_names[i])
column_names = ['d3mIndex', 'timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4','ground_truth']
for i in range(8):
self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['name'],
column_names[i])

self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 0))['semantic_types'], (
"http://schema.org/Integer", "https://metadata.datadrivendiscovery.org/types/PrimaryKey"
'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Index'
))
for i in range(1, 5):
self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], (
'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'
))
self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 5))['semantic_types'],(
'https://metadata.datadrivendiscovery.org/types/CategoricalData',
for i in range(2, 6):
self.assertEqual(
results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ('http://schema.org/Float',)
)
self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 7))['semantic_types'],(
'http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
'https://metadata.datadrivendiscovery.org/types/Target',
'https://metadata.datadrivendiscovery.org/types/TrueTarget'
'https://metadata.datadrivendiscovery.org/types/TrueTarget',
))

def test_produce_score(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))
dataset = self._get_yahoo_dataset()

# We set semantic types like runtime would.
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Index')
dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 6), 'https://metadata.datadrivendiscovery.org/types/Attribute')
dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 7), 'https://metadata.datadrivendiscovery.org/types/Attribute')

hyperparams_class = train_score_split.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams()
hyperparams_class = TrainScoreSplit.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams()

primitive = train_score_split.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
primitive = TrainScoreSplit.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({
'shuffle': True,
}))

@@ -96,33 +97,28 @@ class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase):
for dataset in results:
self.assertEqual(len(dataset), 1)

self.assertEqual(results[0]['learningData'].shape[0], 38)
self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [
'7', '8', '16', '18', '22', '24', '26', '27', '33', '37', '40', '44', '45', '51', '54',
'59', '62', '63', '66', '71', '73', '76', '78', '83', '84', '86', '90', '93', '97', '100',
'107', '114', '121', '126', '127', '132', '134', '137',
])
self.assertEqual(results[0]['learningData'].shape[0], 315)
#TODO check data type

self.assertEqual(results.metadata.query((0, 'learningData'))['dimension']['length'], 38)
self.assertEqual(results.metadata.query((0, 'learningData'))['dimension']['length'], 315)

column_names = ['d3mIndex', 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species']
for i in range(6):
column_names = ['d3mIndex', 'timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4','ground_truth']
for i in range(8):
self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['name'],
column_names[i])

self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 0))['semantic_types'], (
"http://schema.org/Integer", "https://metadata.datadrivendiscovery.org/types/PrimaryKey"
'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Index'
))
for i in range(1, 5):
for i in range(2, 6):
self.assertEqual(
results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], (
'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'
))
self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 5))['semantic_types'], (
'https://metadata.datadrivendiscovery.org/types/CategoricalData',
results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, i))['semantic_types'], ('http://schema.org/Float',)
)
print(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 7))['semantic_types'])
self.assertEqual(results.metadata.query((0, 'learningData', metadata_base.ALL_ELEMENTS, 7))['semantic_types'], (
'http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
'https://metadata.datadrivendiscovery.org/types/Target',
'https://metadata.datadrivendiscovery.org/types/TrueTarget'
'https://metadata.datadrivendiscovery.org/types/TrueTarget',
))




Loading…
Cancel
Save