From cbb0fd3493cac75b000dfa7bc177d21730006a08 Mon Sep 17 00:00:00 2001 From: Devesh Kumar Date: Wed, 11 Nov 2020 06:40:36 -0600 Subject: [PATCH] coverage increased Former-commit-id: c1a8fba17f9913adce98fe50dc59591adfe995e7 [formerly 1d35c822aac3d7a64e3b9ffa2609f070be7e74c8] [formerly 8709aad1c08635eb81e135edfc93016413863c22 [formerly e974eac168737d9ccae3c90e31186014a54c3c37]] [formerly 6806a629533c2ee79f411937411e075476da8aa7 [formerly 03bae6573ccd69399b5d7afccb8d6776741754a9] [formerly 7ba6fdaf792bc63fc04728bfe885d2a92f56594f [formerly 0d29eaab2560ef1c927f75c89eefe486f0da5366]]] [formerly 7dd42166bca6074d39dd7ae99e8d1adf191f45b6 [formerly 2da3f0610bc6a7529ad7f4a1df05f385f546780c] [formerly 025058208746dd7f7603d921fdd7e01a25860f04 [formerly ef3fd32d6dcc2fb8dec4b691290e10732ca49635]] [formerly a767db31d85df997746ce6687bb1098b4e3062b8 [formerly 7f909eb9cfd5ad5387179fba5f0a4d73cd0ff8cf] [formerly a698f1b270fbed8e6a47871dc0a52d3d699790f1 [formerly b841fc53db5a4f941e78da7156969f1533833d27]]]] [formerly 10c3eae512ab8fd81cbe10fdc72020c8e962c3c1 [formerly 210eebb342986eac2b93e87744af20486ca10118] [formerly 70bf2bc6c58671b7a3fbcd5ebd930b87f6976fbd [formerly 305ab36b11be3dc299d8f3787e5e9a58745d9969]] [formerly f70e4d2a27d79081d5d35250275ee497f6e0f28d [formerly 8bd51e3ddc350e4cda8cb0fceb0ba499b128cccb] [formerly bef7cf1b908901cd7b4d7b9dcf9cb2f5baec60c5 [formerly a8968a2fe2d899020a595235223b78db1eeef43d]]] [formerly 39e63b5cbd7d8772279f3215d4fbf9de2f4ecb4f [formerly 4978e2788675df5c3ea16928e7bd5d6490147220] [formerly 1acffe2e1504080e77a375330ffc752ae1df9f6e [formerly 2165c692a604dae123273bb959322ad675d819de]] [formerly d2d8058c7216aa2455acab9db155c5d4c143a09f [formerly 20356322aad9a4fb5e9198bef5c407b6efb81ef3] [formerly ef4e103f18863bb9d60f84f5755ed68e5df31c25 [formerly ecdec2584facae6d591f4a1d72ff15dd140e24dd]]]]] [formerly f0a509598734e778f4feee36283218194e693b70 [formerly 2a02694321f15c9832af862b07b395e6a0dff7f4] [formerly a2e7e078c22c057c9036415f7620ccef0cf4c8ac [formerly c67ddb0319cf378f7cd1f77207c15aec95be1500]] [formerly 748fe2cc3e5ccbdad1be47c7562970abad3f74f1 [formerly 75656de7a34f91d3c43a44c32f44b8d671bce9d1] [formerly 877ef24a85e0ba4e31bc288e61d55245156fd6f8 [formerly 4bf1058d4ae718198983015bb028783ce96cda64]]] [formerly 5a3bc14f0d095a4c30a72800d397d0accb9b6fcb [formerly c27e28104bbda76c8dc0cb48684e435c81c52f90] [formerly 75dcc89db65604a72152c87cde51951b0888059e [formerly 5fc3bf21b1511a682dcfa6f11c9d3343148fe2f5]] [formerly 338154b3306b24461df582be2875ad046f3983e3 [formerly 42605af666f5d7fa2b403da39f2b0fe7fe86d92e] [formerly a5ab7108bc373a71bcb46c3b04572c1ab66caba1 [formerly 200858728c6173edfe3635a59dc9f9a4462a3165]]]] [formerly a6736b478fb22d773eccaf1d58cd5053c2085ecf [formerly 1bf1228edd3776ff8e8c5b53f4940cb932b066d6] [formerly 5c320ae2f13d316bc4deef8f278c5f803a19b78d [formerly d1ddae6dc911263c92fef421d1d5d0ebceaf019c]] [formerly 35a94cd5d28e1971dbd33e9fa826fab6965e247b [formerly 3367b5869237a8840a3b43277353942b8cc1f8f7] [formerly 04caf5aafd4539fa2403ee872987423e58df2e5c [formerly 52170cc4bf601a0d9a2566f4fc3898a6a71701f9]]] [formerly eca61767edf34ab6b1afe16ca2234abb1104177e [formerly 8d345978335a494373fc790cdf297a2aad83c1e4] [formerly f0334f21db37d43a0ca6973d4ebef9bc45751328 [formerly 7500bed33eac90ca60717bb4903ec1314b409a20]] [formerly 97ab9f700a9a85fe84f559dc6a7329df766435a6 [formerly c4749edc844307af0e4e7f7d78c5775606667d37] [formerly 2c1e840e4190270bf981af69e30ee3ba53368b1c [formerly a6b1005961756a574a7f158933b49ddffa2090db]]]]]] Former-commit-id: 77d50b81729ab72b418d727ca28ebe8e792c5403 [formerly 11a04290929b0145d3d3faa4a7f41b5410905718] [formerly f8805ebea5f9674b8ddea46095cdff5a48d35e9a [formerly df91e9b1feb90aba9b67a35f8a4b3b78e82f6942]] [formerly acb940a2c93e80813b957c0c30a7a209eee7d861 [formerly 3019dbd1a3a703221485dda9f8859a5b62a01237] [formerly 9a0ccf5707bd2a534801c0e4d60e6227d4ebf167 [formerly aadb4dd3ab38e04e801e744b3e8cad7f35d1026c]]] [formerly 2795c08e07356d193f13c914b8d70832b987d802 [formerly 77f9eae36ac3c4d2b155d67bf50b13df9055aafd] [formerly a860676d9896b7f04bb9bdfdd41a50841d3ecbb5 [formerly afb35e164c4c89a65da9ff95d71d60a7d394b5c5]] [formerly bef580955f1c84b93897b61a11717557596655d6 [formerly 5d69f17cc1897710e24d272137d3dc43562d319a] [formerly f8894ef015a8d93dcbd550e490b00c632ba46021 [formerly c790e7470e77e200dd5e42b6807e0f2fb9ca6b79]]]] [formerly 9dca24517e2847edd74cfc0da0d92b16751be794 [formerly bee1e7e4c9d897060362064b8d643d081b881814] [formerly b85c484071a427581dfb95b7d8cd1f76a9c32130 [formerly 602164b940096e2d69668550dfe031b0efd1183a]] [formerly a042c80e14eaf6996bb6e7c9c39a12c9a8809939 [formerly 7a62010014d3a655edd4fbc8c018838f41cec28f] [formerly e9620d149f74c20691860735976bec4cbd34e9cc [formerly c3a4f216244a18ee16c19bc4528c83a4919276d6]]] [formerly 52f92b69f84ebb3104f2dab28291177954ff5b0a [formerly aa2d4cb4dde0ba476490c4851213f40744c8812f] [formerly c004732012abf3da08cfa84ec96aa98e2014429a [formerly 28969acdda98c09e7ebb9ec5ee09a70b72e6daea]] [formerly 16bbff62b2aa1464d5138290b22ac783df5b820b [formerly b2ba0bfd19adc951e8027562f1da745432ad0d0f] [formerly 2c1e840e4190270bf981af69e30ee3ba53368b1c]]]] Former-commit-id: 3424e2f965a4bac421759351f680a82e0efe4d9c [formerly 23fe3f85c12d15f0cb675376a91aaea1cbecf6b9] [formerly 65779f7438dc5df80cca0696f7773c007f399d4d [formerly 0f86ecbc8ca931ecf06f63ad9dc2601b6ed50ea8]] [formerly 7f3cf2763b09e8cf86c9b772289c396e97ec89ea [formerly f66725b1145419214ecc1fb6b8270d7c91ec1ef3] [formerly e1e28402ac44995d96efd7f5f1c21da22f8bc801 [formerly 73fbf76d8d016eba868442e816068e7823e7102d]]] [formerly c2a74c973153ffd68566d689996fc4954f2954e7 [formerly b66a399979314db2d1bd76b06fff9d932e39bc95] [formerly 0f8e2fc43374fc56f4712df902d6411f5ef92721 [formerly 912dea8e466acbb70b919e24cec62f70ff9eb34a]] [formerly 157a449ff816ff461d099e378d44b3b4670df1ff [formerly a504e3eee6928aaec1876b71821fd37cb8acaecd] [formerly cd47afc74b901c1f455607c49db8b341485d9b9b [formerly 8cd50e467e84b437d972a6d2761eaa2d8a1b9e8e]]]] Former-commit-id: f189a2aca85cce9ed7dd9e1f5e10e5cf0b89e905 [formerly dff37ee52fe284135336b9537f7de58d49970dc7] [formerly 55046945bdfa6f06b2d834747b22c1fdf1eb76c6 [formerly 5c54f5f40cf2f9501ce01a5c64f0df7ec252dff4]] [formerly 5f0e07788a385a3d372b32dc62aaf611a5e43120 [formerly d529848d63609d30b303650ca84bc0c696196f2d] [formerly 3aa2c17573cedde3cc8e2f53c77f4768430b298c [formerly 5cad4065992a07408416b9161c853433b4749110]]] Former-commit-id: c590724c342e7373d5020d458394ca8f7bbefb83 [formerly 23f3c9eb22beb784d0ebc5903418507f92ff975b] [formerly 9862f8fbfaf98b02a83ad210219d9b03563bca1c [formerly df37361b8b01baf76ea3334c1450e436e3cea177]] Former-commit-id: e10532879fa90157e06c49554edf704e1c616ac6 [formerly 2be2f09937846faadcffd909534e4edfde7b8b01] Former-commit-id: f9dc1b0c1523ec82fcbff7024f2a683deded505b --- tods/data_processing/ColumnParser.py | 28 +- .../ExtractColumnsBySemanticTypes.py | 6 +- tods/tests/data_processing/test_ColumnParser.py | 361 ++++++++++++++++++++- 3 files changed, 376 insertions(+), 19 deletions(-) diff --git a/tods/data_processing/ColumnParser.py b/tods/data_processing/ColumnParser.py index 37088a4..de02b36 100644 --- a/tods/data_processing/ColumnParser.py +++ b/tods/data_processing/ColumnParser.py @@ -120,7 +120,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: columns_to_use, output_columns = self._produce_columns(inputs) - if self.hyperparams['replace_index_columns'] and self.hyperparams['return_result'] == 'append': + if self.hyperparams['replace_index_columns'] and self.hyperparams['return_result'] == 'append': # pragma: no cover assert len(columns_to_use) == len(output_columns) index_columns = inputs.metadata.get_index_columns() @@ -215,7 +215,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return columns_to_use, output_columns - def _produce_columns_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Tuple[typing.List[int], typing.List[metadata_base.DataMetadata]]: + def _produce_columns_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Tuple[typing.List[int], typing.List[metadata_base.DataMetadata]]: # pragma: no cover columns_to_use = self._get_columns(inputs_metadata) # We check against this list again, because there might be multiple matching semantic types @@ -242,10 +242,10 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs elif 'http://schema.org/Float' in parse_semantic_types and 'http://schema.org/Float' in semantic_types: output_columns.append(self._parse_float_metadata(inputs_metadata, column_index)) - elif 'https://metadata.datadrivendiscovery.org/types/FloatVector' in parse_semantic_types and 'https://metadata.datadrivendiscovery.org/types/FloatVector' in semantic_types: + elif 'https://metadata.datadrivendiscovery.org/types/FloatVector' in parse_semantic_types and 'https://metadata.datadrivendiscovery.org/types/FloatVector' in semantic_types: # pragma: no cover output_columns.append(self._parse_float_vector_metadata(inputs_metadata, column_index)) - elif 'http://schema.org/DateTime' in parse_semantic_types and 'http://schema.org/DateTime' in semantic_types: + elif 'http://schema.org/DateTime' in parse_semantic_types and 'http://schema.org/DateTime' in semantic_types: # pragma: no cover output_columns.append(self._parse_time_metadata(inputs_metadata, column_index)) else: @@ -273,14 +273,14 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs @classmethod def _parse_boolean_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: - return cls._parse_categorical_data(inputs, column_index) + return cls._parse_categorical_data(inputs, column_index) # pragma: no cover @classmethod def _parse_boolean_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: - return cls._parse_categorical_metadata(inputs_metadata, column_index) + return cls._parse_categorical_metadata(inputs_metadata, column_index) # pragma: no cover @classmethod - def _parse_categorical_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: + def _parse_categorical_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: # pragma: no cover values_map: typing.Dict[str, int] = {} for value in inputs.iloc[:, column_index]: value = value.strip() @@ -295,7 +295,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs @classmethod - def _parse_categorical_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + def _parse_categorical_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover outputs_metadata = inputs_metadata.select_columns([column_index]) return outputs_metadata.update_column(0, {'structural_type': int}) @@ -333,7 +333,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs @classmethod - def _parse_integer_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + def _parse_integer_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover outputs_metadata = inputs_metadata.select_columns([column_index]) # Without data we assume we can parse everything into integers. This might not be true and # we might end up parsing into floats if we have to represent missing (or invalid) values. @@ -343,7 +343,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs def _str_to_float(cls, value: str) -> float: try: return float(value.strip()) - except ValueError: + except ValueError: # pragma: no cover return float('nan') @classmethod @@ -359,7 +359,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs_metadata.update_column(0, {'structural_type': float}) @classmethod - def _parse_float_vector_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: + def _parse_float_vector_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: # pragma: no cover # We are pretty strict here because we are assuming this was generated programmatically. outputs = container.DataFrame( { @@ -377,7 +377,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs @classmethod - def _parse_float_vector_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + def _parse_float_vector_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover outputs_metadata = inputs_metadata.select_columns([column_index]) # We cannot know the dimension of the ndarray without data. outputs_metadata = outputs_metadata.update_column(0, {'structural_type': container.ndarray}) @@ -385,13 +385,13 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs_metadata @classmethod - def _parse_time_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment, fuzzy: bool) -> Outputs: + def _parse_time_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment, fuzzy: bool) -> Outputs: # pragma: no cover outputs = container.DataFrame({inputs.columns[column_index]: [utils.parse_datetime_to_float(value, fuzzy=fuzzy) for value in inputs.iloc[:, column_index]]}, generate_metadata=False) outputs.metadata = cls._parse_time_metadata(inputs.metadata, column_index) return outputs @classmethod - def _parse_time_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + def _parse_time_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover outputs_metadata = inputs_metadata.select_columns([column_index]) return outputs_metadata.update_column(0, {'structural_type': float}) diff --git a/tods/data_processing/ExtractColumnsBySemanticTypes.py b/tods/data_processing/ExtractColumnsBySemanticTypes.py index 76d9676..bbd2e6c 100644 --- a/tods/data_processing/ExtractColumnsBySemanticTypes.py +++ b/tods/data_processing/ExtractColumnsBySemanticTypes.py @@ -106,16 +106,16 @@ class ExtractColumnsBySemanticTypesPrimitive(transformer.TransformerPrimitiveBas semantic_types = column_metadata.get('semantic_types', []) - if self.hyperparams['match_logic'] == 'all': + if self.hyperparams['match_logic'] == 'all': # pragma: no cover match = all(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types']) elif self.hyperparams['match_logic'] == 'any': match = any(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types']) - elif self.hyperparams["match_logic"] == "equal": + elif self.hyperparams["match_logic"] == "equal": # pragma: no cover match = set(semantic_types) == set(self.hyperparams["semantic_types"]) else: raise exceptions.UnexpectedValueError("Unknown value of hyper-parameter \"match_logic\": {value}".format(value=self.hyperparams['match_logic'])) - if self.hyperparams['negate']: + if self.hyperparams['negate']: # pragma: no cover return not match else: return match diff --git a/tods/tests/data_processing/test_ColumnParser.py b/tods/tests/data_processing/test_ColumnParser.py index 1b75e6e..f10f322 100644 --- a/tods/tests/data_processing/test_ColumnParser.py +++ b/tods/tests/data_processing/test_ColumnParser.py @@ -1,7 +1,7 @@ - +import numpy import os.path import unittest - +import math from d3m import container, utils @@ -9,6 +9,7 @@ from d3m.metadata import base as metadata_base from tods.data_processing import DatasetToDataframe, ColumnParser +from common_primitives import utils as common_utils import utils as test_utils @@ -92,6 +93,362 @@ class ColumnParserPrimitiveTestCase(unittest.TestCase): self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 7))), {'name': 'ground_truth', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + def test_new(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'datasets', 'anomaly', 'yahoo_sub_5', 'TRAIN', + 'dataset_TRAIN', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = DatasetToDataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = DatasetToDataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = ColumnParser.ColumnParserPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'use_columns': [2]})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, ('0', 12183.0)) + + self.assertEqual([type(o) for o in first_row], [str, float]) + + self._test_new_metadata(dataframe.metadata) + + def _test_new_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 1260, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), { + 'name': 'value_0', + 'structural_type': 'float', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + def test_append(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = DatasetToDataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = DatasetToDataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = ColumnParser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults().replace( + {'return_result': 'append', 'replace_index_columns': False, + 'parse_semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer']})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, ('0', '1', '12183', '0.0', '3.7166666666667', '5', '2109', '0', 0, 1, 0)) + + self.assertEqual([type(o) for o in first_row], [str, str, str, str, str, str, str, str,int , int , int]) + + self._test_append_metadata(dataframe.metadata, False) + + def test_append_replace_index_columns(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'datasets', 'anomaly', 'yahoo_sub_5', 'TRAIN', + 'dataset_TRAIN', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = DatasetToDataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = DatasetToDataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = ColumnParser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults().replace( + {'return_result': 'append', + 'parse_semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer']})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, (0, '1', '12183', '0.0', '3.7166666666667', '5', '2109', '0', 1, 0)) + + self.assertEqual([type(o) for o in first_row], [int, str, str, str, str, str,str,str, int , int]) + + self._test_append_replace_metadata(dataframe.metadata, True) + + def _test_append_replace_metadata(self, metadata, replace_index_columns): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 1260, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 10, + } + }) + + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), + {'name': 'd3mIndex', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), + {'name': 'timestamp', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 2))), + {'name': 'value_0', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 3))), + {'name': 'value_1', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 4))), + {'name': 'value_2', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))), + {'name': 'value_3', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 6))), + {'name': 'value_4', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 7))), + {'name': 'ground_truth', 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 8))),{'name': 'timestamp', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 9))),{'name': 'ground_truth', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'int'}) + + + def _test_append_metadata(self, metadata, replace_index_columns): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 1260, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 11, + } + }) + + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), + {'name': 'd3mIndex', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), + {'name': 'timestamp', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 2))), + {'name': 'value_0', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 3))), + {'name': 'value_1', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 4))), + {'name': 'value_2', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))), + {'name': 'value_3', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 6))), + {'name': 'value_4', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 7))), + {'name': 'ground_truth', 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 8))),{'name': 'd3mIndex', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 9))),{'name': 'timestamp', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 10))),{'name': 'ground_truth', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + + def test_integer(self): + hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = ColumnParser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = container.DataFrame({'a': ['1.0', '2.0', '3.0']}, generate_metadata=True) + + dataframe.metadata = dataframe.metadata.update((metadata_base.ALL_ELEMENTS, 0), { + 'name': 'test', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + call_metadata = primitive.produce(inputs=dataframe) + + parsed_dataframe = call_metadata.value + + self.assertEqual( + test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'test', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(list(parsed_dataframe.iloc[:, 0]), [1, 2, 3]) + + dataframe.iloc[2, 0] = '3.1' + + call_metadata = primitive.produce(inputs=dataframe) + + parsed_dataframe = call_metadata.value + + self.assertEqual( + test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'test', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(list(parsed_dataframe.iloc[:, 0]), [1, 2, 3]) + + dataframe.iloc[2, 0] = 'aaa' + + with self.assertRaisesRegex(ValueError, + 'Not all values in a column can be parsed into integers, but only integers were expected'): + primitive.produce(inputs=dataframe) + + dataframe.metadata = dataframe.metadata.update((metadata_base.ALL_ELEMENTS, 0), { + 'name': 'test', + 'structural_type': str, + 'semantic_types': [ + 'http://schema.org/Integer', + ], + }) + + call_metadata = primitive.produce(inputs=dataframe) + + parsed_dataframe = call_metadata.value + + self.assertEqual( + test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'test', + 'structural_type': 'float', + 'semantic_types': [ + 'http://schema.org/Integer', + ], + }) + + self.assertEqual(list(parsed_dataframe.iloc[0:2, 0]), [1.0, 2.0]) + self.assertTrue(math.isnan(parsed_dataframe.iloc[2, 0])) + + + + def test_ugly_time_values(self): + for value in [ + 'Original chained constant price data are rescaled.', + '1986/87', + ]: + self.assertTrue(numpy.isnan(common_utils.parse_datetime_to_float(value)), value) + + if __name__ == '__main__':