diff --git a/tods/data_processing/ColumnParser.py b/tods/data_processing/ColumnParser.py index 37088a4..de02b36 100644 --- a/tods/data_processing/ColumnParser.py +++ b/tods/data_processing/ColumnParser.py @@ -120,7 +120,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: columns_to_use, output_columns = self._produce_columns(inputs) - if self.hyperparams['replace_index_columns'] and self.hyperparams['return_result'] == 'append': + if self.hyperparams['replace_index_columns'] and self.hyperparams['return_result'] == 'append': # pragma: no cover assert len(columns_to_use) == len(output_columns) index_columns = inputs.metadata.get_index_columns() @@ -215,7 +215,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return columns_to_use, output_columns - def _produce_columns_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Tuple[typing.List[int], typing.List[metadata_base.DataMetadata]]: + def _produce_columns_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Tuple[typing.List[int], typing.List[metadata_base.DataMetadata]]: # pragma: no cover columns_to_use = self._get_columns(inputs_metadata) # We check against this list again, because there might be multiple matching semantic types @@ -242,10 +242,10 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs elif 'http://schema.org/Float' in parse_semantic_types and 'http://schema.org/Float' in semantic_types: output_columns.append(self._parse_float_metadata(inputs_metadata, column_index)) - elif 'https://metadata.datadrivendiscovery.org/types/FloatVector' in parse_semantic_types and 'https://metadata.datadrivendiscovery.org/types/FloatVector' in semantic_types: + elif 'https://metadata.datadrivendiscovery.org/types/FloatVector' in parse_semantic_types and 'https://metadata.datadrivendiscovery.org/types/FloatVector' in semantic_types: # pragma: no cover output_columns.append(self._parse_float_vector_metadata(inputs_metadata, column_index)) - elif 'http://schema.org/DateTime' in parse_semantic_types and 'http://schema.org/DateTime' in semantic_types: + elif 'http://schema.org/DateTime' in parse_semantic_types and 'http://schema.org/DateTime' in semantic_types: # pragma: no cover output_columns.append(self._parse_time_metadata(inputs_metadata, column_index)) else: @@ -273,14 +273,14 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs @classmethod def _parse_boolean_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: - return cls._parse_categorical_data(inputs, column_index) + return cls._parse_categorical_data(inputs, column_index) # pragma: no cover @classmethod def _parse_boolean_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: - return cls._parse_categorical_metadata(inputs_metadata, column_index) + return cls._parse_categorical_metadata(inputs_metadata, column_index) # pragma: no cover @classmethod - def _parse_categorical_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: + def _parse_categorical_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: # pragma: no cover values_map: typing.Dict[str, int] = {} for value in inputs.iloc[:, column_index]: value = value.strip() @@ -295,7 +295,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs @classmethod - def _parse_categorical_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + def _parse_categorical_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover outputs_metadata = inputs_metadata.select_columns([column_index]) return outputs_metadata.update_column(0, {'structural_type': int}) @@ -333,7 +333,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs @classmethod - def _parse_integer_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + def _parse_integer_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover outputs_metadata = inputs_metadata.select_columns([column_index]) # Without data we assume we can parse everything into integers. This might not be true and # we might end up parsing into floats if we have to represent missing (or invalid) values. @@ -343,7 +343,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs def _str_to_float(cls, value: str) -> float: try: return float(value.strip()) - except ValueError: + except ValueError: # pragma: no cover return float('nan') @classmethod @@ -359,7 +359,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs_metadata.update_column(0, {'structural_type': float}) @classmethod - def _parse_float_vector_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: + def _parse_float_vector_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: # pragma: no cover # We are pretty strict here because we are assuming this was generated programmatically. outputs = container.DataFrame( { @@ -377,7 +377,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs @classmethod - def _parse_float_vector_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + def _parse_float_vector_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover outputs_metadata = inputs_metadata.select_columns([column_index]) # We cannot know the dimension of the ndarray without data. outputs_metadata = outputs_metadata.update_column(0, {'structural_type': container.ndarray}) @@ -385,13 +385,13 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs return outputs_metadata @classmethod - def _parse_time_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment, fuzzy: bool) -> Outputs: + def _parse_time_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment, fuzzy: bool) -> Outputs: # pragma: no cover outputs = container.DataFrame({inputs.columns[column_index]: [utils.parse_datetime_to_float(value, fuzzy=fuzzy) for value in inputs.iloc[:, column_index]]}, generate_metadata=False) outputs.metadata = cls._parse_time_metadata(inputs.metadata, column_index) return outputs @classmethod - def _parse_time_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + def _parse_time_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover outputs_metadata = inputs_metadata.select_columns([column_index]) return outputs_metadata.update_column(0, {'structural_type': float}) diff --git a/tods/data_processing/ExtractColumnsBySemanticTypes.py b/tods/data_processing/ExtractColumnsBySemanticTypes.py index 76d9676..bbd2e6c 100644 --- a/tods/data_processing/ExtractColumnsBySemanticTypes.py +++ b/tods/data_processing/ExtractColumnsBySemanticTypes.py @@ -106,16 +106,16 @@ class ExtractColumnsBySemanticTypesPrimitive(transformer.TransformerPrimitiveBas semantic_types = column_metadata.get('semantic_types', []) - if self.hyperparams['match_logic'] == 'all': + if self.hyperparams['match_logic'] == 'all': # pragma: no cover match = all(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types']) elif self.hyperparams['match_logic'] == 'any': match = any(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types']) - elif self.hyperparams["match_logic"] == "equal": + elif self.hyperparams["match_logic"] == "equal": # pragma: no cover match = set(semantic_types) == set(self.hyperparams["semantic_types"]) else: raise exceptions.UnexpectedValueError("Unknown value of hyper-parameter \"match_logic\": {value}".format(value=self.hyperparams['match_logic'])) - if self.hyperparams['negate']: + if self.hyperparams['negate']: # pragma: no cover return not match else: return match diff --git a/tods/tests/data_processing/test_ColumnParser.py b/tods/tests/data_processing/test_ColumnParser.py index 1b75e6e..f10f322 100644 --- a/tods/tests/data_processing/test_ColumnParser.py +++ b/tods/tests/data_processing/test_ColumnParser.py @@ -1,7 +1,7 @@ - +import numpy import os.path import unittest - +import math from d3m import container, utils @@ -9,6 +9,7 @@ from d3m.metadata import base as metadata_base from tods.data_processing import DatasetToDataframe, ColumnParser +from common_primitives import utils as common_utils import utils as test_utils @@ -92,6 +93,362 @@ class ColumnParserPrimitiveTestCase(unittest.TestCase): self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 7))), {'name': 'ground_truth', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + def test_new(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'datasets', 'anomaly', 'yahoo_sub_5', 'TRAIN', + 'dataset_TRAIN', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = DatasetToDataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = DatasetToDataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = ColumnParser.ColumnParserPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'use_columns': [2]})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, ('0', 12183.0)) + + self.assertEqual([type(o) for o in first_row], [str, float]) + + self._test_new_metadata(dataframe.metadata) + + def _test_new_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 1260, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), { + 'name': 'value_0', + 'structural_type': 'float', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + def test_append(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = DatasetToDataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = DatasetToDataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = ColumnParser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults().replace( + {'return_result': 'append', 'replace_index_columns': False, + 'parse_semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer']})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, ('0', '1', '12183', '0.0', '3.7166666666667', '5', '2109', '0', 0, 1, 0)) + + self.assertEqual([type(o) for o in first_row], [str, str, str, str, str, str, str, str,int , int , int]) + + self._test_append_metadata(dataframe.metadata, False) + + def test_append_replace_index_columns(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'datasets', 'anomaly', 'yahoo_sub_5', 'TRAIN', + 'dataset_TRAIN', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = DatasetToDataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = DatasetToDataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = ColumnParser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults().replace( + {'return_result': 'append', + 'parse_semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer']})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, (0, '1', '12183', '0.0', '3.7166666666667', '5', '2109', '0', 1, 0)) + + self.assertEqual([type(o) for o in first_row], [int, str, str, str, str, str,str,str, int , int]) + + self._test_append_replace_metadata(dataframe.metadata, True) + + def _test_append_replace_metadata(self, metadata, replace_index_columns): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 1260, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 10, + } + }) + + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), + {'name': 'd3mIndex', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), + {'name': 'timestamp', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 2))), + {'name': 'value_0', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 3))), + {'name': 'value_1', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 4))), + {'name': 'value_2', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))), + {'name': 'value_3', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 6))), + {'name': 'value_4', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 7))), + {'name': 'ground_truth', 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 8))),{'name': 'timestamp', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 9))),{'name': 'ground_truth', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'int'}) + + + def _test_append_metadata(self, metadata, replace_index_columns): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 1260, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 11, + } + }) + + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), + {'name': 'd3mIndex', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), + {'name': 'timestamp', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 2))), + {'name': 'value_0', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 3))), + {'name': 'value_1', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 4))), + {'name': 'value_2', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))), + {'name': 'value_3', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 6))), + {'name': 'value_4', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 7))), + {'name': 'ground_truth', 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 8))),{'name': 'd3mIndex', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 9))),{'name': 'timestamp', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 10))),{'name': 'ground_truth', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Attribute']}) + + + def test_integer(self): + hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = ColumnParser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = container.DataFrame({'a': ['1.0', '2.0', '3.0']}, generate_metadata=True) + + dataframe.metadata = dataframe.metadata.update((metadata_base.ALL_ELEMENTS, 0), { + 'name': 'test', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + call_metadata = primitive.produce(inputs=dataframe) + + parsed_dataframe = call_metadata.value + + self.assertEqual( + test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'test', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(list(parsed_dataframe.iloc[:, 0]), [1, 2, 3]) + + dataframe.iloc[2, 0] = '3.1' + + call_metadata = primitive.produce(inputs=dataframe) + + parsed_dataframe = call_metadata.value + + self.assertEqual( + test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'test', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(list(parsed_dataframe.iloc[:, 0]), [1, 2, 3]) + + dataframe.iloc[2, 0] = 'aaa' + + with self.assertRaisesRegex(ValueError, + 'Not all values in a column can be parsed into integers, but only integers were expected'): + primitive.produce(inputs=dataframe) + + dataframe.metadata = dataframe.metadata.update((metadata_base.ALL_ELEMENTS, 0), { + 'name': 'test', + 'structural_type': str, + 'semantic_types': [ + 'http://schema.org/Integer', + ], + }) + + call_metadata = primitive.produce(inputs=dataframe) + + parsed_dataframe = call_metadata.value + + self.assertEqual( + test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'test', + 'structural_type': 'float', + 'semantic_types': [ + 'http://schema.org/Integer', + ], + }) + + self.assertEqual(list(parsed_dataframe.iloc[0:2, 0]), [1.0, 2.0]) + self.assertTrue(math.isnan(parsed_dataframe.iloc[2, 0])) + + + + def test_ugly_time_values(self): + for value in [ + 'Original chained constant price data are rescaled.', + '1986/87', + ]: + self.assertTrue(numpy.isnan(common_utils.parse_datetime_to_float(value)), value) + + if __name__ == '__main__':