Browse Source

coverage increased

Former-commit-id: c1a8fba17f [formerly 1d35c822aa] [formerly 8709aad1c0 [formerly e974eac168]] [formerly 6806a62953 [formerly 03bae6573c] [formerly 7ba6fdaf79 [formerly 0d29eaab25]]] [formerly 7dd42166bc [formerly 2da3f0610b] [formerly 0250582087 [formerly ef3fd32d6d]] [formerly a767db31d8 [formerly 7f909eb9cf] [formerly a698f1b270 [formerly b841fc53db]]]] [formerly 10c3eae512 [formerly 210eebb342] [formerly 70bf2bc6c5 [formerly 305ab36b11]] [formerly f70e4d2a27 [formerly 8bd51e3ddc] [formerly bef7cf1b90 [formerly a8968a2fe2]]] [formerly 39e63b5cbd [formerly 4978e27886] [formerly 1acffe2e15 [formerly 2165c692a6]] [formerly d2d8058c72 [formerly 20356322aa] [formerly ef4e103f18 [formerly ecdec2584f]]]]] [formerly f0a5095987 [formerly 2a02694321] [formerly a2e7e078c2 [formerly c67ddb0319]] [formerly 748fe2cc3e [formerly 75656de7a3] [formerly 877ef24a85 [formerly 4bf1058d4a]]] [formerly 5a3bc14f0d [formerly c27e28104b] [formerly 75dcc89db6 [formerly 5fc3bf21b1]] [formerly 338154b330 [formerly 42605af666] [formerly a5ab7108bc [formerly 200858728c]]]] [formerly a6736b478f [formerly 1bf1228edd] [formerly 5c320ae2f1 [formerly d1ddae6dc9]] [formerly 35a94cd5d2 [formerly 3367b58692] [formerly 04caf5aafd [formerly 52170cc4bf]]] [formerly eca61767ed [formerly 8d34597833] [formerly f0334f21db [formerly 7500bed33e]] [formerly 97ab9f700a [formerly c4749edc84] [formerly 2c1e840e41 [formerly a6b1005961]]]]]]
Former-commit-id: 77d50b8172 [formerly 11a0429092] [formerly f8805ebea5 [formerly df91e9b1fe]] [formerly acb940a2c9 [formerly 3019dbd1a3] [formerly 9a0ccf5707 [formerly aadb4dd3ab]]] [formerly 2795c08e07 [formerly 77f9eae36a] [formerly a860676d98 [formerly afb35e164c]] [formerly bef580955f [formerly 5d69f17cc1] [formerly f8894ef015 [formerly c790e7470e]]]] [formerly 9dca24517e [formerly bee1e7e4c9] [formerly b85c484071 [formerly 602164b940]] [formerly a042c80e14 [formerly 7a62010014] [formerly e9620d149f [formerly c3a4f21624]]] [formerly 52f92b69f8 [formerly aa2d4cb4dd] [formerly c004732012 [formerly 28969acdda]] [formerly 16bbff62b2 [formerly b2ba0bfd19] [formerly 2c1e840e41]]]]
Former-commit-id: 3424e2f965 [formerly 23fe3f85c1] [formerly 65779f7438 [formerly 0f86ecbc8c]] [formerly 7f3cf2763b [formerly f66725b114] [formerly e1e28402ac [formerly 73fbf76d8d]]] [formerly c2a74c9731 [formerly b66a399979] [formerly 0f8e2fc433 [formerly 912dea8e46]] [formerly 157a449ff8 [formerly a504e3eee6] [formerly cd47afc74b [formerly 8cd50e467e]]]]
Former-commit-id: f189a2aca8 [formerly dff37ee52f] [formerly 55046945bd [formerly 5c54f5f40c]] [formerly 5f0e07788a [formerly d529848d63] [formerly 3aa2c17573 [formerly 5cad406599]]]
Former-commit-id: c590724c34 [formerly 23f3c9eb22] [formerly 9862f8fbfa [formerly df37361b8b]]
Former-commit-id: e10532879f [formerly 2be2f09937]
Former-commit-id: f9dc1b0c15
master
Devesh Kumar 4 years ago
parent
commit
cbb0fd3493
3 changed files with 376 additions and 19 deletions
  1. +14
    -14
      tods/data_processing/ColumnParser.py
  2. +3
    -3
      tods/data_processing/ExtractColumnsBySemanticTypes.py
  3. +359
    -2
      tods/tests/data_processing/test_ColumnParser.py

+ 14
- 14
tods/data_processing/ColumnParser.py View File

@@ -120,7 +120,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
columns_to_use, output_columns = self._produce_columns(inputs)

if self.hyperparams['replace_index_columns'] and self.hyperparams['return_result'] == 'append':
if self.hyperparams['replace_index_columns'] and self.hyperparams['return_result'] == 'append': # pragma: no cover
assert len(columns_to_use) == len(output_columns)

index_columns = inputs.metadata.get_index_columns()
@@ -215,7 +215,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs

return columns_to_use, output_columns

def _produce_columns_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Tuple[typing.List[int], typing.List[metadata_base.DataMetadata]]:
def _produce_columns_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Tuple[typing.List[int], typing.List[metadata_base.DataMetadata]]: # pragma: no cover
columns_to_use = self._get_columns(inputs_metadata)

# We check against this list again, because there might be multiple matching semantic types
@@ -242,10 +242,10 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs
elif 'http://schema.org/Float' in parse_semantic_types and 'http://schema.org/Float' in semantic_types:
output_columns.append(self._parse_float_metadata(inputs_metadata, column_index))

elif 'https://metadata.datadrivendiscovery.org/types/FloatVector' in parse_semantic_types and 'https://metadata.datadrivendiscovery.org/types/FloatVector' in semantic_types:
elif 'https://metadata.datadrivendiscovery.org/types/FloatVector' in parse_semantic_types and 'https://metadata.datadrivendiscovery.org/types/FloatVector' in semantic_types: # pragma: no cover
output_columns.append(self._parse_float_vector_metadata(inputs_metadata, column_index))

elif 'http://schema.org/DateTime' in parse_semantic_types and 'http://schema.org/DateTime' in semantic_types:
elif 'http://schema.org/DateTime' in parse_semantic_types and 'http://schema.org/DateTime' in semantic_types: # pragma: no cover
output_columns.append(self._parse_time_metadata(inputs_metadata, column_index))

else:
@@ -273,14 +273,14 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs

@classmethod
def _parse_boolean_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs:
return cls._parse_categorical_data(inputs, column_index)
return cls._parse_categorical_data(inputs, column_index) # pragma: no cover

@classmethod
def _parse_boolean_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata:
return cls._parse_categorical_metadata(inputs_metadata, column_index)
return cls._parse_categorical_metadata(inputs_metadata, column_index) # pragma: no cover

@classmethod
def _parse_categorical_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs:
def _parse_categorical_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: # pragma: no cover
values_map: typing.Dict[str, int] = {}
for value in inputs.iloc[:, column_index]:
value = value.strip()
@@ -295,7 +295,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs
return outputs

@classmethod
def _parse_categorical_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata:
def _parse_categorical_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover
outputs_metadata = inputs_metadata.select_columns([column_index])
return outputs_metadata.update_column(0, {'structural_type': int})

@@ -333,7 +333,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs
return outputs

@classmethod
def _parse_integer_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata:
def _parse_integer_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover
outputs_metadata = inputs_metadata.select_columns([column_index])
# Without data we assume we can parse everything into integers. This might not be true and
# we might end up parsing into floats if we have to represent missing (or invalid) values.
@@ -343,7 +343,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs
def _str_to_float(cls, value: str) -> float:
try:
return float(value.strip())
except ValueError:
except ValueError: # pragma: no cover
return float('nan')

@classmethod
@@ -359,7 +359,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs
return outputs_metadata.update_column(0, {'structural_type': float})

@classmethod
def _parse_float_vector_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs:
def _parse_float_vector_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: # pragma: no cover
# We are pretty strict here because we are assuming this was generated programmatically.
outputs = container.DataFrame(
{
@@ -377,7 +377,7 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs
return outputs

@classmethod
def _parse_float_vector_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata:
def _parse_float_vector_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover
outputs_metadata = inputs_metadata.select_columns([column_index])
# We cannot know the dimension of the ndarray without data.
outputs_metadata = outputs_metadata.update_column(0, {'structural_type': container.ndarray})
@@ -385,13 +385,13 @@ class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs
return outputs_metadata

@classmethod
def _parse_time_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment, fuzzy: bool) -> Outputs:
def _parse_time_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment, fuzzy: bool) -> Outputs: # pragma: no cover
outputs = container.DataFrame({inputs.columns[column_index]: [utils.parse_datetime_to_float(value, fuzzy=fuzzy) for value in inputs.iloc[:, column_index]]}, generate_metadata=False)
outputs.metadata = cls._parse_time_metadata(inputs.metadata, column_index)

return outputs

@classmethod
def _parse_time_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata:
def _parse_time_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: # pragma: no cover
outputs_metadata = inputs_metadata.select_columns([column_index])
return outputs_metadata.update_column(0, {'structural_type': float})

+ 3
- 3
tods/data_processing/ExtractColumnsBySemanticTypes.py View File

@@ -106,16 +106,16 @@ class ExtractColumnsBySemanticTypesPrimitive(transformer.TransformerPrimitiveBas

semantic_types = column_metadata.get('semantic_types', [])

if self.hyperparams['match_logic'] == 'all':
if self.hyperparams['match_logic'] == 'all': # pragma: no cover
match = all(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types'])
elif self.hyperparams['match_logic'] == 'any':
match = any(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types'])
elif self.hyperparams["match_logic"] == "equal":
elif self.hyperparams["match_logic"] == "equal": # pragma: no cover
match = set(semantic_types) == set(self.hyperparams["semantic_types"])
else:
raise exceptions.UnexpectedValueError("Unknown value of hyper-parameter \"match_logic\": {value}".format(value=self.hyperparams['match_logic']))

if self.hyperparams['negate']:
if self.hyperparams['negate']: # pragma: no cover
return not match
else:
return match


+ 359
- 2
tods/tests/data_processing/test_ColumnParser.py View File

@@ -1,7 +1,7 @@
import numpy
import os.path
import unittest
import math


from d3m import container, utils
@@ -9,6 +9,7 @@ from d3m.metadata import base as metadata_base

from tods.data_processing import DatasetToDataframe, ColumnParser

from common_primitives import utils as common_utils
import utils as test_utils


@@ -92,6 +93,362 @@ class ColumnParserPrimitiveTestCase(unittest.TestCase):

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 7))), {'name': 'ground_truth', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Attribute']})

def test_new(self):
dataset_doc_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), '..', '..', '..', 'datasets', 'anomaly', 'yahoo_sub_5', 'TRAIN',
'dataset_TRAIN', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))

hyperparams_class = DatasetToDataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams()

primitive = DatasetToDataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults())

call_metadata = primitive.produce(inputs=dataset)

dataframe = call_metadata.value

hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams()

primitive = ColumnParser.ColumnParserPrimitive(
hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'use_columns': [2]}))

call_metadata = primitive.produce(inputs=dataframe)

dataframe = call_metadata.value

first_row = list(dataframe.itertuples(index=False, name=None))[0]

self.assertEqual(first_row, ('0', 12183.0))

self.assertEqual([type(o) for o in first_row], [str, float])

self._test_new_metadata(dataframe.metadata)

def _test_new_metadata(self, metadata):
self.maxDiff = None

self.assertEqual(test_utils.convert_through_json(metadata.query(())), {
'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
'structural_type': 'd3m.container.pandas.DataFrame',
'semantic_types': [
'https://metadata.datadrivendiscovery.org/types/Table',
],
'dimension': {
'name': 'rows',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
'length': 1260,
}
})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), {
'dimension': {
'name': 'columns',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
'length': 2,
}
})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), {
'name': 'd3mIndex',
'structural_type': 'str',
'semantic_types': [
'http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
],
})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), {
'name': 'value_0',
'structural_type': 'float',
'semantic_types': [
'http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute',
],
})

def test_append(self):
dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__),'..', '..', '..', 'datasets', 'anomaly','yahoo_sub_5','TRAIN','dataset_TRAIN', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))

hyperparams_class = DatasetToDataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams()

primitive = DatasetToDataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults())

call_metadata = primitive.produce(inputs=dataset)

dataframe = call_metadata.value

hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams()

primitive = ColumnParser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults().replace(
{'return_result': 'append', 'replace_index_columns': False,
'parse_semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData',
'http://schema.org/Integer']}))

call_metadata = primitive.produce(inputs=dataframe)

dataframe = call_metadata.value

first_row = list(dataframe.itertuples(index=False, name=None))[0]

self.assertEqual(first_row, ('0', '1', '12183', '0.0', '3.7166666666667', '5', '2109', '0', 0, 1, 0))

self.assertEqual([type(o) for o in first_row], [str, str, str, str, str, str, str, str,int , int , int])

self._test_append_metadata(dataframe.metadata, False)

def test_append_replace_index_columns(self):
dataset_doc_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), '..', '..', '..', 'datasets', 'anomaly', 'yahoo_sub_5', 'TRAIN',
'dataset_TRAIN', 'datasetDoc.json'))

dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path))

hyperparams_class = DatasetToDataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams()

primitive = DatasetToDataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults())

call_metadata = primitive.produce(inputs=dataset)

dataframe = call_metadata.value

hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams()

primitive = ColumnParser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults().replace(
{'return_result': 'append',
'parse_semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData',
'http://schema.org/Integer']}))

call_metadata = primitive.produce(inputs=dataframe)

dataframe = call_metadata.value

first_row = list(dataframe.itertuples(index=False, name=None))[0]

self.assertEqual(first_row, (0, '1', '12183', '0.0', '3.7166666666667', '5', '2109', '0', 1, 0))

self.assertEqual([type(o) for o in first_row], [int, str, str, str, str, str,str,str, int , int])

self._test_append_replace_metadata(dataframe.metadata, True)

def _test_append_replace_metadata(self, metadata, replace_index_columns):
self.maxDiff = None

self.assertEqual(test_utils.convert_through_json(metadata.query(())), {
'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
'structural_type': 'd3m.container.pandas.DataFrame',
'semantic_types': [
'https://metadata.datadrivendiscovery.org/types/Table',
],
'dimension': {
'name': 'rows',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
'length': 1260,
}
})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), {
'dimension': {
'name': 'columns',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
'length': 10,
}
})


self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))),
{'name': 'd3mIndex', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/PrimaryKey']})
self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))),
{'name': 'timestamp', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/Attribute']})
self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 2))),
{'name': 'value_0', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 3))),
{'name': 'value_1', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 4))),
{'name': 'value_2', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))),
{'name': 'value_3', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 6))),
{'name': 'value_4', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 7))),
{'name': 'ground_truth', 'structural_type': 'str',
'semantic_types': ['http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 8))),{'name': 'timestamp', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute']})
self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 9))),{'name': 'ground_truth',
'semantic_types': ['http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
'https://metadata.datadrivendiscovery.org/types/Attribute'],
'structural_type': 'int'})


def _test_append_metadata(self, metadata, replace_index_columns):
self.maxDiff = None

self.assertEqual(test_utils.convert_through_json(metadata.query(())), {
'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
'structural_type': 'd3m.container.pandas.DataFrame',
'semantic_types': [
'https://metadata.datadrivendiscovery.org/types/Table',
],
'dimension': {
'name': 'rows',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
'length': 1260,
}
})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), {
'dimension': {
'name': 'columns',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
'length': 11,
}
})


self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))),
{'name': 'd3mIndex', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/PrimaryKey']})
self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))),
{'name': 'timestamp', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/Attribute']})
self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 2))),
{'name': 'value_0', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 3))),
{'name': 'value_1', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 4))),
{'name': 'value_2', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))),
{'name': 'value_3', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 6))),
{'name': 'value_4', 'structural_type': 'str', 'semantic_types': ['http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 7))),
{'name': 'ground_truth', 'structural_type': 'str',
'semantic_types': ['http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
'https://metadata.datadrivendiscovery.org/types/Attribute']})

self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 8))),{'name': 'd3mIndex', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']})
self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 9))),{'name': 'timestamp', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute']})
self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 10))),{'name': 'ground_truth', 'structural_type': 'int', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Attribute']})


def test_integer(self):
hyperparams_class = ColumnParser.ColumnParserPrimitive.metadata.get_hyperparams()

primitive = ColumnParser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults())

dataframe = container.DataFrame({'a': ['1.0', '2.0', '3.0']}, generate_metadata=True)

dataframe.metadata = dataframe.metadata.update((metadata_base.ALL_ELEMENTS, 0), {
'name': 'test',
'semantic_types': [
'http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
],
})

call_metadata = primitive.produce(inputs=dataframe)

parsed_dataframe = call_metadata.value

self.assertEqual(
test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), {
'name': 'test',
'structural_type': 'int',
'semantic_types': [
'http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
],
})

self.assertEqual(list(parsed_dataframe.iloc[:, 0]), [1, 2, 3])

dataframe.iloc[2, 0] = '3.1'

call_metadata = primitive.produce(inputs=dataframe)

parsed_dataframe = call_metadata.value

self.assertEqual(
test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), {
'name': 'test',
'structural_type': 'int',
'semantic_types': [
'http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
],
})

self.assertEqual(list(parsed_dataframe.iloc[:, 0]), [1, 2, 3])

dataframe.iloc[2, 0] = 'aaa'

with self.assertRaisesRegex(ValueError,
'Not all values in a column can be parsed into integers, but only integers were expected'):
primitive.produce(inputs=dataframe)

dataframe.metadata = dataframe.metadata.update((metadata_base.ALL_ELEMENTS, 0), {
'name': 'test',
'structural_type': str,
'semantic_types': [
'http://schema.org/Integer',
],
})

call_metadata = primitive.produce(inputs=dataframe)

parsed_dataframe = call_metadata.value

self.assertEqual(
test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), {
'name': 'test',
'structural_type': 'float',
'semantic_types': [
'http://schema.org/Integer',
],
})

self.assertEqual(list(parsed_dataframe.iloc[0:2, 0]), [1.0, 2.0])
self.assertTrue(math.isnan(parsed_dataframe.iloc[2, 0]))



def test_ugly_time_values(self):
for value in [
'Original chained constant price data are rescaled.',
'1986/87',
]:
self.assertTrue(numpy.isnan(common_utils.parse_datetime_to_float(value)), value)




if __name__ == '__main__':


Loading…
Cancel
Save