diff --git a/src/axolotl b/src/axolotl new file mode 160000 index 0000000..af54e69 --- /dev/null +++ b/src/axolotl @@ -0,0 +1 @@ +Subproject commit af54e6970476a081bf0cd65990c9f56a1200d8a2 diff --git a/src/common-primitives b/src/common-primitives new file mode 160000 index 0000000..046b20d --- /dev/null +++ b/src/common-primitives @@ -0,0 +1 @@ +Subproject commit 046b20d2f6d4543dcbe18f0a1d4bcbb1f61cf518 diff --git a/src/d3m b/src/d3m new file mode 160000 index 0000000..70aeefe --- /dev/null +++ b/src/d3m @@ -0,0 +1 @@ +Subproject commit 70aeefed6b7307941581357c4b7858bb3f88e1da diff --git a/tods/data_processing/ContinuityValidation.py b/tods/data_processing/ContinuityValidation.py index 7786552..9a86560 100644 --- a/tods/data_processing/ContinuityValidation.py +++ b/tods/data_processing/ContinuityValidation.py @@ -170,9 +170,3 @@ class ContinuityValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs, inputs['d3mIndex'] = list(range(inputs.shape[0])) return inputs - - def _write(self, inputs:Inputs): - """ - write inputs to current directory, only for test - """ - inputs.to_csv(str(time.time())+'.csv') diff --git a/tods/data_processing/DuplicationValidation.py b/tods/data_processing/DuplicationValidation.py index b9cdb36..2d3a85e 100644 --- a/tods/data_processing/DuplicationValidation.py +++ b/tods/data_processing/DuplicationValidation.py @@ -91,8 +91,3 @@ class DuplicationValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs return inputs - def _write(self, inputs:Inputs): - """ - write inputs to current directory, only for test - """ - inputs.to_csv(str(time.time())+'.csv') diff --git a/tods/detection_algorithm/core/LSTMOD.py b/tods/detection_algorithm/core/LSTMOD.py index 636b7c1..28ced1d 100755 --- a/tods/detection_algorithm/core/LSTMOD.py +++ b/tods/detection_algorithm/core/LSTMOD.py @@ -169,7 +169,7 @@ class LSTMOutlierDetector(CollectiveBaseDetector): # print(danger_coefficient, averaged_relative_error) - else: + else: # pragma: no cover danger_coefficient = np.zeros(relative_error.shape) averaged_relative_error = np.zeros(relative_error.shape) @@ -210,7 +210,7 @@ class LSTMOutlierDetector(CollectiveBaseDetector): -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover X_train = np.asarray( [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]).reshape(-1, 1) diff --git a/tods/feature_analysis/BKFilter.py b/tods/feature_analysis/BKFilter.py new file mode 100644 index 0000000..c35d12c --- /dev/null +++ b/tods/feature_analysis/BKFilter.py @@ -0,0 +1,376 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer +from d3m.metadata import base as metadata_base, hyperparams + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + + +import os.path + +import time +import statsmodels.api as sm + +__all__ = ('BKFilter',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + # Tuning + low = hyperparams.UniformInt( + lower=0, + upper=100000000, + default=6, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Minimum period for oscillations, ie., Baxter and King suggest that the Burns-Mitchell U.S. business cycle has 6 for quarterly data and 1.5 for annual data.", + ) + high = hyperparams.UniformInt( + lower=0, + upper=100000000, + default=32, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Maximum period for oscillations BK suggest that the U.S. business cycle has 32 for quarterly data and 8 for annual data.", + ) + K = hyperparams.UniformInt( + lower=0, + upper=100000000, + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Lead-lag length of the filter. Baxter and King propose a truncation length of 12 for quarterly data and 3 for annual data.", + ) + + # Control + columns_using_method= hyperparams.Enumeration( + values=['name', 'index'], + default='index', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Choose to use columns by names or indecies. If 'name', \"use_columns\" or \"exclude_columns\" is used. If 'index', \"use_columns_name\" or \"exclude_columns_name\" is used." + ) + use_columns_name = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column names to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns_name = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column names to not operate on. Applicable only if \"use_columns_name\" is not provided.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class BKFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Filter a time series using the Baxter-King bandpass filter. + + Parameters + ---------- + low: int + Minimum period for oscillations, ie., Baxter and King suggest that the Burns-Mitchell U.S. business cycle has 6 for quarterly data and 1.5 for annual data. + + high: int + Maximum period for oscillations BK suggest that the U.S. business cycle has 32 for quarterly data and 8 for annual data. + + K: int + Lead-lag length of the filter. Baxter and King propose a truncation length of 12 for quarterly data and 3 for annual data. + + use_columns: Set + A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. + + exclude_columns: Set + A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. + + return_result: Enumeration + Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. + + use_semantic_types: Bool + Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. + + add_index_columns: Bool + Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". + + error_on_no_input: Bool( + Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. + + return_semantic_type: Enumeration[str]( + Decides what semantic type to attach to generated attributes' + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "Baxter-King Filter Primitive", + "python_path": "d3m.primitives.tods.feature_analysis.bk_filter", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.BK_FILTER,], + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + "id": "b2bfadc5-dbca-482c-b188-8585e5f245c4", + "hyperparams_to_tune": ['low', 'high', 'K'], + "version": "0.0.1", + }) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. + + Returns: + Container DataFrame after BKFilter. + """ + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._bkfilter(sk_inputs, low=self.hyperparams['low'], high=self.hyperparams['high'], K=self.hyperparams['K']) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + # self._write(outputs) + # self.logger.warning('produce was called3') + return CallResult(outputs) + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = [] + exclude_columns = [] + + # if hyperparams['columns_using_method'] == 'name': + # inputs_cols = inputs.columns.values.tolist() + # for i in range(len(inputs_cols)): + # if inputs_cols[i] in hyperparams['use_columns_name']: + # use_columns.append(i) + # elif inputs_cols[i] in hyperparams['exclude_columns_name']: + # exclude_columns.append(i) + # else: + use_columns=hyperparams['use_columns'] + exclude_columns=hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs:Inputs): + inputs.to_csv(str(time.time())+'.csv') + + def _bkfilter(self, X, low, high, K): + """ + Perform BKFilter + Args: + X: slected rows to be performed + K, low, high: Parameters of BKFilter + + Returns: + Dataframe, results of BKFilter + """ + transformed_X = utils.pandas.DataFrame() + for col in X.columns: + cycle = sm.tsa.filters.bkfilter(X[col], low=low, high=high, K=K) + cycle_df = utils.pandas.DataFrame(cycle) + transformed_X = utils.pandas.concat([transformed_X,cycle_df], axis=1) + + return transformed_X diff --git a/tods/feature_analysis/HPFilter.py b/tods/feature_analysis/HPFilter.py index ba26d5e..f2a5c5e 100644 --- a/tods/feature_analysis/HPFilter.py +++ b/tods/feature_analysis/HPFilter.py @@ -163,14 +163,14 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams if len(self._training_indices) > 0: # self._clf.fit(self._training_inputs) self._fitted = True - else: + else: # pragma: no cover if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") - if not self._fitted: + if not self._fitted: # pragma: no cover raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: @@ -186,7 +186,7 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams outputs.columns = self._input_column_names output_columns = [outputs] - else: + else: # pragma: no cover if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") @@ -194,14 +194,11 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) - - # self._write(outputs) - # self.logger.warning('produce was called3') return CallResult(outputs) @classmethod - def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover """ Select columns to fit. Args: @@ -238,7 +235,7 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams # return columns_to_produce @classmethod - def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover """ Output whether a column can be processed. Args: @@ -331,9 +328,6 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams return target_columns_metadata - def _write(self, inputs:Inputs): - inputs.to_csv(str(time.time())+'.csv') - def _hpfilter(self, X, lamb): """ Perform HPFilter diff --git a/tods/feature_analysis/SKTruncatedSVD.py b/tods/feature_analysis/SKTruncatedSVD.py index 2b6f038..01c6950 100644 --- a/tods/feature_analysis/SKTruncatedSVD.py +++ b/tods/feature_analysis/SKTruncatedSVD.py @@ -224,7 +224,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H Returns: None """ - if self._fitted: + if self._fitted: # pragma: no cover return CallResult(None) # Get cols to fit. @@ -239,7 +239,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H if len(self._training_indices) > 0: self._clf.fit(self._training_inputs) self._fitted = True - else: + else: # pragma: no cover if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") @@ -257,7 +257,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H # self.logger.warning(str(self.metadata.query()['name'])) - if not self._fitted: + if not self._fitted: # pragma: no cover raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: @@ -272,7 +272,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] - else: + else: # pragma: no cover if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") @@ -286,7 +286,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H return CallResult(outputs) - def get_params(self) -> Params: + def get_params(self) -> Params: # pragma: no cover """ Return parameters. Args: @@ -320,7 +320,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H target_columns_metadata_=self._target_columns_metadata ) - def set_params(self, *, params: Params) -> None: + def set_params(self, *, params: Params) -> None: # pragma: no cover """ Set parameters for SKTruncatedSVD. Args: @@ -351,7 +351,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H @classmethod - def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover """ Select columns to fit. Args: @@ -377,7 +377,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H # return columns_to_produce @classmethod - def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover """ Output whether a column can be processed. Args: @@ -408,35 +408,35 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H return False - @classmethod - def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: - """ - Output metadata of selected columns. - Args: - outputs_metadata: metadata_base.DataMetadata - hyperparams: d3m.metadata.hyperparams.Hyperparams - - Returns: - d3m.metadata.base.DataMetadata - """ - outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] - - target_columns_metadata: List[OrderedDict] = [] - for column_index in range(outputs_length): - column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) - - # Update semantic types and prepare it for predicted targets. - semantic_types = set(column_metadata.get('semantic_types', [])) - semantic_types_to_remove = set([]) - add_semantic_types = [] - add_semantic_types.add(hyperparams["return_semantic_type"]) - semantic_types = semantic_types - semantic_types_to_remove - semantic_types = semantic_types.union(add_semantic_types) - column_metadata['semantic_types'] = list(semantic_types) - - target_columns_metadata.append(column_metadata) - - return target_columns_metadata + # @classmethod + # def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + # """ + # Output metadata of selected columns. + # Args: + # outputs_metadata: metadata_base.DataMetadata + # hyperparams: d3m.metadata.hyperparams.Hyperparams + + # Returns: + # d3m.metadata.base.DataMetadata + # """ + # outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + # target_columns_metadata: List[OrderedDict] = [] + # for column_index in range(outputs_length): + # column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # # Update semantic types and prepare it for predicted targets. + # semantic_types = set(column_metadata.get('semantic_types', [])) + # semantic_types_to_remove = set([]) + # add_semantic_types = [] + # add_semantic_types.add(hyperparams["return_semantic_type"]) + # semantic_types = semantic_types - semantic_types_to_remove + # semantic_types = semantic_types.union(add_semantic_types) + # column_metadata['semantic_types'] = list(semantic_types) + + # target_columns_metadata.append(column_metadata) + + # return target_columns_metadata @classmethod def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], @@ -500,11 +500,3 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H return target_columns_metadata - def _write(self, inputs:Inputs): - """ - write inputs to current directory, only for test - """ - inputs.to_csv(str(time.time())+'.csv') - - -# SKTruncatedSVD.__doc__ = TruncatedSVD.__doc__ diff --git a/tods/feature_analysis/TRMF.py b/tods/feature_analysis/TRMF.py index 3b21fa1..fcbdab9 100644 --- a/tods/feature_analysis/TRMF.py +++ b/tods/feature_analysis/TRMF.py @@ -276,14 +276,14 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): if len(self._training_indices) > 0: self._clf.fit(self._training_inputs) self._fitted = True - else: + else: # pragma: no cover if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") - if not self._fitted: + if not self._fitted: # pragma: no cover raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs @@ -301,7 +301,7 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] - else: + else: # pragma: no cover if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") @@ -316,7 +316,7 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): @classmethod - def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover """ Select columns to fit. Args: @@ -342,7 +342,7 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): # return columns_to_produce @classmethod - def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover """ Output whether a column can be processed. Args: @@ -373,35 +373,35 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): return False - @classmethod - def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: - """ - Output metadata of selected columns. - Args: - outputs_metadata: metadata_base.DataMetadata - hyperparams: d3m.metadata.hyperparams.Hyperparams - - Returns: - d3m.metadata.base.DataMetadata - """ - outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] - - target_columns_metadata: List[OrderedDict] = [] - for column_index in range(outputs_length): - column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) - - # Update semantic types and prepare it for predicted targets. - semantic_types = set(column_metadata.get('semantic_types', [])) - semantic_types_to_remove = set([]) - add_semantic_types = [] - add_semantic_types.add(hyperparams["return_semantic_type"]) - semantic_types = semantic_types - semantic_types_to_remove - semantic_types = semantic_types.union(add_semantic_types) - column_metadata['semantic_types'] = list(semantic_types) - - target_columns_metadata.append(column_metadata) - - return target_columns_metadata + # @classmethod + # def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + # """ + # Output metadata of selected columns. + # Args: + # outputs_metadata: metadata_base.DataMetadata + # hyperparams: d3m.metadata.hyperparams.Hyperparams + + # Returns: + # d3m.metadata.base.DataMetadata + # """ + # outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + # target_columns_metadata: List[OrderedDict] = [] + # for column_index in range(outputs_length): + # column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # # Update semantic types and prepare it for predicted targets. + # semantic_types = set(column_metadata.get('semantic_types', [])) + # semantic_types_to_remove = set([]) + # add_semantic_types = [] + # add_semantic_types.add(hyperparams["return_semantic_type"]) + # semantic_types = semantic_types - semantic_types_to_remove + # semantic_types = semantic_types.union(add_semantic_types) + # column_metadata['semantic_types'] = list(semantic_types) + + # target_columns_metadata.append(column_metadata) + + # return target_columns_metadata @classmethod def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], @@ -465,12 +465,6 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): return target_columns_metadata - def _write(self, inputs:Inputs): - """ - write inputs to current directory, only for test - """ - inputs.to_csv(str(time.time())+'.csv') - """ Temporal Regularized Matrix Factorization @@ -564,7 +558,7 @@ class trmf: return np.dot(self.F, X_preds) - def _predict_X(self, h): + def _predict_X(self, h): # pragma: no cover """Predict X h timepoints ahead. Evaluates matrix X with the help of matrix W. diff --git a/tods/tests/test_ContinuityValidation.py b/tods/tests/test_ContinuityValidation.py index 185bd4c..9c69f8e 100644 --- a/tods/tests/test_ContinuityValidation.py +++ b/tods/tests/test_ContinuityValidation.py @@ -59,7 +59,7 @@ class ContinuityValidationTest(unittest.TestCase): hyperparams_class = ContinuityValidation.ContinuityValidation.metadata.get_hyperparams() primitive = ContinuityValidation.ContinuityValidation(hyperparams=hyperparams_class.defaults()) new_main = primitive.produce(inputs=main).value - # print(new_main) + expected_output = container.DataFrame({'d3mIndex': [0, 1, 2, 3], 'timestamp': [1., 2., 3., 4.], @@ -124,6 +124,67 @@ class ContinuityValidationTest(unittest.TestCase): self._test_continuity(new_main) + hyperparams = hyperparams_class.defaults() + hyperparams = hyperparams.replace({'continuity_option': 'ablation'}) + primitive2 = ContinuityValidation.ContinuityValidation(hyperparams=hyperparams) + new_main2 = primitive2.produce(inputs=main).value + print(new_main2) + + self.assertEqual(utils.to_json_structure(new_main2.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'timestamp', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'a', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'b', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'ground_truth', + 'structural_type': 'numpy.int64', + }, + }]) + + def _test_continuity(self, data_value): tmp_col = data_value['timestamp'] diff --git a/tods/tests/test_DuplicationValidation.py b/tods/tests/test_DuplicationValidation.py index 6c85cfb..6e34d1f 100644 --- a/tods/tests/test_DuplicationValidation.py +++ b/tods/tests/test_DuplicationValidation.py @@ -102,6 +102,54 @@ class DuplicationValidationTest(unittest.TestCase): self._test_drop_duplication(new_main) + hyperparams = hyperparams_class.defaults() + hyperparams = hyperparams.replace({'keep_option': 'average'}) + primitive2 = DuplicationValidation.DuplicationValidation(hyperparams=hyperparams) + new_main2 = primitive2.produce(inputs=main).value + print(new_main2) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'timestamp', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'a', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'b', + 'structural_type': 'numpy.float64', + }, + }]) + def _test_drop_duplication(self, data_value): self.assertEqual(True in list(data_value.duplicated('timestamp')), False)