Browse Source

Merge branch 'dev' of https://github.com/datamllab/tods into dev

fixed merge conflict on BKFilter


Former-commit-id: 268b810550 [formerly c533dc41f0] [formerly 4b7350332a [formerly 91a1dfa29a]] [formerly f0ac3f1dfc [formerly 75cc47630f] [formerly 97c0ab1f2c [formerly 244269481f]]] [formerly 407f577123 [formerly a5bea99500] [formerly d90dd7a5df [formerly f3f7ff9cbe]] [formerly f6b79ee96a [formerly 2b0eaf8f26] [formerly dcb34f9924 [formerly cbcac31a7c]]]] [formerly 276805fd92 [formerly 3ffa9cc927] [formerly 11750432d5 [formerly 63492199be]] [formerly f225a4d340 [formerly 34ffc73c25] [formerly cfe8553fb1 [formerly 52b80cba4d]]] [formerly 2045f00d84 [formerly eb41c739de] [formerly bae9207571 [formerly 7de12dda8a]] [formerly 6c97b93ebc [formerly 5a75188044] [formerly 5f0d45454c [formerly 685cdb979b]]]]] [formerly dde7e94bb7 [formerly f6b5266236] [formerly 8cf5ebc7a8 [formerly 5c9dd9ae27]] [formerly 8dd7e6467a [formerly 5bd4d0fb0b] [formerly 916e0b6eca [formerly 41e55088dc]]] [formerly fb04845556 [formerly d8b444e0ed] [formerly aec60bbd3e [formerly 8d1c160a9a]] [formerly 1943bcb1c9 [formerly 2d3cc48fc4] [formerly 58e75575a6 [formerly 458469ece2]]]] [formerly aa010466ae [formerly df086a293d] [formerly 813ce80e0a [formerly fb3a25caf0]] [formerly 3b85298123 [formerly 148fea965b] [formerly 4ac043cac1 [formerly 47dead1fad]]] [formerly 6c799a58ff [formerly c0b13c8e50] [formerly 9a4bd12e2e [formerly c0dd11a023]] [formerly 1f8c1e3bca [formerly 04db072fa0] [formerly 01d0d8d6a4 [formerly 52c56a7453]]]]]]
Former-commit-id: 28c0c1bc62 [formerly ca095e2fa2] [formerly 7c73154b90 [formerly 22ba8c7e21]] [formerly 56b42d344d [formerly 0deec139cb] [formerly 140524a81c [formerly e05dfbd701]]] [formerly ff57ce3469 [formerly 8bd9bebc7d] [formerly 5eece021d1 [formerly ab504afde5]] [formerly 16414f8e84 [formerly 427bec8abb] [formerly a92733bafe [formerly 7f9779e073]]]] [formerly 7c47a5ecf4 [formerly 003210baa6] [formerly 7235d0ecbb [formerly f1d9089804]] [formerly af042a5779 [formerly d93b0274af] [formerly 651c0e5913 [formerly 80a23c8a08]]] [formerly c0360a3f39 [formerly e860588e5c] [formerly 92cd46b9b2 [formerly 1bb0313b8c]] [formerly e8cf2fc680 [formerly daeb7f052e] [formerly 01d0d8d6a4]]]]
Former-commit-id: 5a14b9b6c9 [formerly f6940aff9f] [formerly b07d1f51e9 [formerly dad55b1667]] [formerly 2e97a29a05 [formerly c507fda4e3] [formerly bca0c8f763 [formerly eb8363bd42]]] [formerly a2c98357bc [formerly 9f9de7fa69] [formerly a0464b7de6 [formerly 3766130a24]] [formerly 3ea9371a48 [formerly b207c05add] [formerly 45043829a6 [formerly 34286e1eca]]]]
Former-commit-id: 20546a8523 [formerly 4c4a4df332] [formerly 1966c0ef91 [formerly fa8e5393cf]] [formerly ef2311fcb0 [formerly 0c3554978e] [formerly 3399a89945 [formerly 4a80a0961a]]]
Former-commit-id: d3d5bd6ddc [formerly a2ecdd5a7a] [formerly f13021e449 [formerly 1a182c95e1]]
Former-commit-id: e441e5c7f0 [formerly eec58f1502]
Former-commit-id: 45972a4aa3
master
YileAllenChen1 4 years ago
parent
commit
03d4c57a65
12 changed files with 568 additions and 111 deletions
  1. +1
    -0
      src/axolotl
  2. +1
    -0
      src/common-primitives
  3. +1
    -0
      src/d3m
  4. +0
    -6
      tods/data_processing/ContinuityValidation.py
  5. +0
    -5
      tods/data_processing/DuplicationValidation.py
  6. +2
    -2
      tods/detection_algorithm/core/LSTMOD.py
  7. +376
    -0
      tods/feature_analysis/BKFilter.py
  8. +5
    -11
      tods/feature_analysis/HPFilter.py
  9. +37
    -45
      tods/feature_analysis/SKTruncatedSVD.py
  10. +35
    -41
      tods/feature_analysis/TRMF.py
  11. +62
    -1
      tods/tests/test_ContinuityValidation.py
  12. +48
    -0
      tods/tests/test_DuplicationValidation.py

+ 1
- 0
src/axolotl

@@ -0,0 +1 @@
Subproject commit af54e6970476a081bf0cd65990c9f56a1200d8a2

+ 1
- 0
src/common-primitives

@@ -0,0 +1 @@
Subproject commit 046b20d2f6d4543dcbe18f0a1d4bcbb1f61cf518

+ 1
- 0
src/d3m

@@ -0,0 +1 @@
Subproject commit 70aeefed6b7307941581357c4b7858bb3f88e1da

+ 0
- 6
tods/data_processing/ContinuityValidation.py View File

@@ -170,9 +170,3 @@ class ContinuityValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs,
inputs['d3mIndex'] = list(range(inputs.shape[0]))
return inputs


def _write(self, inputs:Inputs):
"""
write inputs to current directory, only for test
"""
inputs.to_csv(str(time.time())+'.csv')

+ 0
- 5
tods/data_processing/DuplicationValidation.py View File

@@ -91,8 +91,3 @@ class DuplicationValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs

return inputs

def _write(self, inputs:Inputs):
"""
write inputs to current directory, only for test
"""
inputs.to_csv(str(time.time())+'.csv')

+ 2
- 2
tods/detection_algorithm/core/LSTMOD.py View File

@@ -169,7 +169,7 @@ class LSTMOutlierDetector(CollectiveBaseDetector):
# print(danger_coefficient, averaged_relative_error)

else:
else: # pragma: no cover
danger_coefficient = np.zeros(relative_error.shape)
averaged_relative_error = np.zeros(relative_error.shape)

@@ -210,7 +210,7 @@ class LSTMOutlierDetector(CollectiveBaseDetector):



if __name__ == "__main__":
if __name__ == "__main__": # pragma: no cover
X_train = np.asarray(
[3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]).reshape(-1, 1)



+ 376
- 0
tods/feature_analysis/BKFilter.py View File

@@ -0,0 +1,376 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing
import time

from d3m import container
from d3m.primitive_interfaces import base, transformer
from d3m.metadata import base as metadata_base, hyperparams

from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer


import os.path

import time
import statsmodels.api as sm

__all__ = ('BKFilter',)

Inputs = container.DataFrame
Outputs = container.DataFrame


class Hyperparams(hyperparams.Hyperparams):
# Tuning
low = hyperparams.UniformInt(
lower=0,
upper=100000000,
default=6,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
description="Minimum period for oscillations, ie., Baxter and King suggest that the Burns-Mitchell U.S. business cycle has 6 for quarterly data and 1.5 for annual data.",
)
high = hyperparams.UniformInt(
lower=0,
upper=100000000,
default=32,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
description="Maximum period for oscillations BK suggest that the U.S. business cycle has 32 for quarterly data and 8 for annual data.",
)
K = hyperparams.UniformInt(
lower=0,
upper=100000000,
default=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
description="Lead-lag length of the filter. Baxter and King propose a truncation length of 12 for quarterly data and 3 for annual data.",
)

# Control
columns_using_method= hyperparams.Enumeration(
values=['name', 'index'],
default='index',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Choose to use columns by names or indecies. If 'name', \"use_columns\" or \"exclude_columns\" is used. If 'index', \"use_columns_name\" or \"exclude_columns_name\" is used."
)
use_columns_name = hyperparams.Set(
elements=hyperparams.Hyperparameter[str](''),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column names to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
)
exclude_columns_name = hyperparams.Set(
elements=hyperparams.Hyperparameter[str](''),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column names to not operate on. Applicable only if \"use_columns_name\" is not provided.",
)
use_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
)
exclude_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='append',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
default='https://metadata.datadrivendiscovery.org/types/Attribute',
description='Decides what semantic type to attach to generated attributes',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class BKFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
"""
Filter a time series using the Baxter-King bandpass filter.

Parameters
----------
low: int
Minimum period for oscillations, ie., Baxter and King suggest that the Burns-Mitchell U.S. business cycle has 6 for quarterly data and 1.5 for annual data.
high: int
Maximum period for oscillations BK suggest that the U.S. business cycle has 32 for quarterly data and 8 for annual data.

K: int
Lead-lag length of the filter. Baxter and King propose a truncation length of 12 for quarterly data and 3 for annual data.

use_columns: Set
A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.
exclude_columns: Set
A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.
return_result: Enumeration
Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.
use_semantic_types: Bool
Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe.
add_index_columns: Bool
Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".
error_on_no_input: Bool(
Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.
return_semantic_type: Enumeration[str](
Decides what semantic type to attach to generated attributes'
"""

__author__: "DATA Lab at Texas A&M University"
metadata = metadata_base.PrimitiveMetadata({
"name": "Baxter-King Filter Primitive",
"python_path": "d3m.primitives.tods.feature_analysis.bk_filter",
"source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu',
'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py']},
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.BK_FILTER,],
"primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION,
"id": "b2bfadc5-dbca-482c-b188-8585e5f245c4",
"hyperparams_to_tune": ['low', 'high', 'K'],
"version": "0.0.1",
})


def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
"""
Process the testing data.
Args:
inputs: Container DataFrame.

Returns:
Container DataFrame after BKFilter.
"""
# Get cols to fit.
self._fitted = False
self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns


if len(self._training_indices) > 0:
# self._clf.fit(self._training_inputs)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")



if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
sk_inputs = inputs
if self.hyperparams['use_semantic_types']:
sk_inputs = inputs.iloc[:, self._training_indices]
output_columns = []
if len(self._training_indices) > 0:
sk_output = self._bkfilter(sk_inputs, low=self.hyperparams['low'], high=self.hyperparams['high'], K=self.hyperparams['K'])
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
outputs = self._wrap_predictions(inputs, sk_output)

if len(outputs.columns) == len(self._input_column_names):
outputs.columns = self._input_column_names
output_columns = [outputs]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._training_indices,
columns_list=output_columns)

# self._write(outputs)
# self.logger.warning('produce was called3')
return CallResult(outputs)
@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
"""
Select columns to fit.
Args:
inputs: Container DataFrame
hyperparams: d3m.metadata.hyperparams.Hyperparams

Returns:
list
"""
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

use_columns = []
exclude_columns = []

# if hyperparams['columns_using_method'] == 'name':
# inputs_cols = inputs.columns.values.tolist()
# for i in range(len(inputs_cols)):
# if inputs_cols[i] in hyperparams['use_columns_name']:
# use_columns.append(i)
# elif inputs_cols[i] in hyperparams['exclude_columns_name']:
# exclude_columns.append(i)
# else:
use_columns=hyperparams['use_columns']
exclude_columns=hyperparams['exclude_columns']
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
"""
Output whether a column can be processed.
Args:
inputs_metadata: d3m.metadata.base.DataMetadata
column_index: int

Returns:
bool
"""
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
"""
Updata metadata for selected columns.
Args:
inputs_metadata: metadata_base.DataMetadata
outputs: Container Dataframe
target_columns_metadata: list

Returns:
d3m.metadata.base.DataMetadata
"""
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
"""
Wrap predictions into dataframe
Args:
inputs: Container Dataframe
predictions: array-like data (n_samples, n_features)

Returns:
Dataframe
"""
outputs = d3m_dataframe(predictions, generate_metadata=True)
target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams):
"""
Add target columns metadata
Args:
outputs_metadata: metadata.base.DataMetadata
hyperparams: d3m.metadata.hyperparams.Hyperparams

Returns:
List[OrderedDict]
"""
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_name = "output_{}".format(column_index)
column_metadata = OrderedDict()
semantic_types = set()
semantic_types.add(hyperparams["return_semantic_type"])
column_metadata['semantic_types'] = list(semantic_types)

column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata

def _write(self, inputs:Inputs):
inputs.to_csv(str(time.time())+'.csv')

def _bkfilter(self, X, low, high, K):
"""
Perform BKFilter
Args:
X: slected rows to be performed
K, low, high: Parameters of BKFilter

Returns:
Dataframe, results of BKFilter
"""
transformed_X = utils.pandas.DataFrame()
for col in X.columns:
cycle = sm.tsa.filters.bkfilter(X[col], low=low, high=high, K=K)
cycle_df = utils.pandas.DataFrame(cycle)
transformed_X = utils.pandas.concat([transformed_X,cycle_df], axis=1)

return transformed_X

+ 5
- 11
tods/feature_analysis/HPFilter.py View File

@@ -163,14 +163,14 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams
if len(self._training_indices) > 0:
# self._clf.fit(self._training_inputs)
self._fitted = True
else:
else: # pragma: no cover
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")



if not self._fitted:
if not self._fitted: # pragma: no cover
raise PrimitiveNotFittedError("Primitive not fitted.")
sk_inputs = inputs
if self.hyperparams['use_semantic_types']:
@@ -186,7 +186,7 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams
outputs.columns = self._input_column_names
output_columns = [outputs]
else:
else: # pragma: no cover
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
@@ -194,14 +194,11 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._training_indices,
columns_list=output_columns)

# self._write(outputs)
# self.logger.warning('produce was called3')
return CallResult(outputs)
@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover
"""
Select columns to fit.
Args:
@@ -238,7 +235,7 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover
"""
Output whether a column can be processed.
Args:
@@ -331,9 +328,6 @@ class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams

return target_columns_metadata

def _write(self, inputs:Inputs):
inputs.to_csv(str(time.time())+'.csv')

def _hpfilter(self, X, lamb):
"""
Perform HPFilter


+ 37
- 45
tods/feature_analysis/SKTruncatedSVD.py View File

@@ -224,7 +224,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H
Returns:
None
"""
if self._fitted:
if self._fitted: # pragma: no cover
return CallResult(None)

# Get cols to fit.
@@ -239,7 +239,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H
if len(self._training_indices) > 0:
self._clf.fit(self._training_inputs)
self._fitted = True
else:
else: # pragma: no cover
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
@@ -257,7 +257,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H
# self.logger.warning(str(self.metadata.query()['name']))


if not self._fitted:
if not self._fitted: # pragma: no cover
raise PrimitiveNotFittedError("Primitive not fitted.")
sk_inputs = inputs
if self.hyperparams['use_semantic_types']:
@@ -272,7 +272,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H
if len(outputs.columns) == len(self._input_column_names):
outputs.columns = self._input_column_names
output_columns = [outputs]
else:
else: # pragma: no cover
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
@@ -286,7 +286,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H
return CallResult(outputs)

def get_params(self) -> Params:
def get_params(self) -> Params: # pragma: no cover
"""
Return parameters.
Args:
@@ -320,7 +320,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
def set_params(self, *, params: Params) -> None: # pragma: no cover
"""
Set parameters for SKTruncatedSVD.
Args:
@@ -351,7 +351,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H
@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover
"""
Select columns to fit.
Args:
@@ -377,7 +377,7 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover
"""
Output whether a column can be processed.
Args:
@@ -408,35 +408,35 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H
return False

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
"""
Output metadata of selected columns.
Args:
outputs_metadata: metadata_base.DataMetadata
hyperparams: d3m.metadata.hyperparams.Hyperparams
Returns:
d3m.metadata.base.DataMetadata
"""
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = []
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)
target_columns_metadata.append(column_metadata)
return target_columns_metadata
# @classmethod
# def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
# """
# Output metadata of selected columns.
# Args:
# outputs_metadata: metadata_base.DataMetadata
# hyperparams: d3m.metadata.hyperparams.Hyperparams
# Returns:
# d3m.metadata.base.DataMetadata
# """
# outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
# target_columns_metadata: List[OrderedDict] = []
# for column_index in range(outputs_length):
# column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
# # Update semantic types and prepare it for predicted targets.
# semantic_types = set(column_metadata.get('semantic_types', []))
# semantic_types_to_remove = set([])
# add_semantic_types = []
# add_semantic_types.add(hyperparams["return_semantic_type"])
# semantic_types = semantic_types - semantic_types_to_remove
# semantic_types = semantic_types.union(add_semantic_types)
# column_metadata['semantic_types'] = list(semantic_types)
# target_columns_metadata.append(column_metadata)
# return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
@@ -500,11 +500,3 @@ class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, H

return target_columns_metadata

def _write(self, inputs:Inputs):
"""
write inputs to current directory, only for test
"""
inputs.to_csv(str(time.time())+'.csv')


# SKTruncatedSVD.__doc__ = TruncatedSVD.__doc__

+ 35
- 41
tods/feature_analysis/TRMF.py View File

@@ -276,14 +276,14 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
if len(self._training_indices) > 0:
self._clf.fit(self._training_inputs)
self._fitted = True
else:
else: # pragma: no cover
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")



if not self._fitted:
if not self._fitted: # pragma: no cover
raise PrimitiveNotFittedError("Primitive not fitted.")

sk_inputs = inputs
@@ -301,7 +301,7 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
if len(outputs.columns) == len(self._input_column_names):
outputs.columns = self._input_column_names
output_columns = [outputs]
else:
else: # pragma: no cover
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
@@ -316,7 +316,7 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):

@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover
"""
Select columns to fit.
Args:
@@ -342,7 +342,7 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover
"""
Output whether a column can be processed.
Args:
@@ -373,35 +373,35 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
return False

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
"""
Output metadata of selected columns.
Args:
outputs_metadata: metadata_base.DataMetadata
hyperparams: d3m.metadata.hyperparams.Hyperparams
Returns:
d3m.metadata.base.DataMetadata
"""
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = []
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)
target_columns_metadata.append(column_metadata)
return target_columns_metadata
# @classmethod
# def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
# """
# Output metadata of selected columns.
# Args:
# outputs_metadata: metadata_base.DataMetadata
# hyperparams: d3m.metadata.hyperparams.Hyperparams
# Returns:
# d3m.metadata.base.DataMetadata
# """
# outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
# target_columns_metadata: List[OrderedDict] = []
# for column_index in range(outputs_length):
# column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
# # Update semantic types and prepare it for predicted targets.
# semantic_types = set(column_metadata.get('semantic_types', []))
# semantic_types_to_remove = set([])
# add_semantic_types = []
# add_semantic_types.add(hyperparams["return_semantic_type"])
# semantic_types = semantic_types - semantic_types_to_remove
# semantic_types = semantic_types.union(add_semantic_types)
# column_metadata['semantic_types'] = list(semantic_types)
# target_columns_metadata.append(column_metadata)
# return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
@@ -465,12 +465,6 @@ class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):

return target_columns_metadata

def _write(self, inputs:Inputs):
"""
write inputs to current directory, only for test
"""
inputs.to_csv(str(time.time())+'.csv')


"""
Temporal Regularized Matrix Factorization
@@ -564,7 +558,7 @@ class trmf:
return np.dot(self.F, X_preds)


def _predict_X(self, h):
def _predict_X(self, h): # pragma: no cover
"""Predict X h timepoints ahead.

Evaluates matrix X with the help of matrix W.


+ 62
- 1
tods/tests/test_ContinuityValidation.py View File

@@ -59,7 +59,7 @@ class ContinuityValidationTest(unittest.TestCase):
hyperparams_class = ContinuityValidation.ContinuityValidation.metadata.get_hyperparams()
primitive = ContinuityValidation.ContinuityValidation(hyperparams=hyperparams_class.defaults())
new_main = primitive.produce(inputs=main).value
# print(new_main)

expected_output = container.DataFrame({'d3mIndex': [0, 1, 2, 3],
'timestamp': [1., 2., 3., 4.],
@@ -124,6 +124,67 @@ class ContinuityValidationTest(unittest.TestCase):

self._test_continuity(new_main)

hyperparams = hyperparams_class.defaults()
hyperparams = hyperparams.replace({'continuity_option': 'ablation'})
primitive2 = ContinuityValidation.ContinuityValidation(hyperparams=hyperparams)
new_main2 = primitive2.produce(inputs=main).value
print(new_main2)

self.assertEqual(utils.to_json_structure(new_main2.metadata.to_internal_simple_structure()), [{
'selector': [],
'metadata': {
# 'top_level': 'main',
'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
'structural_type': 'd3m.container.pandas.DataFrame',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
'dimension': {
'name': 'rows',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
'length': 2,
},
},
}, {
'selector': ['__ALL_ELEMENTS__'],
'metadata': {
'dimension': {
'name': 'columns',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
'length': 5,
},
},
}, {
'selector': ['__ALL_ELEMENTS__', 0],
'metadata': {
'name': 'd3mIndex',
'structural_type': 'numpy.int64',
},
}, {
'selector': ['__ALL_ELEMENTS__', 1],
'metadata': {
'name': 'timestamp',
'structural_type': 'numpy.float64',
},
}, {
'selector': ['__ALL_ELEMENTS__', 2],
'metadata': {
'name': 'a',
'structural_type': 'numpy.float64',
},
}, {
'selector': ['__ALL_ELEMENTS__', 3],
'metadata': {
'name': 'b',
'structural_type': 'numpy.float64',
},
}, {
'selector': ['__ALL_ELEMENTS__', 4],
'metadata': {
'name': 'ground_truth',
'structural_type': 'numpy.int64',
},
}])



def _test_continuity(self, data_value):
tmp_col = data_value['timestamp']


+ 48
- 0
tods/tests/test_DuplicationValidation.py View File

@@ -102,6 +102,54 @@ class DuplicationValidationTest(unittest.TestCase):

self._test_drop_duplication(new_main)

hyperparams = hyperparams_class.defaults()
hyperparams = hyperparams.replace({'keep_option': 'average'})
primitive2 = DuplicationValidation.DuplicationValidation(hyperparams=hyperparams)
new_main2 = primitive2.produce(inputs=main).value
print(new_main2)

self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{
'selector': [],
'metadata': {
# 'top_level': 'main',
'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
'structural_type': 'd3m.container.pandas.DataFrame',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
'dimension': {
'name': 'rows',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
'length': 2,
},
},
}, {
'selector': ['__ALL_ELEMENTS__'],
'metadata': {
'dimension': {
'name': 'columns',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
'length': 3,
},
},
}, {
'selector': ['__ALL_ELEMENTS__', 0],
'metadata': {
'name': 'timestamp',
'structural_type': 'numpy.float64',
},
}, {
'selector': ['__ALL_ELEMENTS__', 1],
'metadata': {
'name': 'a',
'structural_type': 'numpy.float64',
},
}, {
'selector': ['__ALL_ELEMENTS__', 2],
'metadata': {
'name': 'b',
'structural_type': 'numpy.float64',
},
}])


def _test_drop_duplication(self, data_value):
self.assertEqual(True in list(data_value.duplicated('timestamp')), False)


Loading…
Cancel
Save