Former-commit-id:master219f70b1a2
[formerly99f9e1eb69
] [formerlyf5f16ce532
[formerly50352b3ff3
]] [formerly2e5704ad73
[formerly5b29820537
] [formerly431b0ddb22
[formerlyd798a8e4e3
]]] [formerlyb0d5d4d534
[formerly44f4678454
] [formerlyee808e7e44
[formerly9859d0b436
]] [formerly50c9d7cea1
[formerly601849d7bd
] [formerly5345efd370
[formerlye1d4a59360
]]]] [formerly7b34cd8f6c
[formerly9e168f96d0
] [formerlyf2571a9a02
[formerlyf7858302f9
]] [formerlyb1528fd27d
[formerlyc2a264017d
] [formerly1a2af1e1fd
[formerly62d7e9b2ed
]]] [formerlya953785cfa
[formerly27aa0afbf7
] [formerlyeee3141c0a
[formerly74cc095a2a
]] [formerly7a838661c9
[formerly826360a453
] [formerly19b2607cc6
[formerly9497930570
]]]]] [formerly601c051e8f
[formerly13bb1bb849
] [formerlybb89c1bded
[formerly28b683f6e7
]] [formerly68265b09c5
[formerly4ec648ef37
] [formerlyd76b272cc0
[formerly2ee66794ec
]]] [formerly47aab2a490
[formerly659e827756
] [formerly9dc3a20fd5
[formerlyc4d0adb03f
]] [formerlybf4c2ef7ff
[formerlybdec0bac94
] [formerly0cae18b065
[formerly4c3c172e47
]]]] [formerly2972f27f73
[formerly84acd0a1b6
] [formerly232b8924f8
[formerlyd925d6f4d3
]] [formerly5217d26dd9
[formerly658eb7e5c8
] [formerlyc5c3d4f714
[formerly2a0f1ff8c3
]]] [formerly1f89ca49a9
[formerly7ae1036fb3
] [formerly10e5070850
[formerly12ab0b78cd
]] [formerlydb6a58bc98
[formerlydb92fd3772
] [formerly37fb024b8f
[formerlyc23effffc6
]]]]]] Former-commit-id:ef4232d8cb
[formerly77b26e468d
] [formerly3aaaca10c1
[formerlybea844e99d
]] [formerlyde3fd235c3
[formerlyd920adf559
] [formerly13d3b270da
[formerlyd698f233e5
]]] [formerly6d767a08aa
[formerly46e3c38fde
] [formerly75d1c6fb5c
[formerly01103f969d
]] [formerly54a08fcd21
[formerly41a1571a38
] [formerly20dcf58162
[formerly343002311a
]]]] [formerly4feb51d60b
[formerly97764bb9f3
] [formerly57a357434f
[formerlyf61d9c7dba
]] [formerly24e3fc024d
[formerly0d3c84a67f
] [formerly5d363f96a1
[formerly7910b082c3
]]] [formerly741cfa0b09
[formerly6fcfe42121
] [formerly86df6392c0
[formerly82d4124b2b
]] [formerly3120bc060c
[formerly839f6e0c8a
] [formerly37fb024b8f
]]]] Former-commit-id:a817081230
[formerly2cbd88db2f
] [formerly11b1b009d2
[formerlycbc5bd4988
]] [formerly594c594b0c
[formerlyf7b92eb97e
] [formerlyc54cb5cb71
[formerlyb88b1fa18c
]]] [formerly044a2fbb3f
[formerly1efe53be7f
] [formerly99c091f7db
[formerly734e2740f6
]] [formerly9dc8407f7e
[formerly4545ca14d8
] [formerlye1bfd4d1da
[formerly6898b4a6a4
]]]] Former-commit-id:3eaf18de9d
[formerlydd877a066e
] [formerlya4a8ae76c4
[formerlyce174d3cce
]] [formerlya57fb2e9e1
[formerlydf4f539b32
] [formerly48464db993
[formerlyb37110284f
]]] Former-commit-id:07096c4efb
[formerlyef0477b822
] [formerly7944fb187e
[formerly56e573e9ef
]] Former-commit-id:fe6a976ab6
[formerly6ce5c42aa8
] Former-commit-id:da363bbcc3
@@ -1,99 +1,57 @@ | |||||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||||
from numpy import ndarray | |||||
from collections import OrderedDict | |||||
from scipy import sparse | |||||
import os | import os | ||||
import sklearn | import sklearn | ||||
import numpy | import numpy | ||||
import typing | import typing | ||||
import time | |||||
from scipy import sparse | |||||
from numpy import ndarray | |||||
from collections import OrderedDict | |||||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||||
import pandas as pd | |||||
# Custom import commands if any | |||||
import warnings | |||||
import numpy as np | import numpy as np | ||||
import pandas as pd | |||||
import logging, uuid | |||||
from scipy import sparse | |||||
from numpy import ndarray | |||||
from collections import OrderedDict | |||||
from common_primitives import dataframe_utils, utils | |||||
from sklearn.utils import check_array | |||||
from sklearn.exceptions import NotFittedError | |||||
# from numba import njit | |||||
from pyod.utils.utility import argmaxn | |||||
from d3m.container.numpy import ndarray as d3m_ndarray | |||||
from d3m.container import DataFrame as d3m_dataframe | |||||
from d3m.metadata import hyperparams, params, base as metadata_base | |||||
from d3m import utils | from d3m import utils | ||||
from d3m import container | |||||
from d3m.base import utils as base_utils | from d3m.base import utils as base_utils | ||||
from d3m.exceptions import PrimitiveNotFittedError | from d3m.exceptions import PrimitiveNotFittedError | ||||
from d3m.container import DataFrame as d3m_dataframe | |||||
from d3m.container.numpy import ndarray as d3m_ndarray | |||||
from d3m.primitive_interfaces import base, transformer | |||||
from d3m.metadata import base as metadata_base, hyperparams | |||||
from d3m.metadata import hyperparams, params, base as metadata_base | |||||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | from d3m.primitive_interfaces.base import CallResult, DockerContainer | ||||
# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||||
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase | |||||
from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase | |||||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||||
from d3m import exceptions | |||||
import pandas | |||||
import uuid | |||||
from d3m import container, utils as d3m_utils | |||||
from .UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase | |||||
import stumpy | import stumpy | ||||
# from typing import Union | |||||
__all__ = ('MatrixProfile',) | |||||
Inputs = d3m_dataframe | |||||
Outputs = d3m_dataframe | |||||
Inputs = container.DataFrame | |||||
Outputs = container.DataFrame | |||||
class PrimitiveCount: | |||||
primitive_no = 0 | |||||
class Params(Params_ODBase): | |||||
######## Add more Attributes ####### | |||||
pass | |||||
class Hyperparams(hyperparams.Hyperparams): | |||||
window_size = hyperparams.UniformInt( | |||||
lower = 0, | |||||
upper = 100, #TODO: Define the correct the upper bound | |||||
default=50, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="window size to calculate" | |||||
) | |||||
# Keep previous | |||||
dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( | |||||
default=None, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", | |||||
) | |||||
use_columns = hyperparams.Set( | |||||
elements=hyperparams.Hyperparameter[int](-1), | |||||
default=(2,), | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", | |||||
) | |||||
exclude_columns = hyperparams.Set( | |||||
elements=hyperparams.Hyperparameter[int](-1), | |||||
default=(0,1,3,), | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", | |||||
) | |||||
return_result = hyperparams.Enumeration( | |||||
values=['append', 'replace', 'new'], | |||||
default='new', | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||||
) | |||||
use_semantic_types = hyperparams.UniformBool( | |||||
default=False, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||||
) | |||||
add_index_columns = hyperparams.UniformBool( | |||||
default=False, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||||
) | |||||
error_on_no_input = hyperparams.UniformBool( | |||||
default=True, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||||
) | |||||
return_semantic_type = hyperparams.Enumeration[str]( | |||||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', | |||||
'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], | |||||
default='https://metadata.datadrivendiscovery.org/types/Attribute', | |||||
description='Decides what semantic type to attach to generated attributes', | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||||
) | |||||
class Hyperparams(Hyperparams_ODBase): | |||||
######## Add more Attributes ####### | |||||
pass | |||||
class MP: | class MP: | ||||
""" | """ | ||||
@@ -103,6 +61,28 @@ class MP: | |||||
self._window_size = window_size | self._window_size = window_size | ||||
return | return | ||||
def fit(self, X, y=None): | |||||
"""Fit detector. y is ignored in unsupervised methods. | |||||
Parameters | |||||
---------- | |||||
X : numpy array of shape (n_samples, n_features) | |||||
The input samples. | |||||
y : Ignored | |||||
Not used, present for API consistency by convention. | |||||
Returns | |||||
------- | |||||
self : object | |||||
Fitted estimator. | |||||
""" | |||||
# validate inputs X and y (optional) | |||||
# X = check_array(X) | |||||
# self._set_n_classes(y) | |||||
# self.decision_scores_ = self.decision_function(X) | |||||
# self._process_decision_scores() | |||||
return self | |||||
def produce(self, data): | def produce(self, data): | ||||
""" | """ | ||||
@@ -113,15 +93,47 @@ class MP: | |||||
nparray | nparray | ||||
""" | """ | ||||
""" | |||||
#print(data.shape[0s]) | |||||
rows = data.shape[0] | |||||
columns = data.shape[1] | |||||
convert_data = np.reshape(data, (columns, rows)) | |||||
T_data = data.transpose() | |||||
#print(T_data) | |||||
transformed_columns=utils.pandas.DataFrame() | transformed_columns=utils.pandas.DataFrame() | ||||
for col in data.columns: | |||||
transformed_columns=d3m_dataframe | |||||
print(len(data)) | |||||
for col in range(len(data)): | |||||
output = stumpy.stump(data[col], m = self._window_size) | output = stumpy.stump(data[col], m = self._window_size) | ||||
output = pd.DataFrame(output) | output = pd.DataFrame(output) | ||||
#print("output", output) | |||||
transformed_columns=pd.concat([transformed_columns,output],axis=1) | transformed_columns=pd.concat([transformed_columns,output],axis=1) | ||||
#transformed_columns[col]=output | |||||
#print(transformed_columns) | |||||
return transformed_columns | |||||
# transformed_data = [] | |||||
# for row in T_data: | |||||
# print(row) | |||||
# output = stumpy.stump(row, m = self._window_size) | |||||
# print(output) | |||||
""" | |||||
#input from UODBasePrimitive is np.ndarray not dataframe | |||||
print("data ",type(data)) | |||||
transformed_columns=utils.pandas.DataFrame() | |||||
for col in data: | |||||
print(col) | |||||
output = stumpy.stump(col, m = self._window_size) | |||||
output = pd.DataFrame(output) | |||||
transformed_columns=pd.concat([transformed_columns,output]) | |||||
#print(transformed_columns) | |||||
return transformed_columns | return transformed_columns | ||||
class MatrixProfilePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): | |||||
def predict(self, data): | |||||
return self.produce(data) | |||||
class MatrixProfile(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): | |||||
""" | """ | ||||
A primitive that performs matrix profile on a DataFrame using Stumpy package | A primitive that performs matrix profile on a DataFrame using Stumpy package | ||||
Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html | Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html | ||||
@@ -137,7 +149,7 @@ class MatrixProfilePrimitive(transformer.TransformerPrimitiveBase[Inputs, Output | |||||
ignore_trivial : bool | ignore_trivial : bool | ||||
Set to `True` if this is a self-join. Otherwise, for AB-join, set this | Set to `True` if this is a self-join. Otherwise, for AB-join, set this | ||||
to `False`. Default is `True`. | to `False`. Default is `True`. | ||||
Returns | |||||
Returnsfdsf | |||||
------- | ------- | ||||
out : ndarray | out : ndarray | ||||
The first column consists of the matrix profile, the second column | The first column consists of the matrix profile, the second column | ||||
@@ -147,7 +159,6 @@ class MatrixProfilePrimitive(transformer.TransformerPrimitiveBase[Inputs, Output | |||||
""" | """ | ||||
metadata = metadata_base.PrimitiveMetadata({ | metadata = metadata_base.PrimitiveMetadata({ | ||||
'__author__': "DATA Lab @Texas A&M University", | '__author__': "DATA Lab @Texas A&M University", | ||||
'name': "Matrix Profile", | 'name': "Matrix Profile", | ||||
@@ -163,219 +174,67 @@ class MatrixProfilePrimitive(transformer.TransformerPrimitiveBase[Inputs, Output | |||||
}) | }) | ||||
def __init__(self, *, hyperparams: Hyperparams) -> None: | |||||
super().__init__(hyperparams=hyperparams) | |||||
self._clf = MP(window_size = hyperparams['window_size']) | |||||
self.primitiveNo = PrimitiveCount.primitive_no | |||||
PrimitiveCount.primitive_no+=1 | |||||
def __init__(self, *, | |||||
hyperparams: Hyperparams, # | |||||
random_seed: int = 0, | |||||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: | |||||
self._clf = MP(window_size=hyperparams['window_size']) | |||||
def set_training_data(self, *, inputs: Inputs) -> None: | |||||
""" | """ | ||||
Set training data for outlier detection. | |||||
Args: | Args: | ||||
inputs: Container DataFrame | inputs: Container DataFrame | ||||
timeout: Default | |||||
iterations: Default | |||||
Returns: | Returns: | ||||
Container DataFrame containing Matrix Profile of selected columns | |||||
None | |||||
""" | """ | ||||
super().set_training_data(inputs=inputs) | |||||
# Get cols to fit. | |||||
self._fitted = False | |||||
self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) | |||||
self._input_column_names = self._training_inputs.columns | |||||
if len(self._training_indices) > 0: | |||||
self._fitted = True | |||||
else: # pragma: no cover | |||||
if self.hyperparams['error_on_no_input']: | |||||
raise RuntimeError("No input columns were selected") | |||||
self.logger.warn("No input columns were selected") | |||||
if not self._fitted: # pragma: no cover | |||||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||||
sk_inputs = inputs | |||||
if self.hyperparams['use_semantic_types']: # pragma: no cover | |||||
sk_inputs = inputs.iloc[:, self._training_indices] | |||||
output_columns = [] | |||||
if len(self._training_indices) > 0: | |||||
sk_output = self._clf.produce(sk_inputs) | |||||
if sparse.issparse(sk_output): # pragma: no cover | |||||
sk_output = sk_output.toarray() | |||||
outputs = self._wrap_predictions(inputs, sk_output) | |||||
if len(outputs.columns) == len(self._input_column_names): # pragma: no cover | |||||
outputs.columns = self._input_column_names | |||||
output_columns = [outputs] | |||||
else: # pragma: no cover | |||||
if self.hyperparams['error_on_no_input']: | |||||
raise RuntimeError("No input columns were selected") | |||||
self.logger.warn("No input columns were selected") | |||||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||||
add_index_columns=self.hyperparams['add_index_columns'], | |||||
inputs=inputs, column_indices=self._training_indices, | |||||
columns_list=output_columns) | |||||
#print(outputs.columns) | |||||
#outputs.columns = [str(x) for x in outputs.columns] | |||||
return CallResult(outputs) | |||||
def _update_metadata(self, outputs): # pragma: no cover | |||||
outputs.metadata = outputs.metadata.generate(outputs) | |||||
@classmethod | |||||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover | |||||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||||
""" | """ | ||||
Fit model with training data. | |||||
Args: | |||||
*: Container DataFrame. Time series data up to fit. | |||||
Select columns to fit. | |||||
Args: | |||||
inputs: Container DataFrame | |||||
hyperparams: d3m.metadata.hyperparams.Hyperparams | |||||
Returns: | |||||
list | |||||
""" | |||||
if not hyperparams['use_semantic_types']: | |||||
return inputs, list(range(len(inputs.columns))) | |||||
inputs_metadata = inputs.metadata | |||||
def can_produce_column(column_index: int) -> bool: | |||||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||||
use_columns=hyperparams['use_columns'], | |||||
exclude_columns=hyperparams['exclude_columns'], | |||||
can_use_column=can_produce_column) | |||||
""" | |||||
Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2) | |||||
columns_to_produce is still [2] | |||||
""" | |||||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||||
@classmethod | |||||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover | |||||
Returns: | |||||
None | |||||
""" | """ | ||||
return super().fit() | |||||
Output whether a column can be processed. | |||||
Args: | |||||
inputs_metadata: d3m.metadata.base.DataMetadata | |||||
column_index: int | |||||
Returns: | |||||
bool | |||||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||||
""" | """ | ||||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||||
accepted_structural_types = (int, float, np.integer, np.float64) #changed numpy to np | |||||
accepted_semantic_types = set() | |||||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||||
return False | |||||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||||
if len(semantic_types) == 0: | |||||
cls.logger.warning("No semantic types found in column metadata") | |||||
return False | |||||
# Making sure all accepted_semantic_types are available in semantic_types | |||||
if len(accepted_semantic_types - semantic_types) == 0: | |||||
return True | |||||
return False | |||||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||||
""" | |||||
Wrap predictions into dataframe | |||||
Process the testing data. | |||||
Args: | Args: | ||||
inputs: Container Dataframe | |||||
predictions: array-like data (n_samples, n_features) | |||||
inputs: Container DataFrame. Time series data up to outlier detection. | |||||
Returns: | Returns: | ||||
Dataframe | |||||
Container DataFrame | |||||
1 marks Outliers, 0 marks normal. | |||||
""" | """ | ||||
print("inputs ",type(inputs)) | |||||
return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) | |||||
outputs = d3m_dataframe(predictions, generate_metadata=True) | |||||
target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo) | |||||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) | |||||
return outputs | |||||
@classmethod | |||||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||||
def get_params(self) -> Params: | |||||
""" | """ | ||||
Return parameters. | |||||
Args: | |||||
None | |||||
Updata metadata for selected columns. | |||||
Args: | |||||
inputs_metadata: metadata_base.DataMetadata | |||||
outputs: Container Dataframe | |||||
target_columns_metadata: list | |||||
Returns: | |||||
d3m.metadata.base.DataMetadata | |||||
Returns: | |||||
class Params | |||||
""" | """ | ||||
return super().get_params() | |||||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||||
column_metadata.pop("structural_type", None) | |||||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||||
return outputs_metadata | |||||
@classmethod | |||||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo): | |||||
def set_params(self, *, params: Params) -> None: | |||||
""" | """ | ||||
Add target columns metadata | |||||
Set parameters for outlier detection. | |||||
Args: | Args: | ||||
outputs_metadata: metadata.base.DataMetadata | |||||
hyperparams: d3m.metadata.hyperparams.Hyperparams | |||||
params: class Params | |||||
Returns: | Returns: | ||||
List[OrderedDict] | |||||
None | |||||
""" | """ | ||||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||||
target_columns_metadata: List[OrderedDict] = [] | |||||
for column_index in range(outputs_length): | |||||
column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index) | |||||
column_metadata = OrderedDict() | |||||
semantic_types = set() | |||||
semantic_types.add(hyperparams["return_semantic_type"]) | |||||
column_metadata['semantic_types'] = list(semantic_types) | |||||
column_metadata["name"] = str(column_name) | |||||
target_columns_metadata.append(column_metadata) | |||||
return target_columns_metadata | |||||
super().set_params(params=params) |
@@ -0,0 +1,381 @@ | |||||
import os | |||||
import sklearn | |||||
import numpy | |||||
import typing | |||||
import time | |||||
from scipy import sparse | |||||
from numpy import ndarray | |||||
from collections import OrderedDict | |||||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||||
import numpy as np | |||||
import pandas as pd | |||||
import logging, uuid | |||||
from scipy import sparse | |||||
from numpy import ndarray | |||||
from collections import OrderedDict | |||||
from common_primitives import dataframe_utils, utils | |||||
from d3m import utils | |||||
from d3m import container | |||||
from d3m.base import utils as base_utils | |||||
from d3m.exceptions import PrimitiveNotFittedError | |||||
from d3m.container import DataFrame as d3m_dataframe | |||||
from d3m.container.numpy import ndarray as d3m_ndarray | |||||
from d3m.primitive_interfaces import base, transformer | |||||
from d3m.metadata import base as metadata_base, hyperparams | |||||
from d3m.metadata import hyperparams, params, base as metadata_base | |||||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||||
import stumpy | |||||
__all__ = ('MatrixProfile',) | |||||
Inputs = container.DataFrame | |||||
Outputs = container.DataFrame | |||||
class PrimitiveCount: | |||||
primitive_no = 0 | |||||
class Hyperparams(hyperparams.Hyperparams): | |||||
window_size = hyperparams.UniformInt( | |||||
lower = 0, | |||||
upper = 100, #TODO: Define the correct the upper bound | |||||
default=50, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="window size to calculate" | |||||
) | |||||
# Keep previous | |||||
dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( | |||||
default=None, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", | |||||
) | |||||
use_columns = hyperparams.Set( | |||||
elements=hyperparams.Hyperparameter[int](-1), | |||||
default=(2,), | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", | |||||
) | |||||
exclude_columns = hyperparams.Set( | |||||
elements=hyperparams.Hyperparameter[int](-1), | |||||
default=(0,1,3,), | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", | |||||
) | |||||
return_result = hyperparams.Enumeration( | |||||
values=['append', 'replace', 'new'], | |||||
default='new', | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||||
) | |||||
use_semantic_types = hyperparams.UniformBool( | |||||
default=False, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||||
) | |||||
add_index_columns = hyperparams.UniformBool( | |||||
default=False, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||||
) | |||||
error_on_no_input = hyperparams.UniformBool( | |||||
default=True, | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||||
) | |||||
return_semantic_type = hyperparams.Enumeration[str]( | |||||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', | |||||
'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], | |||||
default='https://metadata.datadrivendiscovery.org/types/Attribute', | |||||
description='Decides what semantic type to attach to generated attributes', | |||||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||||
) | |||||
class MP: | |||||
""" | |||||
This is the class for matrix profile function | |||||
""" | |||||
def __init__(self, window_size): | |||||
self._window_size = window_size | |||||
return | |||||
def produce(self, data): | |||||
""" | |||||
Args: | |||||
data: dataframe column | |||||
Returns: | |||||
nparray | |||||
""" | |||||
transformed_columns=utils.pandas.DataFrame() | |||||
for col in data.columns: | |||||
output = stumpy.stump(data[col], m = self._window_size) | |||||
output = pd.DataFrame(output) | |||||
transformed_columns=pd.concat([transformed_columns,output],axis=1) | |||||
return transformed_columns | |||||
class MatrixProfile(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): | |||||
""" | |||||
A primitive that performs matrix profile on a DataFrame using Stumpy package | |||||
Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html | |||||
Parameters | |||||
---------- | |||||
T_A : ndarray | |||||
The time series or sequence for which to compute the matrix profile | |||||
m : int | |||||
Window size | |||||
T_B : ndarray | |||||
The time series or sequence that contain your query subsequences | |||||
of interest. Default is `None` which corresponds to a self-join. | |||||
ignore_trivial : bool | |||||
Set to `True` if this is a self-join. Otherwise, for AB-join, set this | |||||
to `False`. Default is `True`. | |||||
Returns | |||||
------- | |||||
out : ndarray | |||||
The first column consists of the matrix profile, the second column | |||||
consists of the matrix profile indices, the third column consists of | |||||
the left matrix profile indices, and the fourth column consists of | |||||
the right matrix profile indices. | |||||
""" | |||||
metadata = metadata_base.PrimitiveMetadata({ | |||||
'__author__': "DATA Lab @Texas A&M University", | |||||
'name': "Matrix Profile", | |||||
#'python_path': 'd3m.primitives.tods.feature_analysis.matrix_profile', | |||||
'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile', | |||||
'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', | |||||
'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/MatrixProfile.py']}, | |||||
'algorithm_types': [metadata_base.PrimitiveAlgorithmType.MATRIX_PROFILE,], | |||||
'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, | |||||
'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')), | |||||
'hyperparams_to_tune': ['window_size'], | |||||
'version': '0.0.2', | |||||
}) | |||||
def __init__(self, *, hyperparams: Hyperparams) -> None: | |||||
super().__init__(hyperparams=hyperparams) | |||||
self._clf = MP(window_size = hyperparams['window_size']) | |||||
self.primitiveNo = PrimitiveCount.primitive_no | |||||
PrimitiveCount.primitive_no+=1 | |||||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: | |||||
""" | |||||
Args: | |||||
inputs: Container DataFrame | |||||
timeout: Default | |||||
iterations: Default | |||||
Returns: | |||||
Container DataFrame containing Matrix Profile of selected columns | |||||
""" | |||||
# Get cols to fit. | |||||
self._fitted = False | |||||
self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) | |||||
self._input_column_names = self._training_inputs.columns | |||||
if len(self._training_indices) > 0: | |||||
self._fitted = True | |||||
else: # pragma: no cover | |||||
if self.hyperparams['error_on_no_input']: | |||||
raise RuntimeError("No input columns were selected") | |||||
self.logger.warn("No input columns were selected") | |||||
if not self._fitted: # pragma: no cover | |||||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||||
sk_inputs = inputs | |||||
if self.hyperparams['use_semantic_types']: # pragma: no cover | |||||
sk_inputs = inputs.iloc[:, self._training_indices] | |||||
output_columns = [] | |||||
if len(self._training_indices) > 0: | |||||
sk_output = self._clf.produce(sk_inputs) | |||||
if sparse.issparse(sk_output): # pragma: no cover | |||||
sk_output = sk_output.toarray() | |||||
outputs = self._wrap_predictions(inputs, sk_output) | |||||
if len(outputs.columns) == len(self._input_column_names): # pragma: no cover | |||||
outputs.columns = self._input_column_names | |||||
output_columns = [outputs] | |||||
else: # pragma: no cover | |||||
if self.hyperparams['error_on_no_input']: | |||||
raise RuntimeError("No input columns were selected") | |||||
self.logger.warn("No input columns were selected") | |||||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||||
add_index_columns=self.hyperparams['add_index_columns'], | |||||
inputs=inputs, column_indices=self._training_indices, | |||||
columns_list=output_columns) | |||||
#print(outputs.columns) | |||||
#outputs.columns = [str(x) for x in outputs.columns] | |||||
return CallResult(outputs) | |||||
def _update_metadata(self, outputs): # pragma: no cover | |||||
outputs.metadata = outputs.metadata.generate(outputs) | |||||
@classmethod | |||||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover | |||||
""" | |||||
Select columns to fit. | |||||
Args: | |||||
inputs: Container DataFrame | |||||
hyperparams: d3m.metadata.hyperparams.Hyperparams | |||||
Returns: | |||||
list | |||||
""" | |||||
if not hyperparams['use_semantic_types']: | |||||
return inputs, list(range(len(inputs.columns))) | |||||
inputs_metadata = inputs.metadata | |||||
def can_produce_column(column_index: int) -> bool: | |||||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||||
use_columns=hyperparams['use_columns'], | |||||
exclude_columns=hyperparams['exclude_columns'], | |||||
can_use_column=can_produce_column) | |||||
""" | |||||
Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2) | |||||
columns_to_produce is still [2] | |||||
""" | |||||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||||
@classmethod | |||||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover | |||||
""" | |||||
Output whether a column can be processed. | |||||
Args: | |||||
inputs_metadata: d3m.metadata.base.DataMetadata | |||||
column_index: int | |||||
Returns: | |||||
bool | |||||
""" | |||||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||||
accepted_structural_types = (int, float, np.integer, np.float64) #changed numpy to np | |||||
accepted_semantic_types = set() | |||||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||||
return False | |||||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||||
if len(semantic_types) == 0: | |||||
cls.logger.warning("No semantic types found in column metadata") | |||||
return False | |||||
# Making sure all accepted_semantic_types are available in semantic_types | |||||
if len(accepted_semantic_types - semantic_types) == 0: | |||||
return True | |||||
return False | |||||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||||
""" | |||||
Wrap predictions into dataframe | |||||
Args: | |||||
inputs: Container Dataframe | |||||
predictions: array-like data (n_samples, n_features) | |||||
Returns: | |||||
Dataframe | |||||
""" | |||||
outputs = d3m_dataframe(predictions, generate_metadata=True) | |||||
target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo) | |||||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) | |||||
return outputs | |||||
@classmethod | |||||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||||
""" | |||||
Updata metadata for selected columns. | |||||
Args: | |||||
inputs_metadata: metadata_base.DataMetadata | |||||
outputs: Container Dataframe | |||||
target_columns_metadata: list | |||||
Returns: | |||||
d3m.metadata.base.DataMetadata | |||||
""" | |||||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||||
column_metadata.pop("structural_type", None) | |||||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||||
return outputs_metadata | |||||
@classmethod | |||||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo): | |||||
""" | |||||
Add target columns metadata | |||||
Args: | |||||
outputs_metadata: metadata.base.DataMetadata | |||||
hyperparams: d3m.metadata.hyperparams.Hyperparams | |||||
Returns: | |||||
List[OrderedDict] | |||||
""" | |||||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||||
target_columns_metadata: List[OrderedDict] = [] | |||||
for column_index in range(outputs_length): | |||||
column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index) | |||||
column_metadata = OrderedDict() | |||||
semantic_types = set() | |||||
semantic_types.add(hyperparams["return_semantic_type"]) | |||||
column_metadata['semantic_types'] = list(semantic_types) | |||||
column_metadata["name"] = str(column_name) | |||||
target_columns_metadata.append(column_metadata) | |||||
return target_columns_metadata |
@@ -58,10 +58,17 @@ class MatrixProfileTest(unittest.TestCase): | |||||
hyperparams_class = MatrixProfilePrimitive.metadata.get_hyperparams() | hyperparams_class = MatrixProfilePrimitive.metadata.get_hyperparams() | ||||
hyperparams = hyperparams_class.defaults() | hyperparams = hyperparams_class.defaults() | ||||
hyperparams = hyperparams.replace({'window_size': 3}) | hyperparams = hyperparams.replace({'window_size': 3}) | ||||
<<<<<<< Updated upstream:tods/tests/detection_algorithm/test_MatrixProfile.py | |||||
primitive = MatrixProfilePrimitive(hyperparams=hyperparams) | primitive = MatrixProfilePrimitive(hyperparams=hyperparams) | ||||
#primitive.set_training_data(inputs=main) | #primitive.set_training_data(inputs=main) | ||||
#primitive.fit() | #primitive.fit() | ||||
======= | |||||
#print(type(main)) | |||||
primitive = MatrixProfile(hyperparams=hyperparams) | |||||
primitive.set_training_data(inputs=main) | |||||
primitive.fit() | |||||
>>>>>>> Stashed changes:tods/tests/test_MatrixProfile.py | |||||
new_main = primitive.produce(inputs=main).value | new_main = primitive.produce(inputs=main).value | ||||
print(new_main) | print(new_main) | ||||