From 46fb9aa7728cefb2e645ae78e83f6963429012c5 Mon Sep 17 00:00:00 2001 From: YileAllenChen1 Date: Sun, 8 Nov 2020 12:24:09 -0600 Subject: [PATCH] minor update Matrix Profile Former-commit-id: 219f70b1a2e31eabb6ee468d38355d8549e219f6 [formerly 99f9e1eb69723464cc5b3ced02b490490c9c3ed2] [formerly f5f16ce532d2f939df8b0332e5549436e3cbedfb [formerly 50352b3ff381a8d5f7cdb223ba3171d1b432bec6]] [formerly 2e5704ad73234c72fd797bd5d964a846935b8bd9 [formerly 5b298205374fbc9580bd54a2611b77f09400e4e4] [formerly 431b0ddb2260a19936c5e4e0098cca0abf01b5f5 [formerly d798a8e4e3cb809d04690ddc62734bcac29dcb7d]]] [formerly b0d5d4d5346ca6101b33646ae5c5c1a639d1d92e [formerly 44f46784540c78c6b0e03c24bbbf9ec9ec5e49c7] [formerly ee808e7e446de9c052443366faa21dfa63052ac0 [formerly 9859d0b436f15611b552ece7c7369f141081b8fc]] [formerly 50c9d7cea1f5f6725d04bfb21b11e5e5e4d02328 [formerly 601849d7bdfc00097b6a62217a8ea270414f2cd3] [formerly 5345efd370e6a0907684e2293403fa94f2c38b15 [formerly e1d4a5936072b4eec46545d2955f5074e264233a]]]] [formerly 7b34cd8f6c7fcb5bef8061ee67e00a791cfabf91 [formerly 9e168f96d0262434a0308abf9e8a5acaaa8aa393] [formerly f2571a9a02a58863c79f10dc108d0bce8848f8eb [formerly f7858302f9b3123f7e0ba93d2792588f83f45279]] [formerly b1528fd27da1adf0d3837a2c1820af05cc803132 [formerly c2a264017d16f79b5fe707fc775e2bb7f849bdc4] [formerly 1a2af1e1fd26b385de5c5f85dbdbaa5393d76b22 [formerly 62d7e9b2edd0c0e33c0dd828bf6ad7963895831d]]] [formerly a953785cfa3647068f1bb68a8f93b675119388c6 [formerly 27aa0afbf707c4ac2ad9ba6b3a75b76925f149b8] [formerly eee3141c0a297dcc70fb913e431bcadca92c7f91 [formerly 74cc095a2a207fe09239f4355f7575c689bde68e]] [formerly 7a838661c9c182afb52653bdc480b498b879520a [formerly 826360a453a1ee311de6093125d7d9b4d875942e] [formerly 19b2607cc623515fb4d14929114138d3294e3664 [formerly 9497930570c637bb762e7535bd8193a48f07d79f]]]]] [formerly 601c051e8f29786b39f44874ab919060c87da178 [formerly 13bb1bb849ed09df056e8e5b8708acbef9cf44ad] [formerly bb89c1bded8594c5ca6557994324f86eee1acba3 [formerly 28b683f6e75f5c292c571355b4b103814408faf1]] [formerly 68265b09c503e2daf5a40ffcd430a72fc512b066 [formerly 4ec648ef37e2afb5693237af2065f50621dab32b] [formerly d76b272cc0af66bdfab8eb74186dfac1992b207e [formerly 2ee66794ec95dc277f67caf3b6f9acefa4d45867]]] [formerly 47aab2a490491573ef5eb057ab5b7960c32db677 [formerly 659e82775621d5a139c4f11b16f9132351e2539d] [formerly 9dc3a20fd5a096fc3cc7fea99b8eb8c7fa1e91b7 [formerly c4d0adb03f6c5a3c6d0d420270e1535cd016b205]] [formerly bf4c2ef7ffc11d96a4f95c94abae22c57bb2b170 [formerly bdec0bac9431f3cb50f91ef429eaada734118660] [formerly 0cae18b06583364a21d10c520fbece5e092eefe6 [formerly 4c3c172e47b0b8e3ab493d0b83f26ab3222043f1]]]] [formerly 2972f27f737a4f3e7f47cbc26d81040d6535ccb5 [formerly 84acd0a1b6444d9e66b8e3a74d2c777498235a60] [formerly 232b8924f8e7ee670baf9164b713c93caaea26c0 [formerly d925d6f4d3bc98bd49f511c424b18765976b6a8c]] [formerly 5217d26dd9b7dd5a5e16ec4f1f688d4927fe3ba6 [formerly 658eb7e5c800698d9b5fc3d2e046d2163d49b50e] [formerly c5c3d4f714e33e2bdfa65b5e07025f14256b59a0 [formerly 2a0f1ff8c3f502ccdb02512f45ce00347b75c7bf]]] [formerly 1f89ca49a95662efb3562bee4dfaf3d24211992b [formerly 7ae1036fb309258dac440b3a97e6064ca6507390] [formerly 10e50708503e1abe3556f727cc194ce7a85519b3 [formerly 12ab0b78cdd91e9cb9facad0ef8d13e7da71dcaa]] [formerly db6a58bc98f449e3027c5acd9ee812b9e06969de [formerly db92fd3772ec8c468564ae6e0141094ae249c70b] [formerly 37fb024b8f4f31507169c243171ae4abd04e0847 [formerly c23effffc6fadef8c1be5b8367cd01f02dc8bf1f]]]]]] Former-commit-id: ef4232d8cbabfe407476bd645932ce8682edae3a [formerly 77b26e468db3c2bbde63b44482a0bae55375a781] [formerly 3aaaca10c1a3569f5a3bec15bb26962e8df4243f [formerly bea844e99defde3ea8566d4c209e92f9b53b7fa0]] [formerly de3fd235c3653424f8432d39fa7246ef1f0e2181 [formerly d920adf5596d18b585bb7f60fafc62355483a038] [formerly 13d3b270dad3dea56b8302f140fc8deeb591130f [formerly d698f233e551a00f9eb91d0f5012fa3a3330ce1e]]] [formerly 6d767a08aab483567324a7b58ac5dda4606480b8 [formerly 46e3c38fde233f8c558c3e7d408ff4166cefe7c0] [formerly 75d1c6fb5cde7330123580c4053f0a24e0a1dca8 [formerly 01103f969d5b187ba2718f63bae07ecd4d0571e6]] [formerly 54a08fcd2128b444fee5488857925801a572d5f3 [formerly 41a1571a3802656e199e8ca3608fcf0bb2a722e7] [formerly 20dcf5816236045bf8fe11bbb530b808c831f0d1 [formerly 343002311a201a72595e4b02a405b472f337b3c1]]]] [formerly 4feb51d60b7f37021f2800c7b73c28a6739b12a5 [formerly 97764bb9f310e664423cac7a7b3713ed7ee43211] [formerly 57a357434f9a65b31d5cba9fd0299368e29f1c45 [formerly f61d9c7dbaf284b84e8d142232e1da4c8b8e361d]] [formerly 24e3fc024dbde98bf9253aaa0e836972f31b1621 [formerly 0d3c84a67f5fbdd73f0837ec7388dc5eb03fbe2f] [formerly 5d363f96a1340c5265585d5dbd7ea495e4ee1868 [formerly 7910b082c3f2912af21332c8f9c395dd5babe705]]] [formerly 741cfa0b099b6fec12617396800cbe1f0a5ae656 [formerly 6fcfe42121fda536601725edb267bb714f047c98] [formerly 86df6392c060ac17c5b5fde453d2b88a8805971e [formerly 82d4124b2b3c348edf0572149b6158c766a9fb63]] [formerly 3120bc060cdceba89c964280a2a328b39bacea34 [formerly 839f6e0c8ac30acbd0265329c92e549b319f77da] [formerly 37fb024b8f4f31507169c243171ae4abd04e0847]]]] Former-commit-id: a817081230fc5f80a7dedf3b611e3ec62efe1bc7 [formerly 2cbd88db2f0ebedec0c5df959fad0f0c3adf8923] [formerly 11b1b009d2c9f59e223441049df0299e0d6547b8 [formerly cbc5bd49884be11af655e524c54672b6dd888402]] [formerly 594c594b0c3817637f6f3966962aea17a6daf44d [formerly f7b92eb97ebeaa3b289efaf3474c1d59c789b035] [formerly c54cb5cb715201fda918114bd63c597fd34d7e77 [formerly b88b1fa18c09e3c4ecb02d800b69b51be31f3c5b]]] [formerly 044a2fbb3ffa494fc2dc1bb2a81f66e6ddf05366 [formerly 1efe53be7f4d296e60a98d6fc5a8019f293305bd] [formerly 99c091f7dbfe6efe64af9b430f2ba1534d227699 [formerly 734e2740f6b8bd56271bf0c7ee7bc622864fed61]] [formerly 9dc8407f7ec81f28d3ae2bd65e2d3b954d94bb2a [formerly 4545ca14d8579302f1039496232163b146d3da7a] [formerly e1bfd4d1dad60ec8bc2220cd707c09691bc6bdc9 [formerly 6898b4a6a4f64d4a15c6828fcba2f2aeedd8191e]]]] Former-commit-id: 3eaf18de9d9be7d83e1d3a8227e7a8f5e35f750b [formerly dd877a066e8aecfa5476f896121a1cbdb6d476c6] [formerly a4a8ae76c4cacf4e37acf83d294ac0c99406e8f3 [formerly ce174d3cce52cd7db81b2e08b19a869971203893]] [formerly a57fb2e9e1edf691adacead98f1f16626c9dd8e0 [formerly df4f539b3240379b8a3a953e4c9aac3e62b5bf52] [formerly 48464db99331fffe6cac6ad7fa16eb402bcfa02a [formerly b37110284f608939d817f4e5593e7bc85dff735c]]] Former-commit-id: 07096c4efb77fb52d7edd2bb356c74ce5f72debf [formerly ef0477b8221ea8ee06896179ea40185a379183e5] [formerly 7944fb187edfd55b448e3b0142ea3b0d71fb3ab2 [formerly 56e573e9efbbf234dc9f181cfba3ddf3ddfd9af7]] Former-commit-id: fe6a976ab6794b561da8b3541c02b83ca6327ed0 [formerly 6ce5c42aa88c77ffc23b50a2ff3f5872700e762a] Former-commit-id: da363bbcc3b547319e1d362c84ef906add4ab993 --- tods/detection_algorithm/MatrixProfile.py | 397 +++++++-------------- tods/detection_algorithm/MatrixProfile2.py | 381 ++++++++++++++++++++ .../detection_algorithm/test_MatrixProfile.py | 7 + 3 files changed, 516 insertions(+), 269 deletions(-) create mode 100644 tods/detection_algorithm/MatrixProfile2.py diff --git a/tods/detection_algorithm/MatrixProfile.py b/tods/detection_algorithm/MatrixProfile.py index f306012..53c3e2f 100644 --- a/tods/detection_algorithm/MatrixProfile.py +++ b/tods/detection_algorithm/MatrixProfile.py @@ -1,99 +1,57 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse import os import sklearn import numpy import typing -import time -from scipy import sparse -from numpy import ndarray -from collections import OrderedDict -from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +import pandas as pd +# Custom import commands if any +import warnings import numpy as np -import pandas as pd -import logging, uuid -from scipy import sparse -from numpy import ndarray -from collections import OrderedDict -from common_primitives import dataframe_utils, utils +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base from d3m import utils -from d3m import container from d3m.base import utils as base_utils from d3m.exceptions import PrimitiveNotFittedError -from d3m.container import DataFrame as d3m_dataframe -from d3m.container.numpy import ndarray as d3m_ndarray -from d3m.primitive_interfaces import base, transformer -from d3m.metadata import base as metadata_base, hyperparams -from d3m.metadata import hyperparams, params, base as metadata_base from d3m.primitive_interfaces.base import CallResult, DockerContainer +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas +import uuid + +from d3m import container, utils as d3m_utils + +from .UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase import stumpy +# from typing import Union -__all__ = ('MatrixProfile',) +Inputs = d3m_dataframe +Outputs = d3m_dataframe -Inputs = container.DataFrame -Outputs = container.DataFrame -class PrimitiveCount: - primitive_no = 0 +class Params(Params_ODBase): + ######## Add more Attributes ####### + pass -class Hyperparams(hyperparams.Hyperparams): - window_size = hyperparams.UniformInt( - lower = 0, - upper = 100, #TODO: Define the correct the upper bound - default=50, - semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], - description="window size to calculate" - ) - - # Keep previous - dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( - default=None, - semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], - description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", - ) - use_columns = hyperparams.Set( - elements=hyperparams.Hyperparameter[int](-1), - default=(2,), - semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], - description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", - ) - exclude_columns = hyperparams.Set( - elements=hyperparams.Hyperparameter[int](-1), - default=(0,1,3,), - semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], - description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", - ) - return_result = hyperparams.Enumeration( - values=['append', 'replace', 'new'], - default='new', - semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], - description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", - ) - use_semantic_types = hyperparams.UniformBool( - default=False, - semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], - description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" - ) - add_index_columns = hyperparams.UniformBool( - default=False, - semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], - description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", - ) - error_on_no_input = hyperparams.UniformBool( - default=True, - semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], - description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", - ) - return_semantic_type = hyperparams.Enumeration[str]( - values=['https://metadata.datadrivendiscovery.org/types/Attribute', - 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], - default='https://metadata.datadrivendiscovery.org/types/Attribute', - description='Decides what semantic type to attach to generated attributes', - semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] - ) +class Hyperparams(Hyperparams_ODBase): + ######## Add more Attributes ####### + pass class MP: """ @@ -103,6 +61,28 @@ class MP: self._window_size = window_size return + def fit(self, X, y=None): + """Fit detector. y is ignored in unsupervised methods. + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + y : Ignored + Not used, present for API consistency by convention. + Returns + ------- + self : object + Fitted estimator. + """ + + # validate inputs X and y (optional) + # X = check_array(X) + # self._set_n_classes(y) + # self.decision_scores_ = self.decision_function(X) + # self._process_decision_scores() + + return self + def produce(self, data): """ @@ -113,15 +93,47 @@ class MP: nparray """ + """ + #print(data.shape[0s]) + rows = data.shape[0] + columns = data.shape[1] + convert_data = np.reshape(data, (columns, rows)) + T_data = data.transpose() + #print(T_data) transformed_columns=utils.pandas.DataFrame() - for col in data.columns: + transformed_columns=d3m_dataframe + print(len(data)) + for col in range(len(data)): output = stumpy.stump(data[col], m = self._window_size) output = pd.DataFrame(output) + #print("output", output) transformed_columns=pd.concat([transformed_columns,output],axis=1) + #transformed_columns[col]=output + #print(transformed_columns) + return transformed_columns + # transformed_data = [] + # for row in T_data: + # print(row) + # output = stumpy.stump(row, m = self._window_size) + # print(output) + """ + #input from UODBasePrimitive is np.ndarray not dataframe + print("data ",type(data)) + transformed_columns=utils.pandas.DataFrame() + for col in data: + print(col) + output = stumpy.stump(col, m = self._window_size) + output = pd.DataFrame(output) + transformed_columns=pd.concat([transformed_columns,output]) + #print(transformed_columns) return transformed_columns -class MatrixProfilePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + def predict(self, data): + return self.produce(data) + +class MatrixProfile(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): """ + A primitive that performs matrix profile on a DataFrame using Stumpy package Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html @@ -137,7 +149,7 @@ class MatrixProfilePrimitive(transformer.TransformerPrimitiveBase[Inputs, Output ignore_trivial : bool Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. - Returns + Returnsfdsf ------- out : ndarray The first column consists of the matrix profile, the second column @@ -147,7 +159,6 @@ class MatrixProfilePrimitive(transformer.TransformerPrimitiveBase[Inputs, Output """ - metadata = metadata_base.PrimitiveMetadata({ '__author__': "DATA Lab @Texas A&M University", 'name': "Matrix Profile", @@ -163,219 +174,67 @@ class MatrixProfilePrimitive(transformer.TransformerPrimitiveBase[Inputs, Output }) - def __init__(self, *, hyperparams: Hyperparams) -> None: - super().__init__(hyperparams=hyperparams) - self._clf = MP(window_size = hyperparams['window_size']) - self.primitiveNo = PrimitiveCount.primitive_no - PrimitiveCount.primitive_no+=1 + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) - def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + self._clf = MP(window_size=hyperparams['window_size']) + def set_training_data(self, *, inputs: Inputs) -> None: """ - + Set training data for outlier detection. Args: - inputs: Container DataFrame - timeout: Default - - iterations: Default - Returns: - - Container DataFrame containing Matrix Profile of selected columns - + None """ + super().set_training_data(inputs=inputs) - # Get cols to fit. - self._fitted = False - self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) - self._input_column_names = self._training_inputs.columns - - - if len(self._training_indices) > 0: - self._fitted = True - else: # pragma: no cover - if self.hyperparams['error_on_no_input']: - raise RuntimeError("No input columns were selected") - self.logger.warn("No input columns were selected") - - if not self._fitted: # pragma: no cover - raise PrimitiveNotFittedError("Primitive not fitted.") - - sk_inputs = inputs - if self.hyperparams['use_semantic_types']: # pragma: no cover - sk_inputs = inputs.iloc[:, self._training_indices] - output_columns = [] - if len(self._training_indices) > 0: - sk_output = self._clf.produce(sk_inputs) - if sparse.issparse(sk_output): # pragma: no cover - sk_output = sk_output.toarray() - outputs = self._wrap_predictions(inputs, sk_output) - - if len(outputs.columns) == len(self._input_column_names): # pragma: no cover - outputs.columns = self._input_column_names - output_columns = [outputs] - - else: # pragma: no cover - if self.hyperparams['error_on_no_input']: - raise RuntimeError("No input columns were selected") - self.logger.warn("No input columns were selected") - - outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], - add_index_columns=self.hyperparams['add_index_columns'], - inputs=inputs, column_indices=self._training_indices, - columns_list=output_columns) - - #print(outputs.columns) - #outputs.columns = [str(x) for x in outputs.columns] - - return CallResult(outputs) - - def _update_metadata(self, outputs): # pragma: no cover - outputs.metadata = outputs.metadata.generate(outputs) - - @classmethod - def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover - + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. - Select columns to fit. - Args: - inputs: Container DataFrame - hyperparams: d3m.metadata.hyperparams.Hyperparams - - Returns: - list - - """ - - if not hyperparams['use_semantic_types']: - return inputs, list(range(len(inputs.columns))) - - inputs_metadata = inputs.metadata - - - - def can_produce_column(column_index: int) -> bool: - return cls._can_produce_column(inputs_metadata, column_index, hyperparams) - - columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, - use_columns=hyperparams['use_columns'], - exclude_columns=hyperparams['exclude_columns'], - can_use_column=can_produce_column) - - - """ - Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2) - columns_to_produce is still [2] - """ - return inputs.iloc[:, columns_to_produce], columns_to_produce - - - @classmethod - def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover - + Returns: + None """ + return super().fit() - Output whether a column can be processed. - Args: - inputs_metadata: d3m.metadata.base.DataMetadata - column_index: int - - Returns: - bool - + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ - - column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) - - accepted_structural_types = (int, float, np.integer, np.float64) #changed numpy to np - accepted_semantic_types = set() - accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") - - if not issubclass(column_metadata['structural_type'], accepted_structural_types): - return False - - semantic_types = set(column_metadata.get('semantic_types', [])) - - if len(semantic_types) == 0: - cls.logger.warning("No semantic types found in column metadata") - return False - - # Making sure all accepted_semantic_types are available in semantic_types - if len(accepted_semantic_types - semantic_types) == 0: - return True - - return False - - def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: - - """ - - Wrap predictions into dataframe + Process the testing data. Args: - inputs: Container Dataframe - predictions: array-like data (n_samples, n_features) + inputs: Container DataFrame. Time series data up to outlier detection. Returns: - Dataframe - + Container DataFrame + 1 marks Outliers, 0 marks normal. """ + print("inputs ",type(inputs)) + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) - outputs = d3m_dataframe(predictions, generate_metadata=True) - target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo) - outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) - return outputs - - - - @classmethod - def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], - target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: - + def get_params(self) -> Params: """ + Return parameters. + Args: + None - Updata metadata for selected columns. - Args: - inputs_metadata: metadata_base.DataMetadata - outputs: Container Dataframe - target_columns_metadata: list - - Returns: - d3m.metadata.base.DataMetadata - + Returns: + class Params """ + return super().get_params() - outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) - - for column_index, column_metadata in enumerate(target_columns_metadata): - column_metadata.pop("structural_type", None) - outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) - - return outputs_metadata - - - @classmethod - def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo): + def set_params(self, *, params: Params) -> None: """ - Add target columns metadata + Set parameters for outlier detection. Args: - outputs_metadata: metadata.base.DataMetadata - hyperparams: d3m.metadata.hyperparams.Hyperparams + params: class Params Returns: - List[OrderedDict] + None """ - outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] - target_columns_metadata: List[OrderedDict] = [] - for column_index in range(outputs_length): - column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index) - column_metadata = OrderedDict() - semantic_types = set() - semantic_types.add(hyperparams["return_semantic_type"]) - column_metadata['semantic_types'] = list(semantic_types) - - column_metadata["name"] = str(column_name) - target_columns_metadata.append(column_metadata) - return target_columns_metadata + super().set_params(params=params) diff --git a/tods/detection_algorithm/MatrixProfile2.py b/tods/detection_algorithm/MatrixProfile2.py new file mode 100644 index 0000000..7d2d244 --- /dev/null +++ b/tods/detection_algorithm/MatrixProfile2.py @@ -0,0 +1,381 @@ +import os +import sklearn +import numpy +import typing +import time +from scipy import sparse +from numpy import ndarray +from collections import OrderedDict +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple + +import numpy as np +import pandas as pd +import logging, uuid +from scipy import sparse +from numpy import ndarray +from collections import OrderedDict +from common_primitives import dataframe_utils, utils + +from d3m import utils +from d3m import container +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.container import DataFrame as d3m_dataframe +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.primitive_interfaces import base, transformer +from d3m.metadata import base as metadata_base, hyperparams +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +import stumpy + +__all__ = ('MatrixProfile',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class PrimitiveCount: + primitive_no = 0 + + +class Hyperparams(hyperparams.Hyperparams): + window_size = hyperparams.UniformInt( + lower = 0, + upper = 100, #TODO: Define the correct the upper bound + default=50, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="window size to calculate" + ) + + # Keep previous + dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(2,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(0,1,3,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class MP: + """ + This is the class for matrix profile function + """ + def __init__(self, window_size): + self._window_size = window_size + return + + def produce(self, data): + + """ + + Args: + data: dataframe column + Returns: + nparray + + """ + transformed_columns=utils.pandas.DataFrame() + for col in data.columns: + output = stumpy.stump(data[col], m = self._window_size) + output = pd.DataFrame(output) + transformed_columns=pd.concat([transformed_columns,output],axis=1) + return transformed_columns + +class MatrixProfile(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that performs matrix profile on a DataFrame using Stumpy package + Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html + + Parameters + ---------- + T_A : ndarray + The time series or sequence for which to compute the matrix profile + m : int + Window size + T_B : ndarray + The time series or sequence that contain your query subsequences + of interest. Default is `None` which corresponds to a self-join. + ignore_trivial : bool + Set to `True` if this is a self-join. Otherwise, for AB-join, set this + to `False`. Default is `True`. + Returns + ------- + out : ndarray + The first column consists of the matrix profile, the second column + consists of the matrix profile indices, the third column consists of + the left matrix profile indices, and the fourth column consists of + the right matrix profile indices. + + """ + + + metadata = metadata_base.PrimitiveMetadata({ + '__author__': "DATA Lab @Texas A&M University", + 'name': "Matrix Profile", + #'python_path': 'd3m.primitives.tods.feature_analysis.matrix_profile', + 'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile', + 'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/MatrixProfile.py']}, + 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.MATRIX_PROFILE,], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')), + 'hyperparams_to_tune': ['window_size'], + 'version': '0.0.2', + }) + + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + self._clf = MP(window_size = hyperparams['window_size']) + self.primitiveNo = PrimitiveCount.primitive_no + PrimitiveCount.primitive_no+=1 + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + """ + + Args: + + inputs: Container DataFrame + + timeout: Default + + iterations: Default + + Returns: + + Container DataFrame containing Matrix Profile of selected columns + + """ + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + + if len(self._training_indices) > 0: + self._fitted = True + else: # pragma: no cover + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: # pragma: no cover + raise PrimitiveNotFittedError("Primitive not fitted.") + + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: # pragma: no cover + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.produce(sk_inputs) + if sparse.issparse(sk_output): # pragma: no cover + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + + if len(outputs.columns) == len(self._input_column_names): # pragma: no cover + outputs.columns = self._input_column_names + output_columns = [outputs] + + else: # pragma: no cover + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + #print(outputs.columns) + #outputs.columns = [str(x) for x in outputs.columns] + + return CallResult(outputs) + + def _update_metadata(self, outputs): # pragma: no cover + outputs.metadata = outputs.metadata.generate(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): # pragma: no cover + + """ + + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + + + """ + Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2) + columns_to_produce is still [2] + """ + return inputs.iloc[:, columns_to_produce], columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: # pragma: no cover + + """ + + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, np.integer, np.float64) #changed numpy to np + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + + """ + + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + + """ + + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + + """ + + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + + """ + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + return target_columns_metadata diff --git a/tods/tests/detection_algorithm/test_MatrixProfile.py b/tods/tests/detection_algorithm/test_MatrixProfile.py index 6477d2a..01d1c46 100644 --- a/tods/tests/detection_algorithm/test_MatrixProfile.py +++ b/tods/tests/detection_algorithm/test_MatrixProfile.py @@ -58,10 +58,17 @@ class MatrixProfileTest(unittest.TestCase): hyperparams_class = MatrixProfilePrimitive.metadata.get_hyperparams() hyperparams = hyperparams_class.defaults() hyperparams = hyperparams.replace({'window_size': 3}) +<<<<<<< Updated upstream:tods/tests/detection_algorithm/test_MatrixProfile.py primitive = MatrixProfilePrimitive(hyperparams=hyperparams) #primitive.set_training_data(inputs=main) #primitive.fit() +======= + #print(type(main)) + primitive = MatrixProfile(hyperparams=hyperparams) + primitive.set_training_data(inputs=main) + primitive.fit() +>>>>>>> Stashed changes:tods/tests/test_MatrixProfile.py new_main = primitive.produce(inputs=main).value print(new_main)