From e478dd06dc42cce1dfa1bee0b51483ad146f0576 Mon Sep 17 00:00:00 2001 From: lhenry15 Date: Sun, 30 May 2021 12:44:33 -0500 Subject: [PATCH] fix sk_interface examples and Matrix profile --- examples/sk_examples/DeepLog_test.py | 2 - examples/sk_examples/IsolationForest_test.py | 9 +- examples/sk_examples/MatrixProfile_test.py | 9 +- examples/sk_examples/Telemanom_test.py | 1 - tods/detection_algorithm/MatrixProfile.py | 410 ++++++++++++++------------- tods/detection_algorithm/UODBasePrimitive.py | 6 - 6 files changed, 213 insertions(+), 224 deletions(-) diff --git a/examples/sk_examples/DeepLog_test.py b/examples/sk_examples/DeepLog_test.py index d521d52..7cb9896 100644 --- a/examples/sk_examples/DeepLog_test.py +++ b/examples/sk_examples/DeepLog_test.py @@ -1,6 +1,5 @@ import numpy as np from tods.sk_interface.detection_algorithm.DeepLog_skinterface import DeepLogSKI -#from tods.tods_skinterface.primitiveSKI.detection_algorithm.DeepLog_skinterface import DeepLogSKI from sklearn.metrics import precision_recall_curve from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix @@ -22,7 +21,6 @@ prediction_labels_train = transformer.predict(X_train) prediction_labels = transformer.predict(X_test) prediction_score = transformer.predict_score(X_test) -print("Primitive: ", transformer.primitive) print("Prediction Labels\n", prediction_labels) print("Prediction Score\n", prediction_score) diff --git a/examples/sk_examples/IsolationForest_test.py b/examples/sk_examples/IsolationForest_test.py index 4c507fe..0ac3254 100644 --- a/examples/sk_examples/IsolationForest_test.py +++ b/examples/sk_examples/IsolationForest_test.py @@ -1,5 +1,5 @@ import numpy as np -from tods.tods_skinterface.primitiveSKI.detection_algorithm.IsolationForest_skinterface import IsolationForestSKI +from tods.sk_interface.detection_algorithm.IsolationForest_skinterface import IsolationForestSKI from sklearn.metrics import precision_recall_curve from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix @@ -24,7 +24,6 @@ prediction_labels_train = transformer.predict(X_train) prediction_labels = transformer.predict(X_test) prediction_score = transformer.predict_score(X_test) -print("Primitive: ", transformer.primitive) print("Prediction Labels\n", prediction_labels) print("Prediction Score\n", prediction_score) @@ -36,9 +35,3 @@ print('Accuracy Score: ', accuracy_score(y_true, y_pred)) confusion_matrix(y_true, y_pred) print(classification_report(y_true, y_pred)) - -precision, recall, thresholds = precision_recall_curve(y_true, y_pred) -f1_scores = 2*recall*precision/(recall+precision) - -print('Best threshold: ', thresholds[np.argmax(f1_scores)]) -print('Best F1-Score: ', np.max(f1_scores)) \ No newline at end of file diff --git a/examples/sk_examples/MatrixProfile_test.py b/examples/sk_examples/MatrixProfile_test.py index 2cb8108..b2a69b6 100644 --- a/examples/sk_examples/MatrixProfile_test.py +++ b/examples/sk_examples/MatrixProfile_test.py @@ -1,5 +1,5 @@ import numpy as np -from tods.tods_skinterface.primitiveSKI.detection_algorithm.MatrixProfile_skinterface import MatrixProfileSKI +from tods.sk_interface.detection_algorithm.MatrixProfile_skinterface import MatrixProfileSKI from sklearn.metrics import precision_recall_curve from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix @@ -17,7 +17,6 @@ prediction_labels_train = transformer.predict(X_train) prediction_labels = transformer.predict(X_test) prediction_score = transformer.predict_score(X_test) -print("Primitive: ", transformer.primitive) print("Prediction Labels\n", prediction_labels) print("Prediction Score\n", prediction_score) @@ -29,9 +28,3 @@ print('Accuracy Score: ', accuracy_score(y_true, y_pred)) confusion_matrix(y_true, y_pred) print(classification_report(y_true, y_pred)) - -precision, recall, thresholds = precision_recall_curve(y_true, y_pred) -f1_scores = 2*recall*precision/(recall+precision) - -print('Best threshold: ', thresholds[np.argmax(f1_scores)]) -print('Best F1-Score: ', np.max(f1_scores)) \ No newline at end of file diff --git a/examples/sk_examples/Telemanom_test.py b/examples/sk_examples/Telemanom_test.py index 92bd90f..d5f1ae0 100644 --- a/examples/sk_examples/Telemanom_test.py +++ b/examples/sk_examples/Telemanom_test.py @@ -25,7 +25,6 @@ prediction_labels_train = transformer.predict(X_train) prediction_labels = transformer.predict(X_test) prediction_score = transformer.predict_score(X_test) -print("Primitive: ", transformer.primitive) print("Prediction Labels\n", prediction_labels) print("Prediction Score\n", prediction_score) y_true = prediction_labels_train diff --git a/tods/detection_algorithm/MatrixProfile.py b/tods/detection_algorithm/MatrixProfile.py index 669e3e4..3952860 100644 --- a/tods/detection_algorithm/MatrixProfile.py +++ b/tods/detection_algorithm/MatrixProfile.py @@ -34,6 +34,8 @@ import pandas import uuid from d3m import container, utils as d3m_utils +from .core.CollectiveBase import CollectiveBaseDetector +from .core.utility import get_sub_matrices from .UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase import stumpy @@ -47,216 +49,226 @@ Outputs = d3m_dataframe class Params(Params_ODBase): - ######## Add more Attributes ####### - pass + ######## Add more Attributes ####### + pass class Hyperparams(Hyperparams_ODBase): - ######## Add more Attributes ####### - #pass - window_size = hyperparams.Hyperparameter[int]( + ######## Add more Attributes ####### + #pass + window_size = hyperparams.Hyperparameter[int]( default=3, description='The moving window size.', semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] ) -class MP: - """ - This is the class for matrix profile function - """ - def __init__(self, window_size, step_size): - self._window_size = window_size - self._step_size = step_size - return - - def fit(self, X, y=None): - """Fit detector. y is ignored in unsupervised methods. - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - y : Ignored - Not used, present for API consistency by convention. - Returns - ------- - self : object - Fitted estimator. - """ - - # validate inputs X and y (optional) - # X = check_array(X) - # self._set_n_classes(y) - # self.decision_scores_ = self.decision_function(X) - # self._process_decision_scores() - - return self - - def _get_right_inds(self, data): - right_inds = [] - for row in data[1]: - right_inds.append(row+self._window_size-1) - right_inds = pd.DataFrame(right_inds) - data = pd.concat([data,right_inds], axis=1) - data.columns = range(0,len(data.columns)) - return data - - def produce(self, data): - - """ - - Args: - data: dataframe column - Returns: - nparray - - """ - """ - #only keep first two columns of MP results, the second column is left index, use windowsize to get right index - transformed_columns=utils.pandas.DataFrame() - for col in data.transpose(): #data.reshape(1,len(data)): - output = stumpy.stump(col, m = self._window_size) - output = pd.DataFrame(output) - output=output.drop(columns=[2,3]) - output = self._get_right_inds(output) - transformed_columns=pd.concat([transformed_columns,output], axis=1) - return transformed_columns - """ - #data = np.random.rand(3, 1000) - #data = np.array([[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 11., 12.]]) - #data = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.]) - #data = np.array([[3., 4., 8.6, 13., 22.5, 17, 19.2, 36.1, 127, -23, 59.2, -10],[3., 4., 8.6, 13., 22.5, 17, 19.2, 36.1, 127, -23, 59.2, -10],[3., 4., 8.6, 13., 22.5, 17, 19.2, 36.1, 127, -23, 59.2, -10]]) - - matrix_profile, matrix_profile_indices = stumpy.mstump(data.transpose(), m = self._window_size) - #matrix_profile, matrix_profile_indices = stumpy.mstump(data, m = self._window_size) - - left_inds_ = numpy.arange(0, len(matrix_profile), self._step_size) - right_inds_ = left_inds_ + self._window_size - right_inds_[right_inds_ > len(matrix_profile)] = len(matrix_profile) - left_inds_ = np.array([left_inds_]).transpose() - right_inds_ = np.array([right_inds_]).transpose() - - # apply min-max scaling - scaler = MinMaxScaler() - scaler = scaler.fit(matrix_profile) - matrix_profile = scaler.transform(matrix_profile) - output = [] - for timestamp in matrix_profile: - timestamp = sum(timestamp) - output.append([timestamp]) - - output = np.concatenate((output, left_inds_, right_inds_),axis=1) - - return output - - def predict(self, data): - return self.produce(data) - +class MP(CollectiveBaseDetector): + """ + This is the class for matrix profile function + """ + def __init__(self, window_size, step_size, contamination): + self._window_size = window_size + self._step_size = step_size + self.contamination = contamination + return + + def _get_right_inds(self, data): + right_inds = [] + for row in data[1]: + right_inds.append(row+self._window_size-1) + right_inds = pd.DataFrame(right_inds) + data = pd.concat([data,right_inds], axis=1) + data.columns = range(0,len(data.columns)) + return data + + def fit(self, X): + """Fit detector. y is ignored in unsupervised methods. + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + y : Ignored + Not used, present for API consistency by convention. + Returns + ------- + self : object + Fitted estimator. + """ + sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( + X, + window_size=self._window_size, + step=self._step_size, + return_numpy=True, + flatten=True) + sub_matrices = sub_matrices[:-1, :] + self.left_inds_ = self.left_inds_[:-1] + self.right_inds_ = self.right_inds_[:-1] + matrix_profile, matrix_profile_indices = stumpy.mstump(X.transpose(), m = self._window_size) + #matrix_profile, matrix_profile_indices = stumpy.mstump(data, m = self._window_size) + + #left_inds_ = numpy.arange(0, len(matrix_profile), self._step_size) + #right_inds_ = left_inds_ + self._window_size + #right_inds_[right_inds_ > len(matrix_profile)] = len(matrix_profile) + #left_inds_ = np.array([left_inds_]).transpose() + #right_inds_ = np.array([right_inds_]).transpose() + + # apply min-max scaling + scaler = MinMaxScaler() + scaler = scaler.fit(matrix_profile) + matrix_profile = scaler.transform(matrix_profile) + self.decision_scores_ = matrix_profile + self._process_decision_scores() + return self + + def decision_function(self, data): + + """ + + Args: + data: dataframe column + Returns: + nparray + + """ + """ + #only keep first two columns of MP results, the second column is left index, use windowsize to get right index + transformed_columns=utils.pandas.DataFrame() + for col in data.transpose(): #data.reshape(1,len(data)): + output = stumpy.stump(col, m = self._window_size) + output = pd.DataFrame(output) + output=output.drop(columns=[2,3]) + output = self._get_right_inds(output) + transformed_columns=pd.concat([transformed_columns,output], axis=1) + return transformed_columns + """ + matrix_profile, matrix_profile_indices = stumpy.mstump(data.transpose(), m = self._window_size) + #matrix_profile, matrix_profile_indices = stumpy.mstump(data, m = self._window_size) + + left_inds_ = numpy.arange(0, len(matrix_profile), self._step_size) + right_inds_ = left_inds_ + self._window_size + right_inds_[right_inds_ > len(matrix_profile)] = len(matrix_profile) + left_inds_ = np.array([left_inds_]).transpose() + right_inds_ = np.array([right_inds_]).transpose() + + # apply min-max scaling + scaler = MinMaxScaler() + scaler = scaler.fit(matrix_profile) + matrix_profile = scaler.transform(matrix_profile) + #output = [] + #for timestamp in matrix_profile: + # timestamp = sum(timestamp) + # output.append([timestamp]) + + #output = np.concatenate((output, left_inds_, right_inds_),axis=1) + + return matrix_profile, left_inds_, right_inds_ + class MatrixProfilePrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): - """ - - A primitive that performs matrix profile on a DataFrame using Stumpy package - Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html - - Parameters - ---------- - T_A : ndarray - The time series or sequence for which to compute the matrix profile - m : int - Window size - T_B : ndarray - The time series or sequence that contain your query subsequences - of interest. Default is `None` which corresponds to a self-join. - ignore_trivial : bool - Set to `True` if this is a self-join. Otherwise, for AB-join, set this - to `False`. Default is `True`. - Returnsfdsf - ------- - out : ndarray - The first column consists of the matrix profile, the second column - consists of the matrix profile indices, the third column consists of - the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. - - """ - - metadata = metadata_base.PrimitiveMetadata({ - '__author__': "DATA Lab @Texas A&M University", - 'name': "Matrix Profile", - 'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile', - 'source': { + """ + + A primitive that performs matrix profile on a DataFrame using Stumpy package + Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html + + Parameters + ---------- + T_A : ndarray + The time series or sequence for which to compute the matrix profile + m : int + Window size + T_B : ndarray + The time series or sequence that contain your query subsequences + of interest. Default is `None` which corresponds to a self-join. + ignore_trivial : bool + Set to `True` if this is a self-join. Otherwise, for AB-join, set this + to `False`. Default is `True`. + Returnsfdsf + ------- + out : ndarray + The first column consists of the matrix profile, the second column + consists of the matrix profile indices, the third column consists of + the left matrix profile indices, and the fourth column consists of + the right matrix profile indices. + + """ + + metadata = metadata_base.PrimitiveMetadata({ + '__author__': "DATA Lab @Texas A&M University", + 'name': "Matrix Profile", + 'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile', + 'source': { 'name': "DATA Lab @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', }, - 'hyperparams_to_tune': ['window_size'], - 'version': '0.0.2', - 'algorithm_types': [ + 'hyperparams_to_tune': ['window_size'], + 'version': '0.0.2', + 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.TODS_PRIMITIVE, ], - 'primitive_family': metadata_base.PrimitiveFamily.ANOMALY_DETECTION, - 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')), - }) - - - def __init__(self, *, - hyperparams: Hyperparams, # - random_seed: int = 0, - docker_containers: Dict[str, DockerContainer] = None) -> None: - super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) - - self._clf = MP(window_size=hyperparams['window_size'], step_size=hyperparams['step_size']) - - def set_training_data(self, *, inputs: Inputs) -> None: - """ - Set training data for outlier detection. - Args: - inputs: Container DataFrame - - Returns: - None - """ - super().set_training_data(inputs=inputs) - - def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: - """ - Fit model with training data. - Args: - *: Container DataFrame. Time series data up to fit. - - Returns: - None - """ - return super().fit() - - def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: - """ - Process the testing data. - Args: - inputs: Container DataFrame. Time series data up to outlier detection. - - Returns: - Container DataFrame - 1 marks Outliers, 0 marks normal. - """ - return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) - - def get_params(self) -> Params: # pragma: no cover - """ - Return parameters. - Args: - None - - Returns: - class Params - """ - return super().get_params() - - def set_params(self, *, params: Params) -> None: # pragma: no cover - """ - Set parameters for outlier detection. - Args: - params: class Params - - Returns: - None - """ - super().set_params(params=params) + 'primitive_family': metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')), + }) + + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = MP(window_size=hyperparams['window_size'], step_size=hyperparams['step_size'], contamination=hyperparams['contamination']) + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: # pragma: no cover + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: # pragma: no cover + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) diff --git a/tods/detection_algorithm/UODBasePrimitive.py b/tods/detection_algorithm/UODBasePrimitive.py index 89b1e23..eee177d 100755 --- a/tods/detection_algorithm/UODBasePrimitive.py +++ b/tods/detection_algorithm/UODBasePrimitive.py @@ -293,9 +293,7 @@ class UnsupervisedOutlierDetectorBase(TODSUnsupervisedLearnerPrimitiveBase[Input if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] - #print("skinputs ", sk_inputs.values) if len(self._training_indices) > 0: - if self.hyperparams['return_subseq_inds']: if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD @@ -306,9 +304,6 @@ class UnsupervisedOutlierDetectorBase(TODSUnsupervisedLearnerPrimitiveBase[Input else: pred_label, left_inds_, right_inds_ = self._clf.predict(sk_inputs.values) - # print(pred_label.shape, left_inds_.shape, right_inds_.shape) - # print(pred_label, left_inds_, right_inds_) - sk_output = numpy.concatenate((numpy.expand_dims(pred_label, axis=1), numpy.expand_dims(left_inds_, axis=1), numpy.expand_dims(right_inds_, axis=1)), axis=1) @@ -321,7 +316,6 @@ class UnsupervisedOutlierDetectorBase(TODSUnsupervisedLearnerPrimitiveBase[Input else: sk_output, _, _ = self._clf.predict(sk_inputs.values) - #print("sk output ", sk_output) if sparse.issparse(sk_output): # pragma: no cover sk_output = sk_output.toarray()