From e478dd06dc42cce1dfa1bee0b51483ad146f0576 Mon Sep 17 00:00:00 2001
From: lhenry15 <khlai037@gmail.com>
Date: Sun, 30 May 2021 12:44:33 -0500
Subject: [PATCH] fix sk_interface examples and Matrix profile

---
 examples/sk_examples/DeepLog_test.py         |   2 -
 examples/sk_examples/IsolationForest_test.py |   9 +-
 examples/sk_examples/MatrixProfile_test.py   |   9 +-
 examples/sk_examples/Telemanom_test.py       |   1 -
 tods/detection_algorithm/MatrixProfile.py    | 410 ++++++++++++++-------------
 tods/detection_algorithm/UODBasePrimitive.py |   6 -
 6 files changed, 213 insertions(+), 224 deletions(-)

diff --git a/examples/sk_examples/DeepLog_test.py b/examples/sk_examples/DeepLog_test.py
index d521d52..7cb9896 100644
--- a/examples/sk_examples/DeepLog_test.py
+++ b/examples/sk_examples/DeepLog_test.py
@@ -1,6 +1,5 @@
 import numpy as np
 from tods.sk_interface.detection_algorithm.DeepLog_skinterface import DeepLogSKI
-#from tods.tods_skinterface.primitiveSKI.detection_algorithm.DeepLog_skinterface import DeepLogSKI
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import confusion_matrix
@@ -22,7 +21,6 @@ prediction_labels_train = transformer.predict(X_train)
 prediction_labels = transformer.predict(X_test)
 prediction_score = transformer.predict_score(X_test)
 
-print("Primitive: ", transformer.primitive)
 print("Prediction Labels\n", prediction_labels)
 print("Prediction Score\n", prediction_score)
 
diff --git a/examples/sk_examples/IsolationForest_test.py b/examples/sk_examples/IsolationForest_test.py
index 4c507fe..0ac3254 100644
--- a/examples/sk_examples/IsolationForest_test.py
+++ b/examples/sk_examples/IsolationForest_test.py
@@ -1,5 +1,5 @@
 import numpy as np
-from tods.tods_skinterface.primitiveSKI.detection_algorithm.IsolationForest_skinterface import IsolationForestSKI
+from tods.sk_interface.detection_algorithm.IsolationForest_skinterface import IsolationForestSKI
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import confusion_matrix
@@ -24,7 +24,6 @@ prediction_labels_train = transformer.predict(X_train)
 prediction_labels = transformer.predict(X_test)
 prediction_score = transformer.predict_score(X_test)
 
-print("Primitive: ", transformer.primitive)
 print("Prediction Labels\n", prediction_labels)
 print("Prediction Score\n", prediction_score)
 
@@ -36,9 +35,3 @@ print('Accuracy Score: ', accuracy_score(y_true, y_pred))
 confusion_matrix(y_true, y_pred)
 
 print(classification_report(y_true, y_pred))
-
-precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
-f1_scores = 2*recall*precision/(recall+precision)
-
-print('Best threshold: ', thresholds[np.argmax(f1_scores)])
-print('Best F1-Score: ', np.max(f1_scores))
\ No newline at end of file
diff --git a/examples/sk_examples/MatrixProfile_test.py b/examples/sk_examples/MatrixProfile_test.py
index 2cb8108..b2a69b6 100644
--- a/examples/sk_examples/MatrixProfile_test.py
+++ b/examples/sk_examples/MatrixProfile_test.py
@@ -1,5 +1,5 @@
 import numpy as np
-from tods.tods_skinterface.primitiveSKI.detection_algorithm.MatrixProfile_skinterface import MatrixProfileSKI
+from tods.sk_interface.detection_algorithm.MatrixProfile_skinterface import MatrixProfileSKI
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import confusion_matrix
@@ -17,7 +17,6 @@ prediction_labels_train = transformer.predict(X_train)
 prediction_labels = transformer.predict(X_test)
 prediction_score = transformer.predict_score(X_test)
 
-print("Primitive: ", transformer.primitive)
 print("Prediction Labels\n", prediction_labels)
 print("Prediction Score\n", prediction_score)
 
@@ -29,9 +28,3 @@ print('Accuracy Score: ', accuracy_score(y_true, y_pred))
 confusion_matrix(y_true, y_pred)
 
 print(classification_report(y_true, y_pred))
-
-precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
-f1_scores = 2*recall*precision/(recall+precision)
-
-print('Best threshold: ', thresholds[np.argmax(f1_scores)])
-print('Best F1-Score: ', np.max(f1_scores))
\ No newline at end of file
diff --git a/examples/sk_examples/Telemanom_test.py b/examples/sk_examples/Telemanom_test.py
index 92bd90f..d5f1ae0 100644
--- a/examples/sk_examples/Telemanom_test.py
+++ b/examples/sk_examples/Telemanom_test.py
@@ -25,7 +25,6 @@ prediction_labels_train = transformer.predict(X_train)
 prediction_labels = transformer.predict(X_test)
 prediction_score = transformer.predict_score(X_test)
 
-print("Primitive: ", transformer.primitive)
 print("Prediction Labels\n", prediction_labels)
 print("Prediction Score\n", prediction_score)
 y_true = prediction_labels_train
diff --git a/tods/detection_algorithm/MatrixProfile.py b/tods/detection_algorithm/MatrixProfile.py
index 669e3e4..3952860 100644
--- a/tods/detection_algorithm/MatrixProfile.py
+++ b/tods/detection_algorithm/MatrixProfile.py
@@ -34,6 +34,8 @@ import pandas
 import uuid
 
 from d3m import container, utils as d3m_utils
+from .core.CollectiveBase import CollectiveBaseDetector
+from .core.utility import get_sub_matrices
 
 from .UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase
 import stumpy
@@ -47,216 +49,226 @@ Outputs = d3m_dataframe
 
 
 class Params(Params_ODBase):
-	######## Add more Attributes #######
-	pass
+    ######## Add more Attributes #######
+    pass
 
 
 class Hyperparams(Hyperparams_ODBase):
-	######## Add more Attributes #######
-	#pass
-	window_size = hyperparams.Hyperparameter[int](
+    ######## Add more Attributes #######
+    #pass
+    window_size = hyperparams.Hyperparameter[int](
         default=3,
         description='The moving window size.',
         semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
     )
 
-class MP:
-	"""
-	This is the class for matrix profile function
-	"""
-	def __init__(self, window_size, step_size):
-		self._window_size = window_size
-		self._step_size = step_size
-		return
-
-	def fit(self, X, y=None):
-		"""Fit detector. y is ignored in unsupervised methods.
-		Parameters
-		----------
-		X : numpy array of shape (n_samples, n_features)
-		    The input samples.
-		y : Ignored
-		    Not used, present for API consistency by convention.
-		Returns
-		-------
-		self : object
-		    Fitted estimator.
-		"""
-
-		# validate inputs X and y (optional)
-		# X = check_array(X)
-		# self._set_n_classes(y)
-		# self.decision_scores_ = self.decision_function(X)
-		# self._process_decision_scores()
-		
-		return self
-
-	def _get_right_inds(self, data):
-		right_inds = []
-		for row in data[1]:
-			right_inds.append(row+self._window_size-1)
-		right_inds = pd.DataFrame(right_inds)
-		data = pd.concat([data,right_inds], axis=1)
-		data.columns = range(0,len(data.columns))
-		return data
-
-	def produce(self, data):
-
-		"""
-
-		Args:
-			data: dataframe column
-		Returns:
-			nparray
-
-		"""
-		"""
-		#only keep first two columns of MP results, the second column is left index, use windowsize to get right index
-		transformed_columns=utils.pandas.DataFrame()
-		for col in data.transpose(): #data.reshape(1,len(data)):
-			output = stumpy.stump(col, m = self._window_size)
-			output = pd.DataFrame(output)
-			output=output.drop(columns=[2,3])
-			output = self._get_right_inds(output)
-			transformed_columns=pd.concat([transformed_columns,output], axis=1)
-		return transformed_columns
-		"""
-		#data = np.random.rand(3, 1000) 
-		#data = np.array([[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 11., 12.]])
-		#data = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.])
-		#data = np.array([[3., 4., 8.6, 13., 22.5, 17, 19.2, 36.1, 127, -23, 59.2, -10],[3., 4., 8.6, 13., 22.5, 17, 19.2, 36.1, 127, -23, 59.2, -10],[3., 4., 8.6, 13., 22.5, 17, 19.2, 36.1, 127, -23, 59.2, -10]])
-
-		matrix_profile, matrix_profile_indices = stumpy.mstump(data.transpose(), m = self._window_size)
-		#matrix_profile, matrix_profile_indices = stumpy.mstump(data, m = self._window_size)
-
-		left_inds_ = numpy.arange(0, len(matrix_profile), self._step_size)
-		right_inds_ = left_inds_ + self._window_size
-		right_inds_[right_inds_ > len(matrix_profile)] = len(matrix_profile)
-		left_inds_ = np.array([left_inds_]).transpose()
-		right_inds_ = np.array([right_inds_]).transpose()
-		
-		# apply min-max scaling
-		scaler = MinMaxScaler()
-		scaler = scaler.fit(matrix_profile)
-		matrix_profile = scaler.transform(matrix_profile)
-		output = []
-		for timestamp in matrix_profile:
-			timestamp = sum(timestamp)
-			output.append([timestamp])
-
-		output = np.concatenate((output, left_inds_, right_inds_),axis=1)
-		
-		return output
-
-	def predict(self, data):
-		return self.produce(data)
-		
+class MP(CollectiveBaseDetector):
+    """
+    This is the class for matrix profile function
+    """
+    def __init__(self, window_size, step_size, contamination):
+        self._window_size = window_size
+        self._step_size = step_size
+        self.contamination = contamination
+        return
+
+    def _get_right_inds(self, data):
+        right_inds = []
+        for row in data[1]:
+            right_inds.append(row+self._window_size-1)
+        right_inds = pd.DataFrame(right_inds)
+        data = pd.concat([data,right_inds], axis=1)
+        data.columns = range(0,len(data.columns))
+        return data
+
+    def fit(self, X):
+        """Fit detector. y is ignored in unsupervised methods.
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+        y : Ignored
+            Not used, present for API consistency by convention.
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices(
+            X,
+            window_size=self._window_size,
+            step=self._step_size,
+            return_numpy=True,
+            flatten=True)
+        sub_matrices = sub_matrices[:-1, :]
+        self.left_inds_ = self.left_inds_[:-1]
+        self.right_inds_ = self.right_inds_[:-1]
+        matrix_profile, matrix_profile_indices = stumpy.mstump(X.transpose(), m = self._window_size)
+        #matrix_profile, matrix_profile_indices = stumpy.mstump(data, m = self._window_size)
+
+        #left_inds_ = numpy.arange(0, len(matrix_profile), self._step_size)
+        #right_inds_ = left_inds_ + self._window_size
+        #right_inds_[right_inds_ > len(matrix_profile)] = len(matrix_profile)
+        #left_inds_ = np.array([left_inds_]).transpose()
+        #right_inds_ = np.array([right_inds_]).transpose()
+        
+        # apply min-max scaling
+        scaler = MinMaxScaler()
+        scaler = scaler.fit(matrix_profile)
+        matrix_profile = scaler.transform(matrix_profile)
+        self.decision_scores_ = matrix_profile
+        self._process_decision_scores()
+        return self
+
+    def decision_function(self, data):
+
+        """
+
+        Args:
+            data: dataframe column
+        Returns:
+            nparray
+
+        """
+        """
+        #only keep first two columns of MP results, the second column is left index, use windowsize to get right index
+        transformed_columns=utils.pandas.DataFrame()
+        for col in data.transpose(): #data.reshape(1,len(data)):
+            output = stumpy.stump(col, m = self._window_size)
+            output = pd.DataFrame(output)
+            output=output.drop(columns=[2,3])
+            output = self._get_right_inds(output)
+            transformed_columns=pd.concat([transformed_columns,output], axis=1)
+        return transformed_columns
+        """
+        matrix_profile, matrix_profile_indices = stumpy.mstump(data.transpose(), m = self._window_size)
+        #matrix_profile, matrix_profile_indices = stumpy.mstump(data, m = self._window_size)
+
+        left_inds_ = numpy.arange(0, len(matrix_profile), self._step_size)
+        right_inds_ = left_inds_ + self._window_size
+        right_inds_[right_inds_ > len(matrix_profile)] = len(matrix_profile)
+        left_inds_ = np.array([left_inds_]).transpose()
+        right_inds_ = np.array([right_inds_]).transpose()
+        
+        # apply min-max scaling
+        scaler = MinMaxScaler()
+        scaler = scaler.fit(matrix_profile)
+        matrix_profile = scaler.transform(matrix_profile)
+        #output = []
+        #for timestamp in matrix_profile:
+        #    timestamp = sum(timestamp)
+        #    output.append([timestamp])
+
+        #output = np.concatenate((output, left_inds_, right_inds_),axis=1)
+        
+        return matrix_profile, left_inds_, right_inds_
+        
 class MatrixProfilePrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]):
-	"""
-
-	A primitive that performs matrix profile on a DataFrame using Stumpy package
-	Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html
-
-	 Parameters
-    	----------
-    	T_A : ndarray
-    	    The time series or sequence for which to compute the matrix profile
-    	m : int
-    	    Window size
-    	T_B : ndarray
-    	    The time series or sequence that contain your query subsequences
-    	    of interest. Default is `None` which corresponds to a self-join.
-    	ignore_trivial : bool
-    	    Set to `True` if this is a self-join. Otherwise, for AB-join, set this
-    	    to `False`. Default is `True`.
-    	Returnsfdsf
-    	-------
-    	out : ndarray
-    	    The first column consists of the matrix profile, the second column
-    	    consists of the matrix profile indices, the third column consists of
-    	    the left matrix profile indices, and the fourth column consists of
-    	    the right matrix profile indices.
-	
-	"""
-
-	metadata = metadata_base.PrimitiveMetadata({
-	    '__author__': "DATA Lab @Texas A&M University",
-	    'name': "Matrix Profile",
-	    'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile',
-	    'source': {
+    """
+
+    A primitive that performs matrix profile on a DataFrame using Stumpy package
+    Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html
+
+     Parameters
+        ----------
+        T_A : ndarray
+            The time series or sequence for which to compute the matrix profile
+        m : int
+            Window size
+        T_B : ndarray
+            The time series or sequence that contain your query subsequences
+            of interest. Default is `None` which corresponds to a self-join.
+        ignore_trivial : bool
+            Set to `True` if this is a self-join. Otherwise, for AB-join, set this
+            to `False`. Default is `True`.
+        Returnsfdsf
+        -------
+        out : ndarray
+            The first column consists of the matrix profile, the second column
+            consists of the matrix profile indices, the third column consists of
+            the left matrix profile indices, and the fourth column consists of
+            the right matrix profile indices.
+    
+    """
+
+    metadata = metadata_base.PrimitiveMetadata({
+        '__author__': "DATA Lab @Texas A&M University",
+        'name': "Matrix Profile",
+        'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile',
+        'source': {
                 'name': "DATA Lab @Taxes A&M University", 
                 'contact': 'mailto:khlai037@tamu.edu',
             },
-	    'hyperparams_to_tune': ['window_size'],
-	    'version': '0.0.2',		
-	    'algorithm_types': [
+        'hyperparams_to_tune': ['window_size'],
+        'version': '0.0.2',        
+        'algorithm_types': [
                 metadata_base.PrimitiveAlgorithmType.TODS_PRIMITIVE,
             ], 
-	    'primitive_family': metadata_base.PrimitiveFamily.ANOMALY_DETECTION,
-	    'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')),
-	})
-
-
-	def __init__(self, *,
-				 hyperparams: Hyperparams, #
-				 random_seed: int = 0,
-				 docker_containers: Dict[str, DockerContainer] = None) -> None:
-		super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
-
-		self._clf = MP(window_size=hyperparams['window_size'], step_size=hyperparams['step_size'])
-
-	def set_training_data(self, *, inputs: Inputs) -> None:
-		"""
-		Set training data for outlier detection.
-		Args:
-			inputs: Container DataFrame
-
-		Returns:
-			None
-		"""
-		super().set_training_data(inputs=inputs)
-
-	def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
-		"""
-		Fit model with training data.
-		Args:
-			*: Container DataFrame. Time series data up to fit.
-
-		Returns:
-			None
-		"""
-		return super().fit()
-
-	def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
-		"""
-		Process the testing data.
-		Args:
-			inputs: Container DataFrame. Time series data up to outlier detection.
-
-		Returns:
-			Container DataFrame
-			1 marks Outliers, 0 marks normal.
-		"""
-		return super().produce(inputs=inputs, timeout=timeout, iterations=iterations)
-
-	def get_params(self) -> Params:		# pragma: no cover
-		"""
-		Return parameters.
-		Args:
-			None
-
-		Returns:
-			class Params
-		"""
-		return super().get_params()
-
-	def set_params(self, *, params: Params) -> None:	# pragma: no cover
-		"""
-		Set parameters for outlier detection.
-		Args:
-			params: class Params
-
-		Returns:
-			None
-		"""
-		super().set_params(params=params)
+        'primitive_family': metadata_base.PrimitiveFamily.ANOMALY_DETECTION,
+        'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')),
+    })
+
+
+    def __init__(self, *,
+                 hyperparams: Hyperparams, #
+                 random_seed: int = 0,
+                 docker_containers: Dict[str, DockerContainer] = None) -> None:
+        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
+
+        self._clf = MP(window_size=hyperparams['window_size'], step_size=hyperparams['step_size'], contamination=hyperparams['contamination'])
+
+    def set_training_data(self, *, inputs: Inputs) -> None:
+        """
+        Set training data for outlier detection.
+        Args:
+            inputs: Container DataFrame
+
+        Returns:
+            None
+        """
+        super().set_training_data(inputs=inputs)
+
+    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
+        """
+        Fit model with training data.
+        Args:
+            *: Container DataFrame. Time series data up to fit.
+
+        Returns:
+            None
+        """
+        return super().fit()
+
+    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
+        """
+        Process the testing data.
+        Args:
+            inputs: Container DataFrame. Time series data up to outlier detection.
+
+        Returns:
+            Container DataFrame
+            1 marks Outliers, 0 marks normal.
+        """
+        return super().produce(inputs=inputs, timeout=timeout, iterations=iterations)
+
+    def get_params(self) -> Params:        # pragma: no cover
+        """
+        Return parameters.
+        Args:
+            None
+
+        Returns:
+            class Params
+        """
+        return super().get_params()
+
+    def set_params(self, *, params: Params) -> None:    # pragma: no cover
+        """
+        Set parameters for outlier detection.
+        Args:
+            params: class Params
+
+        Returns:
+            None
+        """
+        super().set_params(params=params)
diff --git a/tods/detection_algorithm/UODBasePrimitive.py b/tods/detection_algorithm/UODBasePrimitive.py
index 89b1e23..eee177d 100755
--- a/tods/detection_algorithm/UODBasePrimitive.py
+++ b/tods/detection_algorithm/UODBasePrimitive.py
@@ -293,9 +293,7 @@ class UnsupervisedOutlierDetectorBase(TODSUnsupervisedLearnerPrimitiveBase[Input
         if self.hyperparams['use_semantic_types']:
             sk_inputs = inputs.iloc[:, self._training_indices]
         output_columns = []
-        #print("skinputs ", sk_inputs.values)
         if len(self._training_indices) > 0:
-
             if self.hyperparams['return_subseq_inds']:
 
                 if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD
@@ -306,9 +304,6 @@ class UnsupervisedOutlierDetectorBase(TODSUnsupervisedLearnerPrimitiveBase[Input
                 else:
                     pred_label, left_inds_, right_inds_ = self._clf.predict(sk_inputs.values)
 
-                # print(pred_label.shape, left_inds_.shape, right_inds_.shape)
-                # print(pred_label, left_inds_, right_inds_)
-
                 sk_output = numpy.concatenate((numpy.expand_dims(pred_label, axis=1),
                                                numpy.expand_dims(left_inds_, axis=1),
                                                numpy.expand_dims(right_inds_, axis=1)), axis=1)
@@ -321,7 +316,6 @@ class UnsupervisedOutlierDetectorBase(TODSUnsupervisedLearnerPrimitiveBase[Input
                 else:
                     sk_output, _, _ = self._clf.predict(sk_inputs.values)
 
-            #print("sk output ", sk_output)
             if sparse.issparse(sk_output): # pragma: no cover
                 sk_output = sk_output.toarray()