fix sk_interface examples and Matrix profile

4 years ago · e478dd06dc
--- a/examples/sk_examples/DeepLog_test.py
+++ b/examples/sk_examples/DeepLog_test.py
@@ -1,6 +1,5 @@
 import numpy as np
 from tods.sk_interface.detection_algorithm.DeepLog_skinterface import DeepLogSKI
 #from tods.tods_skinterface.primitiveSKI.detection_algorithm.DeepLog_skinterface import DeepLogSKI
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import confusion_matrix
@@ -22,7 +21,6 @@ prediction_labels_train = transformer.predict(X_train)
 prediction_labels = transformer.predict(X_test)
 prediction_score = transformer.predict_score(X_test)
 print("Primitive: ", transformer.primitive)
 print("Prediction Labels\n", prediction_labels)
 print("Prediction Score\n", prediction_score)
--- a/examples/sk_examples/IsolationForest_test.py
+++ b/examples/sk_examples/IsolationForest_test.py
@@ -1,5 +1,5 @@
 import numpy as np
 from tods.tods_skinterface.primitiveSKI.detection_algorithm.IsolationForest_skinterface import IsolationForestSKI
 from tods.sk_interface.detection_algorithm.IsolationForest_skinterface import IsolationForestSKI
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import confusion_matrix
@@ -24,7 +24,6 @@ prediction_labels_train = transformer.predict(X_train)
 prediction_labels = transformer.predict(X_test)
 prediction_score = transformer.predict_score(X_test)
 print("Primitive: ", transformer.primitive)
 print("Prediction Labels\n", prediction_labels)
 print("Prediction Score\n", prediction_score)
@@ -36,9 +35,3 @@ print('Accuracy Score: ', accuracy_score(y_true, y_pred))
 confusion_matrix(y_true, y_pred)
 print(classification_report(y_true, y_pred))
 precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
 f1_scores = 2*recall*precision/(recall+precision)
 print('Best threshold: ', thresholds[np.argmax(f1_scores)])
 print('Best F1-Score: ', np.max(f1_scores))
--- a/examples/sk_examples/MatrixProfile_test.py
+++ b/examples/sk_examples/MatrixProfile_test.py
@@ -1,5 +1,5 @@
 import numpy as np
 from tods.tods_skinterface.primitiveSKI.detection_algorithm.MatrixProfile_skinterface import MatrixProfileSKI
 from tods.sk_interface.detection_algorithm.MatrixProfile_skinterface import MatrixProfileSKI
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import confusion_matrix
@@ -17,7 +17,6 @@ prediction_labels_train = transformer.predict(X_train)
 prediction_labels = transformer.predict(X_test)
 prediction_score = transformer.predict_score(X_test)
 print("Primitive: ", transformer.primitive)
 print("Prediction Labels\n", prediction_labels)
 print("Prediction Score\n", prediction_score)
@@ -29,9 +28,3 @@ print('Accuracy Score: ', accuracy_score(y_true, y_pred))
 confusion_matrix(y_true, y_pred)
 print(classification_report(y_true, y_pred))
 precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
 f1_scores = 2*recall*precision/(recall+precision)
 print('Best threshold: ', thresholds[np.argmax(f1_scores)])
 print('Best F1-Score: ', np.max(f1_scores))
--- a/examples/sk_examples/Telemanom_test.py
+++ b/examples/sk_examples/Telemanom_test.py
@@ -25,7 +25,6 @@ prediction_labels_train = transformer.predict(X_train)
 prediction_labels = transformer.predict(X_test)
 prediction_score = transformer.predict_score(X_test)
 print("Primitive: ", transformer.primitive)
 print("Prediction Labels\n", prediction_labels)
 print("Prediction Score\n", prediction_score)
 y_true = prediction_labels_train
--- a/tods/detection_algorithm/MatrixProfile.py
+++ b/tods/detection_algorithm/MatrixProfile.py
@@ -34,6 +34,8 @@ import pandas
 import uuid
 from d3m import container, utils as d3m_utils
 from .core.CollectiveBase import CollectiveBaseDetector
 from .core.utility import get_sub_matrices
 from .UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase
 import stumpy
@@ -47,216 +49,226 @@ Outputs = d3m_dataframe
 class Params(Params_ODBase):
 	######## Add more Attributes #######
 	pass
    ######## Add more Attributes #######
    pass
 class Hyperparams(Hyperparams_ODBase):
 	######## Add more Attributes #######
 	#pass
 	window_size = hyperparams.Hyperparameter[int](
    ######## Add more Attributes #######
    #pass
    window_size = hyperparams.Hyperparameter[int](
        default=3,
        description='The moving window size.',
        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
    )
 class MP:
 	"""
 	This is the class for matrix profile function
 	"""
 	def __init__(self, window_size, step_size):
 		self._window_size = window_size
 		self._step_size = step_size
 		return
 	def fit(self, X, y=None):
 		"""Fit detector. y is ignored in unsupervised methods.
 		Parameters
 		----------
 		X : numpy array of shape (n_samples, n_features)
 		    The input samples.
 		y : Ignored
 		    Not used, present for API consistency by convention.
 		Returns
 		-------
 		self : object
 		    Fitted estimator.
 		"""
 		# validate inputs X and y (optional)
 		# X = check_array(X)
 		# self._set_n_classes(y)
 		# self.decision_scores_ = self.decision_function(X)
 		# self._process_decision_scores()
 		return self
 	def _get_right_inds(self, data):
 		right_inds = []
 		for row in data[1]:
 			right_inds.append(row+self._window_size-1)
 		right_inds = pd.DataFrame(right_inds)
 		data = pd.concat([data,right_inds], axis=1)
 		data.columns = range(0,len(data.columns))
 		return data
 	def produce(self, data):
 		"""
 		Args:
 			data: dataframe column
 		Returns:
 			nparray
 		"""
 		"""
 		#only keep first two columns of MP results, the second column is left index, use windowsize to get right index
 		transformed_columns=utils.pandas.DataFrame()
 		for col in data.transpose(): #data.reshape(1,len(data)):
 			output = stumpy.stump(col, m = self._window_size)
 			output = pd.DataFrame(output)
 			output=output.drop(columns=[2,3])
 			output = self._get_right_inds(output)
 			transformed_columns=pd.concat([transformed_columns,output], axis=1)
 		return transformed_columns
 		"""
 		#data = np.random.rand(3, 1000) 
 		#data = np.array([[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 11., 12.]])
 		#data = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.])
 		#data = np.array([[3., 4., 8.6, 13., 22.5, 17, 19.2, 36.1, 127, -23, 59.2, -10],[3., 4., 8.6, 13., 22.5, 17, 19.2, 36.1, 127, -23, 59.2, -10],[3., 4., 8.6, 13., 22.5, 17, 19.2, 36.1, 127, -23, 59.2, -10]])
 		matrix_profile, matrix_profile_indices = stumpy.mstump(data.transpose(), m = self._window_size)
 		#matrix_profile, matrix_profile_indices = stumpy.mstump(data, m = self._window_size)
 		left_inds_ = numpy.arange(0, len(matrix_profile), self._step_size)
 		right_inds_ = left_inds_ + self._window_size
 		right_inds_[right_inds_ > len(matrix_profile)] = len(matrix_profile)
 		left_inds_ = np.array([left_inds_]).transpose()
 		right_inds_ = np.array([right_inds_]).transpose()
 		# apply min-max scaling
 		scaler = MinMaxScaler()
 		scaler = scaler.fit(matrix_profile)
 		matrix_profile = scaler.transform(matrix_profile)
 		output = []
 		for timestamp in matrix_profile:
 			timestamp = sum(timestamp)
 			output.append([timestamp])
 		output = np.concatenate((output, left_inds_, right_inds_),axis=1)
 		return output
 	def predict(self, data):
 		return self.produce(data)
 class MP(CollectiveBaseDetector):
    """
    This is the class for matrix profile function
    """
    def __init__(self, window_size, step_size, contamination):
        self._window_size = window_size
        self._step_size = step_size
        self.contamination = contamination
        return
    def _get_right_inds(self, data):
        right_inds = []
        for row in data[1]:
            right_inds.append(row+self._window_size-1)
        right_inds = pd.DataFrame(right_inds)
        data = pd.concat([data,right_inds], axis=1)
        data.columns = range(0,len(data.columns))
        return data
    def fit(self, X):
        """Fit detector. y is ignored in unsupervised methods.
        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.
        y : Ignored
            Not used, present for API consistency by convention.
        Returns
        -------
        self : object
            Fitted estimator.
        """
        sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices(
            X,
            window_size=self._window_size,
            step=self._step_size,
            return_numpy=True,
            flatten=True)
        sub_matrices = sub_matrices[:-1, :]
        self.left_inds_ = self.left_inds_[:-1]
        self.right_inds_ = self.right_inds_[:-1]
        matrix_profile, matrix_profile_indices = stumpy.mstump(X.transpose(), m = self._window_size)
        #matrix_profile, matrix_profile_indices = stumpy.mstump(data, m = self._window_size)
        #left_inds_ = numpy.arange(0, len(matrix_profile), self._step_size)
        #right_inds_ = left_inds_ + self._window_size
        #right_inds_[right_inds_ > len(matrix_profile)] = len(matrix_profile)
        #left_inds_ = np.array([left_inds_]).transpose()
        #right_inds_ = np.array([right_inds_]).transpose()
        # apply min-max scaling
        scaler = MinMaxScaler()
        scaler = scaler.fit(matrix_profile)
        matrix_profile = scaler.transform(matrix_profile)
        self.decision_scores_ = matrix_profile
        self._process_decision_scores()
        return self
    def decision_function(self, data):
        """
        Args:
            data: dataframe column
        Returns:
            nparray
        """
        """
        #only keep first two columns of MP results, the second column is left index, use windowsize to get right index
        transformed_columns=utils.pandas.DataFrame()
        for col in data.transpose(): #data.reshape(1,len(data)):
            output = stumpy.stump(col, m = self._window_size)
            output = pd.DataFrame(output)
            output=output.drop(columns=[2,3])
            output = self._get_right_inds(output)
            transformed_columns=pd.concat([transformed_columns,output], axis=1)
        return transformed_columns
        """
        matrix_profile, matrix_profile_indices = stumpy.mstump(data.transpose(), m = self._window_size)
        #matrix_profile, matrix_profile_indices = stumpy.mstump(data, m = self._window_size)
        left_inds_ = numpy.arange(0, len(matrix_profile), self._step_size)
        right_inds_ = left_inds_ + self._window_size
        right_inds_[right_inds_ > len(matrix_profile)] = len(matrix_profile)
        left_inds_ = np.array([left_inds_]).transpose()
        right_inds_ = np.array([right_inds_]).transpose()
        # apply min-max scaling
        scaler = MinMaxScaler()
        scaler = scaler.fit(matrix_profile)
        matrix_profile = scaler.transform(matrix_profile)
        #output = []
        #for timestamp in matrix_profile:
        #    timestamp = sum(timestamp)
        #    output.append([timestamp])
        #output = np.concatenate((output, left_inds_, right_inds_),axis=1)
        return matrix_profile, left_inds_, right_inds_
 class MatrixProfilePrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]):
 	"""
 	A primitive that performs matrix profile on a DataFrame using Stumpy package
 	Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html
 	 Parameters
    	----------
    	T_A : ndarray
    	    The time series or sequence for which to compute the matrix profile
    	m : int
    	    Window size
    	T_B : ndarray
    	    The time series or sequence that contain your query subsequences
    	    of interest. Default is `None` which corresponds to a self-join.
    	ignore_trivial : bool
    	    Set to `True` if this is a self-join. Otherwise, for AB-join, set this
    	    to `False`. Default is `True`.
    	Returnsfdsf
    	-------
    	out : ndarray
    	    The first column consists of the matrix profile, the second column
    	    consists of the matrix profile indices, the third column consists of
    	    the left matrix profile indices, and the fourth column consists of
    	    the right matrix profile indices.
 	"""
 	metadata = metadata_base.PrimitiveMetadata({
 	    '__author__': "DATA Lab @Texas A&M University",
 	    'name': "Matrix Profile",
 	    'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile',
 	    'source': {
    """
    A primitive that performs matrix profile on a DataFrame using Stumpy package
    Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html
     Parameters
        ----------
        T_A : ndarray
            The time series or sequence for which to compute the matrix profile
        m : int
            Window size
        T_B : ndarray
            The time series or sequence that contain your query subsequences
            of interest. Default is `None` which corresponds to a self-join.
        ignore_trivial : bool
            Set to `True` if this is a self-join. Otherwise, for AB-join, set this
            to `False`. Default is `True`.
        Returnsfdsf
        -------
        out : ndarray
            The first column consists of the matrix profile, the second column
            consists of the matrix profile indices, the third column consists of
            the left matrix profile indices, and the fourth column consists of
            the right matrix profile indices.
    """
    metadata = metadata_base.PrimitiveMetadata({
        '__author__': "DATA Lab @Texas A&M University",
        'name': "Matrix Profile",
        'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile',
        'source': {
                'name': "DATA Lab @Taxes A&M University", 
                'contact': 'mailto:khlai037@tamu.edu',
            },
 	    'hyperparams_to_tune': ['window_size'],
 	    'version': '0.0.2',		
 	    'algorithm_types': [
        'hyperparams_to_tune': ['window_size'],
        'version': '0.0.2',        
        'algorithm_types': [
                metadata_base.PrimitiveAlgorithmType.TODS_PRIMITIVE,
            ], 
 	    'primitive_family': metadata_base.PrimitiveFamily.ANOMALY_DETECTION,
 	    'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')),
 	})
 	def __init__(self, *,
 				 hyperparams: Hyperparams, #
 				 random_seed: int = 0,
 				 docker_containers: Dict[str, DockerContainer] = None) -> None:
 		super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
 		self._clf = MP(window_size=hyperparams['window_size'], step_size=hyperparams['step_size'])
 	def set_training_data(self, *, inputs: Inputs) -> None:
 		"""
 		Set training data for outlier detection.
 		Args:
 			inputs: Container DataFrame
 		Returns:
 			None
 		"""
 		super().set_training_data(inputs=inputs)
 	def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
 		"""
 		Fit model with training data.
 		Args:
 			*: Container DataFrame. Time series data up to fit.
 		Returns:
 			None
 		"""
 		return super().fit()
 	def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
 		"""
 		Process the testing data.
 		Args:
 			inputs: Container DataFrame. Time series data up to outlier detection.
 		Returns:
 			Container DataFrame
 			1 marks Outliers, 0 marks normal.
 		"""
 		return super().produce(inputs=inputs, timeout=timeout, iterations=iterations)
 	def get_params(self) -> Params:		# pragma: no cover
 		"""
 		Return parameters.
 		Args:
 			None
 		Returns:
 			class Params
 		"""
 		return super().get_params()
 	def set_params(self, *, params: Params) -> None:	# pragma: no cover
 		"""
 		Set parameters for outlier detection.
 		Args:
 			params: class Params
 		Returns:
 			None
 		"""
 		super().set_params(params=params)
        'primitive_family': metadata_base.PrimitiveFamily.ANOMALY_DETECTION,
        'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')),
    })
    def __init__(self, *,
                 hyperparams: Hyperparams, #
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        self._clf = MP(window_size=hyperparams['window_size'], step_size=hyperparams['step_size'], contamination=hyperparams['contamination'])
    def set_training_data(self, *, inputs: Inputs) -> None:
        """
        Set training data for outlier detection.
        Args:
            inputs: Container DataFrame
        Returns:
            None
        """
        super().set_training_data(inputs=inputs)
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Fit model with training data.
        Args:
            *: Container DataFrame. Time series data up to fit.
        Returns:
            None
        """
        return super().fit()
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame. Time series data up to outlier detection.
        Returns:
            Container DataFrame
            1 marks Outliers, 0 marks normal.
        """
        return super().produce(inputs=inputs, timeout=timeout, iterations=iterations)
    def get_params(self) -> Params:        # pragma: no cover
        """
        Return parameters.
        Args:
            None
        Returns:
            class Params
        """
        return super().get_params()
    def set_params(self, *, params: Params) -> None:    # pragma: no cover
        """
        Set parameters for outlier detection.
        Args:
            params: class Params
        Returns:
            None
        """
        super().set_params(params=params)
--- a/tods/detection_algorithm/UODBasePrimitive.py
+++ b/tods/detection_algorithm/UODBasePrimitive.py
@@ -293,9 +293,7 @@ class UnsupervisedOutlierDetectorBase(TODSUnsupervisedLearnerPrimitiveBase[Input
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._training_indices]
        output_columns = []
        #print("skinputs ", sk_inputs.values)
        if len(self._training_indices) > 0:
            if self.hyperparams['return_subseq_inds']:
                if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD
@@ -306,9 +304,6 @@ class UnsupervisedOutlierDetectorBase(TODSUnsupervisedLearnerPrimitiveBase[Input
                else:
                    pred_label, left_inds_, right_inds_ = self._clf.predict(sk_inputs.values)
                # print(pred_label.shape, left_inds_.shape, right_inds_.shape)
                # print(pred_label, left_inds_, right_inds_)
                sk_output = numpy.concatenate((numpy.expand_dims(pred_label, axis=1),
                                               numpy.expand_dims(left_inds_, axis=1),
                                               numpy.expand_dims(right_inds_, axis=1)), axis=1)
@@ -321,7 +316,6 @@ class UnsupervisedOutlierDetectorBase(TODSUnsupervisedLearnerPrimitiveBase[Input
                else:
                    sk_output, _, _ = self._clf.predict(sk_inputs.values)
            #print("sk output ", sk_output)
            if sparse.issparse(sk_output): # pragma: no cover
                sk_output = sk_output.toarray()