Source code for patsemb.semantic_segmentation.LogisticRegressionSegmentor


import inspect
import multiprocessing
import numpy as np
from typing import Union, List

from sklearn.exceptions import NotFittedError
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression

from patsemb.semantic_segmentation.ProbabilisticSemanticSegmentor import ProbabilisticSemanticSegmentor



[docs]
class LogisticRegressionSegmentor(ProbabilisticSemanticSegmentor):
    """
    Segments the pattern-based embedding using Logistic Regression [carpentier2024pattern]_.

    First, a KMeans clustering model is fitted on the embedding, which will
    provide a discrete clustering (i.e., every observation in the time series
    will be assigned a discrete cluster label). The number of clusters `K` is
    decided based on the silhouette method. The discrete clustering give an
    initial indication of when the semantic segments occur.

    Second, the discrete clustering is fed to a logistic regression model. This
    model learns to which segment each time point of the pattern-based embedding
    belongs. Because logistic regression is a probabilistic model, we retrieve
    the probabilities of a given observation belong to a semantic segment,
    thereby obtaining a probabilistic segmentation.

    Parameters
    ----------
    n_segments: int or list of int, default=[2, 3, 4, 5, 6, 7, 8, 9]
        The number of segments. If a list of integers is passed, a clustering
        will be made for each value, and the best clustering is selected using
        the silhouette score.
    n_jobs: int, default=1
        The number of jobs to use for computing the multiple clusterings. Has
        no effect if ``n_segments`` is an integer.
    **kwargs:
        Additional arguments to be passed to either ``KMeans`` clutering or
        ``LogisticRegression`` (both using Sklearn implementation). This class
        automatically infers which parameters can be passed to either object
        using the ``inspect`` module. If a parameter is valid for both models
        (e.g., ``max_iter``), then it will be passed to both. If an additional
        argument is given, which is not valid for KMeans nor for LogisticRegression,
        a TypeError will be thrown.

        A TypeError will also be raised if ``n_clusters`` is passed to this
        object - even though it is valid for ``KMeans`` - because this parameter
        will be set based on ``n_segments``.

    Attributes
    ----------
    k_means_kwargs: dict
        The arguments to pass to SKlearn KMeans.
    logistic_regression_kwargs: dict
        The arguments to pass to SKlearn LogisticRegression.
    logistic_regression_: LogisticRegression
        The fitted SKlearn Logistic Regression model.

    References
    ----------
    .. [carpentier2024pattern] Carpentier, Louis, Feremans, Len, Meert, Wannes, Verbeke, Mathias.
       "Pattern-based Time Series Semantic Segmentation with Gradual State Transitions." Proceedings
       of the 2024 SIAM International Conference on Data Mining (SDM). Society for Industrial and
       Applied Mathematics, 2024, doi: `10.1137/1.9781611978032.36 <https://doi.org/10.1137/1.9781611978032.36>`_.
    """
    n_segments: Union[int, List[int]]
    n_jobs: int
    kwargs: dict

    k_means_kwargs: dict
    logistic_regression_kwargs: dict

    logistic_regression_: LogisticRegression

    def __init__(self,
                 n_segments: Union[List[int], int] = None,
                 n_jobs: int = 1,
                 **kwargs):

        self.n_segments: List[int] = \
            list(range(2, 10)) if n_segments is None else \
            [n_segments] if isinstance(n_segments, int) else \
            n_segments
        self.n_jobs = n_jobs
        self.kwargs = kwargs

        # Separate the kwargs
        self.k_means_kwargs = {key: value for key, value in kwargs.items() if key in inspect.signature(KMeans).parameters}
        self.logistic_regression_kwargs = {key: value for key, value in kwargs.items() if key in inspect.signature(LogisticRegression).parameters}

        if 'n_clusters' in self.k_means_kwargs:
            raise TypeError("Parameter 'n_clusters' should not be passed!")

        # Check if invalid arguments were given
        valid_kwargs = dict(self.k_means_kwargs, **self.logistic_regression_kwargs)
        if len(valid_kwargs) != len(kwargs):
            invalid_kwargs = [arg for arg in kwargs.keys() if arg not in valid_kwargs]
            raise TypeError(f"Parameters were given that do not belong to K-Means or Logistic Regression: {invalid_kwargs}")


[docs]
    def fit(self, X: np.ndarray, y=None) -> 'ProbabilisticSemanticSegmentor':

        # If there is only one value for n_segments given, we can simply compute the clustering
        if len(self.n_segments) == 1:
            clustering = KMeans(n_clusters=self.n_segments[0], **self.k_means_kwargs).fit_predict(X.T)

        # Otherwise, use parallelization and select the best clustering
        else:

            # Compute clusters with different number of segments
            args = [(X.T, n_segments) for n_segments in self.n_segments]
            if self.n_jobs > 1:
                with multiprocessing.Pool(self.n_jobs) as pool:
                    pool_results = pool.starmap(self._compute_kmeans_segmentation, args)
            else:
                pool_results = [self._compute_kmeans_segmentation(*arg) for arg in args]

            # Identify the best cluster with maximum silhouette score
            index_largest_silhouette_score = np.argmax([silhouette_avg for silhouette_avg, *_ in pool_results])
            clustering = pool_results[index_largest_silhouette_score][1]

        # Fit the logistic regression model
        self.logistic_regression_ = LogisticRegression(**self.logistic_regression_kwargs)
        self.logistic_regression_.fit(X.T, clustering)

        # Return self
        return self



[docs]
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        if not hasattr(self, 'logistic_regression_'):
            raise NotFittedError('Call the fit method before predicting!')
        return self.logistic_regression_.predict_proba(X.T)


    def _compute_kmeans_segmentation(self, X: np.ndarray, n_segments: int):
        # Cluster the embedding
        k_means = KMeans(n_clusters=n_segments, **self.k_means_kwargs)
        segmentation = k_means.fit_predict(X)

        # Compute silhouette score
        if len(set(segmentation)) != n_segments:
            silhouette_avg = -1
        else:
            n = X.shape[0]
            sample_size = n if n < 2000 else 2000 + int(0.1 * (n - 2000))
            silhouette_avg = silhouette_score(X, segmentation, sample_size=sample_size)

        return silhouette_avg, segmentation