Source code for diffpy.srmise.dataclusters

#!/usr/bin/env python
##############################################################################
#
# SrMise            by Luke Granlund
#                   (c) 2014 trustees of the Michigan State University
#                   (c) 2024 trustees of Columbia University in the City of New York
#                   All rights reserved.
#
# File coded by:    Luke Granlund
#
# See LICENSE.txt for license information.
#
##############################################################################
"""Defines class to partition sequences representing the x and y axis into peak-like clusters."""

import logging

import matplotlib.pyplot as plt
import numpy as np

logger = logging.getLogger("diffpy.srmise")



[docs]
class DataClusters:
    """Find clusters corresponding to peaks in the PDF (y-array)

    DataClusters determines which points in inter-atomic distane, r,
    correspond to peaks in the PDF.  The division between clusters
    is contiguous, with borders between clusters likely near relative
    minima in the data.

    Clusters are iteratively formed around points with the largest
    PDF values.  New clusters are added only when the unclustered data
    point under consideration is greater than a given distance (the
    'resolution') from the nearest existing cluster.

    Data members
    ------------
    x : array
      The array of r values.
    y : sequence of y values
      The array of PDF values, G(r)
    res : float
      The clustering resolution, i.e., the number of distance another point has to
      be away from the center of an existing cluster to before a new cluster is
      formed.  A value of zero allows every point to be a cluster.
    data_order : array
      The array of x, y indices ordered by decreasing y
    clusters : ndarray
      The array of cluster ranges
    current_idx : int
      The index of data_order currently considered
    """

    def __init__(self, x, y, res):
        """Constructor

        Parameters
        ----------
        x : array
          The array of r values.
        y : sequence of y values
          The array of PDF values, G(r)
        res : float
          The clustering resolution, i.e., the distance another point has to
          be away from the center of an existing cluster to before a new cluster is
          formed.  A value of zero allows every point to be cluster.
        """
        # Track internal state of clustering.
        self.INIT = 0
        self.READY = 1
        self.CLUSTERING = 2
        self.DONE = 3
        self._clear()
        self._setdata(x, y, res)

        return

    # This iterator operates not over found clusters, but over the process of
    # clustering.  This behavior could cause confusion and should perhaps be
    # altered.
    def __iter__(self):
        return self

    def __eq__(self, other):
        if not isinstance(other, DataClusters):
            return False
        return (
            np.array_equal(self.x, other.x)
            and np.array_equal(self.y, other.y)
            and np.array_equal(self.data_order, other.data_order)
            and np.array_equal(self.clusters, other.clusters)
            and self.res == other.res
            and self.current_idx == other.current_idx
            and self.lastcluster_idx == other.lastcluster_idx
            and self.lastpoint_idx == other.lastpoint_idx
            and self.status == other.status
            and self.INIT == other.INIT
            and self.READY == other.READY
            and self.CLUSTERING == other.CLUSTERING
            and self.DONE == other.DONE
        )

    def _clear(self):
        """
        Clear all data and reset the cluster object to a transient initial state.

        The purpose of this method is to provide a clean state before creating new clustering operations.
        The object is updated in-place and no new instance is returned.

        Returns
        -------
        None
        """
        self.x = np.array([])
        self.y = np.array([])
        self.data_order = np.array([])
        self.clusters = np.array([[]])
        self.res = 0
        self.current_idx = 0
        self.lastcluster_idx = None
        self.lastpoint_idx = None
        self.status = self.INIT
        return


[docs]
    def reset_clusters(self):
        """Reset all progress on clustering."""
        self.clusters = np.array([[self.data_order[-1], self.data_order[-1]]])
        self.current_idx = self.data_order.size - 1
        self.lastcluster_idx = 0
        self.lastpoint_idx = self.data_order[-1]

        if self.status != self.INIT:
            self.status = self.READY
        return


    def _setdata(self, x, y, res):
        """Assign data members for x- and y-coordinates, and resolution.

        Parameters
        ----------
        x : array
          The array of r values.
        y : sequence of y values
          The array of PDF values, G(r)
        res : float
          The clustering resolution, i.e., the distance another point has to
          be away from the center of an existing cluster to before a new cluster is
          formed.  A value of zero allows every point to be cluster.
        """
        if len(x) != len(y):
            raise ValueError("Sequences x and y must have the same length.")
        if res < 0:
            raise ValueError(
                "Value of resolution parameter is less than zero.  Please rerun specifying a non-negative res"
            )
        self.x = x
        self.y = y
        self.res = res
        if x.size == 0:
            self.data_order = np.array([])
            self.clusters = np.array([[]])
            self.current_idx = 0
            self.lastpoint_idx = None
            self.status = self.INIT
        else:
            self.data_order = self.y.argsort()
            self.clusters = np.array([[self.data_order[-1], self.data_order[-1]]])
            self.current_idx = len(self.data_order) - 1
            self.lastpoint_idx = self.data_order[-1]
            self.status = self.READY
        self.lastcluster_idx = None
        return

    def __next__(self):
        """Cluster point with largest y-coordinate left, returning self.

        next() always adds at least one additional point to the existing
        cluster, or raises an exception if all points have been clustered.
        """
        if self.status == self.INIT:
            raise Exception("Cannot cluster next point while status is INIT.")

        if self.status == self.READY:
            self.status = self.CLUSTERING
        elif self.status == self.DONE:
            raise StopIteration

        # Find next unclustered point, if one exists.
        nearest_cluster = [0, 0.0]
        while nearest_cluster[1] == 0.0 and self.current_idx > 0:
            self.current_idx += -1
            test_idx = self.data_order[self.current_idx]
            nearest_cluster = self.find_nearest_cluster(test_idx)

        # Check status
        if self.current_idx <= 0:
            self.status = self.DONE
            if nearest_cluster[1] == 0.0:
                # Last point already clustered, so stop immediately.
                raise StopIteration

        self.lastpoint_idx = test_idx

        if np.abs(nearest_cluster[1]) <= self.res:
            # Add to an existing cluster
            self.lastcluster_idx = nearest_cluster[0]
            if test_idx < self.clusters[nearest_cluster[0], 0]:
                self.clusters[nearest_cluster[0], 0] = test_idx
            else:
                self.clusters[nearest_cluster[0], 1] = test_idx
        else:
            # Make a new cluster
            if nearest_cluster[1] < 0:
                # Insert left of nearest cluster
                self.lastcluster_idx = nearest_cluster[0]
            else:
                # insert right of nearest cluster
                self.lastcluster_idx = nearest_cluster[0] + 1
            self.clusters = np.insert(self.clusters, int(self.lastcluster_idx), [test_idx, test_idx], 0)
        return self


[docs]
    def makeclusters(self):
        """Cluster all remaining data."""
        for i in self:
            pass
        return



[docs]
    def find_nearest_cluster2(self, x):
        """Return [cluster index, distance] for cluster nearest to x.

        Parameters
        ----------
        x : ndarray
            Coordinate of point of interest

        The distance is positive/negative if the point is right/left of the
        nearest cluster.  If the point is within an existing cluster then
        distance = 0.

        Returns
        -------
        array-like
            The index of the nearest cluster, and the distance for cluster nearest to x. None if no cluster
        """
        if self.status == self.INIT:
            raise Exception("Cannot cluster next point while status is INIT.")

        idx = np.searchsorted(self.x, x)
        if idx > 0 or idx >= len(self.x):
            return self.find_nearest_cluster(idx)
        else:
            # Choose adjacent index nearest to x
            if (self.x[idx] - x) < (x - self.x[idx - 1]):
                return self.find_nearest_cluster(idx)
            else:
                return self.find_nearest_cluster(idx - 1)



[docs]
    def find_nearest_cluster(self, idx):
        """Return [cluster index, distance] for cluster nearest to x[idx].

        The distance is positive/negative if the point is right/left of the
        nearest cluster.  If the point is within an existing cluster then
        distance = 0.

        Parameters
        ----------
        idx : array-like
            index of point in self.x of interest.

        Returns
        -------
        array-like
            The array of cluster index and the distacne to the nearest cluster. None if no clusters exist.
        """
        if self.status == self.INIT:
            raise Exception("Cannot cluster next point while status is INIT.")

        clusters_flat = self.clusters.flatten()
        if len(clusters_flat) == 0:
            return None

        flat_idx = clusters_flat.searchsorted(idx)
        near_idx = flat_idx / 2

        if flat_idx == len(clusters_flat):
            # test_idx is right of the last cluster
            return [near_idx - 1, self.x[idx] - self.x[self.clusters[-1, 1]]]
        if clusters_flat[flat_idx] == idx or flat_idx % 2 == 1:
            # idx is within some cluster
            return [near_idx, 0.0]
        if flat_idx == 0:
            # idx is left of the first cluster
            return [near_idx, self.x[idx] - self.x[self.clusters[0, 0]]]

        # Calculate which of the two nearest clusters is closer
        distances = np.array(
            [
                self.x[idx] - self.x[self.clusters[int(near_idx) - 1, 1]],
                self.x[idx] - self.x[self.clusters[int(near_idx), 0]],
            ]
        )
        if distances[0] < np.abs(distances[1]):
            return [near_idx - 1, distances[0]]
        else:
            return [near_idx, distances[1]]



[docs]
    def cluster_is_full(self, cluster_idx):
        """Return whether the given cluster can grow.

        A cluster is full if no unclustered points remain adjacent to its
        boundaries.

        Parameters
        ----------
        cluster_idx : array-like
            The index of the cluster to test

        Returns
        -------
        bools
            True if the cluster is full, False otherwise
        """
        if cluster_idx > 0:
            low = self.clusters[cluster_idx - 1, 1] + 1
        else:
            low = 0
        if cluster_idx < len(self.clusters) - 1:
            high = self.clusters[cluster_idx + 1, 0] - 1
        else:
            high = len(self.data_order) - 1
        return self.clusters[cluster_idx, 0] == low and self.clusters[cluster_idx, 1] == high



[docs]
    def combine_clusters(self, combine):
        """Combine clusters specified by each subarray of cluster indices.

        Clusters to combine must be contiguous, increasing, and have no
        unclustered points between them.

        Parameters
        ----------
        combine : ndarray
            [[leftmost_idx1, ..., rightmost_idx1], ...]

        Returns
        -------
        None
        """
        # Ensure that the same clusters aren't combined multiple times.
        combine_flat = np.array(combine).ravel()
        if len(combine_flat) != len(np.unique(combine_flat)):
            raise ValueError("Cannot combine a single cluster multiple times.")

        for c in combine:
            # Test that all clusters are contiguous and adjacent
            first = c[0]
            for i in range(c[0], c[-1]):
                if c[i + 1 - first] - 1 != c[i - first]:
                    raise ValueError(
                        "".join(
                            [
                                "Clusters  ",
                                str(c[i]),
                                " and ",
                                str(c[i + 1]),
                                " are not contiguous and/or increasing.",
                            ]
                        )
                    )
                if self.clusters[i + 1, 0] - self.clusters[i, 1] != 1:
                    raise ValueError(
                        "".join(
                            [
                                "Clusters  ",
                                str(c[i]),
                                " and ",
                                str(c[i + 1]),
                                " have unclustered points between them.",
                            ]
                        )
                    )

            # update cluster endpoints
            self.clusters[c[0], 1] = self.clusters[c[-1], 1]
        todelete = np.array([c[1:] for c in combine]).ravel()
        self.clusters = np.delete(self.clusters, todelete, 0)



[docs]
    def find_adjacent_clusters(self):
        """Return all cluster indices with no unclustered points between them.

        Return array([[leftmost_idx1,...,rightmost_idx1],...]) such that there
        are no unclustered points between each element in subarray of clusters
        (inclusive).  If no such clusters exist, returns an empty array.
        """
        if self.status == self.INIT:
            raise Exception("Cannot cluster next point while status is INIT.")

        adj = []
        left_idx = 0

        while left_idx < len(self.clusters) - 1:
            while (
                left_idx < len(self.clusters) - 1
                and self.clusters[left_idx + 1, 0] - self.clusters[left_idx, 1] != 1
            ):
                left_idx += 1

            # Not left_idx+1 since left_idx=len(self.clusters)-2 even if no
            # clusters are actually adjacent.
            right_idx = left_idx

            while (
                right_idx < len(self.clusters) - 1
                and self.clusters[right_idx + 1, 0] - self.clusters[right_idx, 1] == 1
            ):
                right_idx += 1

            if right_idx > left_idx:
                adj.append(range(left_idx, right_idx + 1))
            left_idx = right_idx + 1  # set for next possible left_idx
        return np.array(adj)



[docs]
    def cut(self, idx):
        """Return slice(s) for data given cluster index (or indices).

        Parameters
        idx - Cluster index (or sequence of indices).
        """
        data_ids = self.clusters[idx]
        if len(data_ids) == data_ids.size:
            # idx is a scalar, so give single slice object
            return slice(data_ids[0], data_ids[1] + 1)
        else:
            # idx is a list/slice, so give list of slice objects
            return [slice(c[0], c[1] + 1) for c in data_ids]



[docs]
    def cluster_boundaries(self):
        """Return sequence with (x,y) of all cluster boundaries."""
        boundaries = []
        for cluster in self.clusters:
            xlo = np.mean(self.x[cluster[0] - 1 : cluster[0] + 1])
            ylo = np.mean(self.y[cluster[0] - 1 : cluster[0] + 1])
            xhi = np.mean(self.x[cluster[1] : cluster[1] + 2])
            yhi = np.mean(self.y[cluster[1] : cluster[1] + 2])
            boundaries.append((xlo, ylo))
            boundaries.append((xhi, yhi))
        return np.unique(boundaries)



[docs]
    def plot(self, *args, **kwds):
        """Plot the data with vertical lines at the cluster divisions.

        args and kwds passed to matplotlib.plot()
        """
        plt.figure()
        ax = plt.subplot(111)
        plt.ion()
        plt.plot(self.x, self.y, *args, **kwds)
        plt.ioff()
        boundaries = self.cluster_boundaries()
        (ymin, ymax) = ax.get_ylim()
        for b in boundaries:
            plt.axvline(b[0], 0, (b[1] - ymin) / (ymax - ymin), color="k")
        plt.ion()
        ax.figure.canvas.draw()
        return



[docs]
    def animate(self):
        """Animate clustering.  Restores state when complete."""
        clusters = self.clusters
        current_idx = self.current_idx
        lastcluster_idx = self.lastcluster_idx
        lastpoint_idx = self.lastpoint_idx
        status = self.status
        self.reset_clusters()

        fig, ax = plt.subplots()
        canvas = fig.canvas
        background = canvas.copy_from_bbox(ax.bbox)
        ymin, ymax = ax.get_ylim()
        all_lines = []
        for i in self:
            canvas.restore_region(background)
            boundaries = self.cluster_boundaries()
            for i, b in enumerate(boundaries):
                height = (b[1] - ymin) / (ymax - ymin)
                if i < len(all_lines):
                    all_lines[i].set_xdata([b[0], b[0]])
                    all_lines[i].set_ydata([0, height])
                    ax.draw_artist(all_lines[i])
                else:
                    line = plt.axvline(b[0], 0, height, color="k", animated=True)
                    ax.draw_artist(line)
                    all_lines.append(line)
            canvas.blit(ax.bbox)

        self.clusters = clusters
        self.current_idx = current_idx
        self.lastcluster_idx = lastcluster_idx
        self.lastpoint_idx = lastpoint_idx
        self.status = status
        return




# End of class DataClusters


# simple test code
if __name__ == "__main__":

    x = np.array([-2.0, -1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
    y = np.array(
        [
            0.0183156,
            0.105399,
            0.36788,
            0.778806,
            1.00012,
            0.780731,
            0.386195,
            0.210798,
            0.386195,
            0.780731,
            1.00012,
            0.778806,
            0.36788,
            0.105399,
            0.0183156,
        ]
    )

    testcluster = DataClusters(x, y, 0.1)
    testcluster.makeclusters()

    print(testcluster.clusters)
    adj = testcluster.find_adjacent_clusters()
    print(adj)
    if len(adj) > 0:
        testcluster.combine_clusters(adj)
    print(testcluster.clusters)