Source code for dclab.downsampling

"""Content-based downsampling of ndarrays"""
import numpy as np

from .cached import Cache


[docs]def downsample_rand(a, samples, remove_invalid=False, ret_idx=False):
    """Downsampling by randomly removing points

    Parameters
    ----------
    a: 1d ndarray
        The input array to downsample
    samples: int
        The desired number of samples
    remove_invalid: bool
        Remove nan and inf values before downsampling
    ret_idx: bool
        Also return a boolean array that corresponds to the
        downsampled indices in `a`.

    Returns
    -------
    dsa: 1d ndarray of size `samples`
        The pseudo-randomly downsampled array `a`
    idx: 1d boolean array with same shape as `a`
        Only returned if `ret_idx` is True.
        A boolean array such that `a[idx] == dsa`
    """
    # fixed random state for this method
    rs = np.random.RandomState(seed=47).get_state()
    np.random.set_state(rs)

    samples = int(samples)

    if remove_invalid:
        # slice out nans and infs
        bad = np.isnan(a) | np.isinf(a)
        pool = a[~bad]
    else:
        pool = a

    if samples and (samples < pool.shape[0]):
        keep = np.zeros_like(pool, dtype=bool)
        keep_ids = np.random.choice(np.arange(pool.size),
                                    size=samples,
                                    replace=False)
        keep[keep_ids] = True
        dsa = pool[keep]
    else:
        keep = np.ones_like(pool, dtype=bool)
        dsa = pool

    if remove_invalid:
        # translate the kept values back to the original array
        idx = np.zeros(a.size, dtype=bool)
        idx[~bad] = keep
    else:
        idx = keep

    if ret_idx:
        return dsa, idx
    else:
        return dsa


[docs]@Cache
def downsample_grid(a, b, samples, remove_invalid=False, ret_idx=False):
    """Content-based downsampling for faster visualization

    The arrays `a` and `b` make up a 2D scatter plot with high
    and low density values. This method takes out points at
    indices with high density.

    Parameters
    ----------
    a, b: 1d ndarrays
        The input arrays to downsample
    samples: int
        The desired number of samples
    remove_invalid: bool
        Remove nan and inf values before downsampling; if set to
        `True`, the actual number of samples returned might be
        smaller than `samples` due to infinite or nan values.
    ret_idx: bool
        Also return a boolean array that corresponds to the
        downsampled indices in `a` and `b`.

    Returns
    -------
    dsa, dsb: 1d ndarrays of shape (samples,)
        The arrays `a` and `b` downsampled by evenly selecting
        points and pseudo-randomly adding or removing points
        to match `samples`.
    idx: 1d boolean array with same shape as `a`
        Only returned if `ret_idx` is True.
        A boolean array such that `a[idx] == dsa`
    """
    # fixed random state for this method
    rs = np.random.RandomState(seed=47).get_state()

    if remove_invalid:
        # Remove nan and inf values straight from the beginning.
        # This might result in arrays smaller than `samples`,
        # but it makes sure that no inf/nan values will be plotted.
        bad = np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b)
        ad = a[~bad]
        bd = b[~bad]
    else:
        bad = np.zeros_like(a, dtype=bool)
        ad = a
        bd = b

    keep = np.ones_like(a, dtype=bool)
    keep[bad] = False

    samples = int(samples)

    if samples and samples < ad.size:
        # The events to keep
        keepd = np.zeros_like(ad, dtype=bool)

        # 1. Produce evenly distributed samples
        # Choosing grid-size:
        # - large numbers tend to show actual structures of the sample,
        #   which is not desired for plotting
        # - small numbers tend will not result in too few samples and,
        #   in order to reach the desired samples, the data must be
        #   upsampled again.
        # 300 is about the size of the plot in marker sizes and yields
        # good results.
        grid_size = 300
        xpx = norm(ad, ad, bd) * grid_size
        ypx = norm(bd, bd, ad) * grid_size
        # The events on the grid to process
        toproc = np.ones((grid_size, grid_size), dtype=bool)

        for ii in range(xpx.size):
            xi = xpx[ii]
            yi = ypx[ii]
            # filter for overlapping events
            # Note that `valid` is used here to promote only valid
            # events in this step. However, in step 2, invalid events
            # could be added back. To avoid this scenario, the
            # parameter `remove_invalid` should be set to True.
            if valid(xi, yi) and toproc[int(xi-1), int(yi-1)]:
                toproc[int(xi-1), int(yi-1)] = False
                # include event
                keepd[ii] = True

        # 2. Make sure that we reach `samples` by adding or
        # removing events.
        diff = np.sum(keepd) - samples
        if diff > 0:
            # Too many samples
            rem_indices = np.where(keepd)[0]
            np.random.set_state(rs)
            rem = np.random.choice(rem_indices,
                                   size=diff,
                                   replace=False)
            keepd[rem] = False
        elif diff < 0:
            # Not enough samples
            add_indices = np.where(~keepd)[0]
            np.random.set_state(rs)
            add = np.random.choice(add_indices,
                                   size=abs(diff),
                                   replace=False)
            keepd[add] = True

        assert np.sum(keepd) == samples, "sanity check"
        asd = ad[keepd]
        bsd = bd[keepd]
        assert np.allclose(ad[keepd], asd, equal_nan=True), "sanity check"
        assert np.allclose(bd[keepd], bsd, equal_nan=True), "sanity check"
        keep[~bad] = keepd
    else:
        asd = ad
        bsd = bd

    if ret_idx:
        return asd, bsd, keep
    else:
        return asd, bsd


[docs]def valid(a, b):
    """Check whether `a` and `b` are not inf or nan"""
    return ~(np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b))


[docs]def norm(a, ref1, ref2):
    """
    Normalize `a` with min/max values of `ref1`, using all elements of
    `ref1` where the `ref1` and `ref2` are not nan or inf"""
    ref = ref1[valid(ref1, ref2)]
    return (a-ref.min())/(ref.max()-ref.min())