Source code for dclab.downsampling

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Content-based downsampling of ndarrays"""
from __future__ import division, print_function, unicode_literals

import numpy as np

from .cached import Cache


[docs]def downsample_rand(a, samples, remove_invalid=False, ret_idx=False):
    """Downsampling by randomly removing points

    Parameters
    ----------
    a: 1d ndarray
        The input array to downsample
    samples: int
        The desired number of samples
    remove_invalid: bool
        Remove nan and inf values before downsampling
    ret_idx: bool
        Also return a boolean array that corresponds to the
        downsampled indices in `a`.

    Returns
    -------
    dsa: 1d ndarray of size `samples`
        The pseudo-randomly downsampled array `a`
    idx: 1d boolean array with same shape as `a`
        Only returned if `ret_idx` is True.
        A boolean array such that `a[idx] == dsa`
    """
    # fixed random state for this method
    rs = np.random.RandomState(seed=47).get_state()
    np.random.set_state(rs)

    samples = int(samples)

    if remove_invalid:
        # slice out nans and infs
        bad = np.isnan(a) | np.isinf(a)
        pool = a[~bad]
    else:
        pool = a

    if samples and (samples < pool.shape[0]):
        keep = np.zeros_like(pool, dtype=bool)
        keep_ids = np.random.choice(np.arange(pool.size),
                                    size=samples,
                                    replace=False)
        keep[keep_ids] = True
        dsa = pool[keep]
    else:
        keep = np.ones_like(pool, dtype=bool)
        dsa = pool

    if remove_invalid:
        # translate the kept values back to the original array
        idx = np.zeros(a.size, dtype=bool)
        idx[~bad] = keep
    else:
        idx = keep

    if ret_idx:
        return dsa, idx
    else:
        return dsa


@Cache
def downsample_grid(a, b, samples, ret_idx=False):
    """Content-based downsampling for faster visualization

    The arrays `a` and `b` make up a 2D scatter plot with high
    and low density values. This method takes out points at
    indices with high density.

    Parameters
    ----------
    a, b: 1d ndarrays
        The input arrays to downsample
    samples: int
        The desired number of samples
    remove_invalid: bool
        Remove nan and inf values before downsampling
    ret_idx: bool
        Also return a boolean array that corresponds to the
        downsampled indices in `a` and `b`.

    Returns
    -------
    dsa, dsb: 1d ndarrays of shape (samples,)
        The arrays `a` and `b` downsampled by evenly selecting
        points and pseudo-randomly adding or removing points
        to match `samples`.
    idx: 1d boolean array with same shape as `a`
        Only returned if `ret_idx` is True.
        A boolean array such that `a[idx] == dsa`
    """
    # fixed random state for this method
    rs = np.random.RandomState(seed=47).get_state()

    samples = int(samples)

    if samples and samples < a.size:
        # The events to keep
        keep = np.zeros_like(a, dtype=bool)

        # 1. Produce evenly distributed samples
        # Choosing grid-size:
        # - large numbers tend to show actual structures of the sample,
        #   which is not desired for plotting
        # - small numbers tend will not result in too few samples and,
        #   in order to reach the desired samples, the data must be
        #   upsampled again.
        # 300 is about the size of the plot in marker sizes and yields
        # good results.
        grid_size = 300
        xpx = norm(a, a, b) * grid_size
        ypx = norm(b, b, a) * grid_size
        # The events on the grid to process
        toproc = np.ones((grid_size, grid_size), dtype=bool)

        for ii in range(xpx.size):
            xi = xpx[ii]
            yi = ypx[ii]
            # filter for overlapping events
            if valid(xi, yi) and toproc[int(xi-1), int(yi-1)]:
                toproc[int(xi-1), int(yi-1)] = False
                # include event
                keep[ii] = True

        # 2. Make sure that we reach `samples` by adding or
        # removing events.
        diff = np.sum(keep) - samples
        if diff > 0:
            # Too many samples
            rem_indices = np.where(keep)[0]
            np.random.set_state(rs)
            rem = np.random.choice(rem_indices,
                                   size=diff,
                                   replace=False)
            keep[rem] = False
        elif diff < 0:
            # Not enough samples
            add_indices = np.where(~keep)[0]
            np.random.set_state(rs)
            add = np.random.choice(add_indices,
                                   size=abs(diff),
                                   replace=False)
            keep[add] = True

        assert np.sum(keep) == samples, "sanity check"
        asd = a[keep]
        bsd = b[keep]
        assert np.allclose(a[keep], asd, equal_nan=True), "sanity check"
        assert np.allclose(b[keep], bsd, equal_nan=True), "sanity check"
    else:
        keep = np.ones_like(a, dtype=bool)
        asd = a
        bsd = b

    if ret_idx:
        return asd, bsd, keep
    else:
        return asd, bsd


[docs]def valid(a, b):
    """Check whether `a` and `b` are not inf or nan"""
    return ~(np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b))


[docs]def norm(a, ref1, ref2):
    """
    Normalize `a` with min/max values of `ref1`, using all elements of
    `ref1` where the `ref1` and `ref2` are not nan or inf"""
    ref = ref1[valid(ref1, ref2)]
    return (a-ref.min())/(ref.max()-ref.min())