Source code for dclab.downsampling

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Content-based downsampling of ndarrays"""
from __future__ import division, print_function, unicode_literals

import numpy as np

from .cached import Cache


[docs]def downsample_rand(a, samples, remove_invalid=True, retidx=False):
    """Downsampling by randomly removing points
    
    Parameters
    ----------
    a: 1d ndarray
        The input array to downsample
    samples: int
        The desired number of samples
    remove_invalid: bool
        Remove nan and inf values before downsampling
    retidx: bool
        Also return a boolean array that corresponds to the
        downsampled indices in `a`.

    Returns
    -------
    dsa, dsb: 1d ndarrays of shape (samples,)
        The pseudo-randomly downsampled arrays `a` and `b`
    [idx]: 1d boolean array with same shape as `a`
        A boolean array such that `a[idx] == dsa` is all true
    """
    # fixed random state for this method
    rs = np.random.RandomState(seed=47).get_state()
    
    if not remove_invalid:
        raise ValueError("Downsampling cannot handle inf/nan yet!")
    
    samples = int(samples)
    
    if remove_invalid:
        # slice out nans and infs
        bad = np.isnan(a)+np.isinf(a)
        a = a[~bad]
    
    if samples and (samples < a.shape[0]):
        keep = np.zeros_like(a, dtype=bool)
        np.random.set_state(rs)
        keep_ids = np.random.choice(np.arange(a.shape[0]),
                                    size=samples,
                                    replace=False)
        keep[keep_ids] = True
        dsa = a[keep]
    else:
        keep = np.ones_like(a, dtype=bool)
        dsa = a

    if remove_invalid:
        # translate the kept values back to the original array
        keep_inv = np.zeros_like(bad)
        keep_inv[~bad] = keep

    if retidx:
        return dsa, keep_inv
    else:
        return dsa


@Cache
def downsample_grid(a, b, samples, remove_invalid=True, retidx=False):
    """Content-based downsampling for faster visualization
    
    The arrays `a` and `b` make up a 2D scatter plot with high
    and low density values. This method takes out points at
    indices with high density.

    Parameters
    ----------
    a, b: 1d ndarrays
        The input arrays to downsample
    samples: int
        The desired number of samples
    remove_invalid: bool
        Remove nan and inf values before downsampling
    retidx: bool
        Also return a boolean array that corresponds to the
        downsampled indices in `a` and `b`.

    Returns
    -------
    dsa, dsb: 1d ndarrays of shape (samples,)
        The arrays `a` and `b` downsampled by evenly selecting
        points and pseudo-randomly adding or removing points
        to match `samples`.
    [idx]: 1d boolean array with same shape as `a`
        A boolean array such that `a[idx] == dsa` is all true

    """
    # fixed random state for this method
    rs = np.random.RandomState(seed=47).get_state()
    
    if not remove_invalid:
        raise ValueError("Downsampling cannot handle inf/nan yet!")
    
    samples = int(samples)
    
    if remove_invalid:
        # slice out nans and infs
        bad = np.isnan(a)+np.isinf(a)+np.isnan(b)+np.isinf(b)
        a = a[~bad]
        b = b[~bad]

    if samples and samples < a.shape[0]:
        # The events to keep
        keep = np.zeros(a.shape, dtype=bool)
    
        # 1. Produce evenly distributed samples
        # Choosing grid-size:
        # - large numbers tend to show actual structures of the sample,
        #   which is not desired for plotting
        # - small numbers tend will not result in too few samples and,
        #   in order to reach the desired samples, the data must be
        #   upsampled again.
        # 300 is about the size of the plot in marker sizes and yields
        # good results.
        grid_size=300
        xpx = (a-a.min())/(a.max()-a.min()) * grid_size
        ypx = (b-b.min())/(b.max()-b.min()) * grid_size    
        # The events on the grid to process
        toproc = np.ones((grid_size, grid_size), dtype=bool)
    
        for ii in range(xpx.shape[0]):
            xi = xpx[ii]
            yi = ypx[ii]
            ## first filter for exactly overlapping events
            if toproc[int(xi-1), int(yi-1)]:
                toproc[int(xi-1), int(yi-1)] = False
                ## second filter for multiple overlay
                keep[ii] = True

        
        # 2. Make sure that we reach `samples` by adding or
        # removing events.        
        diff = np.sum(keep) - samples
        if diff > 0:
            # Too many samples
            rem_indices = np.where(keep==True)[0]
            np.random.set_state(rs)
            rem = np.random.choice(rem_indices,
                                   size=diff,
                                   replace=False)
            keep[rem] = False
        elif diff < 0:
            # Not enough samples
            add_indices = np.where(keep==False)[0]
            np.random.set_state(rs)
            add = np.random.choice(add_indices,
                                   size=abs(diff),
                                   replace=False)
            keep[add] = True    

        assert np.sum(keep) == samples    
        asd = a[keep]
        bsd = b[keep]
    else:
        keep = np.ones_like(a, dtype=bool)
        asd = a
        bsd = b

    if remove_invalid:
        # translate the kept values back to the original array
        keep_inv = np.zeros_like(bad)
        keep_inv[~bad] = keep
        
    if retidx:
        return asd, bsd, keep_inv
    else:
        return asd, bsd