Source code for dclab.downsampling

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Content-based downsampling of ndarrays"""
from __future__ import division, print_function, unicode_literals

import numpy as np

from .cached import Cache


[docs]def downsample_rand(a, samples, remove_invalid=True, retidx=False): """Downsampling by randomly removing points Parameters ---------- a: 1d ndarray The input array to downsample samples: int The desired number of samples remove_invalid: bool Remove nan and inf values before downsampling retidx: bool Also return a boolean array that corresponds to the downsampled indices in `a`. Returns ------- dsa, dsb: 1d ndarrays of shape (samples,) The pseudo-randomly downsampled arrays `a` and `b` [idx]: 1d boolean array with same shape as `a` A boolean array such that `a[idx] == dsa` is all true """ # fixed random state for this method rs = np.random.RandomState(seed=47).get_state() if not remove_invalid: raise ValueError("Downsampling cannot handle inf/nan yet!") samples = int(samples) if remove_invalid: # slice out nans and infs bad = np.isnan(a)+np.isinf(a) a = a[~bad] if samples and (samples < a.shape[0]): keep = np.zeros_like(a, dtype=bool) np.random.set_state(rs) keep_ids = np.random.choice(np.arange(a.shape[0]), size=samples, replace=False) keep[keep_ids] = True dsa = a[keep] else: keep = np.ones_like(a, dtype=bool) dsa = a if remove_invalid: # translate the kept values back to the original array keep_inv = np.zeros_like(bad) keep_inv[~bad] = keep if retidx: return dsa, keep_inv else: return dsa
@Cache def downsample_grid(a, b, samples, remove_invalid=True, retidx=False): """Content-based downsampling for faster visualization The arrays `a` and `b` make up a 2D scatter plot with high and low density values. This method takes out points at indices with high density. Parameters ---------- a, b: 1d ndarrays The input arrays to downsample samples: int The desired number of samples remove_invalid: bool Remove nan and inf values before downsampling retidx: bool Also return a boolean array that corresponds to the downsampled indices in `a` and `b`. Returns ------- dsa, dsb: 1d ndarrays of shape (samples,) The arrays `a` and `b` downsampled by evenly selecting points and pseudo-randomly adding or removing points to match `samples`. [idx]: 1d boolean array with same shape as `a` A boolean array such that `a[idx] == dsa` is all true """ # fixed random state for this method rs = np.random.RandomState(seed=47).get_state() if not remove_invalid: raise ValueError("Downsampling cannot handle inf/nan yet!") samples = int(samples) if remove_invalid: # slice out nans and infs bad = np.isnan(a)+np.isinf(a)+np.isnan(b)+np.isinf(b) a = a[~bad] b = b[~bad] if samples and samples < a.shape[0]: # The events to keep keep = np.zeros(a.shape, dtype=bool) # 1. Produce evenly distributed samples # Choosing grid-size: # - large numbers tend to show actual structures of the sample, # which is not desired for plotting # - small numbers tend will not result in too few samples and, # in order to reach the desired samples, the data must be # upsampled again. # 300 is about the size of the plot in marker sizes and yields # good results. grid_size=300 xpx = (a-a.min())/(a.max()-a.min()) * grid_size ypx = (b-b.min())/(b.max()-b.min()) * grid_size # The events on the grid to process toproc = np.ones((grid_size, grid_size), dtype=bool) for ii in range(xpx.shape[0]): xi = xpx[ii] yi = ypx[ii] ## first filter for exactly overlapping events if toproc[int(xi-1), int(yi-1)]: toproc[int(xi-1), int(yi-1)] = False ## second filter for multiple overlay keep[ii] = True # 2. Make sure that we reach `samples` by adding or # removing events. diff = np.sum(keep) - samples if diff > 0: # Too many samples rem_indices = np.where(keep==True)[0] np.random.set_state(rs) rem = np.random.choice(rem_indices, size=diff, replace=False) keep[rem] = False elif diff < 0: # Not enough samples add_indices = np.where(keep==False)[0] np.random.set_state(rs) add = np.random.choice(add_indices, size=abs(diff), replace=False) keep[add] = True assert np.sum(keep) == samples asd = a[keep] bsd = b[keep] else: keep = np.ones_like(a, dtype=bool) asd = a bsd = b if remove_invalid: # translate the kept values back to the original array keep_inv = np.zeros_like(bad) keep_inv[~bad] = keep if retidx: return asd, bsd, keep_inv else: return asd, bsd