"""Content-based downsampling of ndarrays"""
import numpy as np
from .cached import Cache
[docs]def downsample_rand(a, samples, remove_invalid=False, ret_idx=False):
"""Downsampling by randomly removing points
Parameters
----------
a: 1d ndarray
The input array to downsample
samples: int
The desired number of samples
remove_invalid: bool
Remove nan and inf values before downsampling
ret_idx: bool
Also return a boolean array that corresponds to the
downsampled indices in `a`.
Returns
-------
dsa: 1d ndarray of size `samples`
The pseudo-randomly downsampled array `a`
idx: 1d boolean array with same shape as `a`
Only returned if `ret_idx` is True.
A boolean array such that `a[idx] == dsa`
"""
# fixed random state for this method
rs = np.random.RandomState(seed=47).get_state()
np.random.set_state(rs)
samples = int(samples)
if remove_invalid:
# slice out nans and infs
bad = np.isnan(a) | np.isinf(a)
pool = a[~bad]
else:
pool = a
if samples and (samples < pool.shape[0]):
keep = np.zeros_like(pool, dtype=bool)
keep_ids = np.random.choice(np.arange(pool.size),
size=samples,
replace=False)
keep[keep_ids] = True
dsa = pool[keep]
else:
keep = np.ones_like(pool, dtype=bool)
dsa = pool
if remove_invalid:
# translate the kept values back to the original array
idx = np.zeros(a.size, dtype=bool)
idx[~bad] = keep
else:
idx = keep
if ret_idx:
return dsa, idx
else:
return dsa
[docs]@Cache
def downsample_grid(a, b, samples, remove_invalid=False, ret_idx=False):
"""Content-based downsampling for faster visualization
The arrays `a` and `b` make up a 2D scatter plot with high
and low density values. This method takes out points at
indices with high density.
Parameters
----------
a, b: 1d ndarrays
The input arrays to downsample
samples: int
The desired number of samples
remove_invalid: bool
Remove nan and inf values before downsampling; if set to
`True`, the actual number of samples returned might be
smaller than `samples` due to infinite or nan values.
ret_idx: bool
Also return a boolean array that corresponds to the
downsampled indices in `a` and `b`.
Returns
-------
dsa, dsb: 1d ndarrays of shape (samples,)
The arrays `a` and `b` downsampled by evenly selecting
points and pseudo-randomly adding or removing points
to match `samples`.
idx: 1d boolean array with same shape as `a`
Only returned if `ret_idx` is True.
A boolean array such that `a[idx] == dsa`
"""
# fixed random state for this method
rs = np.random.RandomState(seed=47).get_state()
if remove_invalid:
# Remove nan and inf values straight from the beginning.
# This might result in arrays smaller than `samples`,
# but it makes sure that no inf/nan values will be plotted.
bad = np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b)
ad = a[~bad]
bd = b[~bad]
else:
bad = np.zeros_like(a, dtype=bool)
ad = a
bd = b
keep = np.ones_like(a, dtype=bool)
keep[bad] = False
samples = int(samples)
if samples and samples < ad.size:
# The events to keep
keepd = np.zeros_like(ad, dtype=bool)
# 1. Produce evenly distributed samples
# Choosing grid-size:
# - large numbers tend to show actual structures of the sample,
# which is not desired for plotting
# - small numbers tend will not result in too few samples and,
# in order to reach the desired samples, the data must be
# upsampled again.
# 300 is about the size of the plot in marker sizes and yields
# good results.
grid_size = 300
xpx = norm(ad, ad, bd) * grid_size
ypx = norm(bd, bd, ad) * grid_size
# The events on the grid to process
toproc = np.ones((grid_size, grid_size), dtype=bool)
for ii in range(xpx.size):
xi = xpx[ii]
yi = ypx[ii]
# filter for overlapping events
# Note that `valid` is used here to promote only valid
# events in this step. However, in step 2, invalid events
# could be added back. To avoid this scenario, the
# parameter `remove_invalid` should be set to True.
if valid(xi, yi) and toproc[int(xi-1), int(yi-1)]:
toproc[int(xi-1), int(yi-1)] = False
# include event
keepd[ii] = True
# 2. Make sure that we reach `samples` by adding or
# removing events.
diff = np.sum(keepd) - samples
if diff > 0:
# Too many samples
rem_indices = np.where(keepd)[0]
np.random.set_state(rs)
rem = np.random.choice(rem_indices,
size=diff,
replace=False)
keepd[rem] = False
elif diff < 0:
# Not enough samples
add_indices = np.where(~keepd)[0]
np.random.set_state(rs)
add = np.random.choice(add_indices,
size=abs(diff),
replace=False)
keepd[add] = True
assert np.sum(keepd) == samples, "sanity check"
asd = ad[keepd]
bsd = bd[keepd]
assert np.allclose(ad[keepd], asd, equal_nan=True), "sanity check"
assert np.allclose(bd[keepd], bsd, equal_nan=True), "sanity check"
keep[~bad] = keepd
else:
asd = ad
bsd = bd
if ret_idx:
return asd, bsd, keep
else:
return asd, bsd
[docs]def valid(a, b):
"""Check whether `a` and `b` are not inf or nan"""
return ~(np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b))
[docs]def norm(a, ref1, ref2):
"""
Normalize `a` with min/max values of `ref1`, using all elements of
`ref1` where the `ref1` and `ref2` are not nan or inf"""
ref = ref1[valid(ref1, ref2)]
return (a-ref.min())/(ref.max()-ref.min())