"""Kernel Density Estimation methods"""
import numpy as np
from scipy.interpolate import RectBivariateSpline
from scipy.stats import gaussian_kde, skew
from .cached import Cache
from .external.statsmodels.nonparametric.kernel_density import KDEMultivariate
[docs]def bin_num_doane(a):
"""Compute number of bins based on Doane's formula
Notes
-----
If the bin width cannot be determined, then a bin
number of 5 is returned.
See Also
--------
bin_width_doane: method used to compute the bin width
"""
bad = np.isnan(a) | np.isinf(a)
data = a[~bad]
acc = bin_width_doane(a)
if acc == 0 or np.isnan(acc):
num = 5
else:
num = int(np.round((data.max() - data.min()) / acc))
return num
[docs]def bin_width_doane(a):
"""Compute contour spacing based on Doane's formula
References
----------
- `<https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width>`_
- `<https://stats.stackexchange.com/questions/55134/
doanes-formula-for-histogram-binning>`_
Notes
-----
Doane's formula is actually designed for histograms. This
function is kept here for backwards-compatibility reasons.
It is highly recommended to use :func:`bin_width_percentile`
instead.
"""
bad = np.isnan(a) | np.isinf(a)
data = a[~bad]
n = data.size
g1 = skew(data)
sigma_g1 = np.sqrt(6 * (n - 2) / ((n + 1) * (n + 3)))
k = 1 + np.log2(n) + np.log2(1 + np.abs(g1) / sigma_g1)
acc = (data.max() - data.min()) / k
return acc
[docs]def bin_width_percentile(a):
"""Compute contour spacing based on data percentiles
The 10th and the 90th percentile of the input data are taken.
The spacing then computes to the difference between those
two percentiles divided by 23.
Notes
-----
The Freedman–Diaconis rule uses the interquartile range and
normalizes to the third root of len(a). Such things do not
work very well for RT-DC data, because len(a) is huge. Here
we use just the top and bottom 10th percentiles with a fixed
normalization.
"""
bad = np.isnan(a) | np.isinf(a)
data = a[~bad]
start = np.percentile(data, 10)
end = np.percentile(data, 90)
acc = (end - start) / 23
return acc
[docs]def get_bad_vals(x, y):
return np.isnan(x) | np.isinf(x) | np.isnan(y) | np.isinf(y)
[docs]def ignore_nan_inf(kde_method):
"""Ignores nans and infs from the input data
Invalid positions in the resulting density are set to nan.
"""
def new_kde_method(events_x, events_y, xout=None, yout=None,
*args, **kwargs):
bad_in = get_bad_vals(events_x, events_y)
if xout is None:
density = np.zeros_like(events_x, dtype=float)
bad_out = bad_in
xo = yo = None
else:
density = np.zeros_like(xout, dtype=float)
bad_out = get_bad_vals(xout, yout)
xo = xout[~bad_out]
yo = yout[~bad_out]
# Filter events
ev_x = events_x[~bad_in]
ev_y = events_y[~bad_in]
density[~bad_out] = kde_method(ev_x, ev_y,
xo, yo,
*args, **kwargs)
density[bad_out] = np.nan
return density
doc_add = "\n Notes\n" +\
" -----\n" +\
" This is a wrapped version that ignores nan and inf values."
new_kde_method.__doc__ = kde_method.__doc__ + doc_add
return new_kde_method
[docs]@ignore_nan_inf
@Cache
def kde_gauss(events_x, events_y, xout=None, yout=None):
""" Gaussian Kernel Density Estimation
Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.
Returns
-------
density: ndarray, same shape as `xout`
The KDE for the points in (xout, yout)
See Also
--------
`scipy.stats.gaussian_kde`
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both `xout` and `yout` must be (un)set.")
if xout is None and yout is None:
xout = events_x
yout = events_y
try:
estimator = gaussian_kde([events_x.flatten(), events_y.flatten()])
density = estimator.evaluate([xout.flatten(), yout.flatten()])
except np.linalg.LinAlgError:
# LinAlgError occurs when matrix to solve is singular (issue #117)
density = np.zeros(xout.shape)*np.nan
return density.reshape(xout.shape)
[docs]@ignore_nan_inf
@Cache
def kde_histogram(events_x, events_y, xout=None, yout=None, bins=None):
""" Histogram-based Kernel Density Estimation
Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.
bins: tuple (binsx, binsy)
The number of bins to use for the histogram.
Returns
-------
density: ndarray, same shape as `xout`
The KDE for the points in (xout, yout)
See Also
--------
`numpy.histogram2d`
`scipy.interpolate.RectBivariateSpline`
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both `xout` and `yout` must be (un)set.")
if xout is None and yout is None:
xout = events_x
yout = events_y
if bins is None:
bins = (max(5, bin_num_doane(events_x)),
max(5, bin_num_doane(events_y)))
# Compute the histogram
hist2d, xedges, yedges = np.histogram2d(x=events_x,
y=events_y,
bins=bins,
density=True)
xip = xedges[1:]-(xedges[1]-xedges[0])/2
yip = yedges[1:]-(yedges[1]-yedges[0])/2
estimator = RectBivariateSpline(x=xip, y=yip, z=hist2d)
density = estimator.ev(xout, yout)
density[density < 0] = 0
return density.reshape(xout.shape)
[docs]def kde_none(events_x, events_y, xout=None, yout=None):
"""No Kernel Density Estimation
Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.
Returns
-------
density: ndarray, same shape as `xout`
The KDE for the points in (xout, yout)
Notes
-----
This method is a convenience method that always returns ones in the shape
that the other methods in this module produce.
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both `xout` and `yout` must be (un)set.")
if xout is None and yout is None:
xout = events_x
_ = events_y
return np.ones(xout.shape)
[docs]@ignore_nan_inf
@Cache
def kde_multivariate(events_x, events_y, xout=None, yout=None, bw=None):
""" Multivariate Kernel Density Estimation
Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
bw: tuple (bwx, bwy) or None
The bandwith for kernel density estimation.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.
Returns
-------
density: ndarray, same shape as `xout`
The KDE for the points in (xout, yout)
See Also
--------
`statsmodels.nonparametric.kernel_density.KDEMultivariate`
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both `xout` and `yout` must be (un)set.")
if xout is None and yout is None:
xout = events_x
yout = events_y
if bw is None:
# divide by 2 to make it comparable to histogram KDE
bw = (bin_width_doane(events_x) / 2,
bin_width_doane(events_y) / 2)
positions = np.vstack([xout.flatten(), yout.flatten()])
estimator_ly = KDEMultivariate(data=[events_x.flatten(),
events_y.flatten()],
var_type='cc', bw=bw)
density = estimator_ly.pdf(positions)
return density.reshape(xout.shape)
methods = {"gauss": kde_gauss,
"histogram": kde_histogram,
"none": kde_none,
"multivariate": kde_multivariate}