Source code for dclab.kde_methods

"""Kernel Density Estimation methods"""

import numpy as np
from scipy.interpolate import RectBivariateSpline
from scipy.stats import gaussian_kde, skew

from .cached import Cache
from .external.statsmodels.nonparametric.kernel_density import KDEMultivariate


[docs] def bin_num_doane(a): """Compute number of bins based on Doane's formula Notes ----- If the bin width cannot be determined, then a bin number of 5 is returned. See Also -------- bin_width_doane: method used to compute the bin width """ bad = np.isnan(a) | np.isinf(a) data = a[~bad] acc = bin_width_doane(a) if acc == 0 or np.isnan(acc): num = 5 else: num = int(np.round((data.max() - data.min()) / acc)) return num
[docs] def bin_width_doane(a): """Compute contour spacing based on Doane's formula References ---------- - `<https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width>`_ - `<https://stats.stackexchange.com/questions/55134/ doanes-formula-for-histogram-binning>`_ Notes ----- Doane's formula is actually designed for histograms. This function is kept here for backwards-compatibility reasons. It is highly recommended to use :func:`bin_width_percentile` instead. """ bad = np.isnan(a) | np.isinf(a) data = a[~bad] n = data.size g1 = skew(data) sigma_g1 = np.sqrt(6 * (n - 2) / ((n + 1) * (n + 3))) k = 1 + np.log2(n) + np.log2(1 + np.abs(g1) / sigma_g1) acc = (data.max() - data.min()) / k return acc
[docs] def bin_width_percentile(a): """Compute contour spacing based on data percentiles The 10th and the 90th percentile of the input data are taken. The spacing then computes to the difference between those two percentiles divided by 23. Notes ----- The Freedman–Diaconis rule uses the interquartile range and normalizes to the third root of len(a). Such things do not work very well for RT-DC data, because len(a) is huge. Here we use just the top and bottom 10th percentiles with a fixed normalization. """ bad = np.isnan(a) | np.isinf(a) data = a[~bad] start = np.percentile(data, 10) end = np.percentile(data, 90) acc = (end - start) / 23 return acc
[docs] def get_bad_vals(x, y): return np.isnan(x) | np.isinf(x) | np.isnan(y) | np.isinf(y)
[docs] def ignore_nan_inf(kde_method): """Ignores nans and infs from the input data Invalid positions in the resulting density are set to nan. """ def new_kde_method(events_x, events_y, xout=None, yout=None, *args, **kwargs): bad_in = get_bad_vals(events_x, events_y) if xout is None: density = np.zeros_like(events_x, dtype=np.float64) bad_out = bad_in xo = yo = None else: density = np.zeros_like(xout, dtype=np.float64) bad_out = get_bad_vals(xout, yout) xo = xout[~bad_out] yo = yout[~bad_out] # Filter events ev_x = events_x[~bad_in] ev_y = events_y[~bad_in] density[~bad_out] = kde_method(ev_x, ev_y, xo, yo, *args, **kwargs) density[bad_out] = np.nan return density doc_add = "\n Notes\n" +\ " -----\n" +\ " This is a wrapped version that ignores nan and inf values." new_kde_method.__doc__ = kde_method.__doc__ + doc_add return new_kde_method
[docs] @ignore_nan_inf @Cache def kde_gauss(events_x, events_y, xout=None, yout=None): """ Gaussian Kernel Density Estimation Parameters ---------- events_x, events_y: 1D ndarray The input points for kernel density estimation. Input is flattened automatically. xout, yout: ndarray The coordinates at which the KDE should be computed. If set to none, input coordinates are used. Returns ------- density: ndarray, same shape as `xout` The KDE for the points in (xout, yout) See Also -------- `scipy.stats.gaussian_kde` """ valid_combi = ((xout is None and yout is None) or (xout is not None and yout is not None) ) if not valid_combi: raise ValueError("Both `xout` and `yout` must be (un)set.") if xout is None and yout is None: xout = events_x yout = events_y try: estimator = gaussian_kde([events_x.flatten(), events_y.flatten()]) density = estimator.evaluate([xout.flatten(), yout.flatten()]) except np.linalg.LinAlgError: # LinAlgError occurs when matrix to solve is singular (issue #117) density = np.zeros(xout.shape)*np.nan return density.reshape(xout.shape)
[docs] @ignore_nan_inf @Cache def kde_histogram(events_x, events_y, xout=None, yout=None, bins=None): """ Histogram-based Kernel Density Estimation Parameters ---------- events_x, events_y: 1D ndarray The input points for kernel density estimation. Input is flattened automatically. xout, yout: ndarray The coordinates at which the KDE should be computed. If set to none, input coordinates are used. bins: tuple (binsx, binsy) The number of bins to use for the histogram. Returns ------- density: ndarray, same shape as `xout` The KDE for the points in (xout, yout) See Also -------- `numpy.histogram2d` `scipy.interpolate.RectBivariateSpline` """ valid_combi = ((xout is None and yout is None) or (xout is not None and yout is not None) ) if not valid_combi: raise ValueError("Both `xout` and `yout` must be (un)set.") if xout is None and yout is None: xout = events_x yout = events_y if bins is None: bins = (max(5, bin_num_doane(events_x)), max(5, bin_num_doane(events_y))) # Compute the histogram hist2d, xedges, yedges = np.histogram2d(x=events_x, y=events_y, bins=bins, density=True) xip = xedges[1:]-(xedges[1]-xedges[0])/2 yip = yedges[1:]-(yedges[1]-yedges[0])/2 estimator = RectBivariateSpline(x=xip, y=yip, z=hist2d) density = estimator.ev(xout, yout) density[density < 0] = 0 return density.reshape(xout.shape)
[docs] def kde_none(events_x, events_y, xout=None, yout=None): """No Kernel Density Estimation Parameters ---------- events_x, events_y: 1D ndarray The input points for kernel density estimation. Input is flattened automatically. xout, yout: ndarray The coordinates at which the KDE should be computed. If set to none, input coordinates are used. Returns ------- density: ndarray, same shape as `xout` The KDE for the points in (xout, yout) Notes ----- This method is a convenience method that always returns ones in the shape that the other methods in this module produce. """ valid_combi = ((xout is None and yout is None) or (xout is not None and yout is not None) ) if not valid_combi: raise ValueError("Both `xout` and `yout` must be (un)set.") if xout is None and yout is None: xout = events_x _ = events_y return np.ones(xout.shape)
[docs] @ignore_nan_inf @Cache def kde_multivariate(events_x, events_y, xout=None, yout=None, bw=None): """ Multivariate Kernel Density Estimation Parameters ---------- events_x, events_y: 1D ndarray The input points for kernel density estimation. Input is flattened automatically. bw: tuple (bwx, bwy) or None The bandwith for kernel density estimation. xout, yout: ndarray The coordinates at which the KDE should be computed. If set to none, input coordinates are used. Returns ------- density: ndarray, same shape as `xout` The KDE for the points in (xout, yout) See Also -------- `statsmodels.nonparametric.kernel_density.KDEMultivariate` """ valid_combi = ((xout is None and yout is None) or (xout is not None and yout is not None) ) if not valid_combi: raise ValueError("Both `xout` and `yout` must be (un)set.") if xout is None and yout is None: xout = events_x yout = events_y if bw is None: # divide by 2 to make it comparable to histogram KDE bw = (bin_width_doane(events_x) / 2, bin_width_doane(events_y) / 2) positions = np.vstack([xout.flatten(), yout.flatten()]) estimator_ly = KDEMultivariate(data=[events_x.flatten(), events_y.flatten()], var_type='cc', bw=bw) density = estimator_ly.pdf(positions) return density.reshape(xout.shape)
methods = {"gauss": kde_gauss, "histogram": kde_histogram, "none": kde_none, "multivariate": kde_multivariate}