# Source code for dclab.kde_methods

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Kernel Density Estimation methods"""
from __future__ import division, print_function, unicode_literals

import numpy as np
from scipy.interpolate import RectBivariateSpline
from scipy.stats import gaussian_kde, skew

from .cached import Cache
from .external.statsmodels.nonparametric.kernel_density import KDEMultivariate

[docs]def bin_num_doane(a):
"""Compute number of bins based on Doane's formula

Notes
-----
If the bin width cannot be determined, then a bin
number of 5 is returned.

--------
bin_width_doane: method used to compute the bin width
"""
bad = np.isnan(a) | np.isinf(a)
acc = bin_width_doane(a)
if acc == 0 or np.isnan(acc):
num = 5
else:
num = np.int(np.round((data.max() - data.min()) / acc))
return num

[docs]def bin_width_doane(a):
"""Compute contour spacing based on Doane's formula

References
----------
- <https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width>_
- <https://stats.stackexchange.com/questions/55134/
doanes-formula-for-histogram-binning>_

Notes
-----
Doane's formula is actually designed for histograms. This
function is kept here for backwards-compatibility reasons.
It is highly recommended to use :func:bin_width_percentile
"""
bad = np.isnan(a) | np.isinf(a)
n = data.size
g1 = skew(data)
sigma_g1 = np.sqrt(6 * (n - 2) / ((n + 1) * (n + 3)))
k = 1 + np.log2(n) + np.log2(1 + np.abs(g1) / sigma_g1)
acc = (data.max() - data.min()) / k
return acc

[docs]def bin_width_percentile(a):
"""Compute contour spacing based on data percentiles

The 10th and the 90th percentile of the input data are taken.
The spacing then computes to the difference between those
two percentiles divided by 23.

Notes
-----
The Freedmanâ€“Diaconis rule uses the interquartile range and
normalizes to the third root of len(a). Such things do not
work very well for RT-DC data, because len(a) is huge. Here
we use just the top and bottom 10th percentiles with a fixed
normalization.
"""
bad = np.isnan(a) | np.isinf(a)
start = np.percentile(data, 10)
end = np.percentile(data, 90)
acc = (end - start) / 23
return acc

return np.isnan(x) | np.isinf(x) | np.isnan(y) | np.isinf(y)

[docs]def ignore_nan_inf(kde_method):
"""Ignores nans and infs from the input data

Invalid positions in the resulting density are set to nan.
"""
def new_kde_method(events_x, events_y, xout=None, yout=None,
*args, **kwargs):
if xout is None:
density = np.zeros_like(events_x, dtype=float)
xo = yo = None
else:
density = np.zeros_like(xout, dtype=float)
# Filter events
density[~bad_out] = kde_method(ev_x, ev_y,
xo, yo,
*args, **kwargs)
return density

doc_add = "\n    Notes\n" +\
"    -----\n" +\
"    This is a wrapped version that ignores nan and inf values."
new_kde_method.__doc__ = kde_method.__doc__ + doc_add

return new_kde_method

[docs]@ignore_nan_inf
@Cache
def kde_gauss(events_x, events_y, xout=None, yout=None):
""" Gaussian Kernel Density Estimation

Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.

Returns
-------
density: ndarray, same shape as xout
The KDE for the points in (xout, yout)

--------
scipy.stats.gaussian_kde
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both xout and yout must be (un)set.")

if yout is None and yout is None:
xout = events_x
yout = events_y

try:
estimator = gaussian_kde([events_x.flatten(), events_y.flatten()])
density = estimator.evaluate([xout.flatten(), yout.flatten()])
except np.linalg.LinAlgError:
# LinAlgError occurs when matrix to solve is singular (issue #117)
density = np.zeros(xout.shape)*np.nan
return density.reshape(xout.shape)

[docs]@ignore_nan_inf
@Cache
def kde_histogram(events_x, events_y, xout=None, yout=None, bins=None):
""" Histogram-based Kernel Density Estimation

Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.
bins: tuple (binsx, binsy)
The number of bins to use for the histogram.

Returns
-------
density: ndarray, same shape as xout
The KDE for the points in (xout, yout)

--------
numpy.histogram2d
scipy.interpolate.RectBivariateSpline
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both xout and yout must be (un)set.")

if yout is None and yout is None:
xout = events_x
yout = events_y

if bins is None:
bins = (max(5, bin_num_doane(events_x)),
max(5, bin_num_doane(events_y)))

# Compute the histogram
hist2d, xedges, yedges = np.histogram2d(x=events_x,
y=events_y,
bins=bins,
normed=True)
xip = xedges[1:]-(xedges[1]-xedges[0])/2
yip = yedges[1:]-(yedges[1]-yedges[0])/2

estimator = RectBivariateSpline(x=xip, y=yip, z=hist2d)
density = estimator.ev(xout, yout)
density[density < 0] = 0

return density.reshape(xout.shape)

[docs]def kde_none(events_x, events_y, xout=None, yout=None):
""" No Kernel Density Estimation

Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.

Returns
-------
density: ndarray, same shape as xout
The KDE for the points in (xout, yout)

Notes
-----
This method is a convenience method that always returns ones in the shape
that the other methods in this module produce.
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both xout and yout must be (un)set.")

if yout is None and yout is None:
xout = events_x
yout = events_y

return np.ones(xout.shape)

[docs]@ignore_nan_inf
@Cache
def kde_multivariate(events_x, events_y, xout=None, yout=None, bw=None):
""" Multivariate Kernel Density Estimation

Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
bw: tuple (bwx, bwy) or None
The bandwith for kernel density estimation.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.

Returns
-------
density: ndarray, same shape as xout
The KDE for the points in (xout, yout)

--------
statsmodels.nonparametric.kernel_density.KDEMultivariate
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both xout and yout must be (un)set.")

if yout is None and yout is None:
xout = events_x
yout = events_y
if bw is None:
# divide by 2 to make it comparable to histogram KDE
bw = (bin_width_doane(events_x) / 2,
bin_width_doane(events_y) / 2)

positions = np.vstack([xout.flatten(), yout.flatten()])
estimator_ly = KDEMultivariate(data=[events_x.flatten(),
events_y.flatten()],
var_type='cc', bw=bw)

density = estimator_ly.pdf(positions)
return density.reshape(xout.shape)

methods = {"gauss": kde_gauss,
"histogram": kde_histogram,
"none": kde_none,
"multivariate": kde_multivariate}