# Source code for dclab.kde_methods

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Kernel Density Estimation methods
"""
from __future__ import division, print_function, unicode_literals

import numpy as np
from scipy.interpolate import RectBivariateSpline
from scipy.stats import gaussian_kde, skew
from statsmodels.nonparametric.kernel_density import KDEMultivariate

from .cached import Cache

[docs]def bin_num_doane(a):
"""Compute number of bins based on Doane's formula"""
bad = np.isnan(a) + np.isinf(a)
acc = bin_width_doane(a)
num = np.int(np.round((data.max() - data.min()) / acc))
return num

[docs]def bin_width_doane(a):
"""Compute accuracy (bin width) based on Doane's formula"""
# https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
# https://stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
bad = np.isnan(a) + np.isinf(a)
n = data.size
g1 = skew(data)
sigma_g1 = np.sqrt(6 * (n - 2) / ((n + 1) * (n + 3)))
k = 1 + np.log2(n) + np.log2(1 + np.abs(g1) / sigma_g1)
acc = (data.max() - data.min()) / k
return acc

return np.isnan(x)+np.isinf(x)+np.isnan(y)+np.isinf(y)

[docs]def ignore_nan_inf(kde_method):
"""Ignores nans and infs from the input data

Invalid positions in the resulting density are set to nan.
"""
def new_kde_method(events_x, events_y, xout=None, yout=None, *args, **kwargs):
if xout is None:
density = np.zeros_like(events_x, dtype=float)
xo = yo = None
else:
density = np.zeros_like(xout, dtype=float)
# Filter events
density[~bad_out] = kde_method(ev_x, ev_y,
xo, yo,
*args, **kwargs)
return density

doc_add = "\n    Notes\n"+\
"    -----\n"+\
"    This is a wrapped version that ignores nan and inf values."
new_kde_method.__doc__ = kde_method.__doc__ + doc_add

return new_kde_method

[docs]@ignore_nan_inf
@Cache
def kde_gauss(events_x, events_y, xout=None, yout=None):
""" Gaussian Kernel Density Estimation

Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.

Returns
-------
density: ndarray, same shape as xout
The KDE for the points in (xout, yout)

--------
scipy.stats.gaussian_kde
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both xout and yout must be (un)set.")

if yout is None and yout is None:
xout = events_x
yout = events_y

try:
estimator = gaussian_kde([events_x.flatten(), events_y.flatten()])
density = estimator.evaluate([xout.flatten(), yout.flatten()])
except np.linalg.LinAlgError:
# LinAlgError occurs when matrix to solve is singular (issue #117)
density = np.zeros(xout.shape)*np.nan
return density.reshape(xout.shape)

[docs]@ignore_nan_inf
@Cache
def kde_histogram(events_x, events_y, xout=None, yout=None, bins=None):
""" Histogram-based Kernel Density Estimation

Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.
bins: tuple (binsx, binsy)
The number of bins to use for the histogram.

Returns
-------
density: ndarray, same shape as xout
The KDE for the points in (xout, yout)

--------
numpy.histogram2d
scipy.interpolate.RectBivariateSpline
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both xout and yout must be (un)set.")

if yout is None and yout is None:
xout = events_x
yout = events_y

if bins is None:
bins = (max(5, bin_num_doane(events_x)),
max(5, bin_num_doane(events_y)))

# Compute the histogram
hist2d, xedges, yedges = np.histogram2d(x=events_x,
y=events_y,
bins=bins,
normed=True)
xip = xedges[1:]-(xedges-xedges)/2
yip = yedges[1:]-(yedges-yedges)/2

estimator = RectBivariateSpline(x=xip, y=yip, z=hist2d)
density = estimator.ev(xout, yout)
density[density<0] = 0

return density.reshape(xout.shape)

[docs]def kde_none(events_x, events_y, xout=None, yout=None):
""" No Kernel Density Estimation

Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.

Returns
-------
density: ndarray, same shape as xout
The KDE for the points in (xout, yout)

Notes
-----
This method is a convenience method that always returns ones in the shape
that the other methods in this module produce.
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both xout and yout must be (un)set.")

if yout is None and yout is None:
xout = events_x
yout = events_y

return np.ones(xout.shape)

[docs]@ignore_nan_inf
@Cache
def kde_multivariate(events_x, events_y, xout=None, yout=None, bw=None):
""" Multivariate Kernel Density Estimation

Parameters
----------
events_x, events_y: 1D ndarray
The input points for kernel density estimation. Input
is flattened automatically.
bw: tuple (bwx, bwy) or None
The bandwith for kernel density estimation.
xout, yout: ndarray
The coordinates at which the KDE should be computed.
If set to none, input coordinates are used.

Returns
-------
density: ndarray, same shape as xout
The KDE for the points in (xout, yout)

--------
statsmodels.nonparametric.kernel_density.KDEMultivariate
"""
valid_combi = ((xout is None and yout is None) or
(xout is not None and yout is not None)
)
if not valid_combi:
raise ValueError("Both xout and yout must be (un)set.")

if yout is None and yout is None:
xout = events_x
yout = events_y
if bw is None:
# divide by 2 to make it comparable to histogram KDE
bw = (bin_width_doane(events_x) / 2,
bin_width_doane(events_y) / 2)

positions = np.vstack([xout.flatten(), yout.flatten()])
estimator_ly = KDEMultivariate(data=[events_x.flatten(),
events_y.flatten()],
var_type='cc', bw=bw)

density = estimator_ly.pdf(positions)
return density.reshape(xout.shape)

methods = {"gauss": kde_gauss,
"histogram": kde_histogram,
"none": kde_none,
"multivariate": kde_multivariate}