Source code for dclab.statistics

```#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Statistics computation for RT-DC dataset instances"""
from __future__ import division, print_function, unicode_literals

import numpy as np
import traceback as tb
import warnings

from . import definitions as dfn

[docs]class Statistics(object):
available_methods = {}

def __init__(self, name, method, req_feature=False):
"""A helper class for computing statistics

All statistical methods are registered in the dictionary
`Statistics.available_methods`.
"""
self.method = method
self.name = name
self.req_feature = req_feature
Statistics.available_methods[name] = self

def __call__(self, **kwargs):
data = self._get_data(kwargs)
if len(data) == 0:
result = np.nan
else:
try:
result = self.method(data)
except BaseException:
exc = tb.format_exc().replace("\n", "\n    | ")
warnings.warn("Failed to compute {} for {}: {}".format(
self.name, kwargs["ds"].title, exc),
result = np.nan
return result

def _get_data(self, kwargs):
"""Convenience wrapper to get statistics data"""
if "ds" not in kwargs:
raise ValueError("Keyword argument 'ds' missing.")

ds = kwargs["ds"]

if self.req_feature:
if "feature" not in kwargs:
raise ValueError("Keyword argument 'feature' missing.")
return self.get_feature(ds, kwargs["feature"])
else:
return ds

[docs]    def get_feature(self, ds, feat):
"""Return filtered feature data

The features are filtered according to the user-defined filters,
using the information in `ds._filter`. In addition, all
`nan` and `inf` values are purged.

Parameters
----------
ds: dclab.rtdc_dataset.RTDCBase
The dataset containing the feature
feat: str
The name of the feature; must be a scalar feature
"""
if ds.config["filtering"]["enable filters"]:
x = ds[feat][ds._filter]
else:
x = ds[feat]
return xout

[docs]def flow_rate(ds):
"""Return the flow rate of an RT-DC dataset"""
conf = ds.config["setup"]
if "flow rate" in conf:
return conf["flow rate"]
else:
return np.nan

[docs]def get_statistics(ds, methods=None, features=None):
"""Compute statistics for an RT-DC dataset

Parameters
----------
ds: dclab.rtdc_dataset.RTDCBase
The dataset for which to compute the statistics.
methods: list of str or None
The methods wih which to compute the statistics.
The list of available methods is given with
`dclab.statistics.Statistics.available_methods.keys()`
If set to `None`, statistics for all methods are computed.
features: list of str
Feature name identifiers are defined in
`dclab.definitions.scalar_feature_names`.
If set to `None`, statistics for all axes are computed.

Returns
-------
The header (feature + method names) of the computed statistics.
values: list of float
The computed statistics.
"""
if methods is None:
cls = list(Statistics.available_methods.keys())
# sort the features in a usable way
avm = Statistics.available_methods
me1 = [m for m in cls if not avm[m].req_feature]
me2 = [m for m in cls if avm[m].req_feature]
methods = me1 + me2

if features is None:
features = dfn.scalar_feature_names
else:
features = [a.lower() for a in features]

values = []

# To make sure that all methods are computed for each feature in a block,
# we loop over all features. It would be easier to loop over the methods,
# but the resulting statistics would not be human-friendly.
for ft in features:
for mt in methods:
meth = Statistics.available_methods[mt]
if meth.req_feature:
if ft in ds:
values.append(meth(ds=ds, feature=ft))
else:
values.append(np.nan)
else:
# Prevent multiple entries of this method.
values.append(meth(ds=ds))

[docs]def mode(data):
"""Compute an intelligent value for the mode

The most common value in experimental is not very useful if there
are a lot of digits after the comma. This method approaches this
issue by rounding to bin size that is determined by the
Freedman–Diaconis rule.

Parameters
----------
data: 1d ndarray
The data for which the mode should be computed.

Returns
-------
mode: float
The mode computed with the Freedman-Diaconis rule.
"""
# size
n = data.shape[0]
# interquartile range
iqr = np.percentile(data, 75)-np.percentile(data, 25)
# Freedman–Diaconis
bin_size = 2 * iqr / n**(1/3)

if bin_size == 0:
return np.nan

# Add bin_size/2, because we want the center of the bin and
# not the left corner of the bin.
databin = np.round(data/bin_size)*bin_size + bin_size/2
u, indices = np.unique(databin, return_inverse=True)
mode = u[np.argmax(np.bincount(indices))]

return mode

# Register all the methods
# Methods that require an axis
Statistics(name="Mean",   req_feature=True, method=np.average)
Statistics(name="Median", req_feature=True, method=np.median)
Statistics(name="Mode",   req_feature=True, method=mode)
Statistics(name="SD",     req_feature=True, method=np.std)
# Methods that work on RTDCBase
Statistics(name="Events",
method=lambda mm: np.sum(mm._filter))
Statistics(name="%-gated",
method=lambda mm: np.average(mm._filter)*100)
Statistics(name="Flow rate",
method=lambda mm: flow_rate(mm))
```