Source code for dclab.statistics

"""Statistics computation for RT-DC dataset instances"""

import numpy as np
import traceback as tb
import warnings

from . import definitions as dfn



[docs]
class BadMethodWarning(UserWarning):
    pass




[docs]
class Statistics(object):
    available_methods = {}

    def __init__(self, name, method, req_feature=False):
        """A helper class for computing statistics

        All statistical methods are registered in the dictionary
        `Statistics.available_methods`.
        """
        self.method = method
        self.name = name
        self.req_feature = req_feature
        Statistics.available_methods[name] = self

    def __call__(self, **kwargs):
        data = self._get_data(kwargs)
        if len(data) == 0:
            result = np.nan
        else:
            try:
                result = self.method(data)
            except BaseException:
                exc = tb.format_exc().replace("\n", "\n    | ")
                warnings.warn("Failed to compute {} for {}: {}".format(
                              self.name, kwargs["ds"].title, exc),
                              BadMethodWarning)
                result = np.nan
        return result

    def _get_data(self, kwargs):
        """Convenience wrapper to get statistics data"""
        if "ds" not in kwargs:
            raise ValueError("Keyword argument 'ds' missing.")

        ds = kwargs["ds"]

        if self.req_feature:
            if "feature" not in kwargs:
                raise ValueError("Keyword argument 'feature' missing.")
            return self.get_feature(ds, kwargs["feature"])
        else:
            return ds


[docs]
    def get_feature(self, ds, feat):
        """Return filtered feature data

        The features are filtered according to the user-defined filters,
        using the information in `ds.filter.all`. In addition, all
        `nan` and `inf` values are purged.

        Parameters
        ----------
        ds: dclab.rtdc_dataset.RTDCBase
            The dataset containing the feature
        feat: str
            The name of the feature; must be a scalar feature
        """
        if ds.config["filtering"]["enable filters"]:
            x = ds[feat][ds.filter.all]
        else:
            x = ds[feat]
        bad = np.isnan(x) | np.isinf(x)
        xout = x[~bad]
        return xout





[docs]
def flow_rate(ds):
    """Return the flow rate of an RT-DC dataset"""
    conf = ds.config["setup"]
    if "flow rate" in conf:
        return conf["flow rate"]
    else:
        return np.nan




[docs]
def get_statistics(ds, methods=None, features=None, ret_dict=False):
    """Compute statistics for an RT-DC dataset

    Parameters
    ----------
    ds: dclab.rtdc_dataset.RTDCBase
        The dataset for which to compute the statistics.
    methods: list of str or None
        The methods wih which to compute the statistics.
        The list of available methods is given with
        :func:`.available_methods.keys`
        If set to `None`, statistics for all methods are computed.
    features: list of str
        Feature name identifiers are defined by
        :func:`dclab.definitions.feature_exists`.
        If set to `None`, statistics for all scalar features
        available are computed.
    ret_dict: bool
        Instead of returning ``(header, values)``, return a dictionary
        with headers as keys.

    Returns
    -------
    header: list of str
        The header (feature + method names) of the computed statistics.
    values: list of float
        The computed statistics.
    """
    if methods is None:
        cls = list(Statistics.available_methods.keys())
        # sort the features in a usable way
        avm = Statistics.available_methods
        me1 = [m for m in cls if not avm[m].req_feature]
        me2 = [m for m in cls if avm[m].req_feature]
        methods = me1 + me2

    if features is None:
        features = ds.features_scalar
    else:
        features = [a.lower() for a in features]

    header = []
    values = []

    # First loop over all methods that do not require a feature
    for mt in methods:
        meth = Statistics.available_methods[mt]
        if not meth.req_feature:
            values.append(meth(ds=ds))
            header.append(mt)

    # To make sure that all methods are computed for each feature in a block,
    # we loop over all features. It would be easier to loop over the methods,
    # but the ordering of the resulting statistics would not be human-friendly.
    for ft in features:
        for mt in methods:
            meth = Statistics.available_methods[mt]
            if meth.req_feature:
                if ft in ds:
                    values.append(meth(ds=ds, feature=ft))
                else:
                    values.append(np.nan)
                label = dfn.get_feature_label(ft, rtdc_ds=ds)
                header.append(" ".join([mt, label]))

    if ret_dict:
        return dict(zip(header, values))
    else:
        return header, values




[docs]
def mode(data):
    """Compute an intelligent value for the mode

    The most common value in experimental is not very useful if there
    are a lot of digits after the comma. This method approaches this
    issue by rounding to bin size that is determined by the
    Freedman–Diaconis rule.

    Parameters
    ----------
    data: 1d ndarray
        The data for which the mode should be computed.

    Returns
    -------
    mode: float
        The mode computed with the Freedman-Diaconis rule.
    """
    # size
    n = data.shape[0]
    # interquartile range
    iqr = np.percentile(data, 75)-np.percentile(data, 25)
    # Freedman–Diaconis
    bin_size = 2 * iqr / n**(1/3)

    if bin_size == 0:
        return np.nan

    # Add bin_size/2, because we want the center of the bin and
    # not the left corner of the bin.
    databin = np.round(data/bin_size)*bin_size + bin_size/2
    u, indices = np.unique(databin, return_inverse=True)
    mode = u[np.argmax(np.bincount(indices))]

    return mode



# Register all the methods
# Methods that require an axis
Statistics(name="Mean",   req_feature=True, method=np.average)
# Premature-Optimization warning: `np.percentile` also accepts an array
# of percentiles as the `q` argument, which I would expect to yield better
# performance than computing percentiles individually. Implementing this
# would break the way we are defining statistical methods here (One
# `Statistics` instance per method) and thus requires a considerable
# amount of work (much more work than writing this text here). It would
# also make understanding the code more difficult. In addition, computing
# statistics is not done often and is extremely fast anyway for a few
# millions of events. Don't optimize this!
Statistics(name="10th Percentile", req_feature=True,
           method=lambda data: np.percentile(data, 10))
Statistics(name="25th Percentile", req_feature=True,
           method=lambda data: np.percentile(data, 25))
Statistics(name="Median", req_feature=True, method=np.median)
Statistics(name="75th Percentile", req_feature=True,
           method=lambda data: np.percentile(data, 75))
Statistics(name="90th Percentile", req_feature=True,
           method=lambda data: np.percentile(data, 90))
Statistics(name="Mode",   req_feature=True, method=mode)
Statistics(name="SD",     req_feature=True, method=np.std)
# Methods that work on RTDCBase
Statistics(name="Events",
           method=lambda mm: np.sum(mm.filter.all))
Statistics(name="%-gated",
           method=lambda mm: np.average(mm.filter.all)*100)
Statistics(name="Flow rate",
           method=lambda mm: flow_rate(mm))