Source code for dclab.statistics

"""Statistics computation for RT-DC dataset instances"""

import numpy as np
import traceback as tb
import warnings

from . import definitions as dfn

[docs] class BadMethodWarning(UserWarning): pass
[docs] class Statistics(object): available_methods = {} def __init__(self, name, method, req_feature=False): """A helper class for computing statistics All statistical methods are registered in the dictionary `Statistics.available_methods`. """ self.method = method = name self.req_feature = req_feature Statistics.available_methods[name] = self def __call__(self, **kwargs): data = self._get_data(kwargs) if len(data) == 0: result = np.nan else: try: result = self.method(data) except BaseException: exc = tb.format_exc().replace("\n", "\n | ") warnings.warn("Failed to compute {} for {}: {}".format(, kwargs["ds"].title, exc), BadMethodWarning) result = np.nan return result def _get_data(self, kwargs): """Convenience wrapper to get statistics data""" if "ds" not in kwargs: raise ValueError("Keyword argument 'ds' missing.") ds = kwargs["ds"] if self.req_feature: if "feature" not in kwargs: raise ValueError("Keyword argument 'feature' missing.") return self.get_feature(ds, kwargs["feature"]) else: return ds
[docs] def get_feature(self, ds, feat): """Return filtered feature data The features are filtered according to the user-defined filters, using the information in `ds.filter.all`. In addition, all `nan` and `inf` values are purged. Parameters ---------- ds: dclab.rtdc_dataset.RTDCBase The dataset containing the feature feat: str The name of the feature; must be a scalar feature """ if ds.config["filtering"]["enable filters"]: x = ds[feat][ds.filter.all] else: x = ds[feat] bad = np.isnan(x) | np.isinf(x) xout = x[~bad] return xout
[docs] def flow_rate(ds): """Return the flow rate of an RT-DC dataset""" conf = ds.config["setup"] if "flow rate" in conf: return conf["flow rate"] else: return np.nan
[docs] def get_statistics(ds, methods=None, features=None): """Compute statistics for an RT-DC dataset Parameters ---------- ds: dclab.rtdc_dataset.RTDCBase The dataset for which to compute the statistics. methods: list of str or None The methods wih which to compute the statistics. The list of available methods is given with `dclab.statistics.Statistics.available_methods.keys()` If set to `None`, statistics for all methods are computed. features: list of str Feature name identifiers are defined by `dclab.definitions.feature_exists`. If set to `None`, statistics for all scalar features available are computed. Returns ------- header: list of str The header (feature + method names) of the computed statistics. values: list of float The computed statistics. """ if methods is None: cls = list(Statistics.available_methods.keys()) # sort the features in a usable way avm = Statistics.available_methods me1 = [m for m in cls if not avm[m].req_feature] me2 = [m for m in cls if avm[m].req_feature] methods = me1 + me2 if features is None: features = ds.features_scalar else: features = [a.lower() for a in features] header = [] values = [] # First loop over all methods that do not require a feature for mt in methods: meth = Statistics.available_methods[mt] if not meth.req_feature: values.append(meth(ds=ds)) header.append(mt) # To make sure that all methods are computed for each feature in a block, # we loop over all features. It would be easier to loop over the methods, # but the ordering of the resulting statistics would not be human-friendly. for ft in features: for mt in methods: meth = Statistics.available_methods[mt] if meth.req_feature: if ft in ds: values.append(meth(ds=ds, feature=ft)) else: values.append(np.nan) label = dfn.get_feature_label(ft, rtdc_ds=ds) header.append(" ".join([mt, label])) return header, values
[docs] def mode(data): """Compute an intelligent value for the mode The most common value in experimental is not very useful if there are a lot of digits after the comma. This method approaches this issue by rounding to bin size that is determined by the Freedman–Diaconis rule. Parameters ---------- data: 1d ndarray The data for which the mode should be computed. Returns ------- mode: float The mode computed with the Freedman-Diaconis rule. """ # size n = data.shape[0] # interquartile range iqr = np.percentile(data, 75)-np.percentile(data, 25) # Freedman–Diaconis bin_size = 2 * iqr / n**(1/3) if bin_size == 0: return np.nan # Add bin_size/2, because we want the center of the bin and # not the left corner of the bin. databin = np.round(data/bin_size)*bin_size + bin_size/2 u, indices = np.unique(databin, return_inverse=True) mode = u[np.argmax(np.bincount(indices))] return mode
# Register all the methods # Methods that require an axis Statistics(name="Mean", req_feature=True, method=np.average) Statistics(name="Median", req_feature=True, method=np.median) Statistics(name="Mode", req_feature=True, method=mode) Statistics(name="SD", req_feature=True, method=np.std) # Methods that work on RTDCBase Statistics(name="Events", method=lambda mm: np.sum(mm.filter.all)) Statistics(name="%-gated", method=lambda mm: np.average(mm.filter.all)*100) Statistics(name="Flow rate", method=lambda mm: flow_rate(mm))