Source code for dclab.definitions

"""Naming conventions"""
import copy
import numpy as np

from .rtdc_dataset.ancillaries import AncillaryFeature
from .parse_funcs import (
    f2dfloatarray, fbool, fint, fintlist, func_types, lcstr)


#: All configuration keywords editable by the user
CFG_ANALYSIS = {
    # filtering parameters
    "filtering": [
        ["hierarchy parent", str, "Hierarchy parent of the dataset"],
        ["remove invalid events", fbool, "Remove events with inf/nan values"],
        ["enable filters", fbool, "Enable filtering"],
        ["limit events", fint, "Upper limit for number of filtered events"],
        ["polygon filters", fintlist, "Polygon filter indices"],
    ],
    # Addition user-defined data
    "calculation": [
        # "emodulus lut" was introduced in 0.32.0 and will replace
        # the deprecated "emodulus model".
        ["emodulus lut", str, "Look-up table identifier"],
        ["emodulus model", lcstr, "Model [DEPRECATED]"],
        ["emodulus medium", str, "Medium used (e.g. CellCarrierB, water)"],
        ["emodulus temperature", float, "Chip temperature [°C]"],
        ["emodulus viscosity", float, "Viscosity [Pa*s] if 'medium' unknown"],
        ["crosstalk fl21", float, "Fluorescence crosstalk, channel 2 to 1"],
        ["crosstalk fl31", float, "Fluorescence crosstalk, channel 3 to 1"],
        ["crosstalk fl12", float, "Fluorescence crosstalk, channel 1 to 2"],
        ["crosstalk fl32", float, "Fluorescence crosstalk, channel 3 to 2"],
        ["crosstalk fl13", float, "Fluorescence crosstalk, channel 1 to 3"],
        ["crosstalk fl23", float, "Fluorescence crosstalk, channel 2 to 3"],
    ]
}

#: All read-only configuration keywords for a measurement
CFG_METADATA = {
    # All parameters related to the actual experiment
    "experiment": [
        ["date", str, "Date of measurement ('YYYY-MM-DD')"],
        ["event count", fint, "Number of recorded events"],
        ["run index", fint, "Index of measurement run"],
        ["sample", str, "Measured sample or user-defined reference"],
        ["time", str, "Start time of measurement ('HH:MM:SS[.S]')"],
    ],
    # All special keywords related to RT-FDC
    # This section should not be present for regular RT-DC measurements.
    "fluorescence": [
        # The baseline offset was introduced in 0.33.0. It is added to
        # the trace data to obtain the actual signal used for data
        # processing (e.g. obtaining the fl1_max feature). This is more
        # robust than adding the offset directly to the trace data, because
        # of the possibility of integer overflows. Furthermore, DCKit can
        # set this parameter without modifying the original trace data
        # to correct/remove negative trace data
        # (see https://github.com/ZELLMECHANIK-DRESDEN/dclab/issues/101).
        # Note that traces accessed from RTDCBase instances are never
        # background-corrected!
        ["baseline 1 offset", fint, "Baseline offset channel 1"],
        ["baseline 2 offset", fint, "Baseline offset channel 2"],
        ["baseline 3 offset", fint, "Baseline offset channel 3"],
        ["bit depth", fint, "Trace bit depth"],
        # If a fluorescence channel is used, a channel name *must* be
        # present. If a channel is not used, the channel name *must not*
        # be present. E.g. if only channels 1 and 2 are used, but there
        # are three channels present, then `channel count` is two,
        # `channels installed` is three, and `channel 3 name` is not set.
        ["channel 1 name", str, "FL1 description"],
        ["channel 2 name", str, "FL2 description"],
        ["channel 3 name", str, "FL3 description"],
        ["channel count", fint, "Number of active channels"],
        ["channels installed", fint, "Number of available channels"],
        # In contrast to `channel ? name`, the laser power *may*
        # be present (but must be set to 0), if a laser line is not used.
        ["laser 1 lambda", float, "Laser 1 wavelength [nm]"],
        ["laser 1 power", float, "Laser 1 output power [%]"],
        ["laser 2 lambda", float, "Laser 2 wavelength [nm]"],
        ["laser 2 power", float, "Laser 2 output power [%]"],
        ["laser 3 lambda", float, "Laser 3 wavelength [nm]"],
        ["laser 3 power", float, "Laser 3 output power [%]"],
        ["laser count", fint, "Number of active lasers"],
        ["lasers installed", fint, "Number of available lasers"],
        ["sample rate", fint, "Trace sample rate [Hz]"],
        ["samples per event", fint, "Samples per event"],
        ["signal max", float, "Upper voltage detection limit [V]"],
        ["signal min", float, "Lower voltage detection limit [V]"],
        ["trace median", fint, "Rolling median filter size for traces"],
    ],
    # All tdms-related parameters
    "fmt_tdms": [
        ["video frame offset", fint, "Missing events at beginning of video"],
    ],
    # All imaging-related keywords
    "imaging": [
        ["flash device", str, "Light source device type"],  # e.g. green LED
        ["flash duration", float, "Light source flash duration [µs]"],
        ["frame rate", float, "Imaging frame rate [Hz]"],
        ["pixel size", float, "Pixel size [µm]"],
        ["roi position x", fint, "Image x coordinate on sensor [px]"],
        ["roi position y", fint, "Image y coordinate on sensor [px]"],
        ["roi size x", fint, "Image width [px]"],
        ["roi size y", fint, "Image height [px]"],
    ],
    # All parameters for online contour extraction from the event images
    "online_contour": [
        # The option "bg empty" was introduced in dclab 0.34.0 and
        # Shape-In 2.2.2.5.
        # Shape-In  writes to the "shapein-warning" log if there are
        # frames with event images (non-empty frames) that had to be
        # used for background correction.
        ["bg empty", fbool, "Background correction from empty frames only"],
        ["bin area min", fint, "Minium pixel area of binary image event"],
        ["bin kernel", fint, "Odd ellipse kernel size, binary image morphing"],
        ["bin threshold", fint, "Binary threshold for avg-bg-corrected image"],
        ["image blur", fint, "Odd sigma for Gaussian blur (21x21 kernel)"],
        ["no absdiff", fbool, "Avoid OpenCV 'absdiff' for avg-bg-correction"],
    ],
    # All online filters
    "online_filter": [
        ["area_ratio max", float, "Maximum porosity"],
        ["area_ratio min", float, "Minimum porosity"],
        ["area_ratio soft limit", fbool, "Soft limit, porosity"],
        ["area_um max", float, "Maximum area [µm²]"],
        ["area_um min", float, "Minimum area [µm²]"],
        ["area_um soft limit", fbool, "Soft limit, area [µm²]"],
        ["aspect max", float, "Maximum aspect ratio of bounding box"],
        ["aspect min", float, "Minimum aspect ratio of bounding box"],
        ["aspect soft limit", fbool, "Soft limit, aspect ratio of bbox"],
        ["deform max", float, "Maximum deformation"],
        ["deform min", float, "Minimum deformation"],
        ["deform soft limit", fbool, "Soft limit, deformation"],
        ["fl1_max max", float, "Maximum FL-1 maximum [a.u.]"],
        ["fl1_max min", float, "Minimum FL-1 maximum [a.u.]"],
        ["fl1_max soft limit", fbool, "Soft limit, FL-1 maximum"],
        ["fl2_max max", float, "Maximum FL-2 maximum [a.u.]"],
        ["fl2_max min", float, "Minimum FL-2 maximum [a.u.]"],
        ["fl2_max soft limit", fbool, "Soft limit, FL-2 maximum"],
        ["fl3_max max", float, "Maximum FL-3 maximum [a.u.]"],
        ["fl3_max min", float, "Minimum FL-3 maximum [a.u.]"],
        ["fl3_max soft limit", fbool, "Soft limit, FL-3 maximum"],
        ["size_x max", fint, "Maximum bounding box size x [µm]"],
        ["size_x min", fint, "Minimum bounding box size x [µm]"],
        ["size_x soft limit", fbool, "Soft limit, bounding box size x"],
        ["size_y max", fint, "Maximum bounding box size y [µm]"],
        ["size_y min", fint, "Minimum bounding box size y [µm]"],
        ["size_y soft limit", fbool, "Soft limit, bounding box size y"],
        # "target*" is only set if measurement is stopped automatically.
        # "target*" is not necessarily reached (e.g. user aborted).
        ["target duration", float, "Target measurement duration [min]"],
        ["target event count", fint, "Target event count for online gating"],
    ],
    # All setup-related keywords, except imaging
    "setup": [
        ["channel width", float, "Width of microfluidic channel [µm]"],
        ["chip identifier", lcstr, "Unique identifier of the chip used"],
        ["chip region", lcstr, "Imaged chip region (channel or reservoir)"],
        ["flow rate", float, "Flow rate in channel [µL/s]"],
        ["flow rate sample", float, "Sample flow rate [µL/s]"],
        ["flow rate sheath", float, "Sheath flow rate [µL/s]"],
        ["identifier", str, "Unique setup identifier"],
        # "medium" is one of CellCarrier, CellCarrierB, water, or other
        ["medium", str, "Medium used"],
        ["module composition", str, "Comma-separated list of modules used"],
        ["software version", str, "Acquisition software with version"],
        ["temperature", float, "Mean chip temperature [°C]"],
    ],
}

#: List of scalar (one scalar value per event) features. This
#: list does not include the `ml_score_???` features. If you
#: need find out whether a feature name is valid, please use
#: :func:`is_valid_feature`.
FEATURES_SCALAR = [
    ["area_cvx", "Convex area [px]"],
    # area_msd is the contour moment M00
    ["area_msd", "Measured area [px]"],
    ["area_ratio", "Porosity (convex to measured area ratio)"],
    # area_um is the convex area per definition
    ["area_um", "Area [µm²]"],
    ["aspect", "Aspect ratio of bounding box"],
    ["bright_avg", "Brightness average within contour [a.u.]"],
    ["bright_sd", "Brightness SD within contour [a.u.]"],
    ["circ", "Circularity"],
    # deform is computed from the convex contour
    ["deform", "Deformation"],
    ["emodulus", "Young's Modulus [kPa]"],
    # fl*_area, fl*_pos, and fl*_width values correspond to the
    # object for which the contour was found. For high concentrations,
    # these values could be error-prone due to the assignment from
    # false objects.
    ["fl1_area", "FL-1 area of peak [a.u.]"],
    # fl1_dist is set to zero if there is only one peak
    ["fl1_dist", "FL-1 distance between two first peaks [µs]"],
    ["fl1_max", "FL-1 maximum [a.u.]"],
    ["fl1_max_ctc", "FL-1 maximum, crosstalk-corrected [a.u.]"],
    ["fl1_npeaks", "FL-1 number of peaks"],
    ["fl1_pos", "FL-1 position of peak [µs]"],
    ["fl1_width", "FL-1 width [µs]"],
    ["fl2_area", "FL-2 area of peak [a.u.]"],
    ["fl2_dist", "FL-2 distance between two first peaks [µs]"],
    ["fl2_max", "FL-2 maximum [a.u.]"],
    ["fl2_max_ctc", "FL-2 maximum, crosstalk-corrected [a.u.]"],
    ["fl2_npeaks", "FL-2 number of peaks"],
    ["fl2_pos", "FL-2 position of peak [µs]"],
    ["fl2_width", "FL-2 width [µs]"],
    ["fl3_area", "FL-3 area of peak [a.u.]"],
    ["fl3_dist", "FL-3 distance between two first peaks [µs]"],
    ["fl3_max", "FL-3 maximum [a.u.]"],
    ["fl3_max_ctc", "FL-3 maximum, crosstalk-corrected [a.u.]"],
    ["fl3_npeaks", "FL-3 number of peaks"],
    ["fl3_pos", "FL-3 position of peak [µs]"],
    ["fl3_width", "FL-3 width [µs]"],
    ["frame", "Video frame number"],
    ["g_force", "Gravitational force in multiples of g"],
    # index starts with 1
    ["index", "Event index (Dataset)"],
    # index_online may have missing values (#71)
    ["index_online", "Event index (Online)"],
    # The inertia ratios of the event contours are defined by the
    # central second order moments of area (sqrt(m20/m02).
    ["inert_ratio_cvx", "Inertia ratio of convex contour"],
    ["inert_ratio_prnc", "Principal inertia ratio of raw contour"],
    ["inert_ratio_raw", "Inertia ratio of raw contour"],
    # This is an ancillary integer feature for visualizing the class
    # membership of individual events based on the `ml_score_???`
    # features.
    ["ml_class", "Most probable ML class"],
    ["nevents", "Total number of events in the same image"],
    ["pc1", "Principal component 1"],
    ["pc2", "Principal component 2"],
    # pos_x and pos_y are computed from the contour moments
    # "m10"/"m00" and "m01"/"m00" of the convex hull of "contour"
    ["pos_x", "Position along channel axis [µm]"],
    ["pos_y", "Position lateral in channel [µm]"],
    ["size_x", "Bounding box size x [µm]"],
    ["size_y", "Bounding box size y [µm]"],
    ["temp", "Chip temperature [°C]"],
    ["temp_amb", "Ambient temperature [°C]"],
    ["tilt", "Absolute tilt of raw contour"],
    ["time", "Event time [s]"],
    # Volume is computed from the raw contour (i.e. with exclusions).
    # Fun fact: If we had decided to compute it from the convex contour,
    # then we would have close to none pixelation effects ¯\_(ツ)_/¯.
    ["volume", "Volume [µm³]"],
]
# Add userdef features
for _i in range(10):
    FEATURES_SCALAR.append(["userdef{}".format(_i),
                            "User defined {}".format(_i)
                            ])

#: list of non-scalar features
FEATURES_NON_SCALAR = [
    # This is a (M, 2)-shaped array with integer contour coordinates
    ["contour", "Binary event contour image"],
    ["image", "Gray scale event image"],
    ["image_bg", "Gray scale event background image"],
    # This is the contour with holes filled
    ["mask", "Binary region labeling the event in the image"],
    # See FLUOR_TRACES for valid keys
    ["trace", "Dictionary of fluorescence traces"],
]

#: List of fluorescence traces
FLUOR_TRACES = [
    "fl1_median",
    "fl1_raw",
    "fl2_median",
    "fl2_raw",
    "fl3_median",
    "fl3_raw",
]


# CFG convenience lists and dicts
_cfg = copy.deepcopy(CFG_METADATA)
_cfg.update(CFG_ANALYSIS)
#: dict with section as keys and config parameter names as values
config_keys = {}
for _key in _cfg:
    config_keys[_key] = [it[0] for it in _cfg[_key]]
#: dict of dicts containing functions to convert input data
config_funcs = {}
for _key in _cfg:
    config_funcs[_key] = {}
    for _subkey, _type, __ in _cfg[_key]:
        config_funcs[_key][_subkey] = _type
#: dict of dicts containing the type of section parameters
config_types = {}
for _key in _cfg:
    config_types[_key] = {}
    for _subkey, _type, __ in _cfg[_key]:
        if _type in func_types:
            _type = func_types[_type]
        config_types[_key][_subkey] = _type
#: dict with metadata description
config_descr = {}
for _key in _cfg:
    config_descr[_key] = {}
    for _subkey, __, _descr in _cfg[_key]:
        config_descr[_key][_subkey] = _descr


# FEATURE convenience lists and dicts
#: list of feature names
feature_names = [_cc[0] for _cc in FEATURES_SCALAR + FEATURES_NON_SCALAR]
#: list of feature labels (same order as :const:`feature_names`
feature_labels = [_cc[1] for _cc in FEATURES_SCALAR + FEATURES_NON_SCALAR]
#: dict for converting feature names to labels
feature_name2label = {}
for _cc in FEATURES_SCALAR + FEATURES_NON_SCALAR:
    feature_name2label[_cc[0]] = _cc[1]

#: list of scalar feature names
scalar_feature_names = [_cc[0] for _cc in FEATURES_SCALAR]


def _add_feature_to_definitions(name, label=None, is_scalar=True):
    """Protected function to populate definitions with feature details.

    Used by temporary features and plugin features to add new feature
    names and labels to `dclab.definitions`.

    Parameters
    ----------
    name: str
        name of a feature
    label: str, optional
        feature label corresponding to the feature name. If set to None, then
        a label is constructed for the feature name.
    is_scalar: bool
        Specify whether the feature of an event is a scalar (True)
        or not (False)

    Raises
    ------
    ValueError
        If the feature already exists.
    """
    allowed_chars = "abcdefghijklmnopqrstuvwxyz_1234567890"
    feat = "".join([f for f in name if f in allowed_chars])
    if feat != name:
        raise ValueError("`feature` must only contain lower-case characters, "
                         f"digits, and underscores; got '{name}'!")
    if label is None:
        label = f"User-defined feature {name}"
    if feature_exists(name):
        raise ValueError(f"Feature '{name}' already exists!")

    # Populate the new feature in all dictionaries and lists
    # (we don't need global here)
    feature_names.append(name)
    feature_labels.append(label)
    feature_name2label[name] = label
    if is_scalar:
        scalar_feature_names.append(name)


def _remove_feature_from_definitions(name):
    """Protected function to remove feature details from definitions.

    Used by temporary features and plugin features to
    remove the feature names and labels from `dclab.definitions`.

    Parameters
    ----------
    name: str
        name of a feature

    Warnings
    --------
    This function should only be used internally, i.e., You should not use
    this function. This function can break things.
    """
    label = get_feature_label(name)
    feature_names.remove(name)
    feature_labels.remove(label)
    feature_name2label.pop(name)
    if name in scalar_feature_names:
        scalar_feature_names.remove(name)


def check_feature_shape(name, data):
    """Check if (non)-scalar feature matches with its data's dimensionality

    Parameters
    ----------
    name: str
        name of the feature
    data: array-like
        data whose dimensionality will be checked

    Raises
    ------
    ValueError
        If the data's shape does not match its scalar description

    Notes
    -----
    Bug: Some contour data in test files have incorrect dimensions.
    Therefore, an exclusive case has been added. This is to be fixed in
    future versions and is not a permanent fix.
    See https://github.com/ZELLMECHANIK-DRESDEN/dclab/issues/117
    for more information.
    """
    if name == "contour":
        # TODO: contour data are difficult to handle, because
        # - they don't have a well-defined shape
        #   (see https://github.com/ZELLMECHANIK-DRESDEN/dclab/issues/117)
        # - they may be lists of lists or a lazy-list implementation
        # - just converting them to an array is not possible: Numpy
        #   issued a deprecation warning for lists of lists that have
        #   different lengths
        pass
    else:
        data = np.array(data)
        if len(data.shape) == 1 and not scalar_feature_exists(name):
            raise ValueError(f"Feature '{name}' is not a scalar feature, but "
                             "a 1D array was given for `data`!")
        elif len(data.shape) != 1 and scalar_feature_exists(name):
            raise ValueError(f"Feature '{name}' is a scalar feature, but the "
                             "`data` array is not 1D!")


def config_key_exists(section, key):
    """Return `True` if the configuration key exists"""
    valid = False
    if section == "user":
        valid = True
    elif section in config_funcs and key in config_funcs[section]:
        valid = True
    elif section == "online_filter":
        if key.endswith("soft limit"):
            # "online_filter:area_um,deform soft limit"
            valid = True
        elif key.endswith("polygon points"):
            valid = True
    return valid


[docs]def feature_exists(name, scalar_only=False):
    """Return True if `name` is a valid feature name

    This function not only checks whether `name` is in
    :const:`feature_names`, but also validates against
    the machine learning scores `ml_score_???` (where
    `?` can be a digit or a lower-case letter in the
    English alphabet).

    Parameters
    ----------
    name: str
        name of a feature
    scalar_only : bool
        Specify whether the check should only search in scalar features

    Returns
    -------
    valid: bool
        True if name is a valid feature, False otherwise.

    See Also
    --------
    scalar_feature_exists: Wraps `feature_exists` with `scalar_only=True`
    """
    valid = False
    if name in scalar_feature_names:
        # scalar feature
        valid = True
    elif not scalar_only and name in feature_names:
        # non-scalar feature
        valid = True
    else:
        # check whether we have an `ml_score_???` feature
        valid_chars = "0123456789abcdefghijklmnopqrstuvwxyz"
        if (name.startswith("ml_score_")
            and len(name) == len("ml_score_???")
            and name[-3] in valid_chars
            and name[-2] in valid_chars
                and name[-1] in valid_chars):
            valid = True
    return valid


def get_config_value_descr(section, key):
    """Return the description of a config value

    Returns `key` if not defined anywhere
    """
    descr = key
    if section == "user":
        pass
    elif section in config_descr and key in config_descr[section]:
        descr = config_descr[section][key]
    elif section == "online_filter":
        if key.endswith("soft limit") or key.endswith("polygon points"):
            # "online_filter:area_um,deform soft limit"
            f1, f2 = key.split(" ", 1)[0].split(",")
            # remove the units with rsplit
            l1 = get_feature_label(f1).rsplit(" [", 1)[0]
            l2 = get_feature_label(f2).rsplit(" [", 1)[0]
            if key.endswith("soft limit"):
                descr = f"Soft limit, polygon ({l1}, {l2})"
            elif key.endswith("polygon points"):
                descr = f"Polygon ({l1}, {l2})"
    return descr


def get_config_value_func(section, key):
    """Return configuration type converter function"""
    func = None
    if section == "user":
        pass
    elif section in config_funcs and key in config_funcs[section]:
        func = config_funcs[section][key]
    elif section == "online_filter":
        if key.endswith("soft limit"):
            # "online_filter:area_um,deform soft limit"
            func = fbool
        elif key.endswith("polygon points"):
            func = f2dfloatarray

    if func is None:
        return lambda x: x
    else:
        return func


def get_config_value_type(section, key):
    """Return the expected type of a config value

    Returns `None` if no type is defined
    """
    typ = None
    if section == "user":
        pass
    elif section in config_types and key in config_types[section]:
        typ = config_types[section][key]
    elif section == "online_filter":
        if key.endswith("soft limit"):
            # "online_filter:area_um,deform soft limit"
            typ = func_types[fbool]
        elif key.endswith("polygon points"):
            typ = func_types[f2dfloatarray]
    return typ


[docs]def get_feature_label(name, rtdc_ds=None):
    """Return the label corresponding to a feature name

    This function not only checks :const:`feature_name2label`,
    but also supports registered `ml_score_???` features.

    Parameters
    ----------
    name: str
        name of a feature

    Returns
    -------
    label: str
        feature label corresponding to the feature name

    Notes
    -----
    TODO: extract feature label from ancillary information when an rtdc_ds is
    given.
    """
    assert feature_exists(name)
    if name in feature_name2label:
        label = feature_name2label[name]
    else:
        # First check whether an ancillary feature with that
        # name exists.
        for af in AncillaryFeature.features:
            if af.feature_name == name:
                labelid = af.data.outputs.index(name)
                label = af.data.output_labels[labelid]
                break
        else:
            # If that did not work, use a generic name.
            label = "ML score {}".format(name[-3:].upper())
    return label


[docs]def scalar_feature_exists(name):
    """Convenience method wrapping `feature_exists(..., scalar_only=True)`"""
    return feature_exists(name, scalar_only=True)