Source code for dclab.rtdc_dataset.feat_anc_core.ancillary_feature

"""Computation of ancillary features

Ancillary features are computed on-the-fly in dclab if the
required data are available. The features are registered here
and are computed when `RTDCBase.__getitem__` is called with
the respective feature name. When `RTDCBase.__contains__` is
called with the feature name, then the feature is not yet
computed, but the prerequisites are evaluated:

.. ipython::
    :okwarning:

    In [1]: import dclab

    In [2]: ds = dclab.new_dataset("data/example.rtdc")

    In [4]: ds.config["calculation"]["emodulus lut"] = "LE-2D-FEM-19"

    In [3]: ds.config["calculation"]["emodulus medium"] = "CellCarrier"

    In [5]: ds.config["calculation"]["emodulus temperature"] = 23.0

    In [6]: ds.config["calculation"]["emodulus viscosity model"] = \
'buyukurganci-2022'

    In [7]: "emodulus" in ds  # nothing is computed yet

    In [8]: ds["emodulus"]  # now data are computed and cached

Once the data has been computed, `RTDCBase` caches it in
the `_ancillaries` property dict together with a hash
that is computed with `AncillaryFeature.hash`. The hash
is computed from the feature data `req_features` and the
configuration metadata `req_config`.
"""

import hashlib
import warnings

import numpy as np

from ...util import obj2bytes
from ... import definitions as dfn



[docs]
class BadFeatureSizeWarning(UserWarning):
    pass




[docs]
class AncillaryFeature:
    #: All ancillary features registered
    features = []
    #: All feature names registered
    feature_names = []

    def __init__(self, feature_name, method, req_config=None,
                 req_features=None, req_func=lambda x: True, priority=0,
                 data=None, identifier=None):
        """A data feature that is computed from existing data

        Parameters
        ----------
        feature_name: str
            The name of the ancillary feature, e.g. "emodulus".
        method: callable
            The method that computes the feature. This method
            takes an instance of `RTDCBase` as argument.
        req_config: list
            Required configuration parameters to compute the feature,
            e.g. ["calculation", ["emodulus lut", "emodulus viscosity"]]
        req_features: list
            Required existing features in the dataset,
            e.g. ["area_cvx", "deform"]
        req_func: callable
            A function that takes an instance of `RTDCBase` as an
            argument and checks whether any other necessary criteria
            are met. By default, this is a lambda function that returns
            True. The function should return False if the necessary
            criteria are not met. This function may also return a
            hashable object (via :func:`dclab.util.objstr`) instead of
            True, if the criteria are subject to change. In this case,
            the return value is used for identifying the cached
            ancillary feature.

            .. versionchanged:: 0.27.0
                Support non-boolean return values for caching purposes.

        priority: int
            The priority of the feature; if there are multiple
            AncillaryFeature defined for the same feature_name,
            then the priority of the features defines which feature
            returns True in `self.is_available`. A higher value
            means a higher priority.
        data: object or BaseModel
            Any other data relevant for the feature (e.g. the ML
            model for computing 'ml_score_xxx' features)
        identifier: None or str
            A unique identifier (e.g. MD5 hash) of the ancillary
            feature. For PluginFeatures or ML features, this should
            be computed at least from the input file and the feature
            name.

        Notes
        -----
        `req_config` and `req_features` are used to test whether the
        feature can be computed in `self.is_available`.
        """
        if req_features is None:
            req_features = []
        if req_config is None:
            req_config = []
        self.feature_name = feature_name
        self.method = method
        self.req_config = req_config
        self.req_features = req_features
        self.req_func = req_func
        self.priority = priority
        self.data = data
        self.identifier = identifier

        # register this feature
        AncillaryFeature.features.append(self)
        AncillaryFeature.feature_names.append(feature_name)

    def __repr__(self):
        repre = " ".join([
            f"<{self.__class__.__name__}",
            f"'{self.feature_name}'",
            f"(id {self.identifier[:5]}...)" if self.identifier else "(no ID)",
            f"with priority {self.priority}",
            f"at {hex(id(self))}>",
        ])
        return repre


[docs]
    @staticmethod
    def available_features(rtdc_ds):
        """Determine available features for an RT-DC dataset

        Parameters
        ----------
        rtdc_ds: instance of RTDCBase
            The dataset to check availability for

        Returns
        -------
        features: dict
            Dictionary with feature names as keys and instances
            of `AncillaryFeature` as values.
        """
        cols = {}
        for inst in AncillaryFeature.features:
            if inst.is_available(rtdc_ds):
                cols[inst.feature_name] = inst
        return cols



[docs]
    @staticmethod
    def get_instances(feature_name):
        """Return all instances that compute `feature_name`"""
        feats = []
        for ft in AncillaryFeature.features:
            if ft.feature_name == feature_name:
                feats.append(ft)
        return feats



[docs]
    @staticmethod
    def check_data_size(rtdc_ds, data_dict):
        """Check the feature data is the correct size. If it isn't, resize it.

        Parameters
        ----------
        rtdc_ds: instance of RTDCBase
            The dataset from which the features are computed
        data_dict: dict
            Dictionary with `AncillaryFeature.feature_name` as keys and the
            computed data features (to be resized) as values.

        Returns
        -------
        data_dict: dict
            Dictionary with `feature_name` as keys and the correctly resized
            data features as values.
        """
        for key in data_dict:
            dsize = len(rtdc_ds) - len(data_dict[key])
            if dsize > 0:
                msg = "Growing feature {} in {} by {} to match event number!"
                warnings.warn(msg.format(key, rtdc_ds, abs(dsize)),
                              BadFeatureSizeWarning)
                data_dict[key] = np.array(data_dict[key], dtype=float)
                data_dict[key].resize(len(rtdc_ds), refcheck=False)
                data_dict[key][-dsize:] = np.nan
            elif dsize < 0:
                msg = "Shrinking feature {} in {} by {} to match event number!"
                warnings.warn(msg.format(key, rtdc_ds, abs(dsize)),
                              BadFeatureSizeWarning)
                data_dict[key].resize(len(rtdc_ds), refcheck=False)
            if isinstance(data_dict[key], np.ndarray):
                data_dict[key].setflags(write=False)
            elif isinstance(data_dict[key], list):
                for item in data_dict[key]:
                    if isinstance(item, np.ndarray):
                        item.setflags(write=False)
        return data_dict



[docs]
    def compute(self, rtdc_ds):
        """Compute the feature with self.method. All ancillary features that
        share the same method will also be populated automatically.

        Parameters
        ----------
        rtdc_ds: instance of RTDCBase
            The dataset to compute the feature for

        Returns
        -------
        data_dict: dict
            Dictionary with `AncillaryFeature.feature_name` as keys and the
            computed data features (read-only) as values.
        """
        data_dict = self.method(rtdc_ds)
        if not isinstance(data_dict, dict):
            data_dict = {self.feature_name: data_dict}
        data_dict = AncillaryFeature.check_data_size(rtdc_ds, data_dict)
        if self.feature_name not in data_dict:
            raise KeyError(
                f"I expected the feature '{self.feature_name}' to be a key "
                + f"in the dictionary returned by {self}. But I found only "
                + f"the following: {sorted(data_dict.keys())}")
        for key in data_dict:
            dfn.check_feature_shape(key, data_dict[key])
        return data_dict



[docs]
    def hash(self, rtdc_ds):
        """Used for identifying an ancillary computation

        The required features, the used configuration keys/values, and
        the return value of the requirement function are hashed.
        """
        hasher = hashlib.md5()
        # data columns
        for col in self.req_features:
            hasher.update(obj2bytes(rtdc_ds[col]))
        # config keys
        for sec, keys in self.req_config:
            for key in keys:
                val = rtdc_ds.config[sec][key]
                data = "{}:{}={}".format(sec, key, val)
                hasher.update(obj2bytes(data))
        # custom requirement function hash
        reqret = self.req_func(rtdc_ds)
        if not isinstance(reqret, bool):
            # add to hash if not a boolean
            hasher.update(obj2bytes(reqret))
        return hasher.hexdigest()



[docs]
    def is_available(self, rtdc_ds, verbose=False):
        """Check whether the feature is available

        Parameters
        ----------
        rtdc_ds: instance of RTDCBase
            The dataset to check availability for

        Returns
        -------
        available: bool
            `True`, if feature can be computed with `compute`

        Notes
        -----
        This method returns `False` for a feature if there
        is a feature defined with the same name but with
        higher priority (even if the feature would be
        available otherwise).
        """
        # Check config keys
        for item in self.req_config:
            section, keys = item
            if section not in rtdc_ds.config:
                if verbose:
                    print("{} not in config".format(section))
                return False
            else:
                for key in keys:
                    if key not in rtdc_ds.config[section]:
                        if verbose:
                            print("{} not in config['{}']".format(key,
                                                                  section))
                        return False
        # Check features
        for col in self.req_features:
            if col not in rtdc_ds:
                return False
        # Check priorities of other features
        for of in AncillaryFeature.features:
            if of == self:
                # nothing to compare
                continue
            elif of.feature_name == self.feature_name:
                # same feature name
                if of.priority <= self.priority:
                    # lower priority, ignore
                    continue
                else:
                    # higher priority
                    if of.is_available(rtdc_ds):
                        # higher priority is available, thus
                        # this feature is not available
                        return False
                    else:
                        # higher priority not available
                        continue
            else:
                # other feature
                continue
        # Check user-defined function
        if not self.req_func(rtdc_ds):
            return False
        return True