"""Computation of ancillary features
Ancillary features are computed on-the-fly in dclab if the
required data are available. The features are registered here
and are computed when `RTDCBase.__getitem__` is called with
the respective feature name. When `RTDCBase.__contains__` is
called with the feature name, then the feature is not yet
computed, but the prerequisites are evaluated:
.. ipython::
:okwarning:
In [1]: import dclab
In [2]: ds = dclab.new_dataset("data/example.rtdc")
In [3]: ds.config["calculation"]["emodulus medium"] = "CellCarrier"
In [4]: ds.config["calculation"]["emodulus model"] = "elastic sphere"
In [5]: ds.config["calculation"]["emodulus temperature"] = 23.0
In [6]: "emodulus" in ds # nothing is computed
In [7]: ds["emodulus"] # now data is computed and cached
Once the data has been computed, `RTDCBase` caches it in
the `_ancillaries` property dict together with a hash
that is computed with `AncillaryFeature.hash`. The hash
is computed from the feature data `req_features` and the
configuration metadata `req_config`.
"""
import hashlib
import warnings
import numpy as np
from ...util import obj2str
[docs]class BadFeatureSizeWarning(UserWarning):
pass
[docs]class AncillaryFeature():
#: All ancillary features registered
features = []
#: All feature names registered
feature_names = []
def __init__(self, feature_name, method, req_config=[], req_features=[],
req_func=lambda x: True, priority=0, data=None):
"""A data feature that is computed from existing data
Parameters
----------
feature_name: str
The name of the ancillary feature, e.g. "emodulus".
method: callable
The method that computes the feature. This method
takes an instance of `RTDCBase` as argument.
req_config: list
Required configuration parameters to compute the feature,
e.g. ["calculation", ["emodulus model", "emodulus viscosity"]]
req_features: list
Required existing features in the dataset,
e.g. ["area_cvx", "deform"]
req_func: callable
A function that takes an instance of `RTDCBase` as an
argument and checks whether any other necessary criteria
are met. By default, this is a lambda function that returns
True. The function should return False if the necessary
criteria are not met. This function may also return a
hashable object (via :func:`dclab.util.objstr`) instead of
True, if the criteria are subject to change. In this case,
the return value is used for identifying the cached
ancillary feature.
.. versionchanged:: 0.27.0
Support non-boolean return values for caching purposes.
priority: int
The priority of the feature; if there are multiple
AncillaryFeature defined for the same feature_name,
then the priority of the features defines which feature
returns True in `self.is_available`. A higher value
means a higher priority.
data: object
Any other data relevant for the feature (e.g. the ML
model for computing 'ml_score_xxx' features)
Notes
-----
`req_config` and `req_features` are used to test whether the
feature can be computed in `self.is_available`.
"""
self.feature_name = feature_name
self.method = method
self.req_config = req_config
self.req_features = req_features
self.req_func = req_func
self.priority = priority
self.data = data
# register this feature
AncillaryFeature.features.append(self)
AncillaryFeature.feature_names.append(feature_name)
def __repr__(self):
repre = "<{} '{}' (priority {}) at {}>".format(
self.__class__.__name__,
self.feature_name,
self.priority,
hex(id(self)))
return repre
[docs] @staticmethod
def available_features(rtdc_ds):
"""Determine available features for an RT-DC dataset
Parameters
----------
rtdc_ds: instance of RTDCBase
The dataset to check availability for
Returns
-------
features: dict
Dictionary with feature names as keys and instances
of `AncillaryFeature` as values.
"""
cols = {}
for inst in AncillaryFeature.features:
if inst.is_available(rtdc_ds):
cols[inst.feature_name] = inst
return cols
[docs] @staticmethod
def get_instances(feature_name):
"""Return all instances that compute `feature_name`"""
feats = []
for ft in AncillaryFeature.features:
if ft.feature_name == feature_name:
feats.append(ft)
return feats
[docs] def compute(self, rtdc_ds):
"""Compute the feature with self.method
Parameters
----------
rtdc_ds: instance of RTDCBase
The dataset to compute the feature for
Returns
-------
feature: array- or list-like
The computed data feature (read-only).
"""
data = self.method(rtdc_ds)
dsize = len(rtdc_ds) - len(data)
if dsize > 0:
msg = "Growing feature {} in {} by {} to match event number!"
warnings.warn(msg.format(self.feature_name, rtdc_ds, abs(dsize)),
BadFeatureSizeWarning)
data.resize(len(rtdc_ds), refcheck=False)
data[-dsize:] = np.nan
elif dsize < 0:
msg = "Shrinking feature {} in {} by {} to match event number!"
warnings.warn(msg.format(self.feature_name, rtdc_ds, abs(dsize)),
BadFeatureSizeWarning)
data.resize(len(rtdc_ds), refcheck=False)
if isinstance(data, np.ndarray):
data.setflags(write=False)
elif isinstance(data, list):
for item in data:
if isinstance(item, np.ndarray):
item.setflags(write=False)
return data
[docs] def hash(self, rtdc_ds):
"""Used for identifying an ancillary computation
The data columns and the used configuration keys/values
are hashed.
"""
hasher = hashlib.md5()
# data columns
for col in self.req_features:
hasher.update(obj2str(rtdc_ds[col]))
# config keys
for sec, keys in self.req_config:
for key in keys:
val = rtdc_ds.config[sec][key]
data = "{}:{}={}".format(sec, key, val)
hasher.update(obj2str(data))
# custom requirement function hash
reqret = self.req_func(rtdc_ds)
if not isinstance(reqret, bool):
# add to hash if not a boolean
hasher.update(obj2str(reqret))
return hasher.hexdigest()
[docs] def is_available(self, rtdc_ds, verbose=False):
"""Check whether the feature is available
Parameters
----------
rtdc_ds: instance of RTDCBase
The dataset to check availability for
Returns
-------
available: bool
`True`, if feature can be computed with `compute`
Notes
-----
This method returns `False` for a feature if there
is a feature defined with the same name but with
higher priority (even if the feature would be
available otherwise).
"""
# Check config keys
for item in self.req_config:
section, keys = item
if section not in rtdc_ds.config:
if verbose:
print("{} not in config".format(section))
return False
else:
for key in keys:
if key not in rtdc_ds.config[section]:
if verbose:
print("{} not in config['{}']".format(key,
section))
return False
# Check features
for col in self.req_features:
if col not in rtdc_ds:
return False
# Check priorities of other features
for of in AncillaryFeature.features:
if of == self:
# nothing to compare
continue
elif of.feature_name == self.feature_name:
# same feature name
if of.priority <= self.priority:
# lower priority, ignore
continue
else:
# higher priority
if of.is_available(rtdc_ds):
# higher priority is available, thus
# this feature is not available
return False
else:
# higher priority not available
continue
else:
# other feature
continue
# Check user-defined function
if not self.req_func(rtdc_ds):
return False
return True