Source code for dclab.rtdc_dataset.feat_anc_ml.ml_feature

.. versionadded:: 0.38.0
import hashlib

from ..feat_anc_core import AncillaryFeature
from ...util import obj2bytes

from . import modc

[docs] class MachineLearningFeature(AncillaryFeature): def __init__(self, feature_name, dc_model, modc_path=None): """A user-defined machine-learning feature Parameters ---------- feature_name: str name of the ML feature score (starts with `ml_score_`) dc_model: dclab.rtdc_dataset.feat_anc_ml.ml_model.BaseModel ML model to register modc_path: str or Path path to the original .modc file (if applicable) Notes ----- `MachineLearningFeature` inherits from :class:`AncillaryFeature <dclab.rtdc_dataset.feat_anc_core.ancillary_feature.AncillaryFeature>`. """ if (not feature_name.startswith("ml_score_") or len(feature_name) != len("ml_score_123")): raise ValueError("Feature name for MachineLearning Feature must be" + "in the form 'ml_score_xyz'," + f"got '{feature_name}'!") # Make sure this MachineLearningFeature has not already been # registered (for normal features this is ok, but here we want # to avoid any possible ambiguity). for af in AncillaryFeature.features: if (isinstance(af, MachineLearningFeature) and af.feature_name == feature_name): raise ValueError("Cannot register two MachineLearningFeatures " + f"for the same feature '{feature_name}'!") self.feature_name = feature_name self.modc_path = modc_path self.ml_feature_info = self._process_ml_info( # Instantiate the super class super(MachineLearningFeature, self).__init__( feature_name=feature_name, method=dc_model.predict, req_features=dc_model.inputs, data=dc_model, identifier=self.ml_feature_info["identifier"], ) def _process_ml_info(self, original_info): """Return dictionary with all relevant info for this instance """ fidx = original_info["output features"].index(self.feature_name) # determine feature label if ("output labels" in original_info and original_info["output labels"][fidx]): label = original_info["output labels"][fidx] else: label = f"ML feature {self.feature_name}" md5hasher = hashlib.md5() md5hasher.update(original_info.get("sha256", "").encode()) md5hasher.update(original_info.get("description", "").encode()) md5hasher.update(original_info.get("long description", "").encode()) md5hasher.update(original_info.get("date", "").encode()) md5hasher.update(self.feature_name.encode("utf-8")) for feat in original_info["input features"]: md5hasher.update(feat.encode("utf-8")) identifier = md5hasher.hexdigest() feature_info = { "date": original_info.get("date", "no date"), "description": original_info.get( "description", "No description provided"), "long description": original_info.get( "long description", "No long description provided."), "feature name": self.feature_name, "feature label": label, "features required": original_info["input features"], "scalar feature": True, "modc path": self.modc_path, "identifier": identifier, } return feature_info
[docs] def hash(self, rtdc_ds): """Used for identifying an ancillary ML computation The required features, the used configuration keys/values, and the return value of the requirement function are hashed. In addition, the ML model's info dictionary is hashed. """ super_hash = super(MachineLearningFeature, self).hash(rtdc_ds) hasher = hashlib.md5() hasher.update(obj2bytes(super_hash)) hasher.update(obj2bytes( return hasher.hexdigest()
[docs] def load_ml_feature(modc_path): """Find and load MachineLearningFeature(s) from a .modc file Parameters ---------- modc_path: str or Path pathname to a .modc file Returns ------- ml_list: list of MachineLearningFeature list of MachineLearningFeature instances loaded from `modc_path` See Also -------- MachineLearningFeature: class handling the plugin feature information """ dc_models = modc.load_modc(modc_path) mlf_list = [] for dc_model in dc_models: for feat in dc_model.outputs: mlf_list.append(MachineLearningFeature(feat, dc_model, modc_path)) return mlf_list
[docs] def remove_all_ml_features(): """Convenience function for removing all `MachineLearningFeature` instances See Also -------- remove_ml_feature: remove a single `MachineLearningFeature` instance """ for ml_instance in reversed(MachineLearningFeature.features): if isinstance(ml_instance, MachineLearningFeature): remove_ml_feature(ml_instance)
[docs] def remove_ml_feature(ml_instance): """Convenience function for removing a `MachineLearningFeature` instance Parameters ---------- ml_instance: MachineLearningFeature The `MachineLearningFeature` instance to be removed from dclab Raises ------ TypeError If the `ml_instance` is not a `MachineLearningFeature` instance """ if isinstance(ml_instance, MachineLearningFeature): MachineLearningFeature.feature_names.remove(ml_instance.feature_name) MachineLearningFeature.features.remove(ml_instance) else: raise TypeError(f"Type {type(ml_instance)} should be an instance " f"of MachineLearningFeature; got '{ml_instance}'!")