Source code for dclab.ml.models

import abc
import collections
import uuid

import numpy as np

from .mllibs import tensorflow as tf
from ..rtdc_dataset.ancillaries import af_ml_score


[docs]class BaseModel(abc.ABC):
    def __init__(self, bare_model, inputs, outputs, model_name=None,
                 output_labels=None):
        """
        Parameters
        ----------
        bare_model:
            Underlying ML model
        inputs: list of str
            List of model input features, e.g.
            ``["deform", "area_um"]``
        outputs: list of str
            List of output features the model provides in that order, e.g.
            ``["ml_score_rbc", "ml_score_rt1", "ml_score_tfe"]``
        model_name: str or None
            The name of the models
        output_labels: list of str
            List of more descriptive labels for the features, e.g.
            ``["red blood cell", "type 1 cell", "troll cell"]``.
        """
        self.bare_model = bare_model
        self.inputs = inputs
        self.outputs = outputs
        self.name = model_name or str(uuid.uuid4())[:5]
        self.output_labels = output_labels or outputs

    def __enter__(self):
        self.register()
        return self

    def __exit__(self, *args):
        self.unregister()

[docs]    @staticmethod
    @abc.abstractmethod
    def supported_formats():
        """List of dictionaries containing model formats

        Returns
        -------
        fmts: list
            Each item contains the keys "name" (format name),
            "suffix" (saved file suffix), "requires" (Python
            dependencies).
        """

[docs]    @staticmethod
    @abc.abstractmethod
    def load_bare_model(path):
        """Load an implementation-specific model from a file

        This will set the `self.model` attribute. Make sure that
        the other attributes are set properly as well.
        """

[docs]    @staticmethod
    @abc.abstractmethod
    def save_bare_model(path, bare_model, save_format=None):
        """Save an implementation-specific model to a file

        Parameters
        ----------
        path: str or path-like
            Path to store model to
        bare_model: object
            The implementation-specific bare model
        save_format: str
            Must be in `supported_formats`
        """

[docs]    @abc.abstractmethod
    def predict(self, ds):
        """Return the probabilities of `self.outputs` for `ds`

        Parameters
        ----------
        ds: dclab.rtdc_dataset.RTDCBase
            Dataset to apply the model to

        Returns
        -------
        ofdict: dict
            Output feature dictionary with features as keys
            and 1d ndarrays as values.

        Notes
        -----
        This function calls :func:`BaseModel.get_dataset_features`
        to obtain the input feature matrix.
        """

[docs]    def get_dataset_features(self, ds, dtype=np.float32):
        """Return the dataset features used for inference

        Parameters
        ----------
        ds: dclab.rtdc_dataset.RTDCBase
            Dataset from which to retrieve the feature data
        dtype: dtype
            All features are cast to this dtype

        Returns
        -------
        fdata: 2d ndarray
            2D array of shape (len(ds), len(self.inputs));
            i.e. to access the array containing the first feature,
            for all events, you would do `fdata[:, 0]`.
        """
        fdata = np.zeros((len(ds), len(self.inputs)), dtype=dtype)
        for ii, feat in enumerate(self.inputs):
            fdata[:, ii] = ds[feat]
        return fdata

[docs]    def register(self):
        """Register this model to the dclab ancillary features"""
        af_ml_score.register(self)

[docs]    def unregister(self):
        """Unregister from dclab ancillary features"""
        af_ml_score.unregister(self)


[docs]class TensorflowModel(BaseModel):
    """Handle tensorflow models"""
[docs]    @staticmethod
    def supported_formats():
        return [{"name": "tensorflow-SavedModel",
                 "suffix": ".tf",
                 "requirements": "tensorflow"}
                ]

[docs]    @staticmethod
    def load_bare_model(path):
        """Load a tensorflow model"""
        # We don't use tf.saved_model.load, because it does not
        # return a keras layer.
        bare_model = tf.keras.models.load_model(str(path))
        return bare_model

[docs]    @staticmethod
    def save_bare_model(path, bare_model, save_format="tensorflow-SavedModel"):
        """Save a tensorflow model"""
        assert save_format == "tensorflow-SavedModel"
        tf.saved_model.save(obj=bare_model, export_dir=str(path))

[docs]    def predict(self, ds, batch_size=32):
        """Return the probabilities of `self.outputs` for `ds`

        Parameters
        ----------
        ds: dclab.rtdc_dataset.RTDCBase
            Dataset to apply the model to
        batch_size: int
            Batch size for inference with tensorflow

        Returns
        -------
        ofdict: dict
            Output feature dictionary with features as keys
            and 1d ndarrays as values.

        Notes
        -----
        Before prediction, this method asserts that the outputs of the
        model are converted to probabilities. If the final layer
        is one-dimensional and does not have a sigmoid activation,
        then a sigmoid activation layer is added (binary
        classification) ``tf.keras.layers.Activation("sigmoid")``.
        If the final layer has more dimensions and is not a
        ``tf.keras.layers.Softmax()`` layer, then a softmax layer
        is added.
        """
        probability_model = tf.keras.Sequential([self.bare_model])
        if self.bare_model.output_shape[1] > 1:
            # Multiple outputs; check for softmax
            if not self.has_softmax_layer():
                probability_model.add(tf.keras.layers.Softmax())
        else:
            # Binary classification; check for sigmoid
            if not self.has_sigmoid_activation():
                probability_model.add(tf.keras.layers.Activation("sigmoid"))

        fdata = self.get_dataset_features(ds)
        tfdata = tf.data.Dataset.from_tensor_slices(fdata).batch(batch_size)
        ret = probability_model.predict(tfdata)
        ofdict = collections.OrderedDict()
        for ii, key in enumerate(self.outputs):
            ofdict[key] = ret[:, ii]
        return ofdict

[docs]    def has_sigmoid_activation(self, layer_config=None):
        """Return True if final layer has "sigmoid" activation function"""
        if layer_config is None:
            layer_config = self.bare_model.get_config()
        if "layers" in layer_config:
            return self.has_sigmoid_activation(layer_config["layers"][-1])
        else:
            activation = layer_config.get("config", "").get("activation", "")
            return activation == "sigmoid"

[docs]    def has_softmax_layer(self, layer_config=None):
        """Return True if final layer is a Softmax layer"""
        if layer_config is None:
            layer_config = self.bare_model.get_config()
        if "layers" in layer_config:
            return self.has_softmax_layer(layer_config["layers"][-1])
        else:
            return layer_config["class_name"] == "Softmax"