Source code for dclab.rtdc_dataset.fmt_hdf5.base

"""RT-DC hdf5 format"""
from __future__ import annotations

import io
import json
import pathlib
from typing import Any, BinaryIO, Dict
import warnings

import h5py

from ...external.packaging import parse as parse_version
from ...util import hashobj, hashfile

from ..config import Configuration
from ..core import RTDCBase

from . import events
from . import logs
from . import tables

#: rtdc files exported with dclab prior to this version are not supported
MIN_DCLAB_EXPORT_VERSION = "0.3.3.dev2"


class OldFormatNotSupportedError(BaseException):
    pass


class UnknownKeyWarning(UserWarning):
    pass


[docs] class RTDC_HDF5(RTDCBase): def __init__(self, h5path: str | pathlib.Path | BinaryIO | io.IOBase, h5kwargs: Dict[str, Any] = None, *args, **kwargs): """HDF5 file format for RT-DC measurements Parameters ---------- h5path: str or pathlib.Path or file-like object Path to an '.rtdc' measurement file or a file-like object h5kwargs: dict Additional keyword arguments given to :class:`h5py.File` *args: Arguments for `RTDCBase` **kwargs: Keyword arguments for `RTDCBase` Attributes ---------- path: pathlib.Path Path to the experimental HDF5 (.rtdc) file """ super(RTDC_HDF5, self).__init__(*args, **kwargs) # Any subclass from RTDC_HDF5 is probably a remote-type and should # not be able to access local basins. If you do not agree, please # enable this in the definition of the subclass. self._local_basins_allowed = True if self.format == "hdf5" else False if isinstance(h5path, (str, pathlib.Path)): h5path = pathlib.Path(h5path) else: h5path = h5path self._hash = None self.path = h5path # Increase the read cache (which defaults to 1MiB), since # normally we have around 2.5MiB image chunks. if h5kwargs is None: h5kwargs = {} h5kwargs.setdefault("rdcc_nbytes", 10 * 1024 ** 2) h5kwargs.setdefault("rdcc_w0", 0) self.h5kwargs = h5kwargs self.h5file = h5py.File(h5path, **h5kwargs) self._events = events.H5Events(self.h5file) # Parse configuration self.config = RTDC_HDF5.parse_config(self.h5file) # Override logs property with HDF5 data self.logs = logs.H5Logs(self.h5file) # Override the tables property with HDF5 data self.tables = tables.H5Tables(self.h5file) # check version rtdc_soft = self.config["setup"].get("software version", "unknown") if rtdc_soft.startswith("dclab "): rtdc_ver = parse_version(rtdc_soft.split(" ")[1]) if rtdc_ver < parse_version(MIN_DCLAB_EXPORT_VERSION): msg = "The file {} was created ".format(self.path) \ + "with dclab {} which is ".format(rtdc_ver) \ + "not supported anymore! Please rerun " \ + "dclab-tdms2rtdc / export the data again." raise OldFormatNotSupportedError(msg) self.title = "{} - M{}".format( self.config["experiment"].get("sample", "undefined sample"), self.config["experiment"].get("run index", "0"))
[docs] def close(self): """Close the underlying HDF5 file""" super(RTDC_HDF5, self).close() self.h5file.close()
@property def _h5(self): warnings.warn("Access to the underlying HDF5 file is now public. " "Please use the `h5file` attribute instead of `_h5`!", DeprecationWarning) return self.h5file
[docs] @staticmethod def can_open(h5path): """Check whether a given file is in the .rtdc file format""" h5path = pathlib.Path(h5path) if h5path.suffix == ".rtdc": return True else: # we don't know the extension; check for the "events" group canopen = False try: # This is a workaround for Python2 where h5py cannot handle # unicode file names. with h5path.open("rb") as fd: h5 = h5py.File(fd, "r") if "events" in h5: canopen = True except IOError: # not an HDF5 file pass return canopen
[docs] @staticmethod def parse_config(h5path): """Parse the RT-DC configuration of an HDF5 file `h5path` may be a h5py.File object or an actual path """ if not isinstance(h5path, h5py.File): with h5py.File(h5path, mode="r") as fh5: h5attrs = dict(fh5.attrs) else: h5attrs = dict(h5path.attrs) # Convert byte strings to unicode strings # https://github.com/h5py/h5py/issues/379 for key in h5attrs: if isinstance(h5attrs[key], bytes): h5attrs[key] = h5attrs[key].decode("utf-8") config = Configuration() for key in h5attrs: section, pname = key.split(":") config[section][pname] = h5attrs[key] return config
@property def hash(self): """Hash value based on file name and content""" if self._hash is None: tohash = [self.path.name, # Hash a maximum of ~1MB of the hdf5 file hashfile(self.path, blocksize=65536, count=20)] self._hash = hashobj(tohash) return self._hash
[docs] def basins_get_dicts(self): """Return list of dicts for all basins defined in `self.h5file`""" basins = [] # Do not sort anything here, sorting is done in `RTDCBase`. for bk in self.h5file.get("basins", []): bdat = list(self.h5file["basins"][bk]) if isinstance(bdat[0], bytes): bdat = [bi.decode("utf") for bi in bdat] bdict = json.loads(" ".join(bdat)) basins.append(bdict) return basins