Source code for dclab.rtdc_dataset.fmt_hdf5.base

"""RT-DC hdf5 format"""
from __future__ import annotations

import functools
import json
import pathlib
from typing import Any, BinaryIO, Dict
import warnings

import h5py

from ...external.packaging import parse as parse_version
from ...util import hashobj, hashfile

from ..config import Configuration
from ..core import RTDCBase
from .. import feat_basin

from . import events
from . import logs
from . import tables


#: rtdc files exported with dclab prior to this version are not supported
MIN_DCLAB_EXPORT_VERSION = "0.3.3.dev2"


class OldFormatNotSupportedError(BaseException):
    pass


class UnknownKeyWarning(UserWarning):
    pass


[docs]class RTDC_HDF5(RTDCBase): def __init__(self, h5path: str | pathlib.Path | BinaryIO, h5kwargs: Dict[str, Any] = None, *args, **kwargs): """HDF5 file format for RT-DC measurements Parameters ---------- h5path: str or pathlib.Path or file-like object Path to an '.rtdc' measurement file or a file-like object h5kwargs: dict Additional keyword arguments given to :class:`h5py.File` *args: Arguments for `RTDCBase` **kwargs: Keyword arguments for `RTDCBase` Attributes ---------- path: pathlib.Path Path to the experimental HDF5 (.rtdc) file """ super(RTDC_HDF5, self).__init__(*args, **kwargs) if isinstance(h5path, (str, pathlib.Path)): h5path = pathlib.Path(h5path) else: h5path = h5path self._hash = None self.path = h5path # Increase the read cache (which defaults to 1MiB), since # normally we have around 2.5MiB image chunks. if h5kwargs is None: h5kwargs = {} h5kwargs.setdefault("rdcc_nbytes", 10 * 1024 ** 2) h5kwargs.setdefault("rdcc_w0", 0) self.h5kwargs = h5kwargs self.h5file = h5py.File(h5path, **h5kwargs) self._events = events.H5Events(self.h5file) # Parse configuration self.config = RTDC_HDF5.parse_config(h5path) # Override logs property with HDF5 data self.logs = logs.H5Logs(self.h5file) # Override the tables property with HDF5 data self.tables = tables.H5Tables(self.h5file) # check version rtdc_soft = self.config["setup"]["software version"] if rtdc_soft.startswith("dclab "): rtdc_ver = parse_version(rtdc_soft.split(" ")[1]) if rtdc_ver < parse_version(MIN_DCLAB_EXPORT_VERSION): msg = "The file {} was created ".format(self.path) \ + "with dclab {} which is ".format(rtdc_ver) \ + "not supported anymore! Please rerun " \ + "dclab-tdms2rtdc / export the data again." raise OldFormatNotSupportedError(msg) self.title = "{} - M{}".format(self.config["experiment"]["sample"], self.config["experiment"]["run index"]) # Finalize initialization self._finalize_init() def __enter__(self): return self def __exit__(self, type, value, tb): # close the HDF5 file self.h5file.close() @functools.lru_cache() def __len__(self): ec = self.h5file.get("experiment:event count") if ec is not None: return ec else: return super(RTDC_HDF5, self).__len__() @property def _h5(self): warnings.warn("Access to the underlying HDF5 file is now public. " "Please use the `h5file` attribute instead of `_h5`!", DeprecationWarning) return self.h5file
[docs] @staticmethod def can_open(h5path): """Check whether a given file is in the .rtdc file format""" h5path = pathlib.Path(h5path) if h5path.suffix == ".rtdc": return True else: # we don't know the extension; check for the "events" group canopen = False try: # This is a workaround for Python2 where h5py cannot handle # unicode file names. with h5path.open("rb") as fd: h5 = h5py.File(fd, "r") if "events" in h5: canopen = True except IOError: # not an HDF5 file pass return canopen
[docs] @staticmethod def parse_config(h5path): """Parse the RT-DC configuration of an HDF5 file""" with h5py.File(h5path, mode="r") as fh5: h5attrs = dict(fh5.attrs) # Convert byte strings to unicode strings # https://github.com/h5py/h5py/issues/379 for key in h5attrs: if isinstance(h5attrs[key], bytes): h5attrs[key] = h5attrs[key].decode("utf-8") config = Configuration() for key in h5attrs: section, pname = key.split(":") config[section][pname] = h5attrs[key] return config
@property def hash(self): """Hash value based on file name and content""" if self._hash is None: tohash = [self.path.name, # Hash a maximum of ~1MB of the hdf5 file hashfile(self.path, blocksize=65536, count=20)] self._hash = hashobj(tohash) return self._hash
[docs] def basins_get_dicts(self): """Return list of dicts for all basins defined in `self.h5file`""" basins = [] for bk in sorted(self.h5file.get("basins", [])): # `sorted` priority bdat = list(self.h5file["basins"][bk]) if isinstance(bdat[0], bytes): bdat = [bi.decode("utf") for bi in bdat] bdict = json.loads(" ".join(bdat)) basins.append(bdict) return basins
[docs]class HDF5Basin(feat_basin.Basin): basin_format = "hdf5" basin_type = "file"
[docs] def load_dataset(self, location, **kwargs): return RTDC_HDF5(location, enable_basins=False, **kwargs)
[docs] def is_available(self): return pathlib.Path(self.location).exists()