"""RT-DC hdf5 format"""
from __future__ import annotations
import io
import json
import pathlib
import time
from typing import Any, BinaryIO, Dict
import warnings
import h5py
from ...external.packaging import parse as parse_version
from ...util import hashobj, hashfile
from ..config import Configuration
from ..core import RTDCBase
from . import events
from . import logs
from . import tables
#: rtdc files exported with dclab prior to this version are not supported
MIN_DCLAB_EXPORT_VERSION = "0.3.3.dev2"
class OldFormatNotSupportedError(BaseException):
pass
class UnknownKeyWarning(UserWarning):
pass
[docs]
class RTDC_HDF5(RTDCBase):
def __init__(self,
h5path: str | pathlib.Path | BinaryIO | io.IOBase,
h5kwargs: Dict[str, Any] | None = None,
*args,
**kwargs):
"""HDF5 file format for RT-DC measurements
Parameters
----------
h5path: str or pathlib.Path or file-like object
Path to an '.rtdc' measurement file or a file-like object
h5kwargs: dict
Additional keyword arguments given to :class:`h5py.File`
*args:
Arguments for `RTDCBase`
**kwargs:
Keyword arguments for `RTDCBase`
"""
super(RTDC_HDF5, self).__init__(*args, **kwargs)
# Any subclass from RTDC_HDF5 is probably a remote-type and should
# not be able to access local basins. If you do not agree, please
# enable this in the definition of the subclass.
self._local_basins_allowed = True if self.format == "hdf5" else False
if isinstance(h5path, (str, pathlib.Path)):
h5path = pathlib.Path(h5path)
else:
h5path = h5path
self._hash = None
#: Path to the measurement HDF5 (.rtdc) file
self.path = h5path
# Increase the read cache (which defaults to 1MiB), since
# normally we have around 2.5MiB image chunks.
h5kwargs = h5kwargs or {}
h5kwargs.setdefault("rdcc_nbytes", 10 * 1024 ** 2)
h5kwargs.setdefault("rdcc_w0", 0)
h5kwargs.setdefault("locking", False)
self.h5kwargs = h5kwargs
try:
self.h5file = h5py.File(h5path, **h5kwargs)
except OSError:
try:
h5kwargs["locking"] = "best-effort"
self.h5file = h5py.File(h5path, **h5kwargs)
except OSError as e:
# This could mean that the dataset is currently being written
# to OR the recording software crashed. Opening the file in
# read mode is only possible by either clearing the file
# consistency flags, or reading it in SWMR mode (which is
# what we try here).
msg = " ".join([str(arg) for arg in e.args])
if msg and (
# 'Unable to synchronously open file (file locking flag
# values don't match)'
msg.count("file locking flag values don't match")
# 'Unable to synchronously open file (file is already open
# for write (may use <h5clear file> to clear file
# consistency flags))'
or msg.count("h5clear")):
h5kwargs["locking"] = False
h5kwargs["swmr"] = True
h5kwargs["libver"] = "latest"
self.h5file = h5py.File(h5path, **h5kwargs)
# The writer is flushing datasets at specific intervals.
# In case we hit just such a flushing point, then our
# features might have non-matching lengths.
for ii in range(5):
lengths = []
for feat in self.h5file["events"].keys():
ds = self.h5file["events"][feat]
ds.refresh()
lengths.append(ds.shape[0])
if len(set(lengths)) == 1:
# We are good.
break
time.sleep(0.1)
else:
warnings.warn(
f"Feature sizes in {self.path} are not identical.")
else:
raise
self._events = events.H5Events(self.h5file)
# Parse configuration
self.config = RTDC_HDF5.parse_config(self.h5file)
# Override logs property with HDF5 data
self.logs = logs.H5Logs(self.h5file)
# Override the tables property with HDF5 data
self.tables = tables.H5Tables(self.h5file)
# check version
rtdc_soft = self.config["setup"].get("software version", "unknown")
if rtdc_soft.startswith("dclab "):
rtdc_ver = parse_version(rtdc_soft.split(" ")[1])
if rtdc_ver < parse_version(MIN_DCLAB_EXPORT_VERSION):
msg = "The file {} was created ".format(self.path) \
+ "with dclab {} which is ".format(rtdc_ver) \
+ "not supported anymore! Please rerun " \
+ "dclab-tdms2rtdc / export the data again."
raise OldFormatNotSupportedError(msg)
self.title = "{} - M{}".format(
self.config["experiment"].get("sample", "undefined sample"),
self.config["experiment"].get("run index", "0"))
[docs]
def close(self):
"""Close the underlying HDF5 file"""
super(RTDC_HDF5, self).close()
self.h5file.close()
@property
def _h5(self):
warnings.warn("Access to the underlying HDF5 file is now public. "
"Please use the `h5file` attribute instead of `_h5`!",
DeprecationWarning)
return self.h5file
[docs]
@staticmethod
def can_open(h5path):
"""Check whether a given file is in the .rtdc file format"""
h5path = pathlib.Path(h5path)
if h5path.suffix == ".rtdc":
return True
else:
# we don't know the extension; check for the "events" group
canopen = False
try:
# This is a workaround for Python2 where h5py cannot handle
# unicode file names.
with h5path.open("rb") as fd:
h5 = h5py.File(fd, "r", locking=False)
if "events" in h5:
canopen = True
except IOError:
# not an HDF5 file
pass
return canopen
[docs]
@staticmethod
def parse_config(h5path):
"""Parse the RT-DC configuration of an HDF5 file
`h5path` may be a h5py.File object or an actual path
"""
if not isinstance(h5path, h5py.File):
with h5py.File(h5path, mode="r", locking=False) as fh5:
h5attrs = dict(fh5.attrs)
else:
h5attrs = dict(h5path.attrs)
# Convert byte strings to unicode strings
# https://github.com/h5py/h5py/issues/379
for key in h5attrs:
if isinstance(h5attrs[key], bytes):
h5attrs[key] = h5attrs[key].decode("utf-8")
config = Configuration()
for key in h5attrs:
section, pname = key.split(":")
config[section][pname] = h5attrs[key]
return config
@property
def hash(self):
"""Hash value based on file name and content"""
if self._hash is None:
tohash = []
if isinstance(self.path, pathlib.Path):
# actual path on file system
tohash.append(self.path.name)
elif isinstance(self.path, str):
# remote location (when `hash` not defined in subclass)
tohash.append(self.path)
tohash.append(
# Hash a maximum of ~1MB of the hdf5 file
hashfile(self.path, blocksize=65536, count=20)
)
self._hash = hashobj(tohash)
return self._hash
[docs]
def basins_get_dicts(self):
"""Return list of dicts for all basins defined in `self.h5file`"""
return self.basin_get_dicts_from_h5file(self.h5file)
[docs]
@staticmethod
def basin_get_dicts_from_h5file(h5file):
"""Return list of dicts for all basins defined in `h5file`"""
basins = []
# Do not sort anything here, sorting is done in `RTDCBase`.
for bk in h5file.get("basins", []):
bdat = list(h5file["basins"][bk])
if isinstance(bdat[0], bytes):
bdat = [bi.decode("utf") for bi in bdat]
bdict = json.loads(" ".join(bdat))
bdict["key"] = bk
basins.append(bdict)
return basins