Source code for dclab.rtdc_dataset.copier

"""Helper methods for copying .rtdc data"""
from __future__ import annotations

from typing import Literal

import h5py
import h5py.h5o
import hdf5plugin
import numpy as np

from ..definitions import feature_exists, scalar_feature_exists

from .fmt_hdf5 import DEFECTIVE_FEATURES


[docs]def rtdc_copy(src_h5file: h5py.Group, dst_h5file: h5py.Group, features: Literal['all', 'scalar', 'none'] = "all", include_logs: bool = True, include_tables: bool = True, meta_prefix: str = ""): """Create a compressed copy of an RT-DC file""" # metadata for akey in src_h5file.attrs: dst_h5file.attrs[akey] = src_h5file.attrs[akey] # logs if include_logs and "logs" in src_h5file: dst_h5file.require_group("logs") for lkey in src_h5file["logs"]: h5ds_copy(src_loc=src_h5file["logs"], src_name=lkey, dst_loc=dst_h5file["logs"], dst_name=meta_prefix + lkey, recursive=False) # tables if include_tables and "tables" in src_h5file: dst_h5file.require_group("tables") for tkey in src_h5file["tables"]: # There appears to be a problem with h5copy in some rare # situations, so we do not use h5copy, but read and write # the table data directly. # https://github.com/HDFGroup/hdf5/issues/3214 # The following caused a Segmentation fault: # h5ds_copy(src_loc=src_h5file["tables"], # src_name=tkey, # dst_loc=dst_h5file["tables"], # dst_name=meta_prefix + tkey, # recursive=False) dst_h5file["tables"].create_dataset( name=tkey, data=src_h5file["tables"][tkey][:], fletcher32=True, **hdf5plugin.Zstd(clevel=5)) # events if features != "none": scalar_only = features == "scalar" dst_h5file.require_group("events") for feat in src_h5file["events"]: if feature_exists(feat, scalar_only=scalar_only): # Skip all defective features. These are features that # are known to be invalid (e.g. ancillary features that # were computed falsely) and must be recomputed by dclab. if feat in DEFECTIVE_FEATURES: defective = DEFECTIVE_FEATURES[feat](src_h5file) if defective: continue dst = h5ds_copy(src_loc=src_h5file["events"], src_name=feat, dst_loc=dst_h5file["events"], recursive=True) if scalar_feature_exists(feat): # complement min/max values for all scalar features for ufunc, attr in [(np.nanmin, "min"), (np.nanmax, "max"), (np.nanmean, "mean"), ]: if attr not in dst.attrs: dst.attrs[attr] = ufunc(dst)
[docs]def h5ds_copy(src_loc, src_name, dst_loc, dst_name=None, ensure_compression=True, recursive=True): """Copy an HDF5 Dataset from one group to another Parameters ---------- src_loc: h5py.H5Group The source location src_name: str Name of the dataset in `src_loc` dst_loc: h5py.H5Group The destination location dst_name: str The name of the destination dataset, defaults to `src_name` ensure_compression: bool Whether to make sure that the data are compressed, If disabled, then all data from the source will be just copied and not compressed. recursive: bool Whether to recurse into HDF5 Groups (this is required e.g. for copying the "trace" feature) Returns ------- dst: h5py.Dataset The dataset `dst_loc[dst_name]` Raises ------ ValueError: If the named source is not a h5py.Dataset """ compression_kwargs = hdf5plugin.Zstd(clevel=5) dst_name = dst_name or src_name src = src_loc[src_name] if isinstance(src, h5py.Dataset): if ensure_compression and not is_properly_compressed(src): # Chunk size larger than dataset size is not allowed # in h5py's `make_new_dset`. if src.shape[0] == 0: # Ignore empty datasets (This sometimes happens with logs). return elif src.chunks and src.chunks[0] > src.shape[0]: # The chunks in the input file are larger than the dataset # shape. So we set the chunks to the shape. Here, we only # check for the first axis (event count for feature data), # because if the chunks vary in any other dimension then # there is something fundamentally wrong with the input # dataset (which we don't want to endorse, and where there # could potentially be a lot of data put into ram). chunks = list(src.chunks) chunks[0] = src.shape[0] chunks = tuple(chunks) else: # original chunk size is fine chunks = src.chunks # Variable length strings, compression, and fletcher32 are not # a good combination. If we encounter any logs, then we have # to write them with fixed-length strings. # https://forum.hdfgroup.org/t/fletcher32-filter-on-variable- # length-string-datasets-not-suitable-for-filters/9038/4 if src.dtype.kind == "O": # We are looking at logs with variable length strings. max_length = max([len(ii) for ii in src] + [100]) dtype = f"S{max_length}" convert_to_s_fixed = True else: dtype = src.dtype convert_to_s_fixed = False # Manually create a compressed version of the dataset. dst = dst_loc.create_dataset(name=dst_name, shape=src.shape, dtype=dtype, chunks=chunks, fletcher32=True, **compression_kwargs ) if convert_to_s_fixed: # We are looking at old variable-length log strings. dst[:] = src[:].astype(dtype) elif chunks is None: dst[:] = src[:] else: for chunk in src.iter_chunks(): dst[chunk] = src[chunk] # Also write all the attributes for key in src.attrs: dst.attrs[key] = src.attrs[key] else: # Copy the Dataset to the destination as-is. h5py.h5o.copy(src_loc=src_loc.id, src_name=src_name.encode(), dst_loc=dst_loc.id, dst_name=dst_name.encode(), ) elif recursive and isinstance(src, h5py.Group): dst_rec = dst_loc.require_group(dst_name) for key in src: h5ds_copy(src_loc=src, src_name=key, dst_loc=dst_rec, ensure_compression=ensure_compression, recursive=recursive) else: raise ValueError(f"The object {src_name} in {src.file} is not " f"a dataset!") return dst_loc[dst_name]
[docs]def is_properly_compressed(h5obj): """Check whether an HDF5 object is properly compressed The compression check only returns True if the input file was compressed with the Zstandard compression using compression level 5 or higher. """ # Since version 0.43.0, we use Zstandard compression # which does not show up in the `compression` # attribute of `obj`. create_plist = h5obj.id.get_create_plist() filter_args = create_plist.get_filter_by_id(32015) if filter_args is not None and filter_args[1][0] >= 5: properly_compressed = True else: properly_compressed = False return properly_compressed