Source code for dclab.cli.task_compress

"""Compress .rtdc files"""
from __future__ import annotations

import argparse
import pathlib
import warnings

import hdf5plugin
import h5py

from ..rtdc_dataset import rtdc_copy, RTDCWriter
from .. import util
from .._version import version

from . import common

[docs] def compress( path_in: str | pathlib.Path = None, path_out: str | pathlib.Path = None, force: bool = False, check_suffix: bool = True, ret_path: bool = False, ): """Create a new dataset with all features compressed lossless Parameters ---------- path_in: str or pathlib.Path file to compress path_out: str or pathlib output file path force: bool DEPRECATED check_suffix: bool check suffixes for input and output files ret_path: bool whether to return the output path Returns ------- path_out: pathlib.Path (optional) output path (with possibly corrected suffix) """ cmp_kw = hdf5plugin.Zstd(clevel=5) if path_out is None or path_in is None: parser = compress_parser() args = parser.parse_args() path_in = args.input path_out = args.output force = args.force allowed_input_suffixes = [".rtdc"] if not check_suffix: allowed_input_suffixes.append(pathlib.Path(path_in).suffix) path_in, path_out, path_temp = common.setup_task_paths( path_in, path_out, allowed_input_suffixes=allowed_input_suffixes) if force: warnings.warn( "The `force` keyword argument is deprecated since dclab 0.49.0, " "because compressed HDF5 Datasets are now copied and there " "is no reason to avoid or use force anymore.", DeprecationWarning) # command log logs = {"dclab-compress": common.get_command_log(paths=[path_in])} with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") with h5py.File(path_in) as h5, h5py.File(path_temp, "w") as hc: rtdc_copy(src_h5file=h5, dst_h5file=hc, features="all", include_basins=True, include_logs=True, include_tables=True, meta_prefix="", ) hc.require_group("logs") # rename old dclab-compress logs for lkey in ["dclab-compress", "dclab-compress-warnings"]: if lkey in hc["logs"]: # This is cached, so no worry calling it multiple times. md55m = util.hashfile(path_in, count=80) # rename hc["logs"][f"{lkey}_{md55m}"] = hc["logs"][lkey] del hc["logs"][lkey] # warnings log if w: logs["dclab-compress-warnings"] = common.assemble_warnings(w) # Write log file with RTDCWriter(path_temp, compression_kwargs=cmp_kw, mode="append") as hw: for name in logs: hw.store_log(name, logs[name]) # Finally, rename temp to out path_temp.rename(path_out) if ret_path: return path_out
def compress_parser(): descr = "Create a compressed version of an .rtdc file. This can be " \ + "used for saving disk space (loss-less compression). The " \ + "data generated during an experiment is usually not compressed." parser = argparse.ArgumentParser(description=descr) parser.add_argument('input', metavar="INPUT", type=str, help='Input path (.rtdc file)') parser.add_argument('output', metavar="OUTPUT", type=str, help='Output path (.rtdc file)') parser.add_argument('--force', dest='force', action='store_true', help='DEPRECATED') parser.set_defaults(force=False) parser.add_argument('--version', action='version', version=f'dclab-compress {version}') return parser