Source code for dclab.cli.task_condense

"""Create .rtdc files with scalar-only features"""
from __future__ import annotations

import argparse
import pathlib
from typing import List
import warnings

import h5py
import hdf5plugin

from ..rtdc_dataset import (
    fmt_hdf5, new_dataset, rtdc_copy, RTDCWriter, RTDCBase
)
from .. import util
from .._version import version

from . import common


[docs] def condense( path_in: str | pathlib.Path = None, path_out: str | pathlib.Path = None, ancillaries: bool = None, store_ancillary_features: bool = True, store_basin_features: bool = True, check_suffix: bool = True, ret_path: bool = False ): """Create a new dataset with all available scalar-only features Besides the innate scalar features, this also includes all fast-to-compute ancillary and all basin features (`features_loaded`). Parameters ---------- path_in: str or pathlib.Path file to compress path_out: str or pathlib output file path ancillaries: bool DEPRECATED, use `store_ancillary_features` instead store_ancillary_features: bool compute and store ancillary features in the output file store_basin_features: bool copy basin features from the input path to the output file check_suffix: bool check suffixes for input and output files ret_path: bool whether to return the output path Returns ------- path_out: pathlib.Path (optional) output path (with possibly corrected suffix) """ if ancillaries is not None: warnings.warn("Please use `store_ancillary_features` instead of " "`ancillaries`", DeprecationWarning) store_ancillary_features = ancillaries if path_out is None or path_in is None: parser = condense_parser() args = parser.parse_args() path_in = args.input path_out = args.output store_ancillary_features = not args.no_ancillaries store_basin_features = not args.no_basins allowed_input_suffixes = [".rtdc", ".tdms"] if not check_suffix: allowed_input_suffixes.append(pathlib.Path(path_in).suffix) path_in, path_out, path_temp = common.setup_task_paths( path_in, path_out, allowed_input_suffixes=allowed_input_suffixes) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") # We use `store_basin_features` during initialization (to avoid # conflicts with ancillary features) and in the actual function # as well, to correctly determine which features to use. with new_dataset(path_in, enable_basins=store_basin_features) as ds, \ h5py.File(path_temp, "w") as h5_cond: condense_dataset(ds=ds, h5_cond=h5_cond, store_ancillary_features=store_ancillary_features, store_basin_features=store_basin_features, warnings_list=w) # Finally, rename temp to out path_temp.rename(path_out) if ret_path: return path_out
[docs] def condense_dataset( ds: RTDCBase, h5_cond: h5py.File, ancillaries: bool = None, store_ancillary_features: bool = True, store_basin_features: bool = True, warnings_list: List = None): """Condense a dataset using low-level HDF5 methods For ancillary and basin features, high-level dclab methods are used. """ if ancillaries is not None: warnings.warn("Please use `store_ancillary_features` instead of " "`ancillaries`", DeprecationWarning) store_ancillary_features = ancillaries cmp_kw = hdf5plugin.Zstd(clevel=5) cmd_dict = {} # If we have an input HDF5 file, then we might readily copy most # of the features over using rtdc_copy. If we have a .tdms file, # then we have to go the long route. if isinstance(ds, fmt_hdf5.RTDC_HDF5): rtdc_copy(src_h5file=ds.h5file, dst_h5file=h5_cond, features="scalar", include_basins=True, include_logs=True, include_tables=True, meta_prefix="") h5_cond.require_group("logs") # scalar features feats_sc = ds.features_scalar # loaded (computationally cheap) scalar features feats_sc_in = [f for f in ds.features_loaded if f in feats_sc] cmd_dict["features_original_innate"] = ds.features_innate features = set(feats_sc_in) if store_basin_features: feats_sc_basin = [f for f in ds.features_basin if (f in feats_sc and f not in feats_sc_in)] cmd_dict["features_basin"] = feats_sc_basin if feats_sc_basin: print(f"Using basin features {feats_sc_basin}") features |= set(feats_sc_basin) if store_ancillary_features: feats_sc_anc = [f for f in ds.features_ancillary if (f in feats_sc and f not in feats_sc_in)] cmd_dict["features_ancillary"] = feats_sc_anc if feats_sc_anc: features |= set(feats_sc_anc) print(f"Using ancillary features {feats_sc_anc}") # command log logs = {"dclab-condense": common.get_command_log( paths=[ds.path], custom_dict=cmd_dict)} # rename old dclab-condense logs for l_key in ["dclab-condense", "dclab-condense-warnings"]: if l_key in h5_cond["logs"]: # This is cached, so no worry calling it multiple times. md5_cfg = util.hashobj(ds.config) # rename new_log_name = f"{l_key}_{md5_cfg}" if new_log_name not in h5_cond["logs"]: # If the user repeatedly condensed one file, then there is # no benefit in storing the log under a different name (the # metadata did not change). Only write the log if it does # not already exist. h5_cond["logs"][f"{l_key}_{md5_cfg}"] = h5_cond["logs"][l_key] del h5_cond["logs"][l_key] with RTDCWriter(h5_cond, mode="append", compression_kwargs=cmp_kw, ) as hw: # Write all remaining scalar features to the file # (these are *all* scalar features in the case of .tdms data). for feat in features: if feat not in h5_cond["events"]: hw.store_feature(feat=feat, data=ds[feat]) # collect warnings log if warnings_list: logs["dclab-condense-warnings"] = \ common.assemble_warnings(warnings_list) # Write logs for name in logs: hw.store_log(name, logs[name])
def condense_parser(): descr = "Reduce an RT-DC measurement to its scalar-only features " \ + "(i.e. without `contour`, `image`, `mask`, or `trace`). " \ + "All available ancillary features are computed." parser = argparse.ArgumentParser(description=descr) parser.add_argument('input', metavar="INPUT", type=str, help='Input path (.tdms or .rtdc file)') parser.add_argument('output', metavar="OUTPUT", type=str, help='Output path (.rtdc file)') parser.add_argument('--no-ancillary-features', dest='no_ancillaries', action='store_true', help='Do not compute expensive ancillary features ' 'such as volume' ) parser.set_defaults(no_ancillaries=False) parser.add_argument('--no-basin-features', dest='no_basins', action='store_true', help='Do not store basin-based feature data from the ' 'input file in the output file' ) parser.set_defaults(no_basins=False) parser.add_argument('--version', action='version', version=f'dclab-condense {version}') return parser