Source code for dclab.cli.task_split

"""Split an .rtdc file into smaller .rtdc files"""
from __future__ import annotations

import argparse
import pathlib
import warnings

import hdf5plugin

from ..rtdc_dataset import fmt_tdms, new_dataset, RTDCWriter
from .._version import version

from . import common


[docs] def split( path_in: str | pathlib.Path = None, path_out: str | pathlib.Path = None, split_events: int = 10000, skip_initial_empty_image: bool = True, skip_final_empty_image: bool = True, ret_out_paths: bool = False, verbose: bool = False): """Split a measurement file Parameters ---------- path_in: str or pathlib.Path path of input measurement file path_out: str or pathlib.Path path to output directory (optional) split_events: int maximum number of events in each output file skip_initial_empty_image: bool remove the first event of the dataset if the image is zero skip_final_empty_image: bool remove the final event of the dataset if the image is zero ret_out_paths: if True, return the list of output file paths verbose: bool if True, print messages to stdout Returns ------- [out_paths]: list of pathlib.Path List of generated files (only if `ret_out_paths` is specified) """ cmp_kw = hdf5plugin.Zstd(clevel=5) if path_in is None: parser = split_parser() args = parser.parse_args() path_in = pathlib.Path(args.path_in).resolve() path_out = args.path_out split_events = args.split_events skip_initial_empty_image = not args.include_empty_boundary_images skip_final_empty_image = not args.include_empty_boundary_images verbose = True if path_out in ["SAME", None]: # default to input directory path_out = path_in.parent path_in = pathlib.Path(path_in) path_out = pathlib.Path(path_out) logs = {"dclab-split": common.get_command_log(paths=[path_in])} paths_gen = [] paths_temp = [] with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") # ignore ResourceWarning: unclosed file <_io.BufferedReader...> warnings.simplefilter("ignore", ResourceWarning) # noqa: F821 if fmt_tdms.NPTDMS_AVAILABLE: # tdms-related warning filters # ignore SlowVideoWarning warnings.simplefilter("ignore", fmt_tdms.event_image.SlowVideoWarning) if skip_initial_empty_image: # If the initial frame is skipped when empty, # suppress any related warning messages. warnings.simplefilter( "ignore", fmt_tdms.event_image.InitialFrameMissingWarning) with new_dataset(path_in) as ds: num_files = len(ds) // split_events if len(ds) % split_events: num_files += 1 for ii in range(num_files): pp = path_out / f"{path_in.stem}_{ii+1:04d}.rtdc" pt = pp.with_suffix(".rtdc~") paths_gen.append(pp) paths_temp.append(pt) if verbose: print(f"Generating {ii+1:d}/{num_files:d}: {pt}") ds.filter.manual[:] = False # reset filter ds.filter.manual[ii*split_events:(ii+1)*split_events] = True common.skip_empty_image_events( ds=ds, initial=skip_initial_empty_image, final=skip_final_empty_image) ds.apply_filter() ds.export.hdf5(path=pt, features=ds.features_innate, logs=True, tables=True, basins=True, filtered=True, compression_kwargs=cmp_kw, ) if w: logs["dclab-split-warnings"] = common.assemble_warnings(w) sample_name = ds.config["experiment"]["sample"] # Add the logs and update sample name for ii, pt in enumerate(paths_temp): meta = {"experiment": {"sample": f"{sample_name} {ii+1}/{num_files}"}} with RTDCWriter(pt, compression_kwargs=cmp_kw) as hw: for name in logs: hw.store_log(name, logs[name]) hw.store_metadata(meta) for pt, pp in zip(paths_temp, paths_gen): pt.rename(pp) if ret_out_paths: return paths_gen
def split_parser(): descr = "Split an RT-DC measurement file (.tdms or .rtdc) into multiple " \ + "smaller .rtdc files." parser = argparse.ArgumentParser(description=descr) parser.add_argument('path_in', metavar="PATH_IN", type=str, help='Input path (.tdms or .rtdc file)') parser.add_argument('--path_out', metavar="PATH_OUT", type=str, default="SAME", help='Output directory (defaults to same directory)') parser.add_argument('--split-events', type=int, default=10000, help='Maximum number of events in each output file') parser.add_argument('--include-empty-boundary-images', dest='include_empty_boundary_images', action='store_true', help='In old versions of Shape-In, the first or last ' + 'images were sometimes not stored in the ' + 'resulting .avi file. In dclab, such images ' + 'are represented as zero-valued images. Set ' + 'this option, if you wish to include these ' + 'events with empty image data.') parser.set_defaults(include_empty_boundary_images=False) parser.add_argument('--version', action='version', version=f'dclab-split {version}') return parser