Source code for dclab.rtdc_dataset.fmt_s3

import re


try:
    import s3fs
except ModuleNotFoundError:
    S3FS_AVAILABLE = False
else:
    S3FS_AVAILABLE = True


from .fmt_hdf5 import RTDC_HDF5


#: Regular expression for matching a DCOR resource URL
REGEXP_S3_URL = re.compile(
    r"^(https:\/\/)"  # protocol
    r"([a-z0-9-\.]*)"  # host
    r".*"  # path on host
)


[docs]class RTDC_S3(RTDC_HDF5): def __init__(self, url: str, secret_id: str = "", secret_key: str = "", *args, **kwargs): """Access RT-DC measurements in an S3-compatible object store This is essentially just a wrapper around :class:`.RTDC_HDF5` with `s3fs` passing a file object to h5py. Parameters ---------- url: str Full URL to an object in an S3 instance secret_id: str S3 access identifier secret_key: str Secret S3 access key *args: Arguments for `RTDCBase` **kwargs: Keyword arguments for `RTDCBase` Attributes ---------- path: str The URL to the object """ if not S3FS_AVAILABLE: raise ModuleNotFoundError( "Package `s3fs` required for S3 format!") proto, s3_string = url.split("://", 1) s3_endpoint, s3_path = s3_string.split("/", 1) s3fskw = { "client_kwargs": {"endpoint_url": f"{proto}://{s3_endpoint}"}, # A large block size makes loading metadata really slow. "default_block_size": 2048, } if secret_id and secret_key: # We have an id-key pair. s3fskw["key"] = secret_id s3fskw["secret"] = secret_key s3fskw["anon"] = False # this is the default else: # Anonymous access has to be enabled explicitly. # Normally, s3fs would check for credentials in # environment variables and does not fall back to # anonymous use. s3fskw["anon"] = True self._fs = s3fs.S3FileSystem(**s3fskw) self._f3d = self._fs.open(s3_path, mode='rb') # This also takes care of `_finalize_init` super(RTDC_S3, self).__init__( h5path=self._f3d, *args, **kwargs) # Override self.path with the actual S3 URL self.path = url
def is_s3_url(string): if not isinstance(string, str): return False else: return REGEXP_S3_URL.match(string.strip())