Source code for dclab.rtdc_dataset.fmt_http

import hashlib

from ..http_utils import HTTPFile, REQUESTS_AVAILABLE, is_url_available
from ..http_utils import is_http_url  # noqa: F401

from .feat_basin import Basin
from .fmt_hdf5 import RTDC_HDF5


[docs] class RTDC_HTTP(RTDC_HDF5): def __init__(self, url: str, *args, **kwargs): """Access RT-DC measurements via HTTP This class allows you to open .rtdc files accessible via an HTTP URL, for instance files on an S3 object storage or figshare download links. This is essentially just a wrapper around :class:`.RTDC_HDF5` with :class:`.HTTPFile` passing a file object to h5py. Parameters ---------- url: str Full URL to an HDF5 file *args: Arguments for `RTDCBase` **kwargs: Keyword arguments for `RTDCBase` Notes ----- Since this format still requires random access to the file online, i.e. not the entire file is downloaded, only parts of it, the web server must support range requests. """ if not REQUESTS_AVAILABLE: raise ModuleNotFoundError( f"Package `requests` required for loading http data '{url}'!") self._fhttp = HTTPFile(url) # Initialize the HDF5 dataset super(RTDC_HTTP, self).__init__( h5path=self._fhttp, *args, **kwargs) # Override self.path with the actual HTTP URL #: URL to the file self.path = url @property def hash(self): if self._fhttp.etag is not None: # Set the HTTP ETag as the hash, it doesn't get # more unique than that! return self._fhttp.etag else: # Compute a hash of the first data chunk return hashlib.md5(self._fhttp.get_chunk(0)).hexdigest()
[docs] def close(self): super(RTDC_HTTP, self).close() self._fhttp.close()
[docs] class HTTPBasin(Basin): basin_format = "http" basin_type = "remote" def __init__(self, *args, **kwargs): self._available_verified = None super(HTTPBasin, self).__init__(*args, **kwargs) def _load_dataset(self, location, **kwargs): h5file = RTDC_HTTP(location, **kwargs) return h5file
[docs] def is_available(self): """Check for `requests` and object availability Caching policy: Once this method returns True, it will always return True. """ if self._available_verified is None: with self._av_check_lock: if not REQUESTS_AVAILABLE: # don't even bother self._available_verified = False else: avail, reason = is_url_available(self.location, ret_reason=True) if reason in ["forbidden", "not found"]: # we cannot access the URL in the near future self._available_verified = False elif avail: self._available_verified = True return self._available_verified