Source code for dclab.rtdc_dataset.fmt_http

import hashlib

from ..http_utils import HTTPFile, REQUESTS_AVAILABLE, is_url_available
from ..http_utils import is_http_url  # noqa: F401

from .feat_basin import Basin
from .fmt_hdf5 import RTDC_HDF5


[docs] class RTDC_HTTP(RTDC_HDF5): def __init__(self, url: str, *args, **kwargs): """Access RT-DC measurements via HTTP This class allows you to open .rtdc files accessible via an HTTP URL, for instance files on an S3 object storage or figshare download links. This is essentially just a wrapper around :class:`.RTDC_HDF5` with :class:`.HTTPFile` passing a file object to h5py. Parameters ---------- url: str Full URL to an HDF5 file *args: Arguments for `RTDCBase` **kwargs: Keyword arguments for `RTDCBase` Attributes ---------- path: str The URL to the object Notes ----- Since this format still requires random access to the file online, i.e. not the entire file is downloaded, only parts of it, the web server must support range requests. """ if not REQUESTS_AVAILABLE: raise ModuleNotFoundError( "Package `requests` required for http format!") self._fhttp = HTTPFile(url) if kwargs.get("identifier") is None: if self._fhttp.etag is not None: # Set the HTTP ETag as the identifier, it doesn't get # more unique than that! kwargs["identifier"] = self._fhttp.etag else: # Compute a hash of the first data chunk kwargs["identifier"] = hashlib.md5( self._fhttp.get_cache_chunk(0)).hexdigest() # Initialize the HDF5 dataset super(RTDC_HTTP, self).__init__( h5path=self._fhttp, *args, **kwargs) # Override self.path with the actual HTTP URL self.path = url
[docs] def close(self): super(RTDC_HTTP, self).close() self._fhttp.close()
[docs] class HTTPBasin(Basin): basin_format = "http" basin_type = "remote" def __init__(self, *args, **kwargs): self._available_verified = None super(HTTPBasin, self).__init__(*args, **kwargs) def _load_dataset(self, location, **kwargs): h5file = RTDC_HTTP(location, **kwargs) return h5file
[docs] def is_available(self): """Check for `requests` and object availability Caching policy: Once this method returns True, it will always return True. """ with self._av_check_lock: if not REQUESTS_AVAILABLE: # don't even bother self._available_verified = False if self._available_verified is None: avail, reason = is_url_available(self.location, ret_reason=True) if reason in ["forbidden", "not found"]: # we cannot access the URL in the near future self._available_verified = False elif avail: self._available_verified = True return self._available_verified