Source code for dclab.rtdc_dataset.fmt_http

import hashlib

from ..http_utils import HTTPFile, REQUESTS_AVAILABLE, is_url_available
from ..http_utils import is_http_url  # noqa: F401

from .feat_basin import Basin
from .fmt_hdf5 import RTDC_HDF5



[docs]
class RTDC_HTTP(RTDC_HDF5):
    def __init__(self,
                 url: str,
                 *args, **kwargs):
        """Access RT-DC measurements via HTTP

        This class allows you to open .rtdc files accessible via an
        HTTP URL, for instance files on an S3 object storage or
        figshare download links.

        This is essentially just a wrapper around :class:`.RTDC_HDF5`
        with :class:`.HTTPFile` passing a file object to h5py.

        Parameters
        ----------
        url: str
            Full URL to an HDF5 file
        *args:
            Arguments for `RTDCBase`
        **kwargs:
            Keyword arguments for `RTDCBase`

        Attributes
        ----------
        path: str
            The URL to the object

        Notes
        -----
        Since this format still requires random access to the file online,
        i.e. not the entire file is downloaded, only parts of it, the
        web server must support range requests.
        """
        if not REQUESTS_AVAILABLE:
            raise ModuleNotFoundError(
                "Package `requests` required for http format!")

        self._fhttp = HTTPFile(url)
        if kwargs.get("identifier") is None:
            if self._fhttp.etag is not None:
                # Set the HTTP ETag as the identifier, it doesn't get
                # more unique than that!
                kwargs["identifier"] = self._fhttp.etag
            else:
                # Compute a hash of the first data chunk
                kwargs["identifier"] = hashlib.md5(
                    self._fhttp.get_cache_chunk(0)).hexdigest()

        # Initialize the HDF5 dataset
        super(RTDC_HTTP, self).__init__(
            h5path=self._fhttp,
            *args,
            **kwargs)
        # Override self.path with the actual HTTP URL
        self.path = url


[docs]
    def close(self):
        super(RTDC_HTTP, self).close()
        self._fhttp.close()





[docs]
class HTTPBasin(Basin):
    basin_format = "http"
    basin_type = "remote"

    def __init__(self, *args, **kwargs):
        self._available_verified = None
        super(HTTPBasin, self).__init__(*args, **kwargs)

    def _load_dataset(self, location, **kwargs):
        h5file = RTDC_HTTP(location, **kwargs)
        return h5file


[docs]
    def is_available(self):
        """Check for `requests` and object availability

        Caching policy: Once this method returns True, it will always
        return True.
        """
        with self._av_check_lock:
            if not REQUESTS_AVAILABLE:
                # don't even bother
                self._available_verified = False
            if self._available_verified is None:
                avail, reason = is_url_available(self.location,
                                                 ret_reason=True)
                if reason in ["forbidden", "not found"]:
                    # we cannot access the URL in the near future
                    self._available_verified = False
                elif avail:
                    self._available_verified = True
        return self._available_verified