Source code for dclab.rtdc_dataset.fmt_s3

import functools
# import multiprocessing BaseManager here, because there is some kind
# of circular dependency issue with s3transfer.compat and multiprocessing.
from multiprocessing.managers import BaseManager  # noqa: F401
import os
import pathlib
import re
import socket
from urllib.parse import urlparse


try:
    import boto3
    import botocore
    import botocore.client
    import botocore.exceptions
    import botocore.session
except ModuleNotFoundError:
    BOTO3_AVAILABLE = False
else:
    BOTO3_AVAILABLE = True

from ..http_utils import HTTPFile

from .feat_basin import Basin

from .fmt_hdf5 import RTDC_HDF5


#: Regular expression for matching a DCOR resource URL
REGEXP_S3_URL = re.compile(
    r"^(https?:\/\/)"  # protocol (http or https or omitted)
    r"([a-z0-9-\.]*)(\:[0-9]*)?\/"  # host:port
    r".+\/"  # bucket
    r".+"  # key
)
REGEXP_S3_BUCKET_KEY = re.compile(r"^[0-9a-z-]+(\/[0-9a-z-]+)+$")

S3_ENDPOINT_URL = os.environ.get("DCLAB_S3_ENDPOINT_URL")
S3_ACCESS_KEY_ID = os.environ.get("DCLAB_S3_ACCESS_KEY_ID")
S3_SECRET_ACCESS_KEY = os.environ.get("DCLAB_S3_SECRET_ACCESS_KEY")



[docs]
class S3File(HTTPFile):
    """Monkeypatched `HTTPFile` to support authenticated access to S3"""
    def __init__(self,
                 object_path: str,
                 endpoint_url: str,
                 access_key_id: str = "",
                 secret_access_key: str = "",
                 use_ssl: bool = True,
                 verify_ssl: bool = True):
        """

        Parameters
        ----------
        object_path: str
            bucket/key path to object in the object store
        endpoint_url: str
            the explicit endpoint URL for accessing the object store
        access_key_id:
            S3 access key
        secret_access_key:
            secret S3 key mathcing `access_key_id`
        use_ssl: bool
            use SSL to connect to the endpoint, only disabled for testing
        verify_ssl: bool
            make sure the SSL certificate is sound, only used for testing
        """
        endpoint_url = endpoint_url.strip().rstrip("/")
        self.botocore_session = botocore.session.get_session()
        self.s3_session = boto3.Session(
            aws_access_key_id=access_key_id,
            aws_secret_access_key=secret_access_key,
            botocore_session=self.botocore_session)
        self.s3_client = self.s3_session.client(
            service_name='s3',
            use_ssl=use_ssl,
            verify=verify_ssl,
            endpoint_url=endpoint_url,
            )
        # Use a configuration that allows anonymous access
        # https://stackoverflow.com/a/34866092
        if not secret_access_key:
            config = botocore.client.Config(
                signature_version=botocore.UNSIGNED,
                region_name='us-east-1')
        else:
            config = None

        self.s3_resource = self.s3_session.resource(
            service_name="s3",
            use_ssl=use_ssl,
            verify=verify_ssl,
            endpoint_url=endpoint_url,
            config=config)

        bucket_name, object_name = object_path.strip("/").split("/", 1)
        self.s3_object = self.s3_resource.Object(
            bucket_name=bucket_name,
            key=object_name)

        super(S3File, self).__init__(f"{endpoint_url}/{object_path}")

    def _parse_header(self):
        if self._len is None:
            self._len = self.s3_object.content_length
            self._etag = self.s3_object.e_tag


[docs]
    def close(self):
        super(S3File, self).close()
        self.s3_client.close()



[docs]
    def download_range(self, start, stop):
        """Download bytes given by the range (`start`, `stop`)

        `stop` is not inclusive (In the HTTP range request it normally is).
        """
        stream = self.s3_object.get(Range=f"bytes={start}-{stop-1}")['Body']
        return stream.read()





[docs]
class RTDC_S3(RTDC_HDF5):
    def __init__(self,
                 url: str,
                 endpoint_url: str = None,
                 access_key_id: str = None,
                 secret_access_key: str = None,
                 use_ssl: bool = True,
                 *args, **kwargs):
        """Access RT-DC measurements in an S3-compatible object store

        This is essentially just a wrapper around :class:`.RTDC_HDF5`
        with :mod:`boto3` and :class:`.HTTPFile` passing a file object to h5py.

        Parameters
        ----------
        url: str
            URL to an object in an S3 instance; this can be either a full
            URL (including the endpoint), or just `bucket/key`
        access_key_id: str
            S3 access identifier
        secret_access_key: str
            Secret S3 access key
        use_ssl: bool
            Whether to enforce SSL (defaults to True)
        *args:
            Arguments for `RTDCBase`
        **kwargs:
            Keyword arguments for `RTDCBase`

        Attributes
        ----------
        path: str
            The URL to the object
        """
        if not BOTO3_AVAILABLE:
            raise ModuleNotFoundError(
                "Package `boto3` required for S3 format!")

        self._s3file = S3File(
            object_path=get_object_path(url),
            endpoint_url=(endpoint_url
                          or get_endpoint_url(url)
                          or S3_ENDPOINT_URL),
            access_key_id=(access_key_id
                           or S3_ACCESS_KEY_ID
                           or ""),
            secret_access_key=(secret_access_key
                               or S3_SECRET_ACCESS_KEY
                               or ""),
            use_ssl=use_ssl,
            verify_ssl=use_ssl,
            )
        # Initialize the HDF5 dataset
        super(RTDC_S3, self).__init__(
            h5path=self._s3file,
            *args,
            **kwargs)
        # Override self.path with the actual S3 URL
        self.path = self._s3file.url


[docs]
    def close(self):
        super(RTDC_S3, self).close()
        self._s3file.close()





[docs]
class S3Basin(Basin):
    basin_format = "s3"
    basin_type = "remote"

    def __init__(self, *args, **kwargs):
        self._available_verified = None
        super(S3Basin, self).__init__(*args, **kwargs)

    def _load_dataset(self, location, **kwargs):
        h5file = RTDC_S3(location, **kwargs)
        return h5file


[docs]
    def is_available(self):
        """Check for boto3 and object availability

        Caching policy: Once this method returns True, it will always
        return True.
        """
        with self._av_check_lock:
            if not BOTO3_AVAILABLE:
                self._available_verified = False
            if self._available_verified is None:
                self._available_verified = \
                    is_s3_object_available(self.location)
        return self._available_verified





[docs]
def is_s3_object_available(url: str,
                           access_key_id: str = None,
                           secret_access_key: str = None,
                           ):
    """Check whether an S3 object is available

    Parameters
    ----------
    url: str
        full URL to the object
    access_key_id: str
        S3 access identifier
    secret_access_key: str
        Secret S3 access key
    """
    avail = False
    if is_s3_url(url):
        endpoint_url = get_endpoint_url(url) or S3_ENDPOINT_URL
        # default to https if no scheme or port is specified
        urlp = urlparse(endpoint_url)
        port = urlp.port or (80 if urlp.scheme == "http" else 443)
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.settimeout(1)
            # Try to connect to the host
            try:
                # Use `hostname`, not `netloc`, because `netloc` contains
                # the port number which we do not want here.
                s.connect((urlp.hostname, port))
            except (socket.gaierror, OSError):
                pass
            else:
                # Try to access the object
                s3file = S3File(
                    object_path=get_object_path(url),
                    endpoint_url=endpoint_url,
                    access_key_id=(access_key_id
                                   or S3_ACCESS_KEY_ID
                                   or ""),
                    secret_access_key=(secret_access_key
                                       or S3_SECRET_ACCESS_KEY
                                       or ""),
                    )
                try:
                    s3file.s3_object.load()
                except botocore.exceptions.ClientError:
                    avail = False
                else:
                    avail = True
    return avail




[docs]
@functools.lru_cache()
def get_endpoint_url(url):
    """Given a URL of an S3 object, return the endpoint URL

    Return None if no endpoint URL can be extracted (e.g. because
    just `bucket_name/object_path` was passed).
    """
    urlp = urlparse(url=url)
    if urlp.hostname:
        scheme = urlp.scheme or "https"
        port = urlp.port or (80 if scheme == "http" else 443)
        return f"{scheme}://{urlp.hostname}:{port}"
    else:
        return None




[docs]
@functools.lru_cache()
def get_object_path(url):
    """Given a URL of an S3 object, return the `bucket_name/object_path` part

    Return object paths always without leading slash `/`.
    """
    urlp = urlparse(url=url)
    return urlp.path.lstrip("/")




[docs]
@functools.lru_cache()
def is_s3_url(string):
    """Check whether `string` is a valid S3 URL using regexp"""
    if not isinstance(string, str):
        return False
    elif REGEXP_S3_URL.match(string.strip()):
        # this is pretty clear
        return True
    elif pathlib.Path(string).exists():
        # this is actually a file
        return False
    elif REGEXP_S3_BUCKET_KEY.match(string.strip()):
        # bucket_name/key
        return True
    else:
        return False