storage
DigitalOcean Spaces storage client for downloading PDF documents.
Downloads PDFs from Spaces organization-scoped buckets for Docling processing. Uses boto3 for S3-compatible API access.
Classes
StorageFileNotFoundError
Raised when a file is not found in storage.
StorageClient
Client for interacting with DigitalOcean Spaces (S3-compatible).
Constructor:
def __init__(self, config: Any) -> None
Methods
file_exists
def file_exists(self, organization_id: str, relative_path: str) -> bool
Check if a file exists in Spaces.
Args: organization_id: Organization UUID relative_path: Relative path within bucket
Returns: True if file exists, False otherwise
download_file
def download_file(self, organization_id: str, relative_path: str) -> bytes
Download a file from Spaces.
Args: organization_id: Organization UUID relative_path: Relative path within bucket
Returns: File contents as bytes
Raises: ClientError: If file doesn't exist or other S3 errors occur
download_to_file
def download_to_file(self, organization_id: str, relative_path: str, file_obj: BinaryIO) -> None
Download a file from Spaces to a file-like object.
Args: organization_id: Organization UUID relative_path: Relative path within bucket file_obj: File-like object to write to
Raises: ClientError: If file doesn't exist or other S3 errors occur
upload_file
def upload_file(self, organization_id: str, relative_path: str, content: str | bytes, content_type: str = 'application/octet-stream') -> None
Upload a file to Spaces.
Args: organization_id: Organization UUID relative_path: Relative path within bucket (e.g., "path/to/file.md") content: File content as string or bytes content_type: MIME type of the content (default: application/octet-stream)
Raises: ClientError: If upload fails
file_exists_infrastructure
def file_exists_infrastructure(self, relative_path: str) -> bool
Check if a file exists in the infrastructure bucket.
Args: relative_path: Relative path within infrastructure bucket
Returns: True if file exists, False otherwise
download_file_from_infrastructure
def download_file_from_infrastructure(self, relative_path: str) -> bytes
Download a file from the infrastructure bucket.
Args: relative_path: Relative path within infrastructure bucket
Returns: File contents as bytes
Raises: ClientError: If file doesn't exist or other S3 errors occur
upload_file_to_infrastructure
def upload_file_to_infrastructure(self, relative_path: str, content: str | bytes, content_type: str = 'application/octet-stream') -> None
Upload a file to the infrastructure bucket.
Args: relative_path: Relative path within infrastructure bucket content: File content as string or bytes content_type: MIME type of the content (default: application/octet-stream)
Raises: ClientError: If upload fails
upload_directory_to_infrastructure
def upload_directory_to_infrastructure(self, local_dir: Path, remote_prefix: str) -> None
Upload a directory to the infrastructure bucket as a tar.gz archive.
Args: local_dir: Local directory path to upload remote_prefix: Remote path prefix (e.g., "tokenizers/ibm-granite--granite-embedding-english-r2.tar.gz")
Raises: ClientError: If upload fails
download_directory_from_infrastructure
def download_directory_from_infrastructure(self, remote_prefix: str, local_dir: Path) -> None
Download a directory from the infrastructure bucket (tar.gz archive) and extract it.
Args: remote_prefix: Remote path prefix (e.g., "tokenizers/ibm-granite--granite-embedding-english-r2.tar.gz") local_dir: Local directory path to extract to
Raises: ClientError: If download fails