From 9f86971fad711bb0dc091b339d01d52282a293de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Van=C4=8Dura?= Date: Sat, 8 Mar 2025 17:00:53 +0100 Subject: [PATCH] Actor: Replace Docling CLI with docling-serve API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit transitions the Actor from using the full Docling CLI package to the more lightweight docling-serve API. Key changes include: - Redesign Dockerfile to use docling-serve as base image - Update actor.sh to communicate with API instead of running CLI commands - Improve content type handling for various output formats - Update input schema to align with API parameters - Reduce Docker image size from ~6GB to ~600MB - Update documentation and changelog to reflect architectural changes The image size reduction will make the Actor more cost-effective for users while maintaining all existing functionality including OCR capabilities. Issue: No official docling-serve Docker image is currently available, which will be addressed in a future commit. Signed-off-by: Václav Vančura --- .actor/CHANGELOG.md | 18 +++++ .actor/Dockerfile | 69 ++++++---------- .actor/README.md | 69 +++++++--------- .actor/actor.sh | 170 ++++++++++++++++++++++----------------- .actor/input_schema.json | 8 +- 5 files changed, 168 insertions(+), 166 deletions(-) diff --git a/.actor/CHANGELOG.md b/.actor/CHANGELOG.md index aac1191e..5df0a420 100644 --- a/.actor/CHANGELOG.md +++ b/.actor/CHANGELOG.md @@ -5,6 +5,24 @@ All notable changes to the Docling Actor will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.1.0] - 2025-03-15 + +### Changed + +- Switched from full Docling CLI to docling-serve API +- Dramatically reduced Docker image size (from ~6GB to ~600MB) +- Improved API compatibility with docling-serve +- Better content type handling for different output formats +- Updated error handling to align with API responses + +### Technical Details + +- Actor Specification v1 +- Using ds4sd/docling-serve:latest base image +- Node.js 20.x for Apify CLI +- Eliminated Python dependencies +- Simplified Docker build process + ## [1.0.0] - 2025-02-07 ### Added diff --git a/.actor/Dockerfile b/.actor/Dockerfile index 1ce4426f..e8b0f73c 100644 --- a/.actor/Dockerfile +++ b/.actor/Dockerfile @@ -1,57 +1,36 @@ -ARG ACTOR_PATH_IN_DOCKER_CONTEXT - -FROM python:3.11-slim-bookworm +FROM ds4sd/docling-serve:latest LABEL maintainer="Vaclav Vancura <@vancura>" LABEL description="Apify Actor for document processing using Docling" -LABEL version="1.0.0" +LABEL version="1.1.0" -RUN groupadd -r appuser && useradd -r -g appuser -s /sbin/nologin appuser && \ - \ - apt-get update && apt-get install -y --no-install-recommends bash curl file git gpg jo jq procps xz-utils && \ - \ - mkdir -p /etc/apt/keyrings && \ - curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \ - echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \ - \ - apt-get update && apt-get install -y nodejs && apt-get clean && \ - \ - rm -rf /var/lib/apt/lists/* && \ - \ - pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir docling==2.17.0 && \ - \ - npm install -g npm@latest && \ - npm install -g apify-cli && \ - npm cache clean --force && \ - \ - mkdir -p /home/appuser && \ - chown -R appuser:appuser /home/appuser && \ - \ - apt-get update && apt-get install -y --no-install-recommends \ - time \ +# Install necessary dependencies for the Apify Actor +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + gpg \ + jq \ + xz-utils \ + jo \ procps \ - && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# Install Node.js for Apify CLI +RUN mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \ + echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \ + apt-get update && apt-get install -y nodejs && \ + rm -rf /var/lib/apt/lists/* && \ + npm install -g npm@latest && \ + npm install -g apify-cli && \ + npm cache clean --force + +# Create directories and set permissions WORKDIR /app -RUN mkdir -p /tmp/runtime-root && \ - chmod 0700 /tmp/runtime-root && \ - chown -R appuser:appuser /tmp/runtime-root /app +# Copy actor files +COPY --chown=1000:1000 .actor/ .actor/ -COPY --chown=appuser:appuser .actor/ .actor/ -COPY --chown=appuser:appuser . . - -USER appuser - -# Set environment variables for better resource management -ENV PYTHONUNBUFFERED=1 -ENV MALLOC_ARENA_MAX=2 -ENV EASYOCR_DOWNLOAD_CACHE="/tmp/easyocr" - -# Create cache directory for EasyOCR -RUN mkdir -p /tmp/easyocr && \ - chown -R appuser:appuser /tmp/easyocr +# Security best practice: run as non-root user (docling-serve already uses a non-root user) +USER 1000 ENTRYPOINT [".actor/actor.sh"] diff --git a/.actor/README.md b/.actor/README.md index 445ad7fc..d57aa680 100644 --- a/.actor/README.md +++ b/.actor/README.md @@ -17,14 +17,14 @@ This Actor (specification v1) wraps the [Docling project](https://ds4sd.github.i 5. [Performance & Resources](#performance--resources) 6. [Troubleshooting](#troubleshooting) 7. [Local Development](#local-development) -8. [Requirements & Installation](#requirements--installation) +8. [Architecture](#architecture) 9. [License](#license) 10. [Acknowledgments](#acknowledgments) 11. [Security Considerations](#security-considerations) ## Features -- Runs Docling v2.17.0 in a fully managed environment on Apify +- Leverages the lightweight docling-serve API for efficient document processing - Processes multiple document formats: - PDF documents (scanned or digital) - Microsoft Office files (DOCX, XLSX, PPTX) @@ -169,11 +169,10 @@ Content of section 2... The Actor maintains detailed processing logs including: -- Memory usage statistics +- API request and response details - Processing steps and timing - Error messages and stack traces - Input validation results -- OCR processing details (when enabled) Access logs via: @@ -183,18 +182,14 @@ apify key-value-stores get-record DOCLING_LOG ## Performance & Resources -- **Docker Image Size**: ~6 GB (includes OCR libraries and ML models) +- **Docker Image Size**: ~600 MB - **Memory Requirements**: - - Minimum: 4 GB RAM - - Recommended: 8 GB RAM for large documents -- **Memory Monitoring**: - - Real-time memory usage tracking during processing - - Detailed memory statistics in `DOCLING_LOG` - - Automatic failure detection for out-of-memory situations + - Minimum: 2 GB RAM + - Recommended: 4 GB RAM for large or complex documents - **Processing Time**: - - Simple documents: 30-60 seconds - - Complex PDFs with OCR: 2-5 minutes - - Large documents (100+ pages): 5-15 minutes + - Simple documents: 15-30 seconds + - Complex PDFs with OCR: 1-3 minutes + - Large documents (100+ pages): 3-10 minutes ## Troubleshooting @@ -210,10 +205,10 @@ Common issues and solutions: - Check if the image quality is sufficient - Try processing with OCR disabled -3. **Memory Issues** - - For large documents, try splitting them into smaller chunks - - Consider using a higher-memory compute unit - - Disable OCR if not strictly necessary +3. **API Response Issues** + - Check the logs for detailed error messages + - Ensure the document format is supported + - Verify the URL is correctly formatted 4. **Output Format Issues** - Verify the output format is supported @@ -227,7 +222,6 @@ The Actor implements comprehensive error handling: - Input validation for document URLs and parameters - Detailed error messages in `DOCLING_LOG` - Proper exit codes for different failure scenarios -- Memory monitoring and out-of-memory detection - Automatic cleanup on failure - Dataset records with processing status @@ -242,7 +236,6 @@ If you wish to develop or modify this Actor locally: - `actor.json` - Actor configuration and metadata - `actor.sh` - Main execution script - `input_schema.json` - Input parameter definitions - - `.dockerignore` - Build optimization rules 4. Run the Actor locally using: ```bash @@ -257,30 +250,24 @@ If you wish to develop or modify this Actor locally: ├── actor.json # Actor metadata ├── actor.sh # Execution script ├── input_schema.json # Input parameters -├── .dockerignore # Build exclusions └── README.md # This documentation ``` -## Requirements & Installation +## Architecture -- An [Apify account](https://console.apify.com/?fpr=docling) (free tier available) -- For local development: - - Docker installed - - Apify CLI (`npm install -g apify-cli`) - - Git for version control -- The Actor's Docker image (~6 GB) includes: - - Python 3.11 with optimized caching (.pyc, .pyo excluded) - - Node.js 20.x - - Docling v2.17.0 and its dependencies - - OCR libraries and ML models +This Actor uses a lightweight architecture based on the official `ds4sd/docling-serve` Docker image: -### Build Optimizations - -The Actor uses several optimizations to maintain efficiency: - -- Python cache files (`pycache`, `.pyc`, `.pyo`, `.pyd`) are excluded -- Development artifacts (`.git`, `.env`, `.venv`) are ignored -- Log and test files (`*.log`, `.pytest_cache`, `.coverage`) are excluded from builds +- **Base Image**: `ds4sd/docling-serve:latest` (~600MB) +- **API Communication**: Uses the RESTful API provided by docling-serve on port 8080 +- **Request Flow**: + 1. Actor receives the input parameters + 2. Creates a JSON payload for the docling-serve API + 3. Makes a POST request to the /convert endpoint + 4. Processes the response and stores it in the key-value store +- **Dependencies**: + - Node.js for Apify CLI + - Essential Linux tools (curl, jq, etc.) +- **Security**: Runs as a non-root user for enhanced security ## License @@ -288,12 +275,12 @@ This wrapper project is under the MIT License, matching the original Docling lic ## Acknowledgments -- [Docling](https://ds4sd.github.io/docling/) codebase by IBM +- [Docling](https://ds4sd.github.io/docling/) and [docling-serve](https://github.com/DS4SD/docling-serve) by IBM - [Apify](https://apify.com/?fpr=docling) for the serverless actor environment ## Security Considerations -- Actor runs under a non-root user (appuser) for enhanced security +- Actor runs under a non-root user for enhanced security - Input URLs are validated before processing - Temporary files are securely managed and cleaned up - Process isolation through Docker containerization diff --git a/.actor/actor.sh b/.actor/actor.sh index 3e808472..752bfd7f 100755 --- a/.actor/actor.sh +++ b/.actor/actor.sh @@ -17,13 +17,13 @@ exec 2> >(tee -a "$LOG_FILE" >&2) trap 'echo "Error on line $LINENO"' ERR set -e -# --- Validate Docling installation --- +# --- Define error codes --- -# Check if Docling CLI is installed and in PATH. -if ! command -v docling &>/dev/null; then - echo "Error: Docling CLI is not installed or not in PATH" - exit 1 -fi +readonly ERR_INVALID_INPUT=10 +readonly ERR_URL_INACCESSIBLE=11 +readonly ERR_DOCLING_FAILED=12 +readonly ERR_OUTPUT_MISSING=13 +readonly ERR_STORAGE_FAILED=14 # --- Input parsing --- @@ -36,24 +36,10 @@ INPUT="$(apify actor:get-input || { DOCUMENT_URL="$(echo "${INPUT}" | jq -r '.documentUrl')" OUTPUT_FORMAT="$(echo "${INPUT}" | jq -r '.outputFormat')" -OUTPUT_NAME="output_file.${OUTPUT_FORMAT}" - -# Define error codes. -readonly ERR_INVALID_INPUT=10 -readonly ERR_URL_INACCESSIBLE=11 -readonly ERR_DOCLING_FAILED=12 -readonly ERR_OUTPUT_MISSING=13 -readonly ERR_STORAGE_FAILED=14 - -# Update error handling with codes. -if [ -z "$DOCUMENT_URL" ]; then - echo "Error: Missing document URL. Please provide 'documentUrl' in the input" - apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Missing document URL\"}" || true - exit $ERR_INVALID_INPUT -fi +OCR_ENABLED="$(echo "${INPUT}" | jq -r '.ocr')" # If no output format is specified, default to 'md'. -if [ -z "$OUTPUT_FORMAT" ]; then +if [ -z "$OUTPUT_FORMAT" ] || [ "$OUTPUT_FORMAT" = "null" ]; then OUTPUT_FORMAT="md" echo "No output format specified. Defaulting to 'md'" fi @@ -62,10 +48,20 @@ fi case "$OUTPUT_FORMAT" in md | json | html | text | doctags) ;; *) echo "Error: Invalid output format '$OUTPUT_FORMAT'. Supported formats are 'md', 'json', 'html', 'text', and 'doctags'" - exit 1 + apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Invalid output format\"}" || true + exit $ERR_INVALID_INPUT ;; esac +# Set output filename based on format. +OUTPUT_NAME="output_file.${OUTPUT_FORMAT}" + +if [ -z "$DOCUMENT_URL" ] || [ "$DOCUMENT_URL" = "null" ]; then + echo "Error: Missing document URL. Please provide 'documentUrl' in the input" + apify actor:push-data "{\"status\": \"error\", \"error\": \"Missing document URL\"}" || true + exit $ERR_INVALID_INPUT +fi + # Validate URL is accessible. echo "Validating document URL..." if ! curl --output /dev/null --silent --head --fail "${DOCUMENT_URL}"; then @@ -75,65 +71,81 @@ if ! curl --output /dev/null --silent --head --fail "${DOCUMENT_URL}"; then exit $ERR_URL_INACCESSIBLE fi -# --- Build Docling command --- +# --- Create JSON payload for docling-serve API --- -DOC_CONVERT_CMD="docling --verbose '${DOCUMENT_URL}' --to '${OUTPUT_FORMAT}'" +echo "Creating API request for docling-serve..." -# If OCR is enabled, add the OCR flag to the command. -if [ "$(echo "${INPUT}" | jq -r '.ocr')" = "true" ]; then - DOC_CONVERT_CMD="${DOC_CONVERT_CMD} --ocr" +# Set OCR flag. +if [ "$OCR_ENABLED" = "true" ]; then + OCR_VALUE="true" +else + OCR_VALUE="false" fi -# Print the exact command that will be executed. -echo "Debug: Command string: $DOC_CONVERT_CMD" -echo "Debug: Full command: /usr/bin/time -v bash -c \"$DOC_CONVERT_CMD\"" - -# --- Process document with Docling --- - -echo "Processing document with Docling CLI..." -echo "Running: $DOC_CONVERT_CMD" - -# Create a timestamp file to ensure the document is processed only once. -TIMESTAMP_FILE="/tmp/docling.timestamp" -touch "$TIMESTAMP_FILE" || { - echo "Error: Failed to create timestamp file" - exit 1 +# Create a temporary file for the JSON payload. +REQUEST_FILE="/tmp/docling_request.json" +cat > "$REQUEST_FILE" << EOF +{ + "document_url": "${DOCUMENT_URL}", + "output_format": "${OUTPUT_FORMAT}", + "ocr": ${OCR_VALUE} } +EOF -echo "Starting document processing with memory monitoring..." -/usr/bin/time -v bash -c "${DOC_CONVERT_CMD}" 2>&1 | tee -a "$LOG_FILE" -DOCLING_EXIT_CODE=${PIPESTATUS[0]} +echo "Request payload:" +cat "$REQUEST_FILE" -# Check if the command failed and handle the error. -if [ $DOCLING_EXIT_CODE -ne 0 ]; then - echo "Error: Docling command failed with exit code $DOCLING_EXIT_CODE" - echo "Memory usage information:" - free -h - df -h +# --- Call docling-serve API --- + +echo "Calling docling-serve API (localhost:8080/convert)..." + +RESPONSE_FILE="/tmp/docling_response.json" +HTTP_CODE=$(curl -s -o "$RESPONSE_FILE" -w "%{http_code}" -X POST \ + -H "Content-Type: application/json" \ + -d @"$REQUEST_FILE" \ + http://localhost:8080/convert) + +echo "API Response Status Code: $HTTP_CODE" + +# Check response status code. +if [ "$HTTP_CODE" -ne 200 ]; then + echo "Error: docling-serve API returned error code $HTTP_CODE" + if [ -f "$RESPONSE_FILE" ]; then + echo "Error response:" + cat "$RESPONSE_FILE" + fi + + ERROR_MSG=$(jq -r '.error // "Unknown API error"' "$RESPONSE_FILE" 2>/dev/null || echo "Unknown API error") + + apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"${ERROR_MSG}\"}" || true exit $ERR_DOCLING_FAILED fi -GENERATED_FILE="$(find . -type f -name "*.${OUTPUT_FORMAT}" -newer "$TIMESTAMP_FILE")" +# --- Process API response --- -# If no generated file is found, exit with an error. -if [ -z "$GENERATED_FILE" ]; then - echo "Error: Could not find generated output file with extension .$OUTPUT_FORMAT" +echo "Processing API response..." + +# Extract content from response and save to output file. +if ! jq -r '.content' "$RESPONSE_FILE" > "$OUTPUT_NAME" 2>/dev/null; then + echo "Error: Failed to parse API response or extract content" + echo "Response content:" + cat "$RESPONSE_FILE" + + apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Failed to parse API response\"}" || true exit $ERR_OUTPUT_MISSING fi -mv "${GENERATED_FILE}" "${OUTPUT_NAME}" - -# --- Validate output --- - -# If the output file is not found, exit with an error. +# Validate output file. if [ ! -f "$OUTPUT_NAME" ]; then - echo "Error: Expected output file '$OUTPUT_NAME' was not generated" + echo "Error: Output file was not created" + apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Output file not created\"}" || true exit $ERR_OUTPUT_MISSING fi -# If the output file is empty, exit with an error. +# Validate output file is not empty. if [ ! -s "$OUTPUT_NAME" ]; then - echo "Error: Generated output file '$OUTPUT_NAME' is empty" + echo "Error: Output file is empty" + apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Output file is empty\"}" || true exit $ERR_OUTPUT_MISSING fi @@ -142,7 +154,18 @@ echo "Document successfully processed and exported as '$OUTPUT_FORMAT' to file: # --- Store output and log in key-value store --- echo "Pushing processed document to key-value store (record key: OUTPUT_RESULT)..." -apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" <"$OUTPUT_NAME" || { + +CONTENT_TYPE="" +case "$OUTPUT_FORMAT" in + md) CONTENT_TYPE="text/markdown" ;; + json) CONTENT_TYPE="application/json" ;; + html) CONTENT_TYPE="text/html" ;; + text) CONTENT_TYPE="text/plain" ;; + doctags) CONTENT_TYPE="application/json" ;; + *) CONTENT_TYPE="text/plain" ;; +esac + +apify actor:set-value "OUTPUT_RESULT" --contentType "$CONTENT_TYPE" < "$OUTPUT_NAME" || { echo "Error: Failed to push the output document to the key-value store" exit $ERR_STORAGE_FAILED } @@ -154,24 +177,19 @@ apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"output_file\": \"${RESUL echo "Warning: Failed to push data to dataset" } -if [ -f "$LOG_FILE" ]; then - if [ -s "$LOG_FILE" ]; then - echo "Log file is not empty, pushing to key-value store (record key: DOCLING_LOG)..." - apify actor:set-value "DOCLING_LOG" --contentType "text/plain" <"$LOG_FILE" || { - echo "Warning: Failed to push the log file to the key-value store" - } - else - echo "Warning: docling.log file exists but is empty" - fi -else - echo "Warning: No docling.log file found" +# Store logs. +if [ -f "$LOG_FILE" ] && [ -s "$LOG_FILE" ]; then + echo "Pushing log file to key-value store (record key: DOCLING_LOG)..." + apify actor:set-value "DOCLING_LOG" --contentType "text/plain" < "$LOG_FILE" || { + echo "Warning: Failed to push the log file to the key-value store" + } fi # --- Cleanup temporary files --- cleanup() { local exit_code=$? - rm -f "$TIMESTAMP_FILE" || true + rm -f "$REQUEST_FILE" "$RESPONSE_FILE" || true exit $exit_code } diff --git a/.actor/input_schema.json b/.actor/input_schema.json index f58ef1f1..38df5303 100644 --- a/.actor/input_schema.json +++ b/.actor/input_schema.json @@ -1,20 +1,20 @@ { "title": "Docling Actor Input", - "description": "Options for running Docling CLI on the Apify platform.", + "description": "Options for processing documents with Docling via the docling-serve API.", "type": "object", "schemaVersion": 1, "properties": { "documentUrl": { "title": "Document URL", "type": "string", - "description": "URL of the document to process with Docling. Supported formats: images, 'pdf', 'docx', 'pptx', 'xlsx, 'html', 'md', 'xml_pubmed', 'asciidoc', 'xml_uspto'.", + "description": "URL of the document to process. Supported formats: PDF, DOCX, PPTX, XLSX, HTML, MD, XML, images, and more.", "prefill": "https://arxiv.org/pdf/2408.09869.pdf", "editor": "textfield" }, "outputFormat": { "title": "Output Format", "type": "string", - "description": "Specifies the desired output format after processing the document. Supported formats: 'md', 'json', 'html', 'text', 'doctags'.", + "description": "Desired output format after processing the document.", "enum": ["md", "json", "html", "text", "doctags"], "default": "md", "editor": "select" @@ -22,7 +22,7 @@ "ocr": { "title": "Enable OCR", "type": "boolean", - "description": "If true, OCR will be applied to scanned PDFs for text recognition.", + "description": "If enabled, OCR will be applied to scanned documents for text recognition.", "default": true } },