docling/.actor/actor.sh
Václav Vančura 9f86971fad Actor: Replace Docling CLI with docling-serve API
This commit transitions the Actor from using the full Docling CLI package to the more lightweight docling-serve API. Key changes include:

- Redesign Dockerfile to use docling-serve as base image
- Update actor.sh to communicate with API instead of running CLI commands
- Improve content type handling for various output formats
- Update input schema to align with API parameters
- Reduce Docker image size from ~6GB to ~600MB
- Update documentation and changelog to reflect architectural changes

The image size reduction will make the Actor more cost-effective for users while maintaining all existing functionality including OCR capabilities.

Issue: No official docling-serve Docker image is currently available, which will be addressed in a future commit.
Signed-off-by: Václav Vančura <commit@vancura.dev>
2025-03-13 10:39:22 +01:00

200 lines
6.1 KiB
Bash
Executable File

#!/bin/bash
# --- Setup Error Handling ---
# Initialize log file first.
LOG_FILE="/tmp/docling.log"
touch "$LOG_FILE" || {
echo "Fatal: Cannot create log file at $LOG_FILE"
exit 1
}
# Ensure all output is logged.
exec 1> >(tee -a "$LOG_FILE")
exec 2> >(tee -a "$LOG_FILE" >&2)
# Exit the script if any command fails.
trap 'echo "Error on line $LINENO"' ERR
set -e
# --- Define error codes ---
readonly ERR_INVALID_INPUT=10
readonly ERR_URL_INACCESSIBLE=11
readonly ERR_DOCLING_FAILED=12
readonly ERR_OUTPUT_MISSING=13
readonly ERR_STORAGE_FAILED=14
# --- Input parsing ---
echo "Parsing actor input..."
INPUT="$(apify actor:get-input || {
echo "Failed to get input"
exit 1
})"
DOCUMENT_URL="$(echo "${INPUT}" | jq -r '.documentUrl')"
OUTPUT_FORMAT="$(echo "${INPUT}" | jq -r '.outputFormat')"
OCR_ENABLED="$(echo "${INPUT}" | jq -r '.ocr')"
# If no output format is specified, default to 'md'.
if [ -z "$OUTPUT_FORMAT" ] || [ "$OUTPUT_FORMAT" = "null" ]; then
OUTPUT_FORMAT="md"
echo "No output format specified. Defaulting to 'md'"
fi
# Validate the output format.
case "$OUTPUT_FORMAT" in md | json | html | text | doctags) ;;
*)
echo "Error: Invalid output format '$OUTPUT_FORMAT'. Supported formats are 'md', 'json', 'html', 'text', and 'doctags'"
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Invalid output format\"}" || true
exit $ERR_INVALID_INPUT
;;
esac
# Set output filename based on format.
OUTPUT_NAME="output_file.${OUTPUT_FORMAT}"
if [ -z "$DOCUMENT_URL" ] || [ "$DOCUMENT_URL" = "null" ]; then
echo "Error: Missing document URL. Please provide 'documentUrl' in the input"
apify actor:push-data "{\"status\": \"error\", \"error\": \"Missing document URL\"}" || true
exit $ERR_INVALID_INPUT
fi
# Validate URL is accessible.
echo "Validating document URL..."
if ! curl --output /dev/null --silent --head --fail "${DOCUMENT_URL}"; then
echo "Error: Unable to access document at URL: ${DOCUMENT_URL}"
echo "Please ensure the URL is valid and publicly accessible."
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"URL inaccessible\"}" || true
exit $ERR_URL_INACCESSIBLE
fi
# --- Create JSON payload for docling-serve API ---
echo "Creating API request for docling-serve..."
# Set OCR flag.
if [ "$OCR_ENABLED" = "true" ]; then
OCR_VALUE="true"
else
OCR_VALUE="false"
fi
# Create a temporary file for the JSON payload.
REQUEST_FILE="/tmp/docling_request.json"
cat > "$REQUEST_FILE" << EOF
{
"document_url": "${DOCUMENT_URL}",
"output_format": "${OUTPUT_FORMAT}",
"ocr": ${OCR_VALUE}
}
EOF
echo "Request payload:"
cat "$REQUEST_FILE"
# --- Call docling-serve API ---
echo "Calling docling-serve API (localhost:8080/convert)..."
RESPONSE_FILE="/tmp/docling_response.json"
HTTP_CODE=$(curl -s -o "$RESPONSE_FILE" -w "%{http_code}" -X POST \
-H "Content-Type: application/json" \
-d @"$REQUEST_FILE" \
http://localhost:8080/convert)
echo "API Response Status Code: $HTTP_CODE"
# Check response status code.
if [ "$HTTP_CODE" -ne 200 ]; then
echo "Error: docling-serve API returned error code $HTTP_CODE"
if [ -f "$RESPONSE_FILE" ]; then
echo "Error response:"
cat "$RESPONSE_FILE"
fi
ERROR_MSG=$(jq -r '.error // "Unknown API error"' "$RESPONSE_FILE" 2>/dev/null || echo "Unknown API error")
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"${ERROR_MSG}\"}" || true
exit $ERR_DOCLING_FAILED
fi
# --- Process API response ---
echo "Processing API response..."
# Extract content from response and save to output file.
if ! jq -r '.content' "$RESPONSE_FILE" > "$OUTPUT_NAME" 2>/dev/null; then
echo "Error: Failed to parse API response or extract content"
echo "Response content:"
cat "$RESPONSE_FILE"
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Failed to parse API response\"}" || true
exit $ERR_OUTPUT_MISSING
fi
# Validate output file.
if [ ! -f "$OUTPUT_NAME" ]; then
echo "Error: Output file was not created"
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Output file not created\"}" || true
exit $ERR_OUTPUT_MISSING
fi
# Validate output file is not empty.
if [ ! -s "$OUTPUT_NAME" ]; then
echo "Error: Output file is empty"
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Output file is empty\"}" || true
exit $ERR_OUTPUT_MISSING
fi
echo "Document successfully processed and exported as '$OUTPUT_FORMAT' to file: $OUTPUT_NAME"
# --- Store output and log in key-value store ---
echo "Pushing processed document to key-value store (record key: OUTPUT_RESULT)..."
CONTENT_TYPE=""
case "$OUTPUT_FORMAT" in
md) CONTENT_TYPE="text/markdown" ;;
json) CONTENT_TYPE="application/json" ;;
html) CONTENT_TYPE="text/html" ;;
text) CONTENT_TYPE="text/plain" ;;
doctags) CONTENT_TYPE="application/json" ;;
*) CONTENT_TYPE="text/plain" ;;
esac
apify actor:set-value "OUTPUT_RESULT" --contentType "$CONTENT_TYPE" < "$OUTPUT_NAME" || {
echo "Error: Failed to push the output document to the key-value store"
exit $ERR_STORAGE_FAILED
}
# Create dataset record with processing results.
RESULT_URL="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/OUTPUT_RESULT"
echo "Adding record to dataset..."
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"output_file\": \"${RESULT_URL}\", \"status\": \"success\"}" || {
echo "Warning: Failed to push data to dataset"
}
# Store logs.
if [ -f "$LOG_FILE" ] && [ -s "$LOG_FILE" ]; then
echo "Pushing log file to key-value store (record key: DOCLING_LOG)..."
apify actor:set-value "DOCLING_LOG" --contentType "text/plain" < "$LOG_FILE" || {
echo "Warning: Failed to push the log file to the key-value store"
}
fi
# --- Cleanup temporary files ---
cleanup() {
local exit_code=$?
rm -f "$REQUEST_FILE" "$RESPONSE_FILE" || true
exit $exit_code
}
trap cleanup EXIT
echo "Processing completed successfully!"
echo "You can find your results at: ${RESULT_URL}"